From 30ec628a0cb66f658fa00f9ffbd238b46c2d42f3 Mon Sep 17 00:00:00 2001 From: Sean Hefty <sean.hefty@intel.com> Date: Thu, 7 Nov 2013 12:18:59 -0800 Subject: [PATCH] libfabric: Initial commit libfabric overview libfabric is an extensible framework for application access to fabric services. The framework supports multiple providers, including providers built into the library. The layout of the libfabric source tree is outlined below. Note that this is under development, and that full functionality is missing. include/rdma ------------ Contains header files for the framework, including the base framework APIs in fabric.h. There are sample APIs available for message queue operations, RDMA operations, and tagged messages. Proposed APIs and objects that support communication and data transfer functionality are found in fi_domain.h. fabric.h - Base framework APIs fi_domain.h - General resource management objects fi_socket.h - Base communication object src --- Contains the base implementation for the framework and kernel supported APIs. src/fabric - Base framework implementation src/ucma - Interface to kernel rdma cm ABI src/uverbs - Interface to kernel verbs ABI examples -------- Includes some simple examples that demonstrate how an application can use the framework and various API sets. examples/perf - Simple latency/bandwidth test. examples/provinfo - List available provider information. prov ---- Providers built into the libfabric library are under the prov subdirectory. prov/ibverbs - This is a *sample* provider that sits over libibverbs. It is NOT meant as a real provider because of the overhead that results from converting libfabric calls directly into libibverbs calls. It is intended to show how a hardware vendor can implement an optimized version of their provider library for libfabric. prov/mlx4 - This is a sample provider that works in conjunction with the ibverbs provider. It is mostly unchanged from the existing libmlx4 verbs provider. prov/psm - This is a sample provider that sits over the Intel PSM library. prov/rdmacm - Incorporates the librdmacm functionality into libfabric. Signed-off-by: Sean Hefty <sean.hefty@intel.com> --- AUTHORS | 4 + COPYING | 378 ++ Makefile.am | 141 + README | 10 + autogen.sh | 9 + configure.ac | 139 + examples/perf.c | 657 ++++ examples/provinfo.c | 95 + examples/shared.c | 111 + examples/shared.h | 53 + include/fi.h | 132 + include/infiniband/ib.h | 107 + include/rdma/fabric.h | 387 ++ include/rdma/fi_arch.h | 117 + include/rdma/fi_atomic.h | 54 + include/rdma/fi_cm.h | 118 + include/rdma/fi_domain.h | 434 +++ include/rdma/fi_errno.h | 179 + include/rdma/fi_prov.h | 78 + include/rdma/fi_rdma.h | 93 + include/rdma/fi_socket.h | 187 + include/rdma/fi_tagged.h | 111 + include/rdma/fi_ucma.h | 718 ++++ include/rdma/fi_umad.h | 112 + include/rdma/fi_uverbs.h | 1289 +++++++ libfabric.spec.in | 71 + man/fi_getinfo.3 | 53 + man/fi_open.3 | 27 + man/fi_socket.3 | 30 + prov/ibverbs/AUTHORS | 4 + prov/ibverbs/COPYING | 378 ++ prov/ibverbs/include/infiniband/driver.h | 145 + prov/ibverbs/include/infiniband/marshall.h | 64 + prov/ibverbs/include/infiniband/opcode.h | 147 + prov/ibverbs/include/infiniband/verbs.h | 1158 ++++++ prov/ibverbs/src/cmd.c | 879 +++++ prov/ibverbs/src/device.c | 255 ++ prov/ibverbs/src/enum_strs.c | 128 + prov/ibverbs/src/fi_verbs.c | 1277 +++++++ prov/ibverbs/src/ibverbs.h | 62 + prov/ibverbs/src/init.c | 473 +++ prov/ibverbs/src/marshall.c | 144 + prov/ibverbs/src/memory.c | 719 ++++ prov/ibverbs/src/verbs.c | 534 +++ prov/mlx4/AUTHORS | 1 + prov/mlx4/COPYING | 378 ++ prov/mlx4/src/buf.c | 65 + prov/mlx4/src/cq.c | 480 +++ prov/mlx4/src/dbrec.c | 154 + prov/mlx4/src/doorbell.h | 63 + prov/mlx4/src/mlx4-abi.h | 108 + prov/mlx4/src/mlx4.c | 276 ++ prov/mlx4/src/mlx4.h | 350 ++ prov/mlx4/src/mlx4_verbs.c | 741 ++++ prov/mlx4/src/qp.c | 702 ++++ prov/mlx4/src/srq.c | 175 + prov/mlx4/src/wqe.h | 121 + prov/psm/AUTHORS | 1 + prov/psm/COPYING | 378 ++ prov/psm/src/psmx.h | 91 + prov/psm/src/psmx_av.c | 145 + prov/psm/src/psmx_cm.c | 105 + prov/psm/src/psmx_domain.c | 177 + prov/psm/src/psmx_ec.c | 205 + prov/psm/src/psmx_init.c | 163 + prov/psm/src/psmx_sock.c | 177 + prov/psm/src/psmx_tagged.c | 173 + prov/psm/src/psmx_util.c | 270 ++ prov/rdmacm/AUTHORS | 1 + prov/rdmacm/COPYING | 378 ++ prov/rdmacm/examples/common.c | 168 + prov/rdmacm/examples/common.h | 94 + prov/rdmacm/examples/rcopy.c | 628 ++++ prov/rdmacm/examples/riostream.c | 639 ++++ prov/rdmacm/examples/rstream.c | 609 +++ prov/rdmacm/examples/udpong.c | 568 +++ prov/rdmacm/include/rdma/rdma_cma.h | 684 ++++ prov/rdmacm/include/rdma/rdma_verbs.h | 316 ++ prov/rdmacm/include/rdma/rsocket.h | 99 + prov/rdmacm/src/acm.c | 439 +++ prov/rdmacm/src/addrinfo.c | 327 ++ prov/rdmacm/src/cma.c | 2210 +++++++++++ prov/rdmacm/src/cma.h | 155 + prov/rdmacm/src/indexer.c | 166 + prov/rdmacm/src/indexer.h | 144 + prov/rdmacm/src/preload.c | 1057 ++++++ prov/rdmacm/src/rsocket.c | 3970 ++++++++++++++++++++ src/fabric.c | 306 ++ src/libfabric.map | 38 + src/ucma.c | 497 +++ src/uverbs.c | 710 ++++ 91 files changed, 31963 insertions(+) create mode 100644 AUTHORS create mode 100644 COPYING create mode 100644 Makefile.am create mode 100644 README create mode 100755 autogen.sh create mode 100644 configure.ac create mode 100644 examples/perf.c create mode 100644 examples/provinfo.c create mode 100644 examples/shared.c create mode 100644 examples/shared.h create mode 100644 include/fi.h create mode 100644 include/infiniband/ib.h create mode 100644 include/rdma/fabric.h create mode 100644 include/rdma/fi_arch.h create mode 100644 include/rdma/fi_atomic.h create mode 100644 include/rdma/fi_cm.h create mode 100644 include/rdma/fi_domain.h create mode 100644 include/rdma/fi_errno.h create mode 100644 include/rdma/fi_prov.h create mode 100644 include/rdma/fi_rdma.h create mode 100644 include/rdma/fi_socket.h create mode 100644 include/rdma/fi_tagged.h create mode 100644 include/rdma/fi_ucma.h create mode 100644 include/rdma/fi_umad.h create mode 100644 include/rdma/fi_uverbs.h create mode 100644 libfabric.spec.in create mode 100755 man/fi_getinfo.3 create mode 100644 man/fi_open.3 create mode 100644 man/fi_socket.3 create mode 100644 prov/ibverbs/AUTHORS create mode 100644 prov/ibverbs/COPYING create mode 100644 prov/ibverbs/include/infiniband/driver.h create mode 100644 prov/ibverbs/include/infiniband/marshall.h create mode 100644 prov/ibverbs/include/infiniband/opcode.h create mode 100644 prov/ibverbs/include/infiniband/verbs.h create mode 100644 prov/ibverbs/src/cmd.c create mode 100644 prov/ibverbs/src/device.c create mode 100644 prov/ibverbs/src/enum_strs.c create mode 100644 prov/ibverbs/src/fi_verbs.c create mode 100644 prov/ibverbs/src/ibverbs.h create mode 100644 prov/ibverbs/src/init.c create mode 100644 prov/ibverbs/src/marshall.c create mode 100644 prov/ibverbs/src/memory.c create mode 100644 prov/ibverbs/src/verbs.c create mode 100644 prov/mlx4/AUTHORS create mode 100644 prov/mlx4/COPYING create mode 100644 prov/mlx4/src/buf.c create mode 100644 prov/mlx4/src/cq.c create mode 100644 prov/mlx4/src/dbrec.c create mode 100644 prov/mlx4/src/doorbell.h create mode 100644 prov/mlx4/src/mlx4-abi.h create mode 100644 prov/mlx4/src/mlx4.c create mode 100644 prov/mlx4/src/mlx4.h create mode 100644 prov/mlx4/src/mlx4_verbs.c create mode 100644 prov/mlx4/src/qp.c create mode 100644 prov/mlx4/src/srq.c create mode 100644 prov/mlx4/src/wqe.h create mode 100644 prov/psm/AUTHORS create mode 100644 prov/psm/COPYING create mode 100644 prov/psm/src/psmx.h create mode 100644 prov/psm/src/psmx_av.c create mode 100644 prov/psm/src/psmx_cm.c create mode 100644 prov/psm/src/psmx_domain.c create mode 100644 prov/psm/src/psmx_ec.c create mode 100644 prov/psm/src/psmx_init.c create mode 100644 prov/psm/src/psmx_sock.c create mode 100644 prov/psm/src/psmx_tagged.c create mode 100644 prov/psm/src/psmx_util.c create mode 100644 prov/rdmacm/AUTHORS create mode 100644 prov/rdmacm/COPYING create mode 100644 prov/rdmacm/examples/common.c create mode 100644 prov/rdmacm/examples/common.h create mode 100644 prov/rdmacm/examples/rcopy.c create mode 100644 prov/rdmacm/examples/riostream.c create mode 100644 prov/rdmacm/examples/rstream.c create mode 100644 prov/rdmacm/examples/udpong.c create mode 100644 prov/rdmacm/include/rdma/rdma_cma.h create mode 100644 prov/rdmacm/include/rdma/rdma_verbs.h create mode 100644 prov/rdmacm/include/rdma/rsocket.h create mode 100644 prov/rdmacm/src/acm.c create mode 100644 prov/rdmacm/src/addrinfo.c create mode 100644 prov/rdmacm/src/cma.c create mode 100644 prov/rdmacm/src/cma.h create mode 100644 prov/rdmacm/src/indexer.c create mode 100644 prov/rdmacm/src/indexer.h create mode 100644 prov/rdmacm/src/preload.c create mode 100644 prov/rdmacm/src/rsocket.c create mode 100644 src/fabric.c create mode 100644 src/libfabric.map create mode 100644 src/ucma.c create mode 100644 src/uverbs.c diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 00000000000..fcea3504a51 --- /dev/null +++ b/AUTHORS @@ -0,0 +1,4 @@ +Roland Dreier <roland@topspin.com> +Dotan Barak <dotanba@gmail.com> +Sean Hefty <sean.hefty@intel.com> +Michael S. Tsirkin <mst@mellanox.co.il> diff --git a/COPYING b/COPYING new file mode 100644 index 00000000000..39f3831585f --- /dev/null +++ b/COPYING @@ -0,0 +1,378 @@ +This software is available to you under a choice of one of two +licenses. You may choose to be licensed under the terms of the the +OpenIB.org BSD license or the GNU General Public License (GPL) Version +2, both included below. + +Copyright (c) 2005 Intel Corporation. All rights reserved. + +================================================================== + + OpenIB.org BSD license + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +================================================================== + + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + <one line to give the program's name and a brief idea of what it does.> + Copyright (C) <year> <name of author> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + <signature of Ty Coon>, 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/Makefile.am b/Makefile.am new file mode 100644 index 00000000000..74bb7bbe617 --- /dev/null +++ b/Makefile.am @@ -0,0 +1,141 @@ +AM_CPPFLAGS = -I$(srcdir)/include -I$(srcdir)/prov/ibverbs/include \ + -I$(srcdir)/prov/rdmacm/include + +lib_LTLIBRARIES = src/libfabric.la + +ACLOCAL_AMFLAGS = -I config +AM_CFLAGS = -g -Wall -D_GNU_SOURCE + +src_libfabric_la_CFLAGS = $(AM_CFLAGS) -DSYSCONFDIR=\"$(sysconfdir)\" -DRDMADIR=\"@rdmadir@\" + +if HAVE_LD_VERSION_SCRIPT + libfabric_version_script = -Wl,--version-script=$(srcdir)/src/libfabric.map +else + libfabric_version_script = +endif + +src_libfabric_la_SOURCES = src/fabric.c src/uverbs.c src/ucma.c \ + prov/ibverbs/src/cmd.c \ + prov/ibverbs/src/device.c \ + prov/ibverbs/src/enum_strs.c \ + prov/ibverbs/src/fi_verbs.c \ + prov/ibverbs/src/init.c \ + prov/ibverbs/src/marshall.c \ + prov/ibverbs/src/memory.c \ + prov/ibverbs/src/verbs.c \ + prov/rdmacm/src/acm.c \ + prov/rdmacm/src/addrinfo.c \ + prov/rdmacm/src/cma.c \ + prov/rdmacm/src/indexer.c \ + prov/rdmacm/src/rsocket.c \ + prov/mlx4/src/buf.c \ + prov/mlx4/src/cq.c \ + prov/mlx4/src/dbrec.c \ + prov/mlx4/src/mlx4.c \ + prov/mlx4/src/qp.c \ + prov/mlx4/src/srq.c \ + prov/mlx4/src/mlx4_verbs.c + +if HAVE_PSM +src_libfabric_la_SOURCES += prov/psm/src/psmx_init.c \ + prov/psm/src/psmx_domain.c \ + prov/psm/src/psmx_ec.c \ + prov/psm/src/psmx_av.c \ + prov/psm/src/psmx_sock.c \ + prov/psm/src/psmx_cm.c \ + prov/psm/src/psmx_tagged.c \ + prov/psm/src/psmx_util.c +endif + +src_libfabric_la_LDFLAGS = -version-info 1 -export-dynamic \ + $(libfabric_version_script) + +src_libfabric_la_DEPENDENCIES = $(srcdir)/src/libfabric.map + +bin_PROGRAMS = \ + prov/rdmacm/examples/rstream \ + prov/rdmacm/examples/rcopy \ + prov/rdmacm/examples/riostream \ + prov/rdmacm/examples/udpong \ + examples/fi_provinfo \ + examples/fi_perf + +prov_rdmacm_examples_rstream_SOURCES = \ + prov/rdmacm/examples/rstream.c \ + prov/rdmacm/examples/common.c +prov_rdmacm_examples_rstream_LDADD = \ + $(top_builddir)/src/libfabric.la +prov_rdmacm_examples_riostream_SOURCES = \ + prov/rdmacm/examples/riostream.c \ + prov/rdmacm/examples/common.c +prov_rdmacm_examples_riostream_LDADD = \ + $(top_builddir)/src/libfabric.la +prov_rdmacm_examples_rcopy_SOURCES = \ + prov/rdmacm/examples/rcopy.c +prov_rdmacm_examples_rcopy_LDADD = \ + $(top_builddir)/src/libfabric.la +prov_rdmacm_examples_udpong_SOURCES = \ + prov/rdmacm/examples/udpong.c \ + prov/rdmacm/examples/common.c +prov_rdmacm_examples_udpong_LDADD = \ + $(top_builddir)/src/libfabric.la +examples_fi_provinfo_SOURCES = \ + examples/provinfo.c \ + examples/shared.c +examples_fi_provinfo_LDADD = \ + $(top_builddir)/src/libfabric.la +examples_fi_perf_SOURCES = \ + examples/perf.c \ + examples/shared.c +examples_fi_perf_LDADD = \ + $(top_builddir)/src/libfabric.la + +libfabricincludedir = $(includedir)/rdma +infinibandincludedir = $(includedir)/infiniband + +libfabricinclude_HEADERS = $(top_srcdir)/include/rdma/fabric.h \ + $(top_srcdir)/include/rdma/fi_arch.h \ + $(top_srcdir)/include/rdma/fi_atomic.h \ + $(top_srcdir)/include/rdma/fi_cm.h \ + $(top_srcdir)/include/rdma/fi_domain.h \ + $(top_srcdir)/include/rdma/fi_prov.h \ + $(top_srcdir)/include/rdma/fi_rdma.h \ + $(top_srcdir)/include/rdma/fi_socket.h \ + $(top_srcdir)/include/rdma/fi_errno.h \ + $(top_srcdir)/include/rdma/fi_tagged.h \ + $(top_srcdir)/include/rdma/fi_ucma.h \ + $(top_srcdir)/include/rdma/fi_umad.h \ + $(top_srcdir)/include/rdma/fi_uverbs.h \ + $(top_srcdir)/prov/rdmacm/include/rdma/rsocket.h + +infinibandinclude_HEADERS = $(top_srcdir)/include/infiniband/ib.h + +man_MANS = man/fi_getinfo.3 man/fi_socket.3 man/fi_open.3 + +EXTRA_DIST = include/fi.h src/libfabric.map libfabric.spec.in $(man_MANS) \ + prov/ibverbs/include/infiniband/driver.h \ + prov/ibverbs/include/infiniband/marshall.h \ + prov/ibverbs/include/infiniband/opcode.h \ + prov/ibverbs/include/infiniband/sa.h \ + prov/ibverbs/include/infiniband/sa-kern-abi.h \ + prov/ibverbs/include/infiniband/verbs.h \ + prov/ibverbs/src/ibverbs.h \ + prov/rdmacm/include/rdma/rdma_cma.h \ + prov/rdmacm/include/rdma/rdma_verbs.h \ + prov/rdmacm/src/cma.h \ + prov/rdmacm/src/indexer.h \ + prov/mlx4/src/doorbell.h \ + prov/mlx4/src/mlx4.h \ + prov/mlx4/src/mlx4-abi.h \ + prov/mlx4/wqe.h \ + examples/shared.h + +dist-hook: libfabric.spec + cp libfabric.spec $(distdir) + +install-data-hook: + cd $(DESTDIR)$(mandir)/man3 && \ + $(RM) fi_freeinfo.3 && \ + $(RM) fi_close.3 && \ + $(LN_S) fi_getinfo.3 fi_freeinfo.3 && \ + $(LN_S) fi_open.3 fi_close.3 diff --git a/README b/README new file mode 100644 index 00000000000..1cebdef42c8 --- /dev/null +++ b/README @@ -0,0 +1,10 @@ +This README is for userspace RDMA fabric library. + +Building +======== +To make this directory, run: +./autogen.sh && ./configure && make && make install + +Typically the autogen and configure steps only need be done the first +time unless configure.ac or Makefile.am changes. + diff --git a/autogen.sh b/autogen.sh new file mode 100755 index 00000000000..f433312161d --- /dev/null +++ b/autogen.sh @@ -0,0 +1,9 @@ +#! /bin/sh + +set -x +test -d ./config || mkdir ./config +aclocal -I config +libtoolize --force --copy +autoheader +automake --foreign --add-missing --copy +autoconf diff --git a/configure.ac b/configure.ac new file mode 100644 index 00000000000..3bbb05a64d4 --- /dev/null +++ b/configure.ac @@ -0,0 +1,139 @@ +dnl Process this file with autoconf to produce a configure script. + +AC_PREREQ(2.57) +AC_INIT(libfabric, 0.0.1, linux-rdma@vger.kernel.org) +AC_CONFIG_SRCDIR([src/fabric.c]) +AC_CONFIG_AUX_DIR(config) +AC_CONFIG_MACRO_DIR(config) +AC_CONFIG_HEADERS(config.h) +AM_INIT_AUTOMAKE(libfabric, 0.0.1) +m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) + +AC_ARG_ENABLE([debug], + [AS_HELP_STRING([--enable-debug], + [Enable debugging @<:@default=no@:>@]) + ], + [CFLAGS="$CFLAGS -g -O0 -Wall"], + [enable_debug=no]) + +dnl Fix autoconf's habit of adding -g -O2 by default +AS_IF([test -z "$CFLAGS"], + [CFLAGS='-O2 -DNDEBUG -Wall']) + +AM_PROG_LIBTOOL + +AC_ARG_WITH([valgrind], + AC_HELP_STRING([--with-valgrind], + [Enable valgrind annotations - default NO])) + +if test "$with_valgrind" != "" && test "$with_valgrind" != "no"; then + AC_DEFINE([INCLUDE_VALGRIND], 1, + [Define to 1 to enable valgrind annotations]) + if test -d $with_valgrind; then + CPPFLAGS="$CPPLFAGS -I$with_valgrind/include" + fi +fi + +AC_ARG_ENABLE(libcheck, [ --disable-libcheck do not test for presence of libraries], +[ if test "$enableval" = "no"; then + disable_libcheck=yes + fi +]) + +dnl Checks for programs +AC_PROG_CC + +dnl Checks for typedefs, structures, and compiler characteristics. +AC_C_CONST +AC_CHECK_SIZEOF(long) + +dnl Checks for libraries +AC_CHECK_LIB(dl, dlsym, [], + AC_MSG_ERROR([dlsym() not found. libfabric requires libdl.])) +AC_CHECK_LIB(pthread, pthread_mutex_init, [], + AC_MSG_ERROR([pthread_mutex_init() not found. libfabric requires libpthread.])) + +dnl Check for gcc atomic intrinsics +AC_MSG_CHECKING(compiler support for atomics) +AC_TRY_LINK([int i = 0;], + [ return __sync_add_and_fetch(&i, 1) != __sync_sub_and_fetch(&i, 1); ], + [ AC_MSG_RESULT(yes) ], + [ + AC_MSG_RESULT(no) + AC_DEFINE(DEFINE_ATOMICS, 1, [Set to 1 to implement atomics]) + ]) + +dnl Checks for header files. +AC_HEADER_STDC + +if test "$disable_libcheck" != "yes"; then +if test "$with_valgrind" != "" && test "$with_valgrind" != "no"; then +AC_CHECK_HEADER(valgrind/memcheck.h, [], + AC_MSG_ERROR([valgrind requested but <valgrind/memcheck.h> not found.])) +fi +fi + +AC_CACHE_CHECK(whether ld accepts --version-script, ac_cv_version_script, + if test -n "`$LD --help < /dev/null 2>/dev/null | grep version-script`"; then + ac_cv_version_script=yes + else + ac_cv_version_script=no + fi) + +AM_CONDITIONAL(HAVE_LD_VERSION_SCRIPT, test "$ac_cv_version_script" = "yes") + +AC_ARG_ENABLE([psm], + [AS_HELP_STRING([--enable-psm], + [Enable PSM provider @<:@default=no@:>@]) + ], + [AC_DEFINE([HAVE_PSM], [1], [Define if PSM is enabled]) + LIBS="-lpsm_infinipath $LIBS"], + [enable_psm=no]) + +AC_ARG_WITH([psm], + [AS_HELP_STRING([--with-psm=@<:@PSM installation path@:>@], + [Provide path to PSM installation]) + ], + [AS_CASE([$with_psm], + [yes|no], [AC_DEFINE([HAVE_PSM], [1], [Define if PSM is enabled])], + [CPPFLAGS="-I$with_psm/include $CPPFLAGS" + LDFLAGS="-L$with_psm/lib64 -Wl,-rpath=$with_psm/lib64 $LDFLAGS" + LIBS="-lpsm_infinipath $LIBS" + AC_DEFINE([HAVE_PSM], [1], [Define if PSM is enabled])]) + ]) + +AC_ARG_WITH([psm-include], + [AS_HELP_STRING([--with-psm-include=@<:@PSM include path@:>@], + [Provide path to PSM include files]) + ], + [AS_CASE([$with_psm_include], + [yes|no], [AC_DEFINE([HAVE_PSM], [1], [Define if PSM is enabled])], + [CPPFLAGS="-I$with_psm_include $CPPFLAGS" + AC_DEFINE([HAVE_PSM], [1], [Define if PSM is enabled]) + ]) + ]) + +AC_ARG_WITH([psm-lib], + [AS_HELP_STRING([--with-psm-lib=@<:@PSM library path@:>@], + [Provide path to PSM library files]) + ], + [AS_CASE([$with_psm_lib], + [yes|no], [], + [LDFLAGS="-L$with_psm_lib -Wl,-rpath=$with_psm_lib $LDFLAGS" + LIBS="-lpsm_infinipath $LIBS" + AC_DEFINE([HAVE_PSM], [1], [Define if PSM is enabled]) + ]) + ]) + +AS_IF([test x"$enable_psm" = x"yes"], + [AC_CHECK_LIB(psm_infinipath, psm_init, + [AC_CHECK_HEADER([psm.h], [], + [AC_MSG_ERROR([psm.h not found. Provide the correct path to PSM with --with-psm-include (or --with-psm)])] + )], + AC_MSG_ERROR([psm_init() not found. Provide the correct path to PSM --with-psm-lib]))], + [AC_MSG_NOTICE(PSM not enabled)]) + +AM_CONDITIONAL([HAVE_PSM], [test x"$enable_psm" = x"yes"]) + +AC_CONFIG_FILES([Makefile libfabric.spec]) +AC_OUTPUT diff --git a/examples/perf.c b/examples/perf.c new file mode 100644 index 00000000000..6260aea9a51 --- /dev/null +++ b/examples/perf.c @@ -0,0 +1,657 @@ +/* + * Copyright (c) 2013 Intel Corporation. All rights reserved. + * + * This software is available to you under the OpenIB.org BSD license + * below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AWV + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <strings.h> +#include <errno.h> +#include <getopt.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/time.h> +#include <sys/wait.h> +#include <netdb.h> +#include <fcntl.h> +#include <unistd.h> +#include <netinet/in.h> +#include <netinet/tcp.h> + +#include <rdma/fabric.h> +#include <rdma/fi_domain.h> +#include <rdma/fi_errno.h> +#include <rdma/fi_socket.h> +#include <rdma/fi_cm.h> +#include "shared.h" + + +struct test_size_param { + int size; + int option; +}; + +static struct test_size_param test_size[] = { + { 1 << 6, 0 }, + { 1 << 7, 1 }, { (1 << 7) + (1 << 6), 1}, + { 1 << 8, 1 }, { (1 << 8) + (1 << 7), 1}, + { 1 << 9, 1 }, { (1 << 9) + (1 << 8), 1}, + { 1 << 10, 1 }, { (1 << 10) + (1 << 9), 1}, + { 1 << 11, 1 }, { (1 << 11) + (1 << 10), 1}, + { 1 << 12, 0 }, { (1 << 12) + (1 << 11), 1}, + { 1 << 13, 1 }, { (1 << 13) + (1 << 12), 1}, + { 1 << 14, 1 }, { (1 << 14) + (1 << 13), 1}, + { 1 << 15, 1 }, { (1 << 15) + (1 << 14), 1}, + { 1 << 16, 0 }, { (1 << 16) + (1 << 15), 1}, + { 1 << 17, 1 }, { (1 << 17) + (1 << 16), 1}, + { 1 << 18, 1 }, { (1 << 18) + (1 << 17), 1}, + { 1 << 19, 1 }, { (1 << 19) + (1 << 18), 1}, + { 1 << 20, 0 }, { (1 << 20) + (1 << 19), 1}, + { 1 << 21, 1 }, { (1 << 21) + (1 << 20), 1}, + { 1 << 22, 1 }, { (1 << 22) + (1 << 21), 1}, +}; +#define TEST_CNT (sizeof test_size / sizeof test_size[0]) + +enum perf_optimization { + opt_latency, + opt_bandwidth +}; + +#define SEND_CONTEXT NULL + +static int custom; +static enum perf_optimization optimization; +static int size_option; +static int iterations = 1; +static int transfer_size = 1000; +static int transfer_count = 1000; +/* TODO: make max_credits dynamic based on user input or socket size */ +static int max_credits = 128; +static int credits = 128; +static char test_name[10] = "custom"; +static struct timeval start, end; +static void *buf; +static size_t buffer_size; + +static struct fi_info hints; +static char *dst_addr, *src_addr; +static char *port = "9228"; +static fid_t lfs, ldom, lcm; +static fid_t fs, dom, mr, cq; + + +static void show_perf(void) +{ + char str[32]; + float usec; + long long bytes; + + usec = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec); + bytes = (long long) iterations * transfer_count * transfer_size * 2; + + /* name size transfers iterations bytes seconds Gb/sec usec/xfer */ + printf("%-10s", test_name); + size_str(str, sizeof str, transfer_size); + printf("%-8s", str); + cnt_str(str, sizeof str, transfer_count); + printf("%-8s", str); + cnt_str(str, sizeof str, iterations); + printf("%-8s", str); + size_str(str, sizeof str, bytes); + printf("%-8s", str); + printf("%8.2fs%10.2f%11.2f\n", + usec / 1000000., (bytes * 8) / (1000. * usec), + (usec / iterations) / (transfer_count * 2)); +} + +static void init_latency_test(int size) +{ + char sstr[5]; + + size_str(sstr, sizeof sstr, size); + snprintf(test_name, sizeof test_name, "%s_lat", sstr); + transfer_count = 1; + transfer_size = size; + iterations = size_to_count(transfer_size); +} + +static void init_bandwidth_test(int size) +{ + char sstr[5]; + + size_str(sstr, sizeof sstr, size); + snprintf(test_name, sizeof test_name, "%s_bw", sstr); + iterations = 1; + transfer_size = size; + transfer_count = size_to_count(transfer_size); +} + +static int poll_all(void) +{ + struct fi_ec_entry comp; + int ret; + + do { + ret = fi_ec_read(cq, &comp, sizeof comp); + if (ret > 0) { + if (comp.op_context == SEND_CONTEXT) + credits++; + } else if (ret < 0) { + printf("Completion queue read %d (%s)\n", ret, fi_strerror(-ret)); + return ret; + } + } while (ret); + return 0; +} + +static int send_xfer(int size) +{ + struct fi_ec_entry comp; + int ret; + + while (!credits) { + ret = fi_ec_read(cq, &comp, sizeof comp); + if (ret > 0) { + if (comp.op_context == SEND_CONTEXT) + goto post; + } else if (ret < 0) { + printf("Completion queue read %d (%s)\n", ret, fi_strerror(-ret)); + return ret; + } + } + + credits--; +post: + ret = fi_sendmem(fs, buf, size, fi_mr_desc(mr), SEND_CONTEXT); + if (ret) + printf("fi_write %d (%s)\n", ret, fi_strerror(-ret)); + + return ret; +} + +static int recv_xfer(int size) +{ + struct fi_ec_entry comp; + int ret; + + while (1) { + ret = fi_ec_read(cq, &comp, sizeof comp); + if (ret > 0) { + if (comp.op_context == SEND_CONTEXT) + credits++; + else + break; + } else if (ret < 0) { + printf("Completion queue read %d (%s)\n", ret, fi_strerror(-ret)); + return ret; + } + } + + ret = fi_recvmem(fs, buf, buffer_size, fi_mr_desc(mr), buf); + if (ret) + printf("fi_recvmem %d (%s)\n", ret, fi_strerror(-ret)); + + return ret; +} + +static int sync_test(void) +{ + int ret; + + while (credits < max_credits) + poll_all(); + + ret = dst_addr ? send_xfer(16) : recv_xfer(16); + if (ret) + return ret; + + return dst_addr ? recv_xfer(16) : send_xfer(16); +} + +static int run_test(void) +{ + int ret, i, t; + + ret = sync_test(); + if (ret) + goto out; + + gettimeofday(&start, NULL); + for (i = 0; i < iterations; i++) { + for (t = 0; t < transfer_count; t++) { + ret = dst_addr ? send_xfer(transfer_size) : + recv_xfer(transfer_size); + if (ret) + goto out; + } + + for (t = 0; t < transfer_count; t++) { + ret = dst_addr ? recv_xfer(transfer_size) : + send_xfer(transfer_size); + if (ret) + goto out; + } + } + gettimeofday(&end, NULL); + show_perf(); + ret = 0; + +out: + return ret; +} + +static int alloc_cm_ec(fid_t dom, fid_t *cm_ec) +{ + struct fi_ec_attr cm_attr; + int ret; + + memset(&cm_attr, 0, sizeof cm_attr); + cm_attr.ec_mask = FI_EC_ATTR_MASK_V1; + cm_attr.domain = FI_EC_DOMAIN_CM; + cm_attr.type = FI_EC_QUEUE; + cm_attr.format = FI_EC_FORMAT_CM; + cm_attr.wait_obj = FI_EC_WAIT_FD; + cm_attr.flags = FI_AUTO_RESET; + ret = fi_ec_open(dom, &cm_attr, cm_ec, NULL); + if (ret) + printf("fi_ec_open cm %s\n", fi_strerror(-ret)); + + return ret; +} + +static void free_lres(void) +{ + fi_close(lcm); + fi_close(ldom); +} + +static int alloc_lres(struct fi_info *fi) +{ + int ret; + + ret = fi_open(NULL, fi, 0, &ldom, NULL); + if (ret) { + printf("fi_open %s %s\n", fi->domain_name, fi_strerror(-ret)); + return ret; + } + + ret = alloc_cm_ec(ldom, &lcm); + if (ret) + fi_close(ldom); + + return ret; +} + +static void free_res(void) +{ + fi_mr_unreg(mr); + fi_close(cq); + fi_close(dom); + free(buf); +} + +static int alloc_res(struct fi_info *fi) +{ + struct fi_ec_attr cq_attr; + int ret; + + buffer_size = !custom ? test_size[TEST_CNT - 1].size : transfer_size; + buf = malloc(buffer_size); + if (!buf) { + perror("malloc"); + return -1; + } + + ret = fi_open(NULL, fi, 0, &dom, NULL); + if (ret) { + printf("fi_open %s %s\n", fi->domain_name, fi_strerror(-ret)); + goto err1; + } + + memset(&cq_attr, 0, sizeof cq_attr); + cq_attr.ec_mask = FI_EC_ATTR_MASK_V1; + cq_attr.domain = FI_EC_DOMAIN_COMP; + cq_attr.type = FI_EC_QUEUE; + cq_attr.format = FI_EC_FORMAT_CONTEXT; + cq_attr.wait_obj = FI_EC_WAIT_NONE; + cq_attr.size = max_credits << 1; + ret = fi_ec_open(dom, &cq_attr, &cq, NULL); + if (ret) { + printf("fi_eq_open comp %s\n", fi_strerror(-ret)); + goto err2; + } + + ret = fi_mr_reg(dom, buf, buffer_size, &mr, 0, NULL); + if (ret) { + printf("fi_mr_reg %s\n", fi_strerror(-ret)); + goto err3; + } + return 0; + +err3: + fi_close(cq); +err2: + fi_close(dom); +err1: + free(buf); + return ret; +} + +static int bind_fid(fid_t sock, fid_t res, uint64_t flags) +{ + struct fi_resource fr; + int ret; + + fr.fid = res; + fr.flags = flags; + ret = fi_bind(sock, &fr, 1); + if (ret) + printf("fi_bind %s\n", fi_strerror(-ret)); + return ret; +} + +static int bind_lres(void) +{ + return bind_fid(lfs, lcm, 0); +} + +static int bind_res(void) +{ + int ret; + + ret = bind_fid(fs, cq, FI_SEND | FI_RECV); + if (!ret) { + ret = fi_recvmem(fs, buf, buffer_size, fi_mr_desc(mr), buf); + if (ret) + printf("fi_read %d (%s)\n", ret, fi_strerror(-ret)); + } + return ret; +} + +static int server_listen(void) +{ + struct fi_info *fi; + int ret; + + hints.flags = FI_PASSIVE; + ret = fi_getinfo(src_addr, port, &hints, &fi); + if (ret) { + printf("fi_getinfo %s\n", strerror(-ret)); + return ret; + } + + ret = fi_socket(fi, &lfs, NULL); + if (ret) { + printf("fi_socket %s\n", fi_strerror(-ret)); + goto err1; + } + + ret = alloc_lres(fi); + if (ret) + goto err2; + + ret = bind_lres(); + if (ret) + goto err3; + + ret = fi_listen(lfs); + if (ret) { + printf("fi_listen %s\n", fi_strerror(-ret)); + goto err3; + } + + fi_freeinfo(fi); + return 0; +err3: + free_lres(); +err2: + fi_close(lfs); +err1: + fi_freeinfo(fi); + return ret; +} + +static int server_connect(void) +{ + struct fi_ec_cm_entry entry; + ssize_t rd; + int ret; + + rd = fi_ec_read(lcm, &entry, sizeof entry); + if (rd != sizeof entry) { + printf("fi_ec_read %zd %s\n", rd, fi_strerror((int) -rd)); + return (int) rd; + } + + if (entry.event != FI_CONNREQ) { + printf("Unexpected CM event %d\n", entry.event); + ret = -FI_EOTHER; + goto err1; + } + + ret = fi_socket(entry.info, &fs, NULL); + if (ret) { + printf("fi_socket for req %s\n", fi_strerror(-ret)); + goto err1; + } + + ret = alloc_res(entry.info); + if (ret) + goto err2; + + ret = bind_res(); + if (ret) + goto err3; + + ret = fi_accept(fs, NULL, 0); + if (ret) { + printf("fi_accept %s\n", fi_strerror(-ret)); + goto err3; + } + + fi_freeinfo(entry.info); + return 0; + +err3: + free_res(); +err2: + fi_close(fs); +err1: + fi_freeinfo(entry.info); + return ret; +} + +static int client_connect(void) +{ + struct fi_info *fi; + int ret; + + if (src_addr) { + ret = getaddr(src_addr, NULL, (struct sockaddr **) &hints.src_addr, + (socklen_t *) &hints.src_addrlen); + if (ret) + printf("source address error %s\n", gai_strerror(ret)); + } + + ret = fi_getinfo(dst_addr, port, &hints, &fi); + if (ret) { + printf("fi_getinfo %s\n", strerror(-ret)); + goto err1; + } + + ret = fi_socket(fi, &fs, NULL); + if (ret) { + printf("fi_socket %s\n", fi_strerror(-ret)); + goto err2; + } + + ret = alloc_res(fi); + if (ret) + goto err3; + + ret = bind_res(); + if (ret) + goto err4; + + ret = fi_connect(fs, NULL, 0); + if (ret) { + printf("fi_connect %s\n", fi_strerror(-ret)); + goto err4; + } + + if (hints.src_addr) + free(hints.src_addr); + fi_freeinfo(fi); + return 0; + +err4: + free_res(); +err3: + fi_close(fs); +err2: + fi_freeinfo(fi); +err1: + if (hints.src_addr) + free(hints.src_addr); + return ret; +} + +static int run(void) +{ + int i, ret = 0; + + if (!dst_addr) { + ret = server_listen(); + if (ret) + return ret; + } + + printf("%-10s%-8s%-8s%-8s%-8s%8s %10s%13s\n", + "name", "bytes", "xfers", "iters", "total", "time", "Gb/sec", "usec/xfer"); + if (!custom) { + optimization = opt_latency; + ret = dst_addr ? client_connect() : server_connect(); + if (ret) + return ret; + + for (i = 0; i < TEST_CNT; i++) { + if (test_size[i].option > size_option) + continue; + init_latency_test(test_size[i].size); + run_test(); + } + + /* + * disable bandwidth test until we have a correct flooding + * message protocol + fi_shutdown(fs, 0); + poll_all(); + fi_close(fs); + free_res(); + + optimization = opt_bandwidth; + ret = dst_addr ? client_connect() : server_connect(); + if (ret) + return ret; + + for (i = 0; i < TEST_CNT; i++) { + if (test_size[i].option > size_option) + continue; + init_bandwidth_test(test_size[i].size); + run_test(); + } + */ + } else { + ret = dst_addr ? client_connect() : server_connect(); + if (ret) + return ret; + + ret = run_test(); + } + + while (credits < max_credits) + poll_all(); + fi_shutdown(fs, 0); + fi_close(fs); + free_res(); + if (!dst_addr) + free_lres(); + return ret; +} + +int main(int argc, char **argv) +{ + int op, ret; + + while ((op = getopt(argc, argv, "d:n:p:s:C:I:S:")) != -1) { + switch (op) { + case 'd': + dst_addr = optarg; + break; + case 'n': + hints.domain_name = optarg; + break; + case 'p': + port = optarg; + break; + case 's': + src_addr = optarg; + break; + case 'C': + custom = 1; + transfer_count = atoi(optarg); + break; + case 'I': + custom = 1; + iterations = atoi(optarg); + break; + case 'S': + if (!strncasecmp("all", optarg, 3)) { + size_option = 1; + } else { + custom = 1; + transfer_size = atoi(optarg); + } + break; + default: + printf("usage: %s\n", argv[0]); + printf("\t[-d destination_address]\n"); + printf("\t[-n domain_name]\n"); + printf("\t[-p port_number]\n"); + printf("\t[-s source_address]\n"); + printf("\t[-C transfer_count]\n"); + printf("\t[-I iterations]\n"); + printf("\t[-S transfer_size or 'all']\n"); + exit(1); + } + } + + hints.type = FID_MSG; + ret = run(); + return ret; +} diff --git a/examples/provinfo.c b/examples/provinfo.c new file mode 100644 index 00000000000..3e09cc7a0fa --- /dev/null +++ b/examples/provinfo.c @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2013 Intel Corporation. All rights reserved. + * + * This software is available to you under the OpenIB.org BSD license + * below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AWV + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <errno.h> +#include <getopt.h> +#include <netdb.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/socket.h> +#include <sys/types.h> + +#include <rdma/fabric.h> +#include "shared.h" + + +static struct fi_info hints; +static char *dst_addr; + +static int run(void) +{ + struct fi_info *fi, *cur; + int ret; + + ret = fi_getinfo(dst_addr, NULL, &hints, &fi); + if (ret) { + printf("fi_getinfo %s\n", strerror(-ret)); + return ret; + } + + for (cur = fi; cur; cur = cur->next) { + printf("domain: %s\n", cur->domain_name); + } + + return ret; +} + +int main(int argc, char **argv) +{ + int op, ret; + + while ((op = getopt(argc, argv, "d:n:s:")) != -1) { + switch (op) { + case 'd': + dst_addr = optarg; + break; + case 'n': + hints.domain_name = optarg; + break; + case 's': + ret = getaddr(optarg, NULL, (struct sockaddr **) &hints.src_addr, + (socklen_t *) &hints.src_addrlen); + if (ret) { + printf("source address error %s\n", + gai_strerror(errno)); + } + break; + default: + printf("usage: %s\n", argv[0]); + printf("\t[-d destination_address]\n"); + printf("\t[-n domain_name]\n"); + printf("\t[-s source_address]\n"); + exit(1); + } + } + + ret = run(); + return ret; +} diff --git a/examples/shared.c b/examples/shared.c new file mode 100644 index 00000000000..7925f41cf1b --- /dev/null +++ b/examples/shared.c @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2013 Intel Corporation. All rights reserved. + * + * This software is available to you under the OpenIB.org BSD license + * below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AWV + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <errno.h> +#include <netdb.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <sys/socket.h> +#include <sys/types.h> + +#include "shared.h" + + +int getaddr(char *node, char *service, struct sockaddr **addr, socklen_t *len) +{ + struct addrinfo *ai; + int ret; + + ret = getaddrinfo(node, service, NULL, &ai); + if (ret) + return ret; + + if ((*addr = malloc(ai->ai_addrlen))) { + memcpy(*addr, ai->ai_addr, ai->ai_addrlen); + *len = ai->ai_addrlen; + } else { + ret = EAI_MEMORY; + } + + freeaddrinfo(ai); + return ret; +} + +void size_str(char *str, size_t ssize, long long size) +{ + long long base, fraction = 0; + char mag; + + if (size >= (1 << 30)) { + base = 1 << 30; + mag = 'g'; + } else if (size >= (1 << 20)) { + base = 1 << 20; + mag = 'm'; + } else if (size >= (1 << 10)) { + base = 1 << 10; + mag = 'k'; + } else { + base = 1; + mag = '\0'; + } + + if (size / base < 10) + fraction = (size % base) * 10 / base; + if (fraction) { + snprintf(str, ssize, "%lld.%lld%c", size / base, fraction, mag); + } else { + snprintf(str, ssize, "%lld%c", size / base, mag); + } +} + +void cnt_str(char *str, size_t ssize, long long cnt) +{ + if (cnt >= 1000000000) + snprintf(str, ssize, "%lldb", cnt / 1000000000); + else if (cnt >= 1000000) + snprintf(str, ssize, "%lldm", cnt / 1000000); + else if (cnt >= 1000) + snprintf(str, ssize, "%lldk", cnt / 1000); + else + snprintf(str, ssize, "%lld", cnt); +} + +int size_to_count(int size) +{ + if (size >= (1 << 20)) + return 100; + else if (size >= (1 << 16)) + return 1000; + else if (size >= (1 << 10)) + return 10000; + else + return 100000; +} diff --git a/examples/shared.h b/examples/shared.h new file mode 100644 index 00000000000..1fb3660852e --- /dev/null +++ b/examples/shared.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2013 Intel Corporation. All rights reserved. + * + * This software is available to you under the OpenIB.org BSD license + * below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AWV + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _SHARED_H_ +#define _SHARED_H_ + +#include <sys/socket.h> +#include <sys/types.h> + +#include <rdma/fabric.h> + +#ifdef __cplusplus +extern "C" { +#endif + + +int getaddr(char *node, char *service, struct sockaddr **addr, socklen_t *len); +void size_str(char *str, size_t ssize, long long size); +void cnt_str(char *str, size_t ssize, long long cnt); +int size_to_count(int size); + + +#ifdef __cplusplus +} +#endif + +#endif /* _SHARED_H_ */ diff --git a/include/fi.h b/include/fi.h new file mode 100644 index 00000000000..2c6d237d0fb --- /dev/null +++ b/include/fi.h @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2013 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenFabrics.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _FI_H_ +#define _FI_H_ + +#include <endian.h> +#include <byteswap.h> +#include <rdma/fabric.h> +#include <rdma/fi_prov.h> + + +#ifdef __cplusplus +extern "C" { +#endif + +#define PFX "libfabric: " + +#ifdef INCLUDE_VALGRIND +# include <valgrind/memcheck.h> +# ifndef VALGRIND_MAKE_MEM_DEFINED +# warning "Valgrind requested, but VALGRIND_MAKE_MEM_DEFINED undefined" +# endif +#endif + +#ifndef VALGRIND_MAKE_MEM_DEFINED +# define VALGRIND_MAKE_MEM_DEFINED(addr, len) +#endif + +#if __BYTE_ORDER == __LITTLE_ENDIAN +static inline be64_t htonll(uint64_t x) { return bswap_64(x); } +static inline uint64_t ntohll(be64_t x) { return bswap_64(x); } +#else +static inline be64_t htonll(uint64_t x) { return x; } +static inline uint64_t ntohll(be64_t x) { return x; } +#endif + +#define max(a, b) ((a) > (b) ? a : b) +#define min(a, b) ((a) < (b) ? a : b) + +struct fi_prov { + struct fi_prov *next; + struct fi_ops_prov *ops; +}; + +struct uv_dev { + struct uv_dev *next; + char sysfs_name[FI_NAME_MAX]; + char dev_name[FI_NAME_MAX]; + char sysfs_path[FI_PATH_MAX]; + char dev_path[FI_PATH_MAX]; +}; + +extern int uv_abi_ver; +extern struct uv_dev *udev_head, *udev_tail; + +int fi_init(void); + +void uv_ini(void); +void uv_fini(void); +int uv_init(void); + +void ibv_ini(void); +void ibv_fini(void); + +void ucma_ini(void); +void ucma_fini(void); +int ucma_init(void); + +void rdma_cm_ini(void); +void rdma_cm_fini(void); + +void mlx4_ini(void); +void mlx4_fini(void); + +#ifdef HAVE_PSM +void psmx_ini(void); +void psmx_fini(void); +#else +#define psmx_ini() +#define psmx_fini() +#endif + +const char *fi_sysfs_path(void); +int fi_read_file(const char *dir, const char *file, char *buf, size_t size); +void __fi_freeinfo(struct fi_info *info); + +#define IBV_PREFIX "ibv" +#ifndef SYSCONFDIR +#define SYSCONFDIR "/etc" +#endif +#ifndef RDMADIR +#define RDMADIR "rdma" +#endif +#define RDMA_CONF_DIR SYSCONFDIR "/" RDMADIR +#define FI_CONF_DIR RDMA_CONF_DIR "/fabric" + + +#ifdef __cplusplus +} +#endif + +#endif /* _FI_H_ */ diff --git a/include/infiniband/ib.h b/include/infiniband/ib.h new file mode 100644 index 00000000000..2e5029ac29b --- /dev/null +++ b/include/infiniband/ib.h @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2010 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(_RDMA_IB_H) +#define _RDMA_IB_H + +#include <linux/types.h> +#include <string.h> + +#ifndef AF_IB +#define AF_IB 27 +#endif +#ifndef PF_IB +#define PF_IB AF_IB +#endif + +#ifndef __be16 +#define __be16 __u16 +#endif +#ifndef __be32 +#define __be32 __u32 +#endif +#ifndef __be64 +#define __be64 __u64 +#endif + +struct ib_addr { + union { + __u8 uib_addr8[16]; + __be16 uib_addr16[8]; + __be32 uib_addr32[4]; + __be64 uib_addr64[2]; + } ib_u; +#define sib_addr8 ib_u.uib_addr8 +#define sib_addr16 ib_u.uib_addr16 +#define sib_addr32 ib_u.uib_addr32 +#define sib_addr64 ib_u.uib_addr64 +#define sib_raw ib_u.uib_addr8 +#define sib_subnet_prefix ib_u.uib_addr64[0] +#define sib_interface_id ib_u.uib_addr64[1] +}; + +static inline int ib_addr_any(const struct ib_addr *a) +{ + return ((a->sib_addr64[0] | a->sib_addr64[1]) == 0); +} + +static inline int ib_addr_loopback(const struct ib_addr *a) +{ + return ((a->sib_addr32[0] | a->sib_addr32[1] | + a->sib_addr32[2] | (a->sib_addr32[3] ^ htonl(1))) == 0); +} + +static inline void ib_addr_set(struct ib_addr *addr, + __be32 w1, __be32 w2, __be32 w3, __be32 w4) +{ + addr->sib_addr32[0] = w1; + addr->sib_addr32[1] = w2; + addr->sib_addr32[2] = w3; + addr->sib_addr32[3] = w4; +} + +static inline int ib_addr_cmp(const struct ib_addr *a1, const struct ib_addr *a2) +{ + return memcmp(a1, a2, sizeof(struct ib_addr)); +} + +struct sockaddr_ib { + unsigned short int sib_family; /* AF_IB */ + __be16 sib_pkey; + __be32 sib_flowinfo; + struct ib_addr sib_addr; + __be64 sib_sid; + __be64 sib_sid_mask; + __u64 sib_scope_id; +}; + +#endif /* _RDMA_IB_H */ diff --git a/include/rdma/fabric.h b/include/rdma/fabric.h new file mode 100644 index 00000000000..2fa6c84201c --- /dev/null +++ b/include/rdma/fabric.h @@ -0,0 +1,387 @@ +/* + * Copyright (c) 2013 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenFabrics.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _FABRIC_H_ +#define _FABRIC_H_ + +#include <stdint.h> +#include <stddef.h> +#include <assert.h> +#include <sys/socket.h> +#include <assert.h> + +#ifdef __cplusplus +extern "C" { +#endif + + +typedef uint16_t be16_t; +typedef uint32_t be32_t; +typedef uint64_t be64_t; + +#ifndef container_of +#define container_of(ptr, type, field) \ + ((type *) ((char *)ptr - offsetof(type, field))) +#endif + +enum { + FI_PATH_MAX = 256, + FI_NAME_MAX = 64, + FI_VERSION_MAX = 64 +}; + +/* fi_info and operation flags - pass into socket ops calls. + * A user may also set these on a socket by using fcntl, which has the + * affect of applying them to all applicable operations. + */ + +/* PASSIVE - Indicates that the allocated socket will be used + * to listen for connection requests. + * fi_info + */ +#define FI_PASSIVE (1ULL << 0) +/* NUMERICHOST - The node parameter passed into fi_getinfo is a + * numeric IP address or GID. When set, name resolution is not + * performed. + * fi_info + */ +#define FI_NUMERICHOST (1ULL << 1) +/* FAMILY - If set, then the node parameter passed into fi_getinfo + * is encoded address. The format of the address is given by the + * sa_family field in fi_info. This flag is needed by providers + * in order to determine if an address is an IPv6 or GID based + * address. + * fi_info + */ +//#define FI_FAMILY (1ULL << 2) + +/* AUTO_RESET - automatically resets the event queue to generate + * a new wake-up event on the next entry. Example use: + * 1. wait on eq wait object -- poll(fd) + * 2. wait object is ready -- fd is readable + * 3. read eq to retrieve events + * 4. continue reading until read returns 0 + */ +#define FI_AUTO_RESET (1ULL << 7) + +/* fi_info type, fcntl, fi_open flags */ + +/* Reserve lower 8-bits for type selection + * fi_info type, fi_open, fcntl + */ +#define FI_NONBLOCK (1ULL << 8) +/* Reserve lower 8-bits for type selection + * fi_info type, fi_open, fcntl + */ +#define FI_SYNC (1ULL << 9) +/* EXCL - Indicates that the specified domain should not share + * resources with another opened domain. By default, resources + * associated with a resource domain are shared across all open + * calls by the same process. + * reserve lower 8-bits for type selection + * fi_info type, fi_open, fcntl + */ +#define FI_EXCL (1ULL << 10) +/* BUFFERED_RECV - If set, the provider should attempt to queue inbound + * data that arrives before a receive buffer has been posted. In the + * absence of this flag, any messages that arrive before a receive is + * posted are lost. + * When set, the user must use struct fi_context * as their per + * operation context. + * reserve lower 8-bits for type selection + * fi_info type, fi_open, fcntl + */ +/* TODO: Should buffered be its own bit */ +#define FI_BUFFERED_RECV (1ULL << 11) +/* CANCEL - Indicates that the user wants the ability to cancel + * the operation if it does not complete first. Providers use this + * to return a handle to the request, which the user may then cancel. + * Also used by search to indicate that a request should be canceled. + * fi_info type, fi_open, fcntl, data transfer ops + */ +#define FI_CANCEL (1ULL << 12) +/* SHARED_RECV - A socket created with this flag will share the same + * receive queue as other sockets created on the same domain. + * fi_info type, fi_open, fcntl + */ +/* TODO: should shared be its own bit? */ +#define FI_SHARED_RECV (1ULL << 13) +/* READ - Used to enable read access to data buffers. + */ +#define FI_READ (1ULL << 14) +/* WRITE - Used to enable write access to data buffers. + */ +#define FI_WRITE (1ULL << 15) +/* RECV - Report recv completion EQs + */ +/* TODO: Use with buffered_recv / shared_recv? */ +#define FI_RECV (1ULL << 16) +/* SEND - Report send completion EQs + */ +/* TODO: Use with buffered_send? */ +#define FI_SEND (1ULL << 17) + +/* fcntl and data transfer ops */ + +#define FI_DONTWAIT FI_NONBLOCK +#define FI_PEEK (1ULL << 25) +/* ERRQUEUE - A read operation should retrieve any queued error data. + * In the case of a failure, a read operation may return an error code, + * indicating that an operation has failed and extended error data is + * available. Queued error data must be read before additional + * completions may be read. + * + * Added eq.readerr call, which should eliminate the need for this. + */ +#define FI_ERRQUEUE (1ULL << 26) +/* TRUNC - Signals that received data has been truncated. + */ +#define FI_TRUNC (1ULL << 27) +/* CTRUNC - Indicates that control data was truncated. Use case? + */ +#define FI_CTRUNC (1ULL << 28) +#define FI_ATRUNC (1ULL << 29) +/* IMM - Indicates that immediate data is available. IMM data is + * communicated to a receiver through completion data, rather than + * appearing in targeted receive buffers. + */ +#define FI_IMM (1ULL << 30) +/* NOCOMP - Indicates that no completion should be generated for the + * specified operation. + */ +#define FI_NOCOMP (1ULL << 31) +/* MORE: Indicates that additional requests are pending. Providers may + * use this to optimize access to hardware. + */ +#define FI_MORE (1ULL << 32) +/* SIGNAL - Indicates if a completion event should be generated. + */ +#define FI_SIGNAL (1ULL << 33) +/* BUFFERED_SEND - If set, the outbound data buffer should be returned + * to user immediately after the call returns, even if the operation is + * handled asynchronously. This may require that the provider copy + * the data into a local buffer and transfer out of that buffer. + */ +#define FI_BUFFERED_SEND (1ULL << 34) +/* ACK - Indicates that a completion event is not generated until the operation + * initiated is acknowledged by the remote side */ +#define FI_ACK (1ULL << 35) + +/* ERRINLINE - Error events are reported inline with other events, rather + * than through a separate error queue (see ERRQUEUE). + */ +#define FI_ERRINLINE (1ULL << 36) +/* REMOTE - Indicates remote access + */ +#define FI_REMOTE (1ULL << 37) + + +/* + * Format for 'vectored' data transfer calls: sendv, writev, etc. + */ +enum fi_iov_format { + FI_IOV, /* struct iovec */ + FI_IOMV, /* struct fi_iomv */ + FI_IOTAGGED, /* struct fi_iotagged */ + FI_IOTAGGEDV, /* struct fi_iotaggedv */ +}; + +/* + * Format for transport addresses: sendto, writeto, etc. + */ +enum fi_addr_format { + FI_ADDR, /* void * fi_addr */ + FI_AV, /* struct fi_av_addr */ + FI_ADDR_INDEX, /* size_t fi_addr */ + FI_INFO_ADDR, /* struct fi_info_addr */ + FI_SOCKADDR, /* struct sockaddr */ + FI_SOCKADDR_IN, /* struct sockaddr_in */ + FI_SOCKADDR_IN6, /* struct sockaddr_in6 */ + FI_SOCKADDR_IB, /* struct sockaddr_ib */ +}; + +struct fi_info { + struct fi_info *next; + size_t size; + uint64_t flags; + uint64_t type; + uint64_t protocol; + enum fi_iov_format iov_format; + enum fi_addr_format addr_format; + enum fi_addr_format info_addr_format; + size_t src_addrlen; + size_t dst_addrlen; + void *src_addr; + void *dst_addr; + /*char *src_canonname;*/ + /*char *dst_canonname;*/ + /* Authorization key is intended to limit communication with only + * those sockets sharing the same key. + */ + size_t auth_keylen; + void *auth_key; + /* A shared_fd is intended to allow a domain to share resources + * and data with other processes that have access to the same + * shared_fd. Based on XRC work. + */ + int shared_fd; + char *domain_name; + size_t datalen; + void *data; +}; + +enum { + FID_CLASS_UNSPEC, + FID_CLASS_SOCKET, + FID_CLASS_RESOURCE_DOMAIN, + FID_CLASS_INTERFACE, + FID_CLASS_AV, + FID_CLASS_MR, + FID_CLASS_EC +}; + +/* See FI_BUFFERED_RECV, FI_CANCEL */ +struct fi_context { + void *internal[4]; +}; + +struct fid; +typedef struct fid *fid_t; + +struct fi_resource { + fid_t fid; + uint64_t flags; +}; + +struct fi_ops { + size_t size; + int (*close)(fid_t fid); + /* Associate resources with this object */ + int (*bind)(fid_t fid, struct fi_resource *fids, int nfids); + /* Operation that completes after all previous async requests complete */ + int (*sync)(fid_t fid, uint64_t flags, void *context); + /* low-level control - similar to fcntl & ioctl operations */ + int (*control)(fid_t fid, int command, void *arg); +}; + +/* All fabric interface descriptors must start with this structure */ +struct fid { + int fclass; + int size; + void *context; + struct fi_ops *ops; +}; + +#define FI_PREFIX "fi" +#define FI_DOMAIN_NAMES "domains" +#define FI_UNBOUND_NAME "local" + +int fi_getinfo(char *node, char *service, struct fi_info *hints, + struct fi_info **info); +void fi_freeinfo(struct fi_info *info); + +/* Either name or info must be provided. Providing both is allowed. */ +int fi_open(char *name, struct fi_info *info, uint64_t flags, + fid_t *fid, void *context); +/* + * Allocate a fabric socket. A fabric socket is a software construct. + */ +int fi_socket(struct fi_info *info, fid_t *fid, void *context); + +#define FI_ASSERT_CLASS(fid, f_class) assert(fid->fclass == f_class) +#define FI_ASSERT_FIELD(ptr, ftype, field) assert(ptr->size > offsetof(ftype, field)) +#define FI_ASSERT_OPS(fid, ftype, ops) FI_ASSERT_FIELD(fid, ftype, ops) +#define FI_ASSERT_OP(ops, otype, op) FI_ASSERT_FIELD(ops, otype, op) + +static inline int fi_close(fid_t fid) +{ + FI_ASSERT_OPS(fid, struct fid, ops); + FI_ASSERT_OP(fid->ops, struct fi_ops, close); + return fid->ops->close(fid); +} +#define fi_destroy(fid) fi_close(fid) + +static inline int fi_bind(fid_t fid, struct fi_resource *fids, int nfids) +{ + FI_ASSERT_OPS(fid, struct fid, ops); + FI_ASSERT_OP(fid->ops, struct fi_ops, bind); + return fid->ops->bind(fid, fids, nfids); +} + +static inline int fi_sync(fid_t fid, uint64_t flags, void *context) +{ + FI_ASSERT_OPS(fid, struct fid, ops); + FI_ASSERT_OP(fid->ops, struct fi_ops, sync); + return fid->ops->sync(fid, flags, context); +} + +/* control commands */ +enum { + FI_GETFIDFLAG, /* uint64_t flags */ + FI_SETFIDFLAG, /* uint64_t flags */ + FI_GETOPSFLAG, /* uint64_t flags */ + FI_SETOPSFLAG, /* uint64_t flags */ + + /* Duplicate a fid_t. This allows for 2 fids that refer to a single + * HW resource. Each fid may reference functions that are optimized + * for different use cases. + */ + FI_DUPFID, /* fid_t * */ + FI_GETECWAIT, /* void * wait object */ + + /* Start/stop an internal progress thread. This is only needed if the + * provider does not support active_progress, and the app does not + * want to poll for progress. + */ + FI_STARTPROGRESS, /* NULL - flags? */ + FI_STOPPROGRESS /* NULL - flags? */ +}; + +/* + * fi_control may be used to set the flags for data transfer operations. This + * is done using the FI_SETOPSFLAG command with arg a uint64_t flags value. The + * FI_READ, FI_WRITE, FI_SEND, FI_RECV flags indicate the type of data transfer + * that the flags should apply to, with other flags OR'ed in. + */ +static inline int fi_control(fid_t fid, int command, void *arg) +{ + FI_ASSERT_OPS(fid, struct fid, ops); + FI_ASSERT_OP(fid->ops, struct fi_ops, control); + return fid->ops->control(fid, command, arg); +} + + +#ifdef __cplusplus +} +#endif + +#endif /* _FABRIC_H_ */ diff --git a/include/rdma/fi_arch.h b/include/rdma/fi_arch.h new file mode 100644 index 00000000000..41616e2ff95 --- /dev/null +++ b/include/rdma/fi_arch.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _FI_ARCH_H_ +#define _FI_ARCH_H_ + +#include <stdint.h> + +/* + * Architecture-specific defines. Currently, an architecture is + * required to implement the following operations: + * + * mb() - memory barrier. No loads or stores may be reordered across + * this macro by either the compiler or the CPU. + * rmb() - read memory barrier. No loads may be reordered across this + * macro by either the compiler or the CPU. + * wmb() - write memory barrier. No stores may be reordered across + * this macro by either the compiler or the CPU. + * wc_wmb() - flush write combine buffers. No write-combined writes + * will be reordered across this macro by either the compiler or + * the CPU. + */ + +#if defined(__i386__) + +#define mb() asm volatile("lock; addl $0,0(%%esp) " ::: "memory") +#define rmb() mb() +#define wmb() asm volatile("" ::: "memory") +#define wc_wmb() mb() + +#elif defined(__x86_64__) + +/* + * Only use lfence for mb() and rmb() because we don't care about + * ordering against non-temporal stores (for now at least). + */ +#define mb() asm volatile("lfence" ::: "memory") +#define rmb() mb() +#define wmb() asm volatile("" ::: "memory") +#define wc_wmb() asm volatile("sfence" ::: "memory") + +#elif defined(__PPC64__) + +#define mb() asm volatile("sync" ::: "memory") +#define rmb() asm volatile("lwsync" ::: "memory") +#define wmb() mb() +#define wc_wmb() wmb() + +#elif defined(__ia64__) + +#define mb() asm volatile("mf" ::: "memory") +#define rmb() mb() +#define wmb() mb() +#define wc_wmb() asm volatile("fwb" ::: "memory") + +#elif defined(__PPC__) + +#define mb() asm volatile("sync" ::: "memory") +#define rmb() mb() +#define wmb() mb() +#define wc_wmb() wmb() + +#elif defined(__sparc_v9__) + +#define mb() asm volatile("membar #LoadLoad | #LoadStore | #StoreStore | #StoreLoad" ::: "memory") +#define rmb() asm volatile("membar #LoadLoad" ::: "memory") +#define wmb() asm volatile("membar #StoreStore" ::: "memory") +#define wc_wmb() wmb() + +#elif defined(__sparc__) + +#define mb() asm volatile("" ::: "memory") +#define rmb() mb() +#define wmb() mb() +#define wc_wmb() wmb() + +#else + +#warning No architecture specific defines found. Using generic implementation. + +#define mb() asm volatile("" ::: "memory") +#define rmb() mb() +#define wmb() mb() +#define wc_wmb() wmb() + +#endif + +#endif /* _FI_ARCH_H_ */ diff --git a/include/rdma/fi_atomic.h b/include/rdma/fi_atomic.h new file mode 100644 index 00000000000..f5bb994d4c9 --- /dev/null +++ b/include/rdma/fi_atomic.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2013 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenFabrics.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _FI_ATOMIC_H_ +#define _FI_ATOMIC_H_ + +#include <rdma/fi_socket.h> + + +#ifdef __cplusplus +extern "C" { +#endif + + +struct fi_ops_atomic { + size_t size; + /* add/compare_swap */ +}; + + +#ifdef __cplusplus +} +#endif + +#endif /* _FI_ATOMIC_H_ */ diff --git a/include/rdma/fi_cm.h b/include/rdma/fi_cm.h new file mode 100644 index 00000000000..1105ad1e5c5 --- /dev/null +++ b/include/rdma/fi_cm.h @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2013 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenFabrics.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _FI_CM_H_ +#define _FI_CM_H_ + +#include <rdma/fi_socket.h> + + +#ifdef __cplusplus +extern "C" { +#endif + + +struct fi_ops_cm { + size_t size; + int (*getname)(fid_t fid, void *addr, size_t *addrlen); + int (*getpeer)(fid_t fid, void *addr, size_t *addrlen); + int (*connect)(fid_t fid, const void *param, size_t paramlen); + int (*listen)(fid_t fid); + int (*accept)(fid_t fid, const void *param, size_t paramlen); + int (*reject)(fid_t fid, struct fi_info *info, + const void *param, size_t paramlen); + int (*shutdown)(fid_t fid, uint64_t flags); + int (*join)(fid_t fid, void *addr, void **fi_addr, uint64_t flags); + int (*leave)(fid_t fid, void *addr, void *fi_addr, uint64_t flags); +}; + +static inline int fi_getsockname(fid_t fid, void *addr, size_t *addrlen) +{ + struct fid_socket *sock = container_of(fid, struct fid_socket, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_SOCKET); + FI_ASSERT_OPS(fid, struct fid_socket, cm); + FI_ASSERT_OP(sock->cm, struct fi_ops_cm, getname); + return sock->cm->getname(fid, addr, addrlen); +} + +static inline int fi_listen(fid_t fid) +{ + struct fid_socket *sock = container_of(fid, struct fid_socket, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_SOCKET); + FI_ASSERT_OPS(fid, struct fid_socket, cm); + FI_ASSERT_OP(sock->cm, struct fi_ops_cm, listen); + return sock->cm->listen(fid); +} + +static inline int fi_connect(fid_t fid, const void *param, size_t paramlen) +{ + struct fid_socket *sock = container_of(fid, struct fid_socket, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_SOCKET); + FI_ASSERT_OPS(fid, struct fid_socket, cm); + FI_ASSERT_OP(sock->cm, struct fi_ops_cm, connect); + return sock->cm->connect(fid, param, paramlen); +} + +static inline int fi_accept(fid_t fid, const void *param, size_t paramlen) +{ + struct fid_socket *sock = container_of(fid, struct fid_socket, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_SOCKET); + FI_ASSERT_OPS(fid, struct fid_socket, cm); + FI_ASSERT_OP(sock->cm, struct fi_ops_cm, accept); + return sock->cm->accept(fid, param, paramlen); +} + +static inline int fi_reject(fid_t fid, struct fi_info *info, + const void *param, size_t paramlen) +{ + struct fid_socket *sock = container_of(fid, struct fid_socket, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_SOCKET); + FI_ASSERT_OPS(fid, struct fid_socket, cm); + FI_ASSERT_OP(sock->cm, struct fi_ops_cm, reject); + return sock->cm->reject(fid, info, param, paramlen); +} + +static inline int fi_shutdown(fid_t fid, uint64_t flags) +{ + struct fid_socket *sock = container_of(fid, struct fid_socket, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_SOCKET); + FI_ASSERT_OPS(fid, struct fid_socket, cm); + FI_ASSERT_OP(sock->cm, struct fi_ops_cm, shutdown); + return sock->cm->shutdown(fid, flags); +} + + +#ifdef __cplusplus +} +#endif + +#endif /* _FI_CM_H_ */ diff --git a/include/rdma/fi_domain.h b/include/rdma/fi_domain.h new file mode 100644 index 00000000000..152041cc2e9 --- /dev/null +++ b/include/rdma/fi_domain.h @@ -0,0 +1,434 @@ +/* + * Copyright (c) 2013 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenFabrics.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _FI_DOMAIN_H_ +#define _FI_DOMAIN_H_ + +#include <rdma/fabric.h> + + +#ifdef __cplusplus +extern "C" { +#endif + + +struct fi_iomv { + void *addr; + size_t len; + uint64_t mem_desc; +}; + +/* TODO: Will this be used? */ +struct fi_iotagged { + uint64_t itag_addr; + be64_t itag_tag; + be64_t itag_mask; +}; + +/* TODO: Will this be used? */ +struct fi_iotaggedv { + uint64_t itag_addr; + be64_t itag_tag; + be64_t itag_mask; + uint64_t itag_desc; +}; + +/* + * AV = Address Vector + * Maps and stores transport/network addresses. + */ + +struct fi_av_addr { + fid_t av; + uint64_t av_index; +}; + +enum fi_av_type { + FI_AV_MAP, + FI_AV_TABLE +}; + +enum { + FI_AV_ATTR_TYPE = 1 << 0, + FI_AV_ATTR_ADDR_FORMAT = 1 << 1, + FI_AV_ATTR_ADDRLEN = 1 << 2, + FI_AV_ATTR_SIZE = 1 << 3, + FI_AV_ATTR_FLAGS = 1 << 4, + FI_AV_ATTR_MASK_V1 = (FI_AV_ATTR_FLAGS << 1) - 1 +}; + +struct fi_av_attr { + int av_mask; + enum fi_av_type type; + enum fi_addr_format addr_format; + size_t addrlen; + size_t count; + uint64_t flags; +}; + +struct fi_ops_av { + size_t size; + int (*insert)(fid_t fid, const void *addr, size_t count, + void **fi_addr, uint64_t flags); + int (*remove)(fid_t fid, void *fi_addr, size_t count, + uint64_t flags); +}; + +struct fid_av { + struct fid fid; + struct fi_ops_av *ops; +}; + + +/* + * MR = Memory Region + * Tracks registered memory regions, primarily for remote access, + * but also for local access until we can remove that need. + */ +struct fid_mr { + struct fid fid; + uint64_t mem_desc; + be64_t key; +}; + + +/* + * EC = Event Collector + * Used to report various events and the completion of asynchronous + * operations. + */ +enum fi_ec_domain { + FI_EC_DOMAIN_GENERAL, + FI_EC_DOMAIN_COMP, + FI_EC_DOMAIN_CM, + FI_EC_DOMAIN_AV +}; + +enum fi_ec_type { + FI_EC_QUEUE, + FI_EC_COUNTER +}; + +enum fi_ec_format { + FI_EC_FORMAT_UNSPEC, + FI_EC_FORMAT_CONTEXT, + FI_EC_FORMAT_COMP, + FI_EC_FORMAT_DATA, + FI_EC_FORMAT_TAGGED, + FI_EC_FORMAT_ERR, + FI_EC_FORMAT_CM +}; + +/* Use fi_control GETECWAIT to get underlying wait object */ +enum fi_ec_wait_obj { + FI_EC_WAIT_NONE, + FI_EC_WAIT_FD +}; + +enum fi_ec_wait_cond { + FI_EC_COND_NONE, + FI_EC_COND_THRESHOLD /* size_t threshold */ +}; + +enum { + FI_EC_ATTR_DOMAIN = 1 << 0, + FI_EC_ATTR_TYPE = 1 << 1, + FI_EC_ATTR_FORMAT = 1 << 2, + FI_EC_ATTR_WAIT_OBJ = 1 << 3, + FI_EC_ATTR_WAIT_COND = 1 << 4, + FI_EC_ATTR_SIZE = 1 << 5, + FI_EC_ATTR_VECTOR = 1 << 6, + FI_EC_ATTR_FLAGS = 1 << 7, + FI_EC_ATTR_COND = 1 << 8, + FI_EC_ATTR_MASK_V1 = (FI_EC_ATTR_COND << 1) - 1 +}; + +struct fi_ec_attr { + int ec_mask; + enum fi_ec_domain domain; + enum fi_ec_type type; + enum fi_ec_format format; + enum fi_ec_wait_obj wait_obj; + enum fi_ec_wait_cond wait_cond; + size_t size; + int signaling_vector; + uint64_t flags; + /* If AUTO_RESET is enabled, and wait_cond is not NONE */ + void *cond; +}; + +struct fi_ec_entry { + void *op_context; +}; + +struct fi_ec_comp_entry { + void *op_context; + uint64_t flags; + size_t len; +}; + +struct fi_ec_data_entry { + void *op_context; + void *buf; + uint64_t flags; + size_t len; + /* data depends on operation and/or flags - e.g. immediate data */ + uint64_t data; +}; + +struct fi_ec_tagged_entry { + void *op_context; + void *buf; + uint64_t flags; + size_t len; + uint64_t data; + uint64_t tag; + size_t olen; +}; + +struct fi_ec_err_entry { + void *fid_context; + void *op_context; + uint64_t flags; + int err; + int prov_errno; + uint64_t data; + /* prov_data is available until the next time the EQ is read */ + void *prov_data; +}; + +enum fi_cm_event { + FI_CONNREQ, + FI_CONNECTED, + FI_SHUTDOWN +}; + +struct fi_ec_cm_entry { + void *fid_context; + uint64_t flags; + enum fi_cm_event event; + /* user must call fi_freeinfo to release info */ + struct fi_info *info; + /* connection data placed here, up to space provided */ + uint8_t data[0]; +}; + +struct fi_ops_ec { + size_t size; + ssize_t (*read)(fid_t fid, void *buf, size_t len); + ssize_t (*readfrom)(fid_t fid, void *buf, size_t len, + void *src_addr, size_t *addrlen); + ssize_t (*readerr)(fid_t fid, void *buf, size_t len, uint64_t flags); + ssize_t (*write)(fid_t fid, void *buf, size_t len); + int (*reset)(fid_t fid, void *cond); + ssize_t (*condread)(fid_t fid, void *buf, size_t len, void *cond); + ssize_t (*condreadfrom)(fid_t fid, void *buf, size_t len, + void *src_addr, size_t *addrlen, void *cond); + const char * (*strerror)(fid_t fid, int prov_errno, void *prov_data, + void *buf, size_t len); +}; + +struct fid_ec { + struct fid fid; + struct fi_ops_ec *ops; +}; + + +enum fi_progress { + FI_PROGRESS_AUTO, + FI_PROGRESS_INDIRECT, /* progress possible through any domain call */ + FI_PROGRESS_EXPLICIT /* user must explicitly request progress */ +}; + +/* + * The thought is that domain attributes should be relative to what it can + * provide to the applications, and is not intended as a set of available + * hardware limits. + */ +struct fi_domain_attr { + /* Note to providers: set prov_attr to static struct */ + size_t prov_attr_size; + void *prov_attr; + size_t max_auth_key_size; + enum fi_progress progress; +}; + +struct fi_ops_domain { + size_t size; + int (*progress)(fid_t fid); + int (*query)(fid_t fid, struct fi_domain_attr *attr, size_t *attrlen); + int (*av_open)(fid_t fid, struct fi_av_attr *attr, fid_t *av, + void *context); + int (*ec_open)(fid_t fid, struct fi_ec_attr *attr, fid_t *ec, + void *context); + int (*mr_reg)(fid_t fid, const void *buf, size_t len, fid_t *mr, + uint64_t flags, void *context); + int (*mr_regv)(fid_t fid, const struct iovec *iov, size_t count, + fid_t *mr, uint64_t flags, void *context); +}; + +struct fid_domain { + struct fid fid; + struct fi_ops_domain *ops; +}; + +static inline int fi_ec_open(fid_t fid, struct fi_ec_attr *attr, fid_t *ec, + void *context) +{ + struct fid_domain *domain = container_of(fid, struct fid_domain, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_RESOURCE_DOMAIN); + FI_ASSERT_OPS(fid, struct fid_domain, ops); + FI_ASSERT_OP(domain->ops, struct fi_ops_domain, ec_open); + return domain->ops->ec_open(fid, attr, ec, context); +} + +static inline ssize_t fi_ec_read(fid_t fid, void *buf, size_t len) +{ + struct fid_ec *ec = container_of(fid, struct fid_ec, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_EC); + FI_ASSERT_OPS(fid, struct fid_ec, ops); + FI_ASSERT_OP(ec->ops, struct fi_ops_ec, read); + return ec->ops->read(fid, buf, len); +} + +static inline ssize_t fi_ec_readfrom(fid_t fid, void *buf, size_t len, + void *src_addr, size_t *addrlen) +{ + struct fid_ec *ec = container_of(fid, struct fid_ec, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_EC); + FI_ASSERT_OPS(fid, struct fid_ec, ops); + FI_ASSERT_OP(ec->ops, struct fi_ops_ec, readfrom); + return ec->ops->readfrom(fid, buf, len, src_addr, addrlen); +} + +static inline ssize_t fi_ec_readerr(fid_t fid, void *buf, size_t len, uint64_t flags) +{ + struct fid_ec *ec = container_of(fid, struct fid_ec, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_EC); + FI_ASSERT_OPS(fid, struct fid_ec, ops); + FI_ASSERT_OP(ec->ops, struct fi_ops_ec, readerr); + return ec->ops->readerr(fid, buf, len, flags); +} + +static inline int fi_ec_reset(fid_t fid, void *cond) +{ + struct fid_ec *ec = container_of(fid, struct fid_ec, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_EC); + FI_ASSERT_OPS(fid, struct fid_ec, ops); + FI_ASSERT_OP(ec->ops, struct fi_ops_ec, reset); + return ec->ops->reset(fid, cond); +} + +static inline const char * fi_ec_strerror(fid_t fid, int prov_errno, void *prov_data, + void *buf, size_t len) +{ + struct fid_ec *ec = container_of(fid, struct fid_ec, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_EC); + FI_ASSERT_OPS(fid, struct fid_ec, ops); + FI_ASSERT_OP(ec->ops, struct fi_ops_ec, strerror); + return ec->ops->strerror(fid, prov_errno, prov_data, buf, len); +} + +static inline int fi_mr_reg(fid_t fid, const void *buf, size_t len, + fid_t *mr, uint64_t flags, void *context) +{ + struct fid_domain *domain = container_of(fid, struct fid_domain, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_RESOURCE_DOMAIN); + FI_ASSERT_OPS(fid, struct fid_domain, ops); + FI_ASSERT_OP(domain->ops, struct fi_ops_domain, mr_reg); + return domain->ops->mr_reg(fid, buf, len, mr, flags, context); +} + +static inline uint64_t fi_mr_desc(fid_t fid) +{ + struct fid_mr *mr = container_of(fid, struct fid_mr, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_MR); + FI_ASSERT_FIELD(fid, struct fid_mr, mem_desc); + return mr->mem_desc; +} + +static inline be64_t fi_mr_key(fid_t fid) +{ + struct fid_mr *mr = container_of(fid, struct fid_mr, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_MR); + FI_ASSERT_FIELD(fid, struct fid_mr, key); + return mr->key; +} + +static inline int fi_mr_unreg(fid_t fid) +{ + FI_ASSERT_CLASS(fid, FID_CLASS_MR); + return fi_close(fid); +} + +static inline int fi_av_open(fid_t fid, struct fi_av_attr *attr, fid_t *av, + void *context) +{ + struct fid_domain *domain = container_of(fid, struct fid_domain, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_RESOURCE_DOMAIN); + FI_ASSERT_OPS(fid, struct fid_domain, ops); + FI_ASSERT_OP(domain->ops, struct fi_ops_domain, av_open); + return domain->ops->av_open(fid, attr, av, context); +} + +static inline int fi_av_map(fid_t fid, const void *addr, size_t count, + void **fi_addr, uint64_t flags) +{ + struct fid_av *av = container_of(fid, struct fid_av, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_AV); + FI_ASSERT_OPS(fid, struct fid_av, ops); + FI_ASSERT_OP(av->ops, struct fi_ops_av, insert); + return av->ops->insert(fid, addr, count, fi_addr, flags); +} + +static inline int fi_av_unmap(fid_t fid, void *fi_addr, size_t count, + uint64_t flags) +{ + struct fid_av *av = container_of(fid, struct fid_av, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_AV); + FI_ASSERT_OPS(fid, struct fid_av, ops); + FI_ASSERT_OP(av->ops, struct fi_ops_av, remove); + return av->ops->remove(fid, fi_addr, count, flags); +} + +static inline int fi_av_sync(fid_t fid, uint64_t flags, void *context) +{ + FI_ASSERT_CLASS(fid, FID_CLASS_AV); + return fi_sync(fid, flags, context); +} + + +#ifdef __cplusplus +} +#endif + +#endif /* _FI_DOMAIN_H_ */ diff --git a/include/rdma/fi_errno.h b/include/rdma/fi_errno.h new file mode 100644 index 00000000000..980f108d133 --- /dev/null +++ b/include/rdma/fi_errno.h @@ -0,0 +1,179 @@ +/* + * Copyright (c) 2013 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenFabrics.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _FI_ERRNO_H_ +#define _FI_ERRNO_H_ + +#include <errno.h> + +/* FI directly mapped errno values */ + +#define FI_EPERM EPERM /* Operation not permitted */ +#define FI_ENOENT ENOENT /* No such file or directory */ +#define FI_ESRCH ESRCH /* No such process */ +#define FI_EINTR EINTR /* Interrupted system call */ +#define FI_EIO EIO /* I/O error */ +#define FI_ENXIO ENXIO /* No such device or address */ +#define FI_E2BIG E2BIG /* Argument list too long */ +#define FI_ENOEXEC ENOEXEC /* Exec format error */ +#define FI_EBADF EBADF /* Bad file number */ +#define FI_ECHILD ECHILD /* No child processes */ +#define FI_EAGAIN EAGAIN /* Try again */ +#define FI_ENOMEM ENOMEM /* Out of memory */ +#define FI_EACCES EACCES /* Permission denied */ +#define FI_EFAULT EFAULT /* Bad address */ +#define FI_ENOTBLK ENOTBLK /* Block device required */ +#define FI_EBUSY EBUSY /* Device or resource busy */ +#define FI_EEXIST EEXIST /* File exists */ +#define FI_EXDEV EXDEV /* Cross-device link */ +#define FI_ENODEV ENODEV /* No such device */ +#define FI_ENOTDIR ENOTDIR /* Not a directory */ +#define FI_EISDIR EISDIR /* Is a directory */ +#define FI_EINVAL EINVAL /* Invalid argument */ +#define FI_ENFILE ENFILE /* File table overflow */ +#define FI_EMFILE EMFILE /* Too many open files */ +#define FI_ENOTTY ENOTTY /* Not a typewriter */ +#define FI_ETXTBSY ETXTBSY /* Text file busy */ +#define FI_EFBIG EFBIG /* File too large */ +#define FI_ENOSPC ENOSPC /* No space left on device */ +#define FI_ESPIPE ESPIPE /* Illegal seek */ +#define FI_EROFS EROFS /* Read-only file system */ +#define FI_EMLINK EMLINK /* Too many links */ +#define FI_EPIPE EPIPE /* Broken pipe */ +#define FI_EDOM EDOM /* Math argument out of domain of func */ +#define FI_ERANGE ERANGE /* Math result not representable */ +#define FI_EDEADLK EDEADLK /* Resource deadlock would occur */ +#define FI_ENAMETOOLONG ENAMETOLONG /* File name too long */ +#define FI_ENOLCK ENOLCK /* No record locks available */ +#define FI_ENOSYS ENOSYS /* Function not implemented */ +#define FI_ENOTEMPTY ENOTEMPTY /* Directory not empty */ +#define FI_ELOOP ELOOP /* Too many symbolic links encountered */ +#define FI_EWOULDBLOCK EWOULDBLOCK /* Operation would block */ +#define FI_ENOMSG ENOMSG /* No message of desired type */ +#define FI_EIDRM EIDRM /* Identifier removed */ +#define FI_ECHRNG ECHRNG /* Channel number out of range */ +#define FI_EL2NSYNC EL2NSYCN /* Level 2 not synchronized */ +#define FI_EL3HLT EL3HLT /* Level 3 halted */ +#define FI_EL3RST EL3RST /* Level 3 reset */ +#define FI_ELNRNG ELNRNG /* Link number out of range */ +#define FI_EUNATCH EUNATCH /* Protocol driver not attached */ +#define FI_ENOCSI ENOCSI /* No CSI structure available */ +#define FI_EL2HLT EL2HLT /* Level 2 halted */ +#define FI_EBADE EBADE /* Invalid exchange */ +#define FI_EBADR EBADDR /* Invalid request descriptor */ +#define FI_EXFULL EXFULL /* Exchange full */ +#define FI_ENOANO ENOANO /* No anode */ +#define FI_EBADRQC EBADRQC /* Invalid request code */ +#define FI_EBADSLT EBADSLT /* Invalid slot */ +#define FI_EDEADLOCK EDEADLOCK /* Resource deadlock would occur */ +#define FI_EBFONT EBFONT /* Bad font file format */ +#define FI_ENOSTR ENOSTR /* Device not a stream */ +#define FI_ENODATA ENODATA /* No data available */ +#define FI_ETIME ETIME /* Timer expired */ +#define FI_ENOSR ENOSR /* Out of streams resources */ +#define FI_ENONET ENONET /* Machine is not on the network */ +#define FI_ENOPKG ENOPKG /* Package not installed */ +#define FI_EREMOTE EREMOTE /* Object is remote */ +#define FI_ENOLINK ENOLINK /* Link has been severed */ +#define FI_EADV EADV /* Advertise error */ +#define FI_ESRMNT ESRMNT /* Srmount error */ +#define FI_ECOMM ECOMM /* Communication error on send */ +#define FI_EPROTO EPROTO /* Protocol error */ +#define FI_EMULTIHOP EMULTIHOP /* Multihop attempted */ +#define FI_EDOTDOT EDOTDOT /* RFS specific error */ +#define FI_EBADMSG EBADMSG /* Not a data message */ +#define FI_EOVERFLOW EOVERFLOW /* Value too large for defined data type */ +#define FI_ENOTUNIQ ENOTUNIQ /* Name not unique on network */ +#define FI_EBADFD EBADFD /* File descriptor in bad state */ +#define FI_EREMCHG EREMCHG /* Remote address changed */ +#define FI_ELIBACC ELIBACC /* Can not access a needed shared library */ +#define FI_ELIBBAD ELIBBAD /* Accessing a corrupted shared library */ +#define FI_ELIBSCN ELIBSCN /* .lib section in a.out corrupted */ +#define FI_ELIBMAX ELIBMAX /* Attempting to link in too many shared libraries */ +#define FI_ELIBEXEC ELIBEXEC /* Cannot exec a shared library directly */ +#define FI_EILSEQ EILSEQ /* Illegal byte sequence */ +#define FI_ERESTART ERESTART /* Interrupted system call should be restarted */ +#define FI_ESTRPIPE ESTRPIPE /* Streams pipe error */ +#define FI_EUSERS EUSERS /* Too many users */ +#define FI_ENOTSOCK ENOTSOCK /* Socket operation on non-socket */ +#define FI_EDESTADDRREQ EDESTADDRREQ /* Destination address required */ +#define FI_EMSGSIZE EMSGSIZE /* Message too long */ +#define FI_EPROTOTYPE EPROTOTYPE /* Protocol wrong type for socket */ +#define FI_ENOPROTOOPT ENOPROTOOPT /* Protocol not available */ +#define FI_EPROTONOSUPPORT EPROTONOSUPPORT /* Protocol not supported */ +#define FI_ESOCKTNOSUPPORT ESOCKTNOSUPPORT /* Socket type not supported */ +#define FI_EOPNOTSUPP EOPNOTSUPP /* Operation not supported on transport endpoint */ +#define FI_EPFNOSUPPORT EPFNOSUPPORT /* Protocol family not supported */ +#define FI_EAFNOSUPPORT EAFNOSUPPORT /* Address family not supported by protocol */ +#define FI_EADDRINUSE EADDRINUSE /* Address already in use */ +#define FI_EADDRNOTAVAIL EADDRNOTAVAIL /* Cannot assign requested address */ +#define FI_ENETDOWN ENETDOWN /* Network is down */ +#define FI_ENETUNREACH ENETUNREACH /* Network is unreachable */ +#define FI_ENETRESET ENETRESET /* Network dropped connection because of reset */ +#define FI_ECONNABORTED ECONNABORTED /* Software caused connection abort */ +#define FI_ECONNRESET ECONNRESET /* Connection reset by peer */ +#define FI_ENOBUFS ENOBUFS /* No buffer space available */ +#define FI_EISCONN EISCONN /* Transport endpoint is already connected */ +#define FI_ENOTCONN ENOTCONN /* Transport endpoint is not connected */ +#define FI_ESHUTDOWN ESHUTDOWN /* Cannot send after transport endpoint shutdown */ +#define FI_ETOOMANYREFS ETOOMANYREFS /* Too many references: cannot splice */ +#define FI_ETIMEDOUT ETIMEDOUT /* Connection timed out */ +#define FI_ECONNREFUSED ECONNREFUSED /* Connection refused */ +#define FI_EHOSTDOWN EHOSTDOWN /* Host is down */ +#define FI_EHOSTUNREACH EHOSTUNREACH /* No route to host */ +#define FI_EALREADY EALREADY /* Operation already in progress */ +#define FI_EINPROGRESS EINPROGRESS /* Operation now in progress */ +#define FI_ESTALE ESTALE /* Stale NFS file handle */ +#define FI_EUCLEAN EUNCLEAN /* Structure needs cleaning */ +#define FI_ENOTNAM ENOTNAM /* Not a XENIX named type file */ +#define FI_ENAVAIL ENAVAIL /* No XENIX semaphores available */ +#define FI_EISNAM EISNAM /* Is a named type file */ +#define FI_EREMOTEIO EREMOTEIO /* Remote I/O error */ +#define FI_EDQUOT EDQUOT /* Quota exceeded */ +#define FI_ENOMEDIUM ENOMEDIUM /* No medium found */ +#define FI_EMEDIUMTYPE EMEDIUMTYPE /* Wrong medium type */ +#define FI_ECANCELED ECANCELED /* Operation Canceled */ +#define FI_ENOKEY ENOKEY /* Required key not available */ +#define FI_EKEYEXPIRED EKEYEXPIRED /* Key has expired */ +#define FI_EKEYREVOKED EKEYREVOKED /* Key has been revoked */ +#define FI_EKEYREJECTED EKEYREJECTED /* Key was rejected by service */ +#define FI_EOWNERDEAD EOWNERDEAD /* Owner died */ +#define FI_ENOTRECOVERABLE ENOTRECOVERABLE /* State not recoverable */ + +/* FI specific return values: >= 256 */ + +#define FI_EOTHER 256 /* Unspecified error */ +#define FI_ETOOSMALL 257 /* Provided buffer is too small */ + +const char *fi_strerror(int errnum); + +#endif /* _FI_ERRNO_H_ */ diff --git a/include/rdma/fi_prov.h b/include/rdma/fi_prov.h new file mode 100644 index 00000000000..90cc52b6746 --- /dev/null +++ b/include/rdma/fi_prov.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2005 PathScale, Inc. All rights reserved. + * Copyright (c) 2013 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenFabrics.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _FI_PROV_H_ +#define _FI_PROV_H_ + +#include <rdma/fabric.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Extension that low-level drivers should add to their .so filename + * (probably via libtool "-release" option). For example a low-level + * driver named "libfoo" should build a plug-in named "libfoo-fi.so". + */ +#define FI_LIB_EXTENSION fi + +struct fi_ops_prov { + size_t size; + int (*getinfo)(char *node, char *service, struct fi_info *hints, + struct fi_info **info); + int (*freeinfo)(struct fi_info *info); + int (*socket)(struct fi_info *info, fid_t *fid, void *context); + int (*open)(const char *name, struct fi_info *info, uint64_t flags, + fid_t *fid, void *context); +}; + +void fi_register(struct fi_ops_prov *ops); + +#define FI_LIB_CLASS_NAME "libfabric" + +struct fi_ops_lib { + size_t size; + size_t (*context_size)(void); + const char * (*sysfs_path)(void); + int (*read_file)(const char *dir, const char *file, + char *buf, size_t size); +}; + +#ifdef __cplusplus +} +#endif + +#endif /* _FI_PROV_H_ */ diff --git a/include/rdma/fi_rdma.h b/include/rdma/fi_rdma.h new file mode 100644 index 00000000000..27ede3537b7 --- /dev/null +++ b/include/rdma/fi_rdma.h @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2013 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenFabrics.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _FI_RDMA_H_ +#define _FI_RDMA_H_ + +#include <rdma/fi_socket.h> + + +#ifdef __cplusplus +extern "C" { +#endif + + +struct fi_rdma_iov { + uint64_t addr; + size_t len; + be64_t key; +}; + +struct fi_msg_rdma { + const void *msg_iov; + size_t iov_count; + const void *addr; + const struct fi_rdma_iov *rdma_iov; + size_t rdma_iov_count; + void *context; + uint64_t data; +}; + +struct fi_ops_rdma { + size_t size; + int (*read)(fid_t fid, void *buf, size_t len, uint64_t addr, + be64_t key, void *context); + int (*readmem)(fid_t fid, void *buf, size_t len, uint64_t mem_desc, + uint64_t addr, be64_t key, void *context); + int (*readv)(fid_t fid, const void *iov, size_t count, uint64_t addr, + be64_t key, void *context); + int (*readfrom)(fid_t fid, void *buf, size_t len, const void *src_addr, + uint64_t addr, be64_t key, void *context); + int (*readmemfrom)(fid_t fid, void *buf, size_t len, uint64_t mem_desc, + const void *src_addr, uint64_t addr, be64_t key, + void *context); + int (*readmsg)(fid_t fid, const struct fi_msg_rdma *msg, uint64_t flags); + int (*write)(fid_t fid, const void *buf, size_t len, uint64_t addr, + be64_t key, void *context); + int (*writemem)(fid_t fid, const void *buf, size_t len, uint64_t mem_desc, + uint64_t addr, be64_t key, void *context); + int (*writev)(fid_t fid, const void *iov, size_t count, uint64_t addr, + be64_t key, void *context); + int (*writememto)(fid_t fid, const void *buf, size_t len, uint64_t mem_desc, + const void *dst_addr, uint64_t addr, be64_t key, + void *context); + int (*writeto)(fid_t fid, const void *buf, size_t len, const void *dst_addr, + uint64_t addr, be64_t key, void *context); + int (*writemsg)(fid_t fid, const struct fi_msg_rdma *msg, uint64_t flags); +}; + + +#ifdef __cplusplus +} +#endif + +#endif /* _FI_RDMA_H_ */ diff --git a/include/rdma/fi_socket.h b/include/rdma/fi_socket.h new file mode 100644 index 00000000000..544a6ddcb59 --- /dev/null +++ b/include/rdma/fi_socket.h @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2013 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenFabrics.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _FI_SOCKET_H_ +#define _FI_SOCKET_H_ + +#include <sys/socket.h> +#include <rdma/fabric.h> +#include <stddef.h> + + +#ifdef __cplusplus +extern "C" { +#endif + + +enum fid_type { + FID_UNSPEC, + FID_MSG, /* pick a better name */ + FID_STREAM, + FID_DGRAM, + FID_RAW, + FID_RDM, + FID_PACKET, + FID_MAX +}; + +#define FID_TYPE_MASK 0xFF + +enum fi_proto { + FI_PROTO_UNSPEC, + FI_PROTO_IB_RC, + FI_PROTO_IWARP, + FI_PROTO_IB_UC, + FI_PROTO_IB_UD, + FI_PROTO_IB_XRC, + FI_PROTO_RAW, + FI_PROTO_MAX +}; + +#define FI_PROTO_MASK 0xFF +#define FI_PROTO_MSG (1ULL << 8) +#define FI_PROTO_RDMA (1ULL << 9) +#define FI_PROTO_TAGGED (1ULL << 10) +#define FI_PROTO_ATOMICS (1ULL << 11) +#define FI_PROTO_MULTICAST (1ULL << 12) /* multicast uses MSG ops */ +/*#define FI_PROTO_COLLECTIVES (1ULL << 13)*/ + +struct fi_msg { + const void *msg_iov; + size_t iov_count; + const void *addr; + void *context; + uint64_t data; +}; + +struct fi_ops_sock { + size_t size; + ssize_t (*cancel)(fid_t fid, struct fi_context *context); + /* syncto? (fid_t fid, void *addr, uint64_t flags, void *context); */ + int (*getopt)(fid_t fid, int level, int optname, + void *optval, size_t *optlen); + int (*setopt)(fid_t fid, int level, int optname, + const void *optval, size_t optlen); +}; + +struct fi_ops_msg { + size_t size; + ssize_t (*recv)(fid_t fid, void *buf, size_t len, void *context); + ssize_t (*recvmem)(fid_t fid, void *buf, size_t len, uint64_t mem_desc, + void *context); + ssize_t (*recvv)(fid_t fid, const void *iov, size_t count, void *context); + ssize_t (*recvfrom)(fid_t fid, void *buf, size_t len, + const void *src_addr, void *context); + ssize_t (*recvmemfrom)(fid_t fid, void *buf, size_t len, uint64_t mem_desc, + const void *src_addr, void *context); + ssize_t (*recvmsg)(fid_t fid, const struct fi_msg *msg, uint64_t flags); + ssize_t (*send)(fid_t fid, const void *buf, size_t len, void *context); + ssize_t (*sendmem)(fid_t fid, const void *buf, size_t len, + uint64_t mem_desc, void *context); + ssize_t (*sendv)(fid_t fid, const void *iov, size_t count, void *context); + ssize_t (*sendto)(fid_t fid, const void *buf, size_t len, + const void *dest_addr, void *context); + ssize_t (*sendmemto)(fid_t fid, const void *buf, size_t len, uint64_t mem_desc, + const void *dest_addr, void *context); + ssize_t (*sendmsg)(fid_t fid, const struct fi_msg *msg, uint64_t flags); +}; + +struct fi_ops_cm; +struct fi_ops_rdma; +struct fi_ops_tagged; +/* struct fi_ops_atomic; */ +/* struct fi_ops_collectives; */ + +/* + * Calls which modify the properties of a socket (control, setopt, bind, ...) + * must be serialized against all other operations. Those calls may modify the + * operations referenced by a socket in order to optimize the data transfer code + * paths. + * + * A provider may allocate the minimal size structure needed to support the + * ops requested by the user. + */ +struct fid_socket { + struct fid fid; + struct fi_ops_sock *ops; + struct fi_ops_msg *msg; + struct fi_ops_cm *cm; + struct fi_ops_rdma *rdma; + struct fi_ops_tagged *tagged; + /* struct fi_ops_atomics *atomic; */ +}; + +static inline ssize_t fi_cancel(fid_t fid, struct fi_context *context) +{ + struct fid_socket *sock = container_of(fid, struct fid_socket, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_SOCKET); + FI_ASSERT_OPS(fid, struct fid_socket, ops); + FI_ASSERT_OP(sock->ops, struct fi_ops_sock, cancel); + return sock->ops->cancel(fid, context); +} + +static inline ssize_t fi_setsockopt(fid_t fid, int level, int optname, + const void *optval, size_t optlen) +{ + struct fid_socket *sock = container_of(fid, struct fid_socket, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_SOCKET); + FI_ASSERT_OPS(fid, struct fid_socket, ops); + FI_ASSERT_OP(sock->ops, struct fi_ops_sock, setopt); + return sock->ops->setopt(fid, level, optname, optval, optlen); +} + +static inline ssize_t fi_recvmem(fid_t fid, void *buf, size_t len, + uint64_t mem_desc, void *context) +{ + struct fid_socket *sock = container_of(fid, struct fid_socket, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_SOCKET); + FI_ASSERT_OPS(fid, struct fid_socket, msg); + FI_ASSERT_OP(sock->msg, struct fi_ops_msg, recvmem); + return sock->msg->recvmem(fid, buf, len, mem_desc, context); +} + +static inline ssize_t fi_sendmem(fid_t fid, void *buf, size_t len, + uint64_t mem_desc, void *context) +{ + struct fid_socket *sock = container_of(fid, struct fid_socket, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_SOCKET); + FI_ASSERT_OPS(fid, struct fid_socket, msg); + FI_ASSERT_OP(sock->msg, struct fi_ops_msg, sendmem); + return sock->msg->sendmem(fid, buf, len, mem_desc, context); +} + + +#ifdef __cplusplus +} +#endif + +#endif /* _FI_SOCKET_H_ */ diff --git a/include/rdma/fi_tagged.h b/include/rdma/fi_tagged.h new file mode 100644 index 00000000000..b1631d84079 --- /dev/null +++ b/include/rdma/fi_tagged.h @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2013 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenFabrics.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _FI_TAGGED_H_ +#define _FI_TAGGED_H_ + +#include <assert.h> +#include <rdma/fi_socket.h> + + +#ifdef __cplusplus +extern "C" { +#endif + +struct fi_msg_tagged { + const void *msg_iov; + size_t iov_count; + const void *addr; + be64_t tag; + be64_t mask; + void *context; + uint64_t data; +}; + +struct fi_ops_tagged { + size_t size; + ssize_t (*recv)(fid_t fid, void *buf, size_t len, + be64_t tag, be64_t mask, void *context); + ssize_t (*recvv)(fid_t fid, const void *iov, size_t count, + be64_t tag, be64_t mask, void *context); + ssize_t (*recvfrom)(fid_t fid, void *buf, size_t len, const void *src_addr, + be64_t tag, be64_t mask, void *context); + ssize_t (*recvmsg)(fid_t fid, const struct fi_msg_tagged *msg, uint64_t flags); + ssize_t (*send)(fid_t fid, const void *buf, size_t len, be64_t tag, + void *context); + ssize_t (*sendv)(fid_t fid, const void *iov, size_t count, be64_t tag, + void *context); + ssize_t (*sendto)(fid_t fid, const void *buf, size_t len, + const void *dest_addr, be64_t tag, void *context); + ssize_t (*sendmsg)(fid_t fid, const struct fi_msg_tagged *msg, uint64_t flags); + ssize_t (*search)(fid_t fid, be64_t *tag, be64_t mask, uint64_t flags, + void *src_addr, size_t *src_addrlen, size_t *len, void *context); +}; + +static inline ssize_t +fi_tsendto(fid_t fid, const void *buf, size_t len, + const void *dest_addr, be64_t tag, void *context) +{ + struct fid_socket *sock = container_of(fid, struct fid_socket, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_SOCKET); + FI_ASSERT_OPS(fid, struct fid_socket, tagged); + FI_ASSERT_OP(sock->tagged, struct fi_ops_tagged, sendto); + return sock->tagged->sendto(fid, buf, len, dest_addr, tag, context); +} + +static inline ssize_t +fi_trecvfrom(fid_t fid, void *buf, size_t len, const void *src_addr, + be64_t tag, be64_t mask, void *context) +{ + struct fid_socket *sock = container_of(fid, struct fid_socket, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_SOCKET); + FI_ASSERT_OPS(fid, struct fid_socket, tagged); + FI_ASSERT_OP(sock->tagged, struct fi_ops_tagged, recvfrom); + return sock->tagged->recvfrom(fid, buf, len, src_addr, tag, mask, context); +} + +static inline ssize_t +fi_tsearch(fid_t fid, be64_t *tag, be64_t mask, uint64_t flags, + void *src_addr, size_t *src_addrlen, size_t *len, void *context) +{ + struct fid_socket *sock = container_of(fid, struct fid_socket, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_SOCKET); + FI_ASSERT_OPS(fid, struct fid_socket, tagged); + FI_ASSERT_OP(sock->tagged, struct fi_ops_tagged, search); + return sock->tagged->search(fid, tag, mask, flags, src_addr, src_addrlen, len, context); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _FI_TAGGED_H_ */ diff --git a/include/rdma/fi_ucma.h b/include/rdma/fi_ucma.h new file mode 100644 index 00000000000..36d2b8a5e5a --- /dev/null +++ b/include/rdma/fi_ucma.h @@ -0,0 +1,718 @@ +/* + * Copyright (c) 2005-2013 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _FI_UCMA_H_ +#define _FI_UCMA_H_ + +#include <linux/types.h> +#include <sys/socket.h> +#include <netinet/in.h> + +#include <rdma/fabric.h> +#include <rdma/fi_uverbs.h> + +#ifdef __cplusplus +extern "C" { +#endif + + +struct ibv_kern_path_rec { + __u8 dgid[16]; + __u8 sgid[16]; + __u16 dlid; + __u16 slid; + __u32 raw_traffic; + __u32 flow_label; + __u32 reversible; + __u32 mtu; + __u16 pkey; + __u8 hop_limit; + __u8 traffic_class; + __u8 numb_path; + __u8 sl; + __u8 mtu_selector; + __u8 rate_selector; + __u8 rate; + __u8 packet_life_time_selector; + __u8 packet_life_time; + __u8 preference; +}; + +#define IBV_PATH_RECORD_REVERSIBLE 0x80 + +struct ibv_path_record { + uint64_t service_id; + uint8_t dgid[16]; + uint8_t sgid[16]; + uint16_t dlid; + uint16_t slid; + uint32_t flowlabel_hoplimit; /* resv-31:28 flow label-27:8 hop limit-7:0*/ + uint8_t tclass; + uint8_t reversible_numpath; /* reversible-7:7 num path-6:0 */ + uint16_t pkey; + uint16_t qosclass_sl; /* qos class-15:4 sl-3:0 */ + uint8_t mtu; /* mtu selector-7:6 mtu-5:0 */ + uint8_t rate; /* rate selector-7:6 rate-5:0 */ + uint8_t packetlifetime; /* lifetime selector-7:6 lifetime-5:0 */ + uint8_t preference; + uint8_t reserved[6]; +}; + +#define IBV_PATH_FLAG_GMP (1<<0) +#define IBV_PATH_FLAG_PRIMARY (1<<1) +#define IBV_PATH_FLAG_ALTERNATE (1<<2) +#define IBV_PATH_FLAG_OUTBOUND (1<<3) +#define IBV_PATH_FLAG_INBOUND (1<<4) +#define IBV_PATH_FLAG_INBOUND_REVERSE (1<<5) +#define IBV_PATH_FLAG_BIDIRECTIONAL (IBV_PATH_FLAG_OUTBOUND | \ + IBV_PATH_FLAG_INBOUND_REVERSE) + +struct ibv_path_data { + uint32_t flags; + uint32_t reserved; + struct ibv_path_record path; +}; + + +/* + * This file must be kept in sync with the kernel's version of rdma_user_cm.h + */ + +#define RDMA_USER_CM_MIN_ABI_VERSION 4 +#define RDMA_USER_CM_MAX_ABI_VERSION 4 + +#define RDMA_MAX_PRIVATE_DATA 256 + +enum { + UCMA_CMD_CREATE_ID, + UCMA_CMD_DESTROY_ID, + UCMA_CMD_BIND_IP, + UCMA_CMD_RESOLVE_IP, + UCMA_CMD_RESOLVE_ROUTE, + UCMA_CMD_QUERY_ROUTE, + UCMA_CMD_CONNECT, + UCMA_CMD_LISTEN, + UCMA_CMD_ACCEPT, + UCMA_CMD_REJECT, + UCMA_CMD_DISCONNECT, + UCMA_CMD_INIT_QP_ATTR, + UCMA_CMD_GET_EVENT, + UCMA_CMD_GET_OPTION, /* unused */ + UCMA_CMD_SET_OPTION, + UCMA_CMD_NOTIFY, + UCMA_CMD_JOIN_IP_MCAST, + UCMA_CMD_LEAVE_MCAST, + UCMA_CMD_MIGRATE_ID, + UCMA_CMD_QUERY, + UCMA_CMD_BIND, + UCMA_CMD_RESOLVE_ADDR, + UCMA_CMD_JOIN_MCAST +}; + +struct ucma_abi_cmd_hdr { + __u32 cmd; + __u16 in; + __u16 out; +}; + +struct ucma_abi_create_id { + __u32 cmd; + __u16 in; + __u16 out; + __u64 uid; + __u64 response; + __u16 ps; + __u8 qp_type; + __u8 reserved[5]; +}; + +struct ucma_abi_create_id_resp { + __u32 id; +}; + +struct ucma_abi_destroy_id { + __u32 cmd; + __u16 in; + __u16 out; + __u64 response; + __u32 id; + __u32 reserved; +}; + +struct ucma_abi_destroy_id_resp { + __u32 events_reported; +}; + +struct ucma_abi_bind_ip { + __u32 cmd; + __u16 in; + __u16 out; + __u64 response; + struct sockaddr_in6 addr; + __u32 id; +}; + +struct ucma_abi_bind { + __u32 cmd; + __u16 in; + __u16 out; + __u32 id; + __u16 addr_size; + __u16 reserved; + struct sockaddr_storage addr; +}; + +struct ucma_abi_resolve_ip { + __u32 cmd; + __u16 in; + __u16 out; + struct sockaddr_in6 src_addr; + struct sockaddr_in6 dst_addr; + __u32 id; + __u32 timeout_ms; +}; + +struct ucma_abi_resolve_addr { + __u32 cmd; + __u16 in; + __u16 out; + __u32 id; + __u32 timeout_ms; + __u16 src_size; + __u16 dst_size; + __u32 reserved; + struct sockaddr_storage src_addr; + struct sockaddr_storage dst_addr; +}; + +struct ucma_abi_resolve_route { + __u32 cmd; + __u16 in; + __u16 out; + __u32 id; + __u32 timeout_ms; +}; + +enum { + UCMA_QUERY_ADDR, + UCMA_QUERY_PATH, + UCMA_QUERY_GID +}; + +struct ucma_abi_query { + __u32 cmd; + __u16 in; + __u16 out; + __u64 response; + __u32 id; + __u32 option; +}; + +struct ucma_abi_query_route_resp { + __u64 node_guid; + struct ibv_kern_path_rec ib_route[2]; + struct sockaddr_in6 src_addr; + struct sockaddr_in6 dst_addr; + __u32 num_paths; + __u8 port_num; + __u8 reserved[3]; +}; + +struct ucma_abi_query_addr_resp { + __u64 node_guid; + __u8 port_num; + __u8 reserved; + __u16 pkey; + __u16 src_size; + __u16 dst_size; + struct sockaddr_storage src_addr; + struct sockaddr_storage dst_addr; +}; + +struct ucma_abi_query_path_resp { + __u32 num_paths; + __u32 reserved; + struct ibv_path_data path_data[0]; +}; + +struct ucma_abi_conn_param { + __u32 qp_num; + __u32 reserved; + __u8 private_data[RDMA_MAX_PRIVATE_DATA]; + __u8 private_data_len; + __u8 srq; + __u8 responder_resources; + __u8 initiator_depth; + __u8 flow_control; + __u8 retry_count; + __u8 rnr_retry_count; + __u8 valid; +}; + +struct ucma_abi_ud_param { + __u32 qp_num; + __u32 qkey; + struct ibv_kern_ah_attr ah_attr; + __u8 private_data[RDMA_MAX_PRIVATE_DATA]; + __u8 private_data_len; + __u8 reserved[7]; + __u8 reserved2[4]; /* Round to 8-byte boundary to support 32/64 */ +}; + +struct ucma_abi_connect { + __u32 cmd; + __u16 in; + __u16 out; + struct ucma_abi_conn_param conn_param; + __u32 id; + __u32 reserved; +}; + +struct ucma_abi_listen { + __u32 cmd; + __u16 in; + __u16 out; + __u32 id; + __u32 backlog; +}; + +struct ucma_abi_accept { + __u32 cmd; + __u16 in; + __u16 out; + __u64 uid; + struct ucma_abi_conn_param conn_param; + __u32 id; + __u32 reserved; +}; + +struct ucma_abi_reject { + __u32 cmd; + __u16 in; + __u16 out; + __u32 id; + __u8 private_data_len; + __u8 reserved[3]; + __u8 private_data[RDMA_MAX_PRIVATE_DATA]; +}; + +struct ucma_abi_disconnect { + __u32 cmd; + __u16 in; + __u16 out; + __u32 id; +}; + +struct ucma_abi_init_qp_attr { + __u32 cmd; + __u16 in; + __u16 out; + __u64 response; + __u32 id; + __u32 qp_state; +}; + +struct ucma_abi_notify { + __u32 cmd; + __u16 in; + __u16 out; + __u32 id; + __u32 event; +}; + +struct ucma_abi_join_ip_mcast { + __u32 cmd; + __u16 in; + __u16 out; + __u64 response; /* ucma_abi_create_id_resp */ + __u64 uid; + struct sockaddr_in6 addr; + __u32 id; +}; + +struct ucma_abi_join_mcast { + __u32 cmd; + __u16 in; + __u16 out; + __u64 response; /* rdma_ucma_create_id_resp */ + __u64 uid; + __u32 id; + __u16 addr_size; + __u16 reserved; + struct sockaddr_storage addr; +}; + +struct ucma_abi_get_event { + __u32 cmd; + __u16 in; + __u16 out; + __u64 response; +}; + +struct ucma_abi_event_resp { + __u64 uid; + __u32 id; + __u32 event; + __u32 status; + union { + struct ucma_abi_conn_param conn; + struct ucma_abi_ud_param ud; + } param; +}; + +struct ucma_abi_set_option { + __u32 cmd; + __u16 in; + __u16 out; + __u64 optval; + __u32 id; + __u32 level; + __u32 optname; + __u32 optlen; +}; + +struct ucma_abi_migrate_id { + __u32 cmd; + __u16 in; + __u16 out; + __u64 response; + __u32 id; + __u32 fd; +}; + +struct ucma_abi_migrate_resp { + __u32 events_reported; +}; + + +struct fi_ops_ucma { + size_t size; + int (*create_id)(fid_t fid, + struct ucma_abi_create_id *cmd, size_t cmd_size, + struct ucma_abi_create_id_resp *resp, size_t resp_size); + int (*destroy_id)(fid_t fid, + struct ucma_abi_destroy_id *cmd, size_t cmd_size, + struct ucma_abi_destroy_id_resp *resp, size_t resp_size); + int (*bind_ip)(fid_t fid, + struct ucma_abi_bind_ip *cmd, size_t cmd_size); + int (*bind)(fid_t fid, + struct ucma_abi_bind *cmd, size_t cmd_size); + int (*resolve_ip)(fid_t fid, + struct ucma_abi_resolve_ip *cmd, size_t cmd_size); + int (*resolve_addr)(fid_t fid, + struct ucma_abi_resolve_addr *cmd, size_t cmd_size); + int (*resolve_route)(fid_t fid, + struct ucma_abi_resolve_route *cmd, size_t cmd_size); + int (*query_route)(fid_t fid, + struct ucma_abi_query *cmd, size_t cmd_size, + struct ucma_abi_query_route_resp *resp, size_t resp_size); + int (*query)(fid_t fid, + struct ucma_abi_query *cmd, size_t cmd_size, + void *resp, size_t resp_size); + int (*connect)(fid_t fid, + struct ucma_abi_connect *cmd, size_t cmd_size); + int (*listen)(fid_t fid, + struct ucma_abi_listen *cmd, size_t cmd_size); + int (*accept)(fid_t fid, + struct ucma_abi_accept *cmd, size_t cmd_size); + int (*reject)(fid_t fid, + struct ucma_abi_reject *cmd, size_t cmd_size); + int (*disconnect)(fid_t fid, + struct ucma_abi_disconnect *cmd, size_t cmd_size); + int (*init_qp_attr)(fid_t fid, + struct ucma_abi_init_qp_attr *cmd, size_t cmd_size, + struct ibv_kern_qp_attr *resp, size_t resp_size); + int (*get_event)(fid_t fid, + struct ucma_abi_get_event *cmd, size_t cmd_size, + struct ucma_abi_event_resp *resp, size_t resp_size); + int (*set_option)(fid_t fid, + struct ucma_abi_set_option *cmd, size_t cmd_size); + int (*notify)(fid_t fid, + struct ucma_abi_notify *cmd, size_t cmd_size); + int (*join_ip_mcast)(fid_t fid, + struct ucma_abi_join_ip_mcast *cmd, size_t cmd_size, + struct ucma_abi_create_id_resp *resp, size_t resp_size); + int (*join_mcast)(fid_t fid, + struct ucma_abi_join_mcast *cmd, size_t cmd_size, + struct ucma_abi_create_id_resp *resp, size_t resp_size); + int (*leave_mcast)(fid_t fid, + struct ucma_abi_destroy_id *cmd, size_t cmd_size, + struct ucma_abi_destroy_id_resp *resp, size_t resp_size); + int (*migrate_id)(fid_t fid, + struct ucma_abi_migrate_id *cmd, size_t cmd_size, + struct ucma_abi_migrate_resp *resp, size_t resp_size); +}; + +#define FI_UCMA_INTERFACE "ucma" + +struct fid_ucma { + struct fid fid; + int fd; + struct fi_ops_ucma *ops; + +}; + +static inline int ucma_create_id(fid_t fid, + struct ucma_abi_create_id *cmd, size_t cmd_size, + struct ucma_abi_create_id_resp *resp, size_t resp_size) +{ + struct fid_ucma *ucma = container_of(fid, struct fid_ucma, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_ucma, ops); + FI_ASSERT_OP(ucma->ops, struct fi_ops_ucma, create_id); + return ucma->ops->create_id(fid, cmd, cmd_size, resp, resp_size); +} + +static inline int ucma_destroy_id(fid_t fid, + struct ucma_abi_destroy_id *cmd, size_t cmd_size, + struct ucma_abi_destroy_id_resp *resp, size_t resp_size) +{ + struct fid_ucma *ucma = container_of(fid, struct fid_ucma, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_ucma, ops); + FI_ASSERT_OP(ucma->ops, struct fi_ops_ucma, destroy_id); + return ucma->ops->destroy_id(fid, cmd, cmd_size, resp, resp_size); +} + +static inline int ucma_bind_ip(fid_t fid, + struct ucma_abi_bind_ip *cmd, size_t cmd_size) +{ + struct fid_ucma *ucma = container_of(fid, struct fid_ucma, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_ucma, ops); + FI_ASSERT_OP(ucma->ops, struct fi_ops_ucma, bind_ip); + return ucma->ops->bind_ip(fid, cmd, cmd_size); +} + +static inline int ucma_bind(fid_t fid, + struct ucma_abi_bind *cmd, size_t cmd_size) +{ + struct fid_ucma *ucma = container_of(fid, struct fid_ucma, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_ucma, ops); + FI_ASSERT_OP(ucma->ops, struct fi_ops_ucma, bind); + return ucma->ops->bind(fid, cmd, cmd_size); +} + +static inline int ucma_resolve_ip(fid_t fid, + struct ucma_abi_resolve_ip *cmd, size_t cmd_size) +{ + struct fid_ucma *ucma = container_of(fid, struct fid_ucma, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_ucma, ops); + FI_ASSERT_OP(ucma->ops, struct fi_ops_ucma, resolve_ip); + return ucma->ops->resolve_ip(fid, cmd, cmd_size); +} + +static inline int ucma_resolve_addr(fid_t fid, + struct ucma_abi_resolve_addr *cmd, size_t cmd_size) +{ + struct fid_ucma *ucma = container_of(fid, struct fid_ucma, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_ucma, ops); + FI_ASSERT_OP(ucma->ops, struct fi_ops_ucma, resolve_addr); + return ucma->ops->resolve_addr(fid, cmd, cmd_size); +} + +static inline int ucma_resolve_route(fid_t fid, + struct ucma_abi_resolve_route *cmd, size_t cmd_size) +{ + struct fid_ucma *ucma = container_of(fid, struct fid_ucma, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_ucma, ops); + FI_ASSERT_OP(ucma->ops, struct fi_ops_ucma, resolve_route); + return ucma->ops->resolve_route(fid, cmd, cmd_size); +} + +static inline int ucma_query_route(fid_t fid, + struct ucma_abi_query *cmd, size_t cmd_size, + struct ucma_abi_query_route_resp *resp, size_t resp_size) +{ + struct fid_ucma *ucma = container_of(fid, struct fid_ucma, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_ucma, ops); + FI_ASSERT_OP(ucma->ops, struct fi_ops_ucma, query_route); + return ucma->ops->query_route(fid, cmd, cmd_size, resp, resp_size); +} + +static inline int ucma_query(fid_t fid, + struct ucma_abi_query *cmd, size_t cmd_size, + void *resp, size_t resp_size) +{ + struct fid_ucma *ucma = container_of(fid, struct fid_ucma, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_ucma, ops); + FI_ASSERT_OP(ucma->ops, struct fi_ops_ucma, query); + return ucma->ops->query(fid, cmd, cmd_size, resp, resp_size); +} + +static inline int ucma_connect(fid_t fid, + struct ucma_abi_connect *cmd, size_t cmd_size) +{ + struct fid_ucma *ucma = container_of(fid, struct fid_ucma, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_ucma, ops); + FI_ASSERT_OP(ucma->ops, struct fi_ops_ucma, connect); + return ucma->ops->connect(fid, cmd, cmd_size); +} + +static inline int ucma_listen(fid_t fid, + struct ucma_abi_listen *cmd, size_t cmd_size) +{ + struct fid_ucma *ucma = container_of(fid, struct fid_ucma, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_ucma, ops); + FI_ASSERT_OP(ucma->ops, struct fi_ops_ucma, listen); + return ucma->ops->listen(fid, cmd, cmd_size); +} + +static inline int ucma_accept(fid_t fid, + struct ucma_abi_accept *cmd, size_t cmd_size) +{ + struct fid_ucma *ucma = container_of(fid, struct fid_ucma, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_ucma, ops); + FI_ASSERT_OP(ucma->ops, struct fi_ops_ucma, accept); + return ucma->ops->accept(fid, cmd, cmd_size); +} + +static inline int ucma_reject(fid_t fid, + struct ucma_abi_reject *cmd, size_t cmd_size) +{ + struct fid_ucma *ucma = container_of(fid, struct fid_ucma, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_ucma, ops); + FI_ASSERT_OP(ucma->ops, struct fi_ops_ucma, reject); + return ucma->ops->reject(fid, cmd, cmd_size); +} + +static inline int ucma_disconnect(fid_t fid, + struct ucma_abi_disconnect *cmd, size_t cmd_size) +{ + struct fid_ucma *ucma = container_of(fid, struct fid_ucma, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_ucma, ops); + FI_ASSERT_OP(ucma->ops, struct fi_ops_ucma, disconnect); + return ucma->ops->disconnect(fid, cmd, cmd_size); +} + +static inline int ucma_init_qp_attr(fid_t fid, + struct ucma_abi_init_qp_attr *cmd, size_t cmd_size, + struct ibv_kern_qp_attr *resp, size_t resp_size) +{ + struct fid_ucma *ucma = container_of(fid, struct fid_ucma, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_ucma, ops); + FI_ASSERT_OP(ucma->ops, struct fi_ops_ucma, init_qp_attr); + return ucma->ops->init_qp_attr(fid, cmd, cmd_size, resp, resp_size); +} + +static inline int ucma_get_event(fid_t fid, + struct ucma_abi_get_event *cmd, size_t cmd_size, + struct ucma_abi_event_resp *resp, size_t resp_size) +{ + struct fid_ucma *ucma = container_of(fid, struct fid_ucma, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_ucma, ops); + FI_ASSERT_OP(ucma->ops, struct fi_ops_ucma, get_event); + return ucma->ops->get_event(fid, cmd, cmd_size, resp, resp_size); +} + +static inline int ucma_set_option(fid_t fid, + struct ucma_abi_set_option *cmd, size_t cmd_size) +{ + struct fid_ucma *ucma = container_of(fid, struct fid_ucma, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_ucma, ops); + FI_ASSERT_OP(ucma->ops, struct fi_ops_ucma, set_option); + return ucma->ops->set_option(fid, cmd, cmd_size); +} + +static inline int ucma_notify(fid_t fid, + struct ucma_abi_notify *cmd, size_t cmd_size) +{ + struct fid_ucma *ucma = container_of(fid, struct fid_ucma, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_ucma, ops); + FI_ASSERT_OP(ucma->ops, struct fi_ops_ucma, notify); + return ucma->ops->notify(fid, cmd, cmd_size); +} + +static inline int ucma_join_ip_mcast(fid_t fid, + struct ucma_abi_join_ip_mcast *cmd, size_t cmd_size, + struct ucma_abi_create_id_resp *resp, size_t resp_size) +{ + struct fid_ucma *ucma = container_of(fid, struct fid_ucma, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_ucma, ops); + FI_ASSERT_OP(ucma->ops, struct fi_ops_ucma, join_ip_mcast); + return ucma->ops->join_ip_mcast(fid, cmd, cmd_size, resp, resp_size); +} + +static inline int ucma_join_mcast(fid_t fid, + struct ucma_abi_join_mcast *cmd, size_t cmd_size, + struct ucma_abi_create_id_resp *resp, size_t resp_size) +{ + struct fid_ucma *ucma = container_of(fid, struct fid_ucma, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_ucma, ops); + FI_ASSERT_OP(ucma->ops, struct fi_ops_ucma, join_mcast); + return ucma->ops->join_mcast(fid, cmd, cmd_size, resp, resp_size); +} + +static inline int ucma_leave_mcast(fid_t fid, + struct ucma_abi_destroy_id *cmd, size_t cmd_size, + struct ucma_abi_destroy_id_resp *resp, size_t resp_size) +{ + struct fid_ucma *ucma = container_of(fid, struct fid_ucma, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_ucma, ops); + FI_ASSERT_OP(ucma->ops, struct fi_ops_ucma, leave_mcast); + return ucma->ops->leave_mcast(fid, cmd, cmd_size, resp, resp_size); +} + +static inline int ucma_migrate_id(fid_t fid, + struct ucma_abi_migrate_id *cmd, size_t cmd_size, + struct ucma_abi_migrate_resp *resp, size_t resp_size) +{ + struct fid_ucma *ucma = container_of(fid, struct fid_ucma, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_ucma, ops); + FI_ASSERT_OP(ucma->ops, struct fi_ops_ucma, migrate_id); + return ucma->ops->migrate_id(fid, cmd, cmd_size, resp, resp_size); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _FI_UCMA_H_ */ diff --git a/include/rdma/fi_umad.h b/include/rdma/fi_umad.h new file mode 100644 index 00000000000..d0d18ecf58c --- /dev/null +++ b/include/rdma/fi_umad.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Voltaire, Inc. All rights reserved. + * Copyright (c) 2013 Intel Corp., Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _FI_UMAD_H_ +#define _FI_UMAD_H_ + +#include <linux/types.h> +#include <linux/ioctl.h> + + +#ifdef __cplusplus +extern "C" { +#endif + + +/* + * This file must be kept in sync with the kernel's version of ib_user_mad.h + */ + +#define UMAD_MIN_ABI_VERSION 5 +#define UMAD_MAX_ABI_VERSION 5 + + +struct umad_hdr { + __u32 id; + __u32 status; + __u32 timeout_ms; + __u32 retries; + __u32 length; + __be32 qpn; + __be32 qkey; + __be16 lid; + __u8 sl; + __u8 path_bits; + __u8 grh_present; + __u8 gid_index; + __u8 hop_limit; + __u8 traffic_class; + __u8 gid[16]; + __be32 flow_label; + __u16 pkey_index; + __u8 reserved[6]; +}; + +struct umad_data { + struct umad_hdr hdr; + __u64 data[0]; +}; + +typedef unsigned long __attribute__((aligned(4))) packed_ulong; +#define UMAD_LONGS_PER_METHOD_MASK (128 / (8 * sizeof (long))) + +struct umad_reg_req { + __u32 id; + packed_ulong method_mask[UMAD_LONGS_PER_METHOD_MASK]; + __u8 qpn; + __u8 mgmt_class; + __u8 mgmt_class_version; + __u8 oui[3]; + __u8 rmpp_version; +}; + +#define UMAD_IOCTL_MAGIC 0x1b +#define UMAD_REGISTER_AGENT _IOWR(UMAD_IOCTL_MAGIC, 1, struct umad_reg_req) +#define UMAD_UNREGISTER_AGENT _IOW(UMAD_IOCTL_MAGIC, 2, __u32) +#define UMAD_ENABLE_PKEY _IO(UMAD_IOCTL_MAGIC, 3) + + +#define FI_UVERBS_CLASS_NAME "umad" +#define FI_UMAD_OPS (4ULL << FI_OPS_LIB_SHIFT) + +struct fi_umad_ops { + size_t size; + int (*get_abi)(void); +}; + +#ifdef __cplusplus +} +#endif + +#endif /* _FI_UMAD_H_ */ diff --git a/include/rdma/fi_uverbs.h b/include/rdma/fi_uverbs.h new file mode 100644 index 00000000000..e3d2ba19487 --- /dev/null +++ b/include/rdma/fi_uverbs.h @@ -0,0 +1,1289 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. + * Copyright (c) 2005 PathScale, Inc. All rights reserved. + * Copyright (c) 2013 Intel Corporation, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _FI_UVERBS_H_ +#define _FI_UVERBS_H_ + + +#include <linux/types.h> +#include <rdma/fabric.h> + + +#ifdef __cplusplus +extern "C" { +#endif + + +/* + * This file must be kept in sync with the kernel's version of ib_user_verbs.h + */ + +#define UVERBS_MIN_ABI_VERSION 6 +#define UVERBS_MAX_ABI_VERSION 6 + +enum { + UVERBS_CMD_GET_CONTEXT, + UVERBS_CMD_QUERY_DEVICE, + UVERBS_CMD_QUERY_PORT, + UVERBS_CMD_ALLOC_PD, + UVERBS_CMD_DEALLOC_PD, + UVERBS_CMD_CREATE_AH, + UVERBS_CMD_MODIFY_AH, /* unused */ + UVERBS_CMD_QUERY_AH, /* unused */ + UVERBS_CMD_DESTROY_AH, + UVERBS_CMD_REG_MR, + UVERBS_CMD_REG_SMR, /* unused */ + UVERBS_CMD_REREG_MR, /* unused */ + UVERBS_CMD_QUERY_MR, /* unused */ + UVERBS_CMD_DEREG_MR, + UVERBS_CMD_ALLOC_MW, /* unused */ + UVERBS_CMD_BIND_MW, /* unused */ + UVERBS_CMD_DEALLOC_MW, /* unused */ + UVERBS_CMD_CREATE_COMP_CHANNEL, + UVERBS_CMD_CREATE_CQ, + UVERBS_CMD_RESIZE_CQ, + UVERBS_CMD_DESTROY_CQ, + UVERBS_CMD_POLL_CQ, + UVERBS_CMD_PEEK_CQ, + UVERBS_CMD_REQ_NOTIFY_CQ, + UVERBS_CMD_CREATE_QP, + UVERBS_CMD_QUERY_QP, + UVERBS_CMD_MODIFY_QP, + UVERBS_CMD_DESTROY_QP, + UVERBS_CMD_POST_SEND, + UVERBS_CMD_POST_RECV, + UVERBS_CMD_ATTACH_MCAST, + UVERBS_CMD_DETACH_MCAST, + UVERBS_CMD_CREATE_SRQ, + UVERBS_CMD_MODIFY_SRQ, + UVERBS_CMD_QUERY_SRQ, + UVERBS_CMD_DESTROY_SRQ, + UVERBS_CMD_POST_SRQ_RECV, + UVERBS_CMD_OPEN_XRCD, /* TODO */ + UVERBS_CMD_CLOSE_XRCD, /* TODO */ + UVERBS_CMD_CREATE_XSRQ, /* TODO */ + UVERBS_CMD_OPEN_QP, /* TODO */ +}; + +/* + * Make sure that all structs defined in this file remain laid out so + * that they pack the same way on 32-bit and 64-bit architectures (to + * avoid incompatibility between 32-bit userspace and 64-bit kernels). + * Specifically: + * - Do not use pointer types -- pass pointers in __u64 instead. + * - Make sure that any structure larger than 4 bytes is padded to a + * multiple of 8 bytes. Otherwise the structure size will be + * different between 32-bit and 64-bit architectures. + */ + +struct ibv_kern_async_event { + __u64 element; + __u32 event_type; + __u32 reserved; +}; + +struct ibv_comp_event { + __u64 cq_handle; +}; + +/* + * All commands from userspace should start with a __u32 command field + * followed by __u16 in_words and out_words fields (which give the + * length of the command block and response buffer if any in 32-bit + * words). The kernel driver will read these fields first and read + * the rest of the command struct based on these value. + */ + +struct ibv_query_params { + __u32 command; + __u16 in_words; + __u16 out_words; + __u64 response; +}; + +struct ibv_query_params_resp { + __u32 num_cq_events; +}; + +struct ibv_get_context { + __u32 command; + __u16 in_words; + __u16 out_words; + __u64 response; + __u64 driver_data[0]; +}; + +struct ibv_get_context_resp { + __u32 async_fd; + __u32 num_comp_vectors; +}; + +struct ibv_query_device { + __u32 command; + __u16 in_words; + __u16 out_words; + __u64 response; + __u64 driver_data[0]; +}; + +struct ibv_query_device_resp { + __u64 fw_ver; + __u64 node_guid; + __u64 sys_image_guid; + __u64 max_mr_size; + __u64 page_size_cap; + __u32 vendor_id; + __u32 vendor_part_id; + __u32 hw_ver; + __u32 max_qp; + __u32 max_qp_wr; + __u32 device_cap_flags; + __u32 max_sge; + __u32 max_sge_rd; + __u32 max_cq; + __u32 max_cqe; + __u32 max_mr; + __u32 max_pd; + __u32 max_qp_rd_atom; + __u32 max_ee_rd_atom; + __u32 max_res_rd_atom; + __u32 max_qp_init_rd_atom; + __u32 max_ee_init_rd_atom; + __u32 atomic_cap; + __u32 max_ee; + __u32 max_rdd; + __u32 max_mw; + __u32 max_raw_ipv6_qp; + __u32 max_raw_ethy_qp; + __u32 max_mcast_grp; + __u32 max_mcast_qp_attach; + __u32 max_total_mcast_qp_attach; + __u32 max_ah; + __u32 max_fmr; + __u32 max_map_per_fmr; + __u32 max_srq; + __u32 max_srq_wr; + __u32 max_srq_sge; + __u16 max_pkeys; + __u8 local_ca_ack_delay; + __u8 phys_port_cnt; + __u8 reserved[4]; +}; + +struct ibv_query_port { + __u32 command; + __u16 in_words; + __u16 out_words; + __u64 response; + __u8 port_num; + __u8 reserved[7]; + __u64 driver_data[0]; +}; + +struct ibv_query_port_resp { + __u32 port_cap_flags; + __u32 max_msg_sz; + __u32 bad_pkey_cntr; + __u32 qkey_viol_cntr; + __u32 gid_tbl_len; + __u16 pkey_tbl_len; + __u16 lid; + __u16 sm_lid; + __u8 state; + __u8 max_mtu; + __u8 active_mtu; + __u8 lmc; + __u8 max_vl_num; + __u8 sm_sl; + __u8 subnet_timeout; + __u8 init_type_reply; + __u8 active_width; + __u8 active_speed; + __u8 phys_state; + __u8 link_layer; + __u8 reserved[2]; +}; + +struct ibv_alloc_pd { + __u32 command; + __u16 in_words; + __u16 out_words; + __u64 response; + __u64 driver_data[0]; +}; + +struct ibv_alloc_pd_resp { + __u32 pd_handle; +}; + +struct ibv_dealloc_pd { + __u32 command; + __u16 in_words; + __u16 out_words; + __u32 pd_handle; +}; + +struct ibv_open_xrcd { + __u32 command; + __u16 in_words; + __u16 out_words; + __u64 response; + __u32 fd; + __u32 oflags; + __u64 driver_data[0]; +}; + +struct ibv_open_xrcd_resp { + __u32 xrcd_handle; +}; + +struct ibv_close_xrcd { + __u32 command; + __u16 in_words; + __u16 out_words; + __u32 xrcd_handle; +}; + +struct ibv_reg_mr { + __u32 command; + __u16 in_words; + __u16 out_words; + __u64 response; + __u64 start; + __u64 length; + __u64 hca_va; + __u32 pd_handle; + __u32 access_flags; + __u64 driver_data[0]; +}; + +struct ibv_reg_mr_resp { + __u32 mr_handle; + __u32 lkey; + __u32 rkey; +}; + +struct ibv_dereg_mr { + __u32 command; + __u16 in_words; + __u16 out_words; + __u32 mr_handle; +}; + +struct ibv_create_comp_channel { + __u32 command; + __u16 in_words; + __u16 out_words; + __u64 response; +}; + +struct ibv_create_comp_channel_resp { + __u32 fd; +}; + +struct ibv_create_cq { + __u32 command; + __u16 in_words; + __u16 out_words; + __u64 response; + __u64 user_handle; + __u32 cqe; + __u32 comp_vector; + __s32 comp_channel; + __u32 reserved; + __u64 driver_data[0]; +}; + +struct ibv_create_cq_resp { + __u32 cq_handle; + __u32 cqe; +}; + +struct ibv_kern_wc { + __u64 wr_id; + __u32 status; + __u32 opcode; + __u32 vendor_err; + __u32 byte_len; + __u32 imm_data; + __u32 qp_num; + __u32 src_qp; + __u32 wc_flags; + __u16 pkey_index; + __u16 slid; + __u8 sl; + __u8 dlid_path_bits; + __u8 port_num; + __u8 reserved; +}; + +struct ibv_poll_cq { + __u32 command; + __u16 in_words; + __u16 out_words; + __u64 response; + __u32 cq_handle; + __u32 ne; +}; + +struct ibv_poll_cq_resp { + __u32 count; + __u32 reserved; + struct ibv_kern_wc wc[0]; +}; + +struct ibv_req_notify_cq { + __u32 command; + __u16 in_words; + __u16 out_words; + __u32 cq_handle; + __u32 solicited; +}; + +struct ibv_resize_cq { + __u32 command; + __u16 in_words; + __u16 out_words; + __u64 response; + __u32 cq_handle; + __u32 cqe; + __u64 driver_data[0]; +}; + +struct ibv_resize_cq_resp { + __u32 cqe; + __u32 reserved; + __u64 driver_data[0]; +}; + +struct ibv_destroy_cq { + __u32 command; + __u16 in_words; + __u16 out_words; + __u64 response; + __u32 cq_handle; + __u32 reserved; +}; + +struct ibv_destroy_cq_resp { + __u32 comp_events_reported; + __u32 async_events_reported; +}; + +struct ibv_kern_global_route { + __u8 dgid[16]; + __u32 flow_label; + __u8 sgid_index; + __u8 hop_limit; + __u8 traffic_class; + __u8 reserved; +}; + +struct ibv_kern_ah_attr { + struct ibv_kern_global_route grh; + __u16 dlid; + __u8 sl; + __u8 src_path_bits; + __u8 static_rate; + __u8 is_global; + __u8 port_num; + __u8 reserved; +}; + +struct ibv_kern_qp_attr { + __u32 qp_attr_mask; + __u32 qp_state; + __u32 cur_qp_state; + __u32 path_mtu; + __u32 path_mig_state; + __u32 qkey; + __u32 rq_psn; + __u32 sq_psn; + __u32 dest_qp_num; + __u32 qp_access_flags; + + struct ibv_kern_ah_attr ah_attr; + struct ibv_kern_ah_attr alt_ah_attr; + + /* ib_qp_cap */ + __u32 max_send_wr; + __u32 max_recv_wr; + __u32 max_send_sge; + __u32 max_recv_sge; + __u32 max_inline_data; + + __u16 pkey_index; + __u16 alt_pkey_index; + __u8 en_sqd_async_notify; + __u8 sq_draining; + __u8 max_rd_atomic; + __u8 max_dest_rd_atomic; + __u8 min_rnr_timer; + __u8 port_num; + __u8 timeout; + __u8 retry_cnt; + __u8 rnr_retry; + __u8 alt_port_num; + __u8 alt_timeout; + __u8 reserved[5]; +}; + +struct ibv_create_qp { + __u32 command; + __u16 in_words; + __u16 out_words; + __u64 response; + __u64 user_handle; + __u32 pd_handle; + __u32 send_cq_handle; + __u32 recv_cq_handle; + __u32 srq_handle; + __u32 max_send_wr; + __u32 max_recv_wr; + __u32 max_send_sge; + __u32 max_recv_sge; + __u32 max_inline_data; + __u8 sq_sig_all; + __u8 qp_type; + __u8 is_srq; + __u8 reserved; + __u64 driver_data[0]; +}; + +struct ibv_open_qp { + __u32 command; + __u16 in_words; + __u16 out_words; + __u64 response; + __u64 user_handle; + __u32 pd_handle; + __u32 qpn; + __u8 qp_type; + __u8 reserved[7]; + __u64 driver_data[0]; +}; + +/* also used for open response */ +struct ibv_create_qp_resp { + __u32 qp_handle; + __u32 qpn; + __u32 max_send_wr; + __u32 max_recv_wr; + __u32 max_send_sge; + __u32 max_recv_sge; + __u32 max_inline_data; + __u32 reserved; +}; + +struct ibv_qp_dest { + __u8 dgid[16]; + __u32 flow_label; + __u16 dlid; + __u16 reserved; + __u8 sgid_index; + __u8 hop_limit; + __u8 traffic_class; + __u8 sl; + __u8 src_path_bits; + __u8 static_rate; + __u8 is_global; + __u8 port_num; +}; + +struct ibv_query_qp { + __u32 command; + __u16 in_words; + __u16 out_words; + __u64 response; + __u32 qp_handle; + __u32 attr_mask; + __u64 driver_data[0]; +}; + +struct ibv_query_qp_resp { + struct ibv_qp_dest dest; + struct ibv_qp_dest alt_dest; + __u32 max_send_wr; + __u32 max_recv_wr; + __u32 max_send_sge; + __u32 max_recv_sge; + __u32 max_inline_data; + __u32 qkey; + __u32 rq_psn; + __u32 sq_psn; + __u32 dest_qp_num; + __u32 qp_access_flags; + __u16 pkey_index; + __u16 alt_pkey_index; + __u8 qp_state; + __u8 cur_qp_state; + __u8 path_mtu; + __u8 path_mig_state; + __u8 sq_draining; + __u8 max_rd_atomic; + __u8 max_dest_rd_atomic; + __u8 min_rnr_timer; + __u8 port_num; + __u8 timeout; + __u8 retry_cnt; + __u8 rnr_retry; + __u8 alt_port_num; + __u8 alt_timeout; + __u8 sq_sig_all; + __u8 reserved[5]; + __u64 driver_data[0]; +}; + +struct ibv_modify_qp { + __u32 command; + __u16 in_words; + __u16 out_words; + struct ibv_qp_dest dest; + struct ibv_qp_dest alt_dest; + __u32 qp_handle; + __u32 attr_mask; + __u32 qkey; + __u32 rq_psn; + __u32 sq_psn; + __u32 dest_qp_num; + __u32 qp_access_flags; + __u16 pkey_index; + __u16 alt_pkey_index; + __u8 qp_state; + __u8 cur_qp_state; + __u8 path_mtu; + __u8 path_mig_state; + __u8 en_sqd_async_notify; + __u8 max_rd_atomic; + __u8 max_dest_rd_atomic; + __u8 min_rnr_timer; + __u8 port_num; + __u8 timeout; + __u8 retry_cnt; + __u8 rnr_retry; + __u8 alt_port_num; + __u8 alt_timeout; + __u8 reserved[2]; + __u64 driver_data[0]; +}; + +struct ibv_destroy_qp { + __u32 command; + __u16 in_words; + __u16 out_words; + __u64 response; + __u32 qp_handle; + __u32 reserved; +}; + +struct ibv_destroy_qp_resp { + __u32 events_reported; +}; + +struct ibv_kern_send_wr { + __u64 wr_id; + __u32 num_sge; + __u32 opcode; + __u32 send_flags; + __u32 imm_data; + union { + struct { + __u64 remote_addr; + __u32 rkey; + __u32 reserved; + } rdma; + struct { + __u64 remote_addr; + __u64 compare_add; + __u64 swap; + __u32 rkey; + __u32 reserved; + } atomic; + struct { + __u32 ah; + __u32 remote_qpn; + __u32 remote_qkey; + __u32 reserved; + } ud; + struct { + __u64 reserved[3]; + __u32 reserved2; + __u32 remote_srqn; + } xrc; + } wr; +}; + +struct ibv_post_send { + __u32 command; + __u16 in_words; + __u16 out_words; + __u64 response; + __u32 qp_handle; + __u32 wr_count; + __u32 sge_count; + __u32 wqe_size; + struct ibv_kern_send_wr send_wr[0]; +}; + +struct ibv_post_send_resp { + __u32 bad_wr; +}; + +struct ibv_kern_recv_wr { + __u64 wr_id; + __u32 num_sge; + __u32 reserved; +}; + +struct ibv_post_recv { + __u32 command; + __u16 in_words; + __u16 out_words; + __u64 response; + __u32 qp_handle; + __u32 wr_count; + __u32 sge_count; + __u32 wqe_size; + struct ibv_kern_recv_wr recv_wr[0]; +}; + +struct ibv_post_recv_resp { + __u32 bad_wr; +}; + +struct ibv_post_srq_recv { + __u32 command; + __u16 in_words; + __u16 out_words; + __u64 response; + __u32 srq_handle; + __u32 wr_count; + __u32 sge_count; + __u32 wqe_size; + struct ibv_kern_recv_wr recv_wr[0]; +}; + +struct ibv_post_srq_recv_resp { + __u32 bad_wr; +}; + +struct ibv_create_ah { + __u32 command; + __u16 in_words; + __u16 out_words; + __u64 response; + __u64 user_handle; + __u32 pd_handle; + __u32 reserved; + struct ibv_kern_ah_attr attr; +}; + +struct ibv_create_ah_resp { + __u32 handle; +}; + +struct ibv_destroy_ah { + __u32 command; + __u16 in_words; + __u16 out_words; + __u32 ah_handle; +}; + +struct ibv_attach_mcast { + __u32 command; + __u16 in_words; + __u16 out_words; + __u8 gid[16]; + __u32 qp_handle; + __u16 mlid; + __u16 reserved; + __u64 driver_data[0]; +}; + +struct ibv_detach_mcast { + __u32 command; + __u16 in_words; + __u16 out_words; + __u8 gid[16]; + __u32 qp_handle; + __u16 mlid; + __u16 reserved; + __u64 driver_data[0]; +}; + +struct ibv_create_srq { + __u32 command; + __u16 in_words; + __u16 out_words; + __u64 response; + __u64 user_handle; + __u32 pd_handle; + __u32 max_wr; + __u32 max_sge; + __u32 srq_limit; + __u64 driver_data[0]; +}; + +struct ibv_create_xsrq { + __u32 command; + __u16 in_words; + __u16 out_words; + __u64 response; + __u64 user_handle; + __u32 srq_type; + __u32 pd_handle; + __u32 max_wr; + __u32 max_sge; + __u32 srq_limit; + __u32 reserved; + __u32 xrcd_handle; + __u32 cq_handle; + __u64 driver_data[0]; +}; + +struct ibv_create_srq_resp { + __u32 srq_handle; + __u32 max_wr; + __u32 max_sge; + __u32 srqn; +}; + +struct ibv_modify_srq { + __u32 command; + __u16 in_words; + __u16 out_words; + __u32 srq_handle; + __u32 attr_mask; + __u32 max_wr; + __u32 srq_limit; + __u64 driver_data[0]; +}; + +struct ibv_query_srq { + __u32 command; + __u16 in_words; + __u16 out_words; + __u64 response; + __u32 srq_handle; + __u32 reserved; + __u64 driver_data[0]; +}; + +struct ibv_query_srq_resp { + __u32 max_wr; + __u32 max_sge; + __u32 srq_limit; + __u32 reserved; +}; + +struct ibv_destroy_srq { + __u32 command; + __u16 in_words; + __u16 out_words; + __u64 response; + __u32 srq_handle; + __u32 reserved; +}; + +struct ibv_destroy_srq_resp { + __u32 events_reported; +}; + + +struct fi_ops_uverbs { + size_t size; + int (*get_context)(fid_t fid, + struct ibv_get_context *cmd, size_t cmd_size, + struct ibv_get_context_resp *resp, size_t resp_size); + int (*query_device)(fid_t fid, + struct ibv_query_device *cmd, size_t cmd_size, + struct ibv_query_device_resp *resp, size_t resp_size); + int (*query_port)(fid_t fid, + struct ibv_query_port *cmd, size_t cmd_size, + struct ibv_query_port_resp *resp, size_t resp_size); + int (*alloc_pd)(fid_t fid, + struct ibv_alloc_pd *cmd, size_t cmd_size, + struct ibv_alloc_pd_resp *resp, size_t resp_size); + int (*dealloc_pd)(fid_t fid, + struct ibv_dealloc_pd *cmd, size_t cmd_size); + int (*create_ah)(fid_t fid, + struct ibv_create_ah *cmd, size_t cmd_size, + struct ibv_create_ah_resp *resp, size_t resp_size); + int (*destroy_ah)(fid_t fid, + struct ibv_destroy_ah *cmd, size_t cmd_size); + int (*open_xrcd)(fid_t fid, + struct ibv_open_xrcd *cmd, size_t cmd_size, + struct ibv_open_xrcd_resp *resp, size_t resp_size); + int (*close_xrcd)(fid_t fid, + struct ibv_close_xrcd *cmd, size_t cmd_size); + int (*reg_mr)(fid_t fid, + struct ibv_reg_mr *cmd, size_t cmd_size, + struct ibv_reg_mr_resp *resp, size_t resp_size); + int (*dereg_mr)(fid_t fid, + struct ibv_dereg_mr *cd, size_t cmd_size); + int (*create_comp_channel)(fid_t fid, + struct ibv_create_comp_channel *cmd, size_t cmd_size, + struct ibv_create_comp_channel_resp *resp, size_t resp_size); + int (*create_cq)(fid_t fid, + struct ibv_create_cq *cmd, size_t cmd_size, + struct ibv_create_cq_resp *resp, size_t resp_size); + int (*poll_cq)(fid_t fid, + struct ibv_poll_cq *cmd, size_t cmd_size, + struct ibv_poll_cq_resp *resp, size_t resp_size); + int (*req_notify_cq)(fid_t fid, + struct ibv_req_notify_cq *cmd, size_t cmd_size); + int (*resize_cq)(fid_t fid, + struct ibv_resize_cq *cmd, size_t cmd_size, + struct ibv_resize_cq_resp *resp, size_t resp_size); + int (*destroy_cq)(fid_t fid, + struct ibv_destroy_cq *cmd, size_t cmd_size, + struct ibv_destroy_cq_resp *resp, size_t resp_size); + int (*create_srq)(fid_t fid, + struct ibv_create_srq *cmd, size_t cmd_size, + struct ibv_create_srq_resp *resp, size_t resp_size); + int (*modify_srq)(fid_t fid, + struct ibv_modify_srq *cmd, size_t cmd_size); + int (*query_srq)(fid_t fid, + struct ibv_query_srq *cmd, size_t cmd_size, + struct ibv_query_srq_resp *resp, size_t resp_size); + int (*destroy_srq)(fid_t fid, + struct ibv_destroy_srq *cmd, size_t cmd_size, + struct ibv_destroy_srq_resp *resp, size_t resp_size); + int (*create_qp)(fid_t fid, + struct ibv_create_qp *cmd, size_t cmd_size, + struct ibv_create_qp_resp *resp, size_t resp_size); + int (*open_qp)(fid_t fid, + struct ibv_open_qp *cmd, size_t cmd_size, + struct ibv_create_qp_resp *resp, size_t resp_size); + int (*query_qp)(fid_t fid, + struct ibv_query_qp *cmd, size_t cmd_size, + struct ibv_query_qp_resp *resp, size_t resp_size); + int (*modify_qp)(fid_t fid, + struct ibv_modify_qp *cmd, size_t cmd_size); + int (*destroy_qp)(fid_t fid, + struct ibv_destroy_qp *cmd, size_t cmd_size, + struct ibv_destroy_qp_resp *resp, size_t resp_size); + int (*post_send)(fid_t fid, + struct ibv_post_send *cmd, size_t cmd_size, + struct ibv_post_send_resp *resp, size_t resp_size); + int (*post_recv)(fid_t fid, + struct ibv_post_recv *cmd, size_t cmd_size, + struct ibv_post_recv_resp *resp, size_t resp_size); + int (*post_srq_recv)(fid_t fid, + struct ibv_post_srq_recv *cmd, size_t cmd_size, + struct ibv_post_srq_recv_resp *resp, size_t resp_size); + int (*attach_mcast)(fid_t fid, + struct ibv_attach_mcast *cmd, size_t cmd_size); + int (*detach_mcast)(fid_t fid, + struct ibv_detach_mcast *cmd, size_t cmd_size); +}; + +struct fid_uverbs { + struct fid fid; + int fd; + struct fi_ops_uverbs *ops; +}; + +#define FI_UVERBS_INTERFACE "uverbs" + +static inline int +uv_get_context(fid_t fid, + struct ibv_get_context *cmd, size_t cmd_size, + struct ibv_get_context_resp *resp, size_t resp_size) +{ + struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_uverbs, ops); + FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, get_context); + return uv->ops->get_context(fid, cmd, cmd_size, resp, resp_size); +} + +static inline int +uv_query_device(fid_t fid, + struct ibv_query_device *cmd, size_t cmd_size, + struct ibv_query_device_resp *resp, size_t resp_size) +{ + struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_uverbs, ops); + FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, query_device); + return uv->ops->query_device(fid, cmd, cmd_size, resp, resp_size); +} + +static inline int +uv_query_port(fid_t fid, + struct ibv_query_port *cmd, size_t cmd_size, + struct ibv_query_port_resp *resp, size_t resp_size) +{ + struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_uverbs, ops); + FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, query_port); + return uv->ops->query_port(fid, cmd, cmd_size, resp, resp_size); +} + +static inline int +uv_alloc_pd(fid_t fid, + struct ibv_alloc_pd *cmd, size_t cmd_size, + struct ibv_alloc_pd_resp *resp, size_t resp_size) +{ + struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_uverbs, ops); + FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, alloc_pd); + return uv->ops->alloc_pd(fid, cmd, cmd_size, resp, resp_size); +} + +static inline int +uv_dealloc_pd(fid_t fid, + struct ibv_dealloc_pd *cmd, size_t cmd_size) +{ + struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_uverbs, ops); + FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, dealloc_pd); + return uv->ops->dealloc_pd(fid, cmd, cmd_size); +} + +static inline int +uv_create_ah(fid_t fid, + struct ibv_create_ah *cmd, size_t cmd_size, + struct ibv_create_ah_resp *resp, size_t resp_size) +{ + struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_uverbs, ops); + FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, create_ah); + return uv->ops->create_ah(fid, cmd, cmd_size, resp, resp_size); +} + +static inline int +uv_destroy_ah(fid_t fid, + struct ibv_destroy_ah *cmd, size_t cmd_size) +{ + struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_uverbs, ops); + FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, destroy_ah); + return uv->ops->destroy_ah(fid, cmd, cmd_size); +} + +static inline int +uv_open_xrcd(fid_t fid, + struct ibv_open_xrcd *cmd, size_t cmd_size, + struct ibv_open_xrcd_resp *resp, size_t resp_size) +{ + struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_uverbs, ops); + FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, open_xrcd); + return uv->ops->open_xrcd(fid, cmd, cmd_size, resp, resp_size); +} + +static inline int +uv_close_xrcd(fid_t fid, + struct ibv_close_xrcd *cmd, size_t cmd_size) +{ + struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_uverbs, ops); + FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, close_xrcd); + return uv->ops->close_xrcd(fid, cmd, cmd_size); +} + +static inline int +uv_reg_mr(fid_t fid, + struct ibv_reg_mr *cmd, size_t cmd_size, + struct ibv_reg_mr_resp *resp, size_t resp_size) +{ + struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_uverbs, ops); + FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, reg_mr); + return uv->ops->reg_mr(fid, cmd, cmd_size, resp, resp_size); +} + +static inline int +uv_dereg_mr(fid_t fid, + struct ibv_dereg_mr *cmd, size_t cmd_size) +{ + struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_uverbs, ops); + FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, dereg_mr); + return uv->ops->dereg_mr(fid, cmd, cmd_size); +} + +static inline int +uv_create_comp_channel(fid_t fid, + struct ibv_create_comp_channel *cmd, size_t cmd_size, + struct ibv_create_comp_channel_resp *resp, size_t resp_size) +{ + struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_uverbs, ops); + FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, create_comp_channel); + return uv->ops->create_comp_channel(fid, cmd, cmd_size, resp, resp_size); +} + +static inline int +uv_create_cq(fid_t fid, + struct ibv_create_cq *cmd, size_t cmd_size, + struct ibv_create_cq_resp *resp, size_t resp_size) +{ + struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_uverbs, ops); + FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, create_cq); + return uv->ops->create_cq(fid, cmd, cmd_size, resp, resp_size); +} + +static inline int +uv_poll_cq(fid_t fid, + struct ibv_poll_cq *cmd, size_t cmd_size, + struct ibv_poll_cq_resp *resp, size_t resp_size) +{ + struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_uverbs, ops); + FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, poll_cq); + return uv->ops->poll_cq(fid, cmd, cmd_size, resp, resp_size); +} + +static inline int +uv_req_notify_cq(fid_t fid, + struct ibv_req_notify_cq *cmd, size_t cmd_size) +{ + struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_uverbs, ops); + FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, req_notify_cq); + return uv->ops->req_notify_cq(fid, cmd, cmd_size); +} + +static inline int +uv_resize_cq(fid_t fid, + struct ibv_resize_cq *cmd, size_t cmd_size, + struct ibv_resize_cq_resp *resp, size_t resp_size) +{ + struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_uverbs, ops); + FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, resize_cq); + return uv->ops->resize_cq(fid, cmd, cmd_size, resp, resp_size); +} + +static inline int +uv_destroy_cq(fid_t fid, + struct ibv_destroy_cq *cmd, size_t cmd_size, + struct ibv_destroy_cq_resp *resp, size_t resp_size) +{ + struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_uverbs, ops); + FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, destroy_cq); + return uv->ops->destroy_cq(fid, cmd, cmd_size, resp, resp_size); +} + +static inline int +uv_create_srq(fid_t fid, + struct ibv_create_srq *cmd, size_t cmd_size, + struct ibv_create_srq_resp *resp, size_t resp_size) +{ + struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_uverbs, ops); + FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, create_srq); + return uv->ops->create_srq(fid, cmd, cmd_size, resp, resp_size); +} + +static inline int +uv_modify_srq(fid_t fid, + struct ibv_modify_srq *cmd, size_t cmd_size) +{ + struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_uverbs, ops); + FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, modify_srq); + return uv->ops->modify_srq(fid, cmd, cmd_size); +} + +static inline int +uv_query_srq(fid_t fid, + struct ibv_query_srq *cmd, size_t cmd_size, + struct ibv_query_srq_resp *resp, size_t resp_size) +{ + struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_uverbs, ops); + FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, query_srq); + return uv->ops->query_srq(fid, cmd, cmd_size, resp, resp_size); +} + +static inline int +uv_destroy_srq(fid_t fid, + struct ibv_destroy_srq *cmd, size_t cmd_size, + struct ibv_destroy_srq_resp *resp, size_t resp_size) +{ + struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_uverbs, ops); + FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, destroy_srq); + return uv->ops->destroy_srq(fid, cmd, cmd_size, resp, resp_size); +} + +static inline int +uv_create_qp(fid_t fid, + struct ibv_create_qp *cmd, size_t cmd_size, + struct ibv_create_qp_resp *resp, size_t resp_size) +{ + struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_uverbs, ops); + FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, create_qp); + return uv->ops->create_qp(fid, cmd, cmd_size, resp, resp_size); +} + +static inline int +uv_open_qp(fid_t fid, + struct ibv_open_qp *cmd, size_t cmd_size, + struct ibv_create_qp_resp *resp, size_t resp_size) +{ + struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_uverbs, ops); + FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, open_qp); + return uv->ops->open_qp(fid, cmd, cmd_size, resp, resp_size); +} + +static inline int +uv_query_qp(fid_t fid, + struct ibv_query_qp *cmd, size_t cmd_size, + struct ibv_query_qp_resp *resp, size_t resp_size) +{ + struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_uverbs, ops); + FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, query_qp); + return uv->ops->query_qp(fid, cmd, cmd_size, resp, resp_size); +} + +static inline int +uv_modify_qp(fid_t fid, + struct ibv_modify_qp *cmd, size_t cmd_size) +{ + struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_uverbs, ops); + FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, modify_qp); + return uv->ops->modify_qp(fid, cmd, cmd_size); +} + +static inline int +uv_destroy_qp(fid_t fid, + struct ibv_destroy_qp *cmd, size_t cmd_size, + struct ibv_destroy_qp_resp *resp, size_t resp_size) +{ + struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_uverbs, ops); + FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, destroy_qp); + return uv->ops->destroy_qp(fid, cmd, cmd_size, resp, resp_size); +} + +static inline int +uv_post_send(fid_t fid, + struct ibv_post_send *cmd, size_t cmd_size, + struct ibv_post_send_resp *resp, size_t resp_size) +{ + struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_uverbs, ops); + FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, post_send); + return uv->ops->post_send(fid, cmd, cmd_size, resp, resp_size); +} + +static inline int +uv_post_recv(fid_t fid, + struct ibv_post_recv *cmd, size_t cmd_size, + struct ibv_post_recv_resp *resp, size_t resp_size) +{ + struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_uverbs, ops); + FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, post_recv); + return uv->ops->post_recv(fid, cmd, cmd_size, resp, resp_size); +} + +static inline int +uv_post_srq_recv(fid_t fid, + struct ibv_post_srq_recv *cmd, size_t cmd_size, + struct ibv_post_srq_recv_resp *resp, size_t resp_size) +{ + struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_uverbs, ops); + FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, post_srq_recv); + return uv->ops->post_srq_recv(fid, cmd, cmd_size, resp, resp_size); +} + +static inline int +uv_attach_mcast(fid_t fid, + struct ibv_attach_mcast *cmd, size_t cmd_size) +{ + struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_uverbs, ops); + FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, attach_mcast); + return uv->ops->attach_mcast(fid, cmd, cmd_size); +} + +static inline int +uv_detach_mcast(fid_t fid, + struct ibv_detach_mcast *cmd, size_t cmd_size) +{ + struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid); + FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE); + FI_ASSERT_OPS(fid, struct fid_uverbs, ops); + FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, detach_mcast); + return uv->ops->detach_mcast(fid, cmd, cmd_size); +} + + +#ifdef __cplusplus +} +#endif + +#endif /* _FI_UVERBS_H_ */ diff --git a/libfabric.spec.in b/libfabric.spec.in new file mode 100644 index 00000000000..c59388c94c8 --- /dev/null +++ b/libfabric.spec.in @@ -0,0 +1,71 @@ +%define ver @VERSION@ + +Name: libfabric +Version: 0.0.1 +Release: 1%{?dist} +Summary: Userspace RDMA Fabric Interfaces + +Group: System Environment/Libraries +License: GPLv2 or BSD +Url: http://www.openfabrics.org/ +Source: http://www.openfabrics.org/downloads/fabrics/%{name}-%{version}.tar.gz +BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n) + +%description +libfabric provides a userspace API to access high-performance fabric +services, such as RDMA. + +%package devel +Summary: Development files for the libfabric library +Group: System Environment/Libraries + +%description devel +Development files for the libfabric library. + +%package utils +Summary: Examples for the libfabric library +Group: System Environment/Libraries +Requires: %{name} = %{version}-%{release} + +%description utils +Example test programs for the libfabric library. + +%prep +%setup -q -n %{name}-%{ver} + +%build +%configure +make %{?_smp_mflags} + +%install +rm -rf $RPM_BUILD_ROOT +%makeinstall +# remove unpackaged files from the buildroot +rm -f $RPM_BUILD_ROOT%{_libdir}/*.la + +%clean +rm -rf $RPM_BUILD_ROOT + +%post -p /sbin/ldconfig +%postun -p /sbin/ldconfig + +%files +%defattr(-,root,root,-) +%{_libdir}/lib*.so.* +%doc AUTHORS COPYING README + +%files devel +%defattr(-,root,root) +%{_libdir}/libfabric*.so +%{_libdir}/*.a +%{_includedir}/* +%{_mandir}/man3/* +%{_mandir}/man7/* + +%files utils +%defattr(-,root,root,-) +%{_bindir}/* +%{_mandir}/man1/* + +%changelog + diff --git a/man/fi_getinfo.3 b/man/fi_getinfo.3 new file mode 100755 index 00000000000..4b4987f7915 --- /dev/null +++ b/man/fi_getinfo.3 @@ -0,0 +1,53 @@ +.TH "FI_GETINFO" 3 "2013-11-01" "libfabric" "Libfabric Programmer's Manual" libfabric +.SH NAME +fi_getinfo / fi_freeinfo \- Obtain/free fabric interface information +.SH SYNOPSIS +.B "#include <rdma/fabric.h>" +.P +.B "int" fi_getinfo +.BI "(char *" node "," +.BI "char *" service "," +.BI "struct fi_info *" hints "," +.BI "struct fi_info **" info ");" +.P +.B "int" fi_freeinfo +.BI "(struct fi_info *" info ");" +.SH ARGUMENTS +.IP "node" 12 +Optional, name or fabric address to resolve. +.IP "service" 12 +Optional, service name or port number of address. +.IP "hints" 12 +Reference to an fi_info structure containing hints about the type +of service the caller supports. +.IP "info" 12 +A pointer to a linked list of fi_info structures containing response +information. +.SH "DESCRIPTION" +Resolves the destination node and service address and returns +information needed to establish communication. Provides the +fabric interface functional equivalent to getaddrinfo. +.SH "RETURN VALUE" +Returns 0 on success, or errno on error. +.SH "NOTES" +Either node, service, or hints must be provided. If hints are provided, the +operation will be controlled by hints.ai_flags. If FI_PASSIVE is +specified, the call will resolve address information for use on the +passive side of a connection. +If node is provided, fi_getinfo will attempt to resolve the fabric address +to the given node. The hints parameter, if provided, +may be used to control the resulting output as indicated below. +If node is not given, fi_getinfo will attempt to resolve the fabric addressing +information based on the provided hints. +The caller must call fi_freeinfo to release fi_info structures returned +by this call. +.SH "fi_info" +.IP "next" 12 +Pointer to the next fi_info structure in the list. Will be NULL +if no more structures exist. +.IP "size" 12 +Size of the fi_info structure, used for compatibility. +.IP "write more!!!" 12 +Details are left as an exercise for the reader. +.SH "SEE ALSO" +fi_open(3) diff --git a/man/fi_open.3 b/man/fi_open.3 new file mode 100644 index 00000000000..e2c9896bf30 --- /dev/null +++ b/man/fi_open.3 @@ -0,0 +1,27 @@ +.TH "FI_OPEN" 3 "2013-11-01" "libfabric" "Libfabric Programmer's Manual" libfabric +.SH NAME +fi_open / fi_close \- Open/close a fabric interface +.SH SYNOPSIS +.B "#include <rdma/fabric.h>" +.P +.B "int" fi_open +.BI "(char *" name "," +.BI WRITE ME!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +.BI "fid_t *" fid ");" +.P +.B "int" fi_close +.BI "(fid_t " fid ");" +.SH ARGUMENTS +.IP "name" 12 +The name of the interface to open. +.IP "fid" 12 +On success, points to the opened interface. +.SH "DESCRIPTION" +Opens a fabric interface. +.SH "RETURN VALUE" +Returns 0 on success, or errno on error. +.SH "NOTES" +Opens a fabric interface. Users should call fi_close to release +the interface. +.SH "SEE ALSO" +fi_getinfo(3) diff --git a/man/fi_socket.3 b/man/fi_socket.3 new file mode 100644 index 00000000000..83ce8cf2e48 --- /dev/null +++ b/man/fi_socket.3 @@ -0,0 +1,30 @@ +.TH "FI_SOCKET" 3 "2013-11-01" "libfabric" "Libfabric Programmer's Manual" libfabric +.SH NAME +fi_socket / fi_close \- Allocate/close a fabric interface socket +.SH SYNOPSIS +.B "#include <rdma/fabric.h>" +.P +.B "int" fi_socket +.BI "(struct fi_info *" info "," +.BI "fid_t *" fid "," +.BI "void *" context ");" +.P +.B "int" fi_close +.BI "(fid_t " fid ");" +.SH ARGUMENTS +.IP "info" 12 +Details about the fabric interface socket to be opened. +.IP "fid" 12 +On success, points to the opened fabric socket. +.IP "context" 12 +User specified context associated with the socket. +.SH "DESCRIPTION" +Opens a fabric interface socket. +.SH "RETURN VALUE" +Returns 0 on success, or errno on error. +.SH "NOTES" +Opens a fabric socket corresponding to the requested fabric interface +information. Users should call fi_close to release all resources allocated +fo the fabric socket. +.SH "SEE ALSO" +fi_getinfo(3), fi_open(3) diff --git a/prov/ibverbs/AUTHORS b/prov/ibverbs/AUTHORS new file mode 100644 index 00000000000..fcea3504a51 --- /dev/null +++ b/prov/ibverbs/AUTHORS @@ -0,0 +1,4 @@ +Roland Dreier <roland@topspin.com> +Dotan Barak <dotanba@gmail.com> +Sean Hefty <sean.hefty@intel.com> +Michael S. Tsirkin <mst@mellanox.co.il> diff --git a/prov/ibverbs/COPYING b/prov/ibverbs/COPYING new file mode 100644 index 00000000000..ee1a79ffabf --- /dev/null +++ b/prov/ibverbs/COPYING @@ -0,0 +1,378 @@ +This software is available to you under a choice of one of two +licenses. You may choose to be licensed under the terms of the the +OpenIB.org BSD license or the GNU General Public License (GPL) Version +2, both included below. + +Copyright (c) 2004 Topspin Communications. All rights reserved. + +================================================================== + + OpenIB.org BSD license + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +================================================================== + + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + <one line to give the program's name and a brief idea of what it does.> + Copyright (C) <year> <name of author> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + <signature of Ty Coon>, 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/prov/ibverbs/include/infiniband/driver.h b/prov/ibverbs/include/infiniband/driver.h new file mode 100644 index 00000000000..49353b632c6 --- /dev/null +++ b/prov/ibverbs/include/infiniband/driver.h @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2005 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef INFINIBAND_DRIVER_H +#define INFINIBAND_DRIVER_H + +#include <infiniband/verbs.h> +#include <rdma/fi_uverbs.h> + + +#ifdef __cplusplus +# define BEGIN_C_DECLS extern "C" { +# define END_C_DECLS } +#else /* !__cplusplus */ +# define BEGIN_C_DECLS +# define END_C_DECLS +#endif /* __cplusplus */ + +/* + * Extension that low-level drivers should add to their .so filename + * (probably via libtool "-release" option). For example a low-level + * driver named "libfoo" should build a plug-in named "libfoo-rdmav2.so". + */ +#define IBV_DEVICE_LIBRARY_EXTENSION rdmav2 + +typedef struct ibv_device *(*ibv_driver_init_func)(const char *uverbs_sys_path, + int abi_version); + +void ibv_register_driver(const char *name, ibv_driver_init_func init_func); +int ibv_cmd_get_context(struct ibv_context *context, struct ibv_get_context *cmd, + size_t cmd_size, struct ibv_get_context_resp *resp, + size_t resp_size); +int ibv_cmd_query_device(struct ibv_context *context, + struct ibv_device_attr *device_attr, + uint64_t *raw_fw_ver, + struct ibv_query_device *cmd, size_t cmd_size); +int ibv_cmd_query_port(struct ibv_context *context, uint8_t port_num, + struct ibv_port_attr *port_attr, + struct ibv_query_port *cmd, size_t cmd_size); +int ibv_cmd_query_gid(struct ibv_context *context, uint8_t port_num, + int index, union ibv_gid *gid); +int ibv_cmd_query_pkey(struct ibv_context *context, uint8_t port_num, + int index, uint16_t *pkey); +int ibv_cmd_alloc_pd(struct ibv_context *context, struct ibv_pd *pd, + struct ibv_alloc_pd *cmd, size_t cmd_size, + struct ibv_alloc_pd_resp *resp, size_t resp_size); +int ibv_cmd_dealloc_pd(struct ibv_pd *pd); +#define IBV_CMD_REG_MR_HAS_RESP_PARAMS +int ibv_cmd_reg_mr(struct ibv_pd *pd, void *addr, size_t length, + uint64_t hca_va, int access, + struct ibv_mr *mr, struct ibv_reg_mr *cmd, + size_t cmd_size, + struct ibv_reg_mr_resp *resp, size_t resp_size); +int ibv_cmd_dereg_mr(struct ibv_mr *mr); +int ibv_cmd_create_cq(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, + int comp_vector, struct ibv_cq *cq, + struct ibv_create_cq *cmd, size_t cmd_size, + struct ibv_create_cq_resp *resp, size_t resp_size); +int ibv_cmd_poll_cq(struct ibv_cq *cq, int ne, struct ibv_wc *wc); +int ibv_cmd_req_notify_cq(struct ibv_cq *cq, int solicited_only); +#define IBV_CMD_RESIZE_CQ_HAS_RESP_PARAMS +int ibv_cmd_resize_cq(struct ibv_cq *cq, int cqe, + struct ibv_resize_cq *cmd, size_t cmd_size, + struct ibv_resize_cq_resp *resp, size_t resp_size); +int ibv_cmd_destroy_cq(struct ibv_cq *cq); + +int ibv_cmd_create_srq(struct ibv_pd *pd, + struct ibv_srq *srq, struct ibv_srq_init_attr *attr, + struct ibv_create_srq *cmd, size_t cmd_size, + struct ibv_create_srq_resp *resp, size_t resp_size); +int ibv_cmd_modify_srq(struct ibv_srq *srq, + struct ibv_srq_attr *srq_attr, + int srq_attr_mask, + struct ibv_modify_srq *cmd, size_t cmd_size); +int ibv_cmd_query_srq(struct ibv_srq *srq, + struct ibv_srq_attr *srq_attr, + struct ibv_query_srq *cmd, size_t cmd_size); +int ibv_cmd_destroy_srq(struct ibv_srq *srq); + +int ibv_cmd_create_qp(struct ibv_pd *pd, + struct ibv_qp *qp, struct ibv_qp_init_attr *attr, + struct ibv_create_qp *cmd, size_t cmd_size, + struct ibv_create_qp_resp *resp, size_t resp_size); +int ibv_cmd_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *qp_attr, + int attr_mask, + struct ibv_qp_init_attr *qp_init_attr, + struct ibv_query_qp *cmd, size_t cmd_size); +int ibv_cmd_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask, + struct ibv_modify_qp *cmd, size_t cmd_size); +int ibv_cmd_destroy_qp(struct ibv_qp *qp); +int ibv_cmd_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr); +int ibv_cmd_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr); +int ibv_cmd_post_srq_recv(struct ibv_srq *srq, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr); +int ibv_cmd_create_ah(struct ibv_pd *pd, struct ibv_ah *ah, + struct ibv_ah_attr *attr); +int ibv_cmd_destroy_ah(struct ibv_ah *ah); +int ibv_cmd_attach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid); +int ibv_cmd_detach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid); + +int ibv_dontfork_range(void *base, size_t size); +int ibv_dofork_range(void *base, size_t size); + +/* + * sysfs helper functions + */ +#define ibv_get_sysfs_path fi_sysfs_path +#define ibv_read_sysfs_file fi_read_file + +#endif /* INFINIBAND_DRIVER_H */ diff --git a/prov/ibverbs/include/infiniband/marshall.h b/prov/ibverbs/include/infiniband/marshall.h new file mode 100644 index 00000000000..48493fa121d --- /dev/null +++ b/prov/ibverbs/include/infiniband/marshall.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2005 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef INFINIBAND_MARSHALL_H +#define INFINIBAND_MARSHALL_H + +#include <infiniband/verbs.h> +#include <rdma/fi_uverbs.h> +#include <rdma/fi_ucma.h> + +#ifdef __cplusplus +# define BEGIN_C_DECLS extern "C" { +# define END_C_DECLS } +#else /* !__cplusplus */ +# define BEGIN_C_DECLS +# define END_C_DECLS +#endif /* __cplusplus */ + +BEGIN_C_DECLS + +void ibv_copy_qp_attr_from_kern(struct ibv_qp_attr *dst, + struct ibv_kern_qp_attr *src); + +void ibv_copy_ah_attr_from_kern(struct ibv_ah_attr *dst, + struct ibv_kern_ah_attr *src); + +void ibv_copy_path_rec_from_kern(struct ibv_sa_path_rec *dst, + struct ibv_kern_path_rec *src); + +void ibv_copy_path_rec_to_kern(struct ibv_kern_path_rec *dst, + struct ibv_sa_path_rec *src); + +END_C_DECLS + +#endif /* INFINIBAND_MARSHALL_H */ diff --git a/prov/ibverbs/include/infiniband/opcode.h b/prov/ibverbs/include/infiniband/opcode.h new file mode 100644 index 00000000000..fd4bc96a2c9 --- /dev/null +++ b/prov/ibverbs/include/infiniband/opcode.h @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef INFINIBAND_OPCODE_H +#define INFINIBAND_OPCODE_H + +/* + * This macro cleans up the definitions of constants for BTH opcodes. + * It is used to define constants such as IBV_OPCODE_UD_SEND_ONLY, + * which becomes IBV_OPCODE_UD + IBV_OPCODE_SEND_ONLY, and this gives + * the correct value. + * + * In short, user code should use the constants defined using the + * macro rather than worrying about adding together other constants. +*/ +#define IBV_OPCODE(transport, op) \ + IBV_OPCODE_ ## transport ## _ ## op = \ + IBV_OPCODE_ ## transport + IBV_OPCODE_ ## op + +enum { + /* transport types -- just used to define real constants */ + IBV_OPCODE_RC = 0x00, + IBV_OPCODE_UC = 0x20, + IBV_OPCODE_RD = 0x40, + IBV_OPCODE_UD = 0x60, + + /* operations -- just used to define real constants */ + IBV_OPCODE_SEND_FIRST = 0x00, + IBV_OPCODE_SEND_MIDDLE = 0x01, + IBV_OPCODE_SEND_LAST = 0x02, + IBV_OPCODE_SEND_LAST_WITH_IMMEDIATE = 0x03, + IBV_OPCODE_SEND_ONLY = 0x04, + IBV_OPCODE_SEND_ONLY_WITH_IMMEDIATE = 0x05, + IBV_OPCODE_RDMA_WRITE_FIRST = 0x06, + IBV_OPCODE_RDMA_WRITE_MIDDLE = 0x07, + IBV_OPCODE_RDMA_WRITE_LAST = 0x08, + IBV_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE = 0x09, + IBV_OPCODE_RDMA_WRITE_ONLY = 0x0a, + IBV_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE = 0x0b, + IBV_OPCODE_RDMA_READ_REQUEST = 0x0c, + IBV_OPCODE_RDMA_READ_RESPONSE_FIRST = 0x0d, + IBV_OPCODE_RDMA_READ_RESPONSE_MIDDLE = 0x0e, + IBV_OPCODE_RDMA_READ_RESPONSE_LAST = 0x0f, + IBV_OPCODE_RDMA_READ_RESPONSE_ONLY = 0x10, + IBV_OPCODE_ACKNOWLEDGE = 0x11, + IBV_OPCODE_ATOMIC_ACKNOWLEDGE = 0x12, + IBV_OPCODE_COMPARE_SWAP = 0x13, + IBV_OPCODE_FETCH_ADD = 0x14, + + /* real constants follow -- see comment about above IBV_OPCODE() + macro for more details */ + + /* RC */ + IBV_OPCODE(RC, SEND_FIRST), + IBV_OPCODE(RC, SEND_MIDDLE), + IBV_OPCODE(RC, SEND_LAST), + IBV_OPCODE(RC, SEND_LAST_WITH_IMMEDIATE), + IBV_OPCODE(RC, SEND_ONLY), + IBV_OPCODE(RC, SEND_ONLY_WITH_IMMEDIATE), + IBV_OPCODE(RC, RDMA_WRITE_FIRST), + IBV_OPCODE(RC, RDMA_WRITE_MIDDLE), + IBV_OPCODE(RC, RDMA_WRITE_LAST), + IBV_OPCODE(RC, RDMA_WRITE_LAST_WITH_IMMEDIATE), + IBV_OPCODE(RC, RDMA_WRITE_ONLY), + IBV_OPCODE(RC, RDMA_WRITE_ONLY_WITH_IMMEDIATE), + IBV_OPCODE(RC, RDMA_READ_REQUEST), + IBV_OPCODE(RC, RDMA_READ_RESPONSE_FIRST), + IBV_OPCODE(RC, RDMA_READ_RESPONSE_MIDDLE), + IBV_OPCODE(RC, RDMA_READ_RESPONSE_LAST), + IBV_OPCODE(RC, RDMA_READ_RESPONSE_ONLY), + IBV_OPCODE(RC, ACKNOWLEDGE), + IBV_OPCODE(RC, ATOMIC_ACKNOWLEDGE), + IBV_OPCODE(RC, COMPARE_SWAP), + IBV_OPCODE(RC, FETCH_ADD), + + /* UC */ + IBV_OPCODE(UC, SEND_FIRST), + IBV_OPCODE(UC, SEND_MIDDLE), + IBV_OPCODE(UC, SEND_LAST), + IBV_OPCODE(UC, SEND_LAST_WITH_IMMEDIATE), + IBV_OPCODE(UC, SEND_ONLY), + IBV_OPCODE(UC, SEND_ONLY_WITH_IMMEDIATE), + IBV_OPCODE(UC, RDMA_WRITE_FIRST), + IBV_OPCODE(UC, RDMA_WRITE_MIDDLE), + IBV_OPCODE(UC, RDMA_WRITE_LAST), + IBV_OPCODE(UC, RDMA_WRITE_LAST_WITH_IMMEDIATE), + IBV_OPCODE(UC, RDMA_WRITE_ONLY), + IBV_OPCODE(UC, RDMA_WRITE_ONLY_WITH_IMMEDIATE), + + /* RD */ + IBV_OPCODE(RD, SEND_FIRST), + IBV_OPCODE(RD, SEND_MIDDLE), + IBV_OPCODE(RD, SEND_LAST), + IBV_OPCODE(RD, SEND_LAST_WITH_IMMEDIATE), + IBV_OPCODE(RD, SEND_ONLY), + IBV_OPCODE(RD, SEND_ONLY_WITH_IMMEDIATE), + IBV_OPCODE(RD, RDMA_WRITE_FIRST), + IBV_OPCODE(RD, RDMA_WRITE_MIDDLE), + IBV_OPCODE(RD, RDMA_WRITE_LAST), + IBV_OPCODE(RD, RDMA_WRITE_LAST_WITH_IMMEDIATE), + IBV_OPCODE(RD, RDMA_WRITE_ONLY), + IBV_OPCODE(RD, RDMA_WRITE_ONLY_WITH_IMMEDIATE), + IBV_OPCODE(RD, RDMA_READ_REQUEST), + IBV_OPCODE(RD, RDMA_READ_RESPONSE_FIRST), + IBV_OPCODE(RD, RDMA_READ_RESPONSE_MIDDLE), + IBV_OPCODE(RD, RDMA_READ_RESPONSE_LAST), + IBV_OPCODE(RD, RDMA_READ_RESPONSE_ONLY), + IBV_OPCODE(RD, ACKNOWLEDGE), + IBV_OPCODE(RD, ATOMIC_ACKNOWLEDGE), + IBV_OPCODE(RD, COMPARE_SWAP), + IBV_OPCODE(RD, FETCH_ADD), + + /* UD */ + IBV_OPCODE(UD, SEND_ONLY), + IBV_OPCODE(UD, SEND_ONLY_WITH_IMMEDIATE) +}; + +#endif /* INFINIBAND_OPCODE_H */ diff --git a/prov/ibverbs/include/infiniband/verbs.h b/prov/ibverbs/include/infiniband/verbs.h new file mode 100644 index 00000000000..a0158ff1df8 --- /dev/null +++ b/prov/ibverbs/include/infiniband/verbs.h @@ -0,0 +1,1158 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2004 Intel Corporation. All rights reserved. + * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2005 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef INFINIBAND_VERBS_H +#define INFINIBAND_VERBS_H + +#include <stdint.h> +#include <pthread.h> +#include <rdma/fabric.h> + +#ifdef __cplusplus +# define BEGIN_C_DECLS extern "C" { +# define END_C_DECLS } +#else /* !__cplusplus */ +# define BEGIN_C_DECLS +# define END_C_DECLS +#endif /* __cplusplus */ + +#if __GNUC__ >= 3 +# define __attribute_const __attribute__((const)) +#else +# define __attribute_const +#endif + +BEGIN_C_DECLS + +union ibv_gid { + uint8_t raw[16]; + struct { + uint64_t subnet_prefix; + uint64_t interface_id; + } global; +}; + +struct ibv_sa_path_rec { + union ibv_gid dgid; + union ibv_gid sgid; + uint16_t dlid; + uint16_t slid; + int raw_traffic; + /* reserved */ + uint32_t flow_label; + uint8_t hop_limit; + uint8_t traffic_class; + int reversible; + uint8_t numb_path; + uint16_t pkey; + /* reserved */ + uint8_t sl; + uint8_t mtu_selector; + uint8_t mtu; + uint8_t rate_selector; + uint8_t rate; + uint8_t packet_life_time_selector; + uint8_t packet_life_time; + uint8_t preference; +}; + +enum ibv_node_type { + IBV_NODE_UNKNOWN = -1, + IBV_NODE_CA = 1, + IBV_NODE_SWITCH, + IBV_NODE_ROUTER, + IBV_NODE_RNIC +}; + +enum ibv_transport_type { + IBV_TRANSPORT_UNKNOWN = -1, + IBV_TRANSPORT_IB = 0, + IBV_TRANSPORT_IWARP +}; + +enum ibv_device_cap_flags { + IBV_DEVICE_RESIZE_MAX_WR = 1, + IBV_DEVICE_BAD_PKEY_CNTR = 1 << 1, + IBV_DEVICE_BAD_QKEY_CNTR = 1 << 2, + IBV_DEVICE_RAW_MULTI = 1 << 3, + IBV_DEVICE_AUTO_PATH_MIG = 1 << 4, + IBV_DEVICE_CHANGE_PHY_PORT = 1 << 5, + IBV_DEVICE_UD_AV_PORT_ENFORCE = 1 << 6, + IBV_DEVICE_CURR_QP_STATE_MOD = 1 << 7, + IBV_DEVICE_SHUTDOWN_PORT = 1 << 8, + IBV_DEVICE_INIT_TYPE = 1 << 9, + IBV_DEVICE_PORT_ACTIVE_EVENT = 1 << 10, + IBV_DEVICE_SYS_IMAGE_GUID = 1 << 11, + IBV_DEVICE_RC_RNR_NAK_GEN = 1 << 12, + IBV_DEVICE_SRQ_RESIZE = 1 << 13, + IBV_DEVICE_N_NOTIFY_CQ = 1 << 14 +}; + +enum ibv_atomic_cap { + IBV_ATOMIC_NONE, + IBV_ATOMIC_HCA, + IBV_ATOMIC_GLOB +}; + +struct ibv_device_attr { + char fw_ver[64]; + uint64_t node_guid; + uint64_t sys_image_guid; + uint64_t max_mr_size; + uint64_t page_size_cap; + uint32_t vendor_id; + uint32_t vendor_part_id; + uint32_t hw_ver; + int max_qp; + int max_qp_wr; + int device_cap_flags; + int max_sge; + int max_sge_rd; + int max_cq; + int max_cqe; + int max_mr; + int max_pd; + int max_qp_rd_atom; + int max_ee_rd_atom; + int max_res_rd_atom; + int max_qp_init_rd_atom; + int max_ee_init_rd_atom; + enum ibv_atomic_cap atomic_cap; + int max_ee; + int max_rdd; + int max_mw; + int max_raw_ipv6_qp; + int max_raw_ethy_qp; + int max_mcast_grp; + int max_mcast_qp_attach; + int max_total_mcast_qp_attach; + int max_ah; + int max_fmr; + int max_map_per_fmr; + int max_srq; + int max_srq_wr; + int max_srq_sge; + uint16_t max_pkeys; + uint8_t local_ca_ack_delay; + uint8_t phys_port_cnt; +}; + +enum ibv_mtu { + IBV_MTU_256 = 1, + IBV_MTU_512 = 2, + IBV_MTU_1024 = 3, + IBV_MTU_2048 = 4, + IBV_MTU_4096 = 5 +}; + +enum ibv_port_state { + IBV_PORT_NOP = 0, + IBV_PORT_DOWN = 1, + IBV_PORT_INIT = 2, + IBV_PORT_ARMED = 3, + IBV_PORT_ACTIVE = 4, + IBV_PORT_ACTIVE_DEFER = 5 +}; + +enum { + IBV_LINK_LAYER_UNSPECIFIED, + IBV_LINK_LAYER_INFINIBAND, + IBV_LINK_LAYER_ETHERNET, +}; + +struct ibv_port_attr { + enum ibv_port_state state; + enum ibv_mtu max_mtu; + enum ibv_mtu active_mtu; + int gid_tbl_len; + uint32_t port_cap_flags; + uint32_t max_msg_sz; + uint32_t bad_pkey_cntr; + uint32_t qkey_viol_cntr; + uint16_t pkey_tbl_len; + uint16_t lid; + uint16_t sm_lid; + uint8_t lmc; + uint8_t max_vl_num; + uint8_t sm_sl; + uint8_t subnet_timeout; + uint8_t init_type_reply; + uint8_t active_width; + uint8_t active_speed; + uint8_t phys_state; + uint8_t link_layer; + uint8_t reserved; +}; + +enum ibv_event_type { + IBV_EVENT_CQ_ERR, + IBV_EVENT_QP_FATAL, + IBV_EVENT_QP_REQ_ERR, + IBV_EVENT_QP_ACCESS_ERR, + IBV_EVENT_COMM_EST, + IBV_EVENT_SQ_DRAINED, + IBV_EVENT_PATH_MIG, + IBV_EVENT_PATH_MIG_ERR, + IBV_EVENT_DEVICE_FATAL, + IBV_EVENT_PORT_ACTIVE, + IBV_EVENT_PORT_ERR, + IBV_EVENT_LID_CHANGE, + IBV_EVENT_PKEY_CHANGE, + IBV_EVENT_SM_CHANGE, + IBV_EVENT_SRQ_ERR, + IBV_EVENT_SRQ_LIMIT_REACHED, + IBV_EVENT_QP_LAST_WQE_REACHED, + IBV_EVENT_CLIENT_REREGISTER, + IBV_EVENT_GID_CHANGE, +}; + +struct ibv_async_event { + union { + struct ibv_cq *cq; + struct ibv_qp *qp; + struct ibv_srq *srq; + int port_num; + } element; + enum ibv_event_type event_type; +}; + +enum ibv_wc_status { + IBV_WC_SUCCESS, + IBV_WC_LOC_LEN_ERR, + IBV_WC_LOC_QP_OP_ERR, + IBV_WC_LOC_EEC_OP_ERR, + IBV_WC_LOC_PROT_ERR, + IBV_WC_WR_FLUSH_ERR, + IBV_WC_MW_BIND_ERR, + IBV_WC_BAD_RESP_ERR, + IBV_WC_LOC_ACCESS_ERR, + IBV_WC_REM_INV_REQ_ERR, + IBV_WC_REM_ACCESS_ERR, + IBV_WC_REM_OP_ERR, + IBV_WC_RETRY_EXC_ERR, + IBV_WC_RNR_RETRY_EXC_ERR, + IBV_WC_LOC_RDD_VIOL_ERR, + IBV_WC_REM_INV_RD_REQ_ERR, + IBV_WC_REM_ABORT_ERR, + IBV_WC_INV_EECN_ERR, + IBV_WC_INV_EEC_STATE_ERR, + IBV_WC_FATAL_ERR, + IBV_WC_RESP_TIMEOUT_ERR, + IBV_WC_GENERAL_ERR +}; +const char *ibv_wc_status_str(enum ibv_wc_status status); + +enum ibv_wc_opcode { + IBV_WC_SEND, + IBV_WC_RDMA_WRITE, + IBV_WC_RDMA_READ, + IBV_WC_COMP_SWAP, + IBV_WC_FETCH_ADD, + IBV_WC_BIND_MW, +/* + * Set value of IBV_WC_RECV so consumers can test if a completion is a + * receive by testing (opcode & IBV_WC_RECV). + */ + IBV_WC_RECV = 1 << 7, + IBV_WC_RECV_RDMA_WITH_IMM +}; + +enum ibv_wc_flags { + IBV_WC_GRH = 1 << 0, + IBV_WC_WITH_IMM = 1 << 1 +}; + +struct ibv_wc { + uint64_t wr_id; + enum ibv_wc_status status; + enum ibv_wc_opcode opcode; + uint32_t vendor_err; + uint32_t byte_len; + uint32_t imm_data; /* in network byte order */ + uint32_t qp_num; + uint32_t src_qp; + int wc_flags; + uint16_t pkey_index; + uint16_t slid; + uint8_t sl; + uint8_t dlid_path_bits; +}; + +enum ibv_access_flags { + IBV_ACCESS_LOCAL_WRITE = 1, + IBV_ACCESS_REMOTE_WRITE = (1<<1), + IBV_ACCESS_REMOTE_READ = (1<<2), + IBV_ACCESS_REMOTE_ATOMIC = (1<<3), + IBV_ACCESS_MW_BIND = (1<<4) +}; + +struct ibv_pd { + struct ibv_context *context; + uint32_t handle; +}; + +enum ibv_rereg_mr_flags { + IBV_REREG_MR_CHANGE_TRANSLATION = (1 << 0), + IBV_REREG_MR_CHANGE_PD = (1 << 1), + IBV_REREG_MR_CHANGE_ACCESS = (1 << 2), + IBV_REREG_MR_KEEP_VALID = (1 << 3) +}; + +struct ibv_mr { + struct ibv_context *context; + struct ibv_pd *pd; + void *addr; + size_t length; + uint32_t handle; + uint32_t lkey; + uint32_t rkey; +}; + +enum ibv_mw_type { + IBV_MW_TYPE_1 = 1, + IBV_MW_TYPE_2 = 2 +}; + +struct ibv_mw { + struct ibv_context *context; + struct ibv_pd *pd; + uint32_t rkey; +}; + +struct ibv_global_route { + union ibv_gid dgid; + uint32_t flow_label; + uint8_t sgid_index; + uint8_t hop_limit; + uint8_t traffic_class; +}; + +struct ibv_grh { + uint32_t version_tclass_flow; + uint16_t paylen; + uint8_t next_hdr; + uint8_t hop_limit; + union ibv_gid sgid; + union ibv_gid dgid; +}; + +enum ibv_rate { + IBV_RATE_MAX = 0, + IBV_RATE_2_5_GBPS = 2, + IBV_RATE_5_GBPS = 5, + IBV_RATE_10_GBPS = 3, + IBV_RATE_20_GBPS = 6, + IBV_RATE_30_GBPS = 4, + IBV_RATE_40_GBPS = 7, + IBV_RATE_60_GBPS = 8, + IBV_RATE_80_GBPS = 9, + IBV_RATE_120_GBPS = 10, + IBV_RATE_14_GBPS = 11, + IBV_RATE_56_GBPS = 12, + IBV_RATE_112_GBPS = 13, + IBV_RATE_168_GBPS = 14, + IBV_RATE_25_GBPS = 15, + IBV_RATE_100_GBPS = 16, + IBV_RATE_200_GBPS = 17, + IBV_RATE_300_GBPS = 18 +}; + +/** + * ibv_rate_to_mult - Convert the IB rate enum to a multiple of the + * base rate of 2.5 Gbit/sec. For example, IBV_RATE_5_GBPS will be + * converted to 2, since 5 Gbit/sec is 2 * 2.5 Gbit/sec. + * @rate: rate to convert. + */ +int ibv_rate_to_mult(enum ibv_rate rate) __attribute_const; + +/** + * mult_to_ibv_rate - Convert a multiple of 2.5 Gbit/sec to an IB rate enum. + * @mult: multiple to convert. + */ +enum ibv_rate mult_to_ibv_rate(int mult) __attribute_const; + +/** + * ibv_rate_to_mbps - Convert the IB rate enum to Mbit/sec. + * For example, IBV_RATE_5_GBPS will return the value 5000. + * @rate: rate to convert. + */ +int ibv_rate_to_mbps(enum ibv_rate rate) __attribute_const; + +/** + * mbps_to_ibv_rate - Convert a Mbit/sec value to an IB rate enum. + * @mbps: value to convert. + */ +enum ibv_rate mbps_to_ibv_rate(int mbps) __attribute_const; + +struct ibv_ah_attr { + struct ibv_global_route grh; + uint16_t dlid; + uint8_t sl; + uint8_t src_path_bits; + uint8_t static_rate; + uint8_t is_global; + uint8_t port_num; +}; + +enum ibv_srq_attr_mask { + IBV_SRQ_MAX_WR = 1 << 0, + IBV_SRQ_LIMIT = 1 << 1 +}; + +struct ibv_srq_attr { + uint32_t max_wr; + uint32_t max_sge; + uint32_t srq_limit; +}; + +struct ibv_srq_init_attr { + void *srq_context; + struct ibv_srq_attr attr; +}; + +enum ibv_qp_type { + IBV_QPT_RC = 2, + IBV_QPT_UC, + IBV_QPT_UD, + IBV_QPT_RAW_PACKET = 8 +}; + +struct ibv_qp_cap { + uint32_t max_send_wr; + uint32_t max_recv_wr; + uint32_t max_send_sge; + uint32_t max_recv_sge; + uint32_t max_inline_data; +}; + +struct ibv_qp_init_attr { + void *qp_context; + struct ibv_cq *send_cq; + struct ibv_cq *recv_cq; + struct ibv_srq *srq; + struct ibv_qp_cap cap; + enum ibv_qp_type qp_type; + int sq_sig_all; +}; + +enum ibv_qp_attr_mask { + IBV_QP_STATE = 1 << 0, + IBV_QP_CUR_STATE = 1 << 1, + IBV_QP_EN_SQD_ASYNC_NOTIFY = 1 << 2, + IBV_QP_ACCESS_FLAGS = 1 << 3, + IBV_QP_PKEY_INDEX = 1 << 4, + IBV_QP_PORT = 1 << 5, + IBV_QP_QKEY = 1 << 6, + IBV_QP_AV = 1 << 7, + IBV_QP_PATH_MTU = 1 << 8, + IBV_QP_TIMEOUT = 1 << 9, + IBV_QP_RETRY_CNT = 1 << 10, + IBV_QP_RNR_RETRY = 1 << 11, + IBV_QP_RQ_PSN = 1 << 12, + IBV_QP_MAX_QP_RD_ATOMIC = 1 << 13, + IBV_QP_ALT_PATH = 1 << 14, + IBV_QP_MIN_RNR_TIMER = 1 << 15, + IBV_QP_SQ_PSN = 1 << 16, + IBV_QP_MAX_DEST_RD_ATOMIC = 1 << 17, + IBV_QP_PATH_MIG_STATE = 1 << 18, + IBV_QP_CAP = 1 << 19, + IBV_QP_DEST_QPN = 1 << 20 +}; + +enum ibv_qp_state { + IBV_QPS_RESET, + IBV_QPS_INIT, + IBV_QPS_RTR, + IBV_QPS_RTS, + IBV_QPS_SQD, + IBV_QPS_SQE, + IBV_QPS_ERR +}; + +enum ibv_mig_state { + IBV_MIG_MIGRATED, + IBV_MIG_REARM, + IBV_MIG_ARMED +}; + +struct ibv_qp_attr { + enum ibv_qp_state qp_state; + enum ibv_qp_state cur_qp_state; + enum ibv_mtu path_mtu; + enum ibv_mig_state path_mig_state; + uint32_t qkey; + uint32_t rq_psn; + uint32_t sq_psn; + uint32_t dest_qp_num; + int qp_access_flags; + struct ibv_qp_cap cap; + struct ibv_ah_attr ah_attr; + struct ibv_ah_attr alt_ah_attr; + uint16_t pkey_index; + uint16_t alt_pkey_index; + uint8_t en_sqd_async_notify; + uint8_t sq_draining; + uint8_t max_rd_atomic; + uint8_t max_dest_rd_atomic; + uint8_t min_rnr_timer; + uint8_t port_num; + uint8_t timeout; + uint8_t retry_cnt; + uint8_t rnr_retry; + uint8_t alt_port_num; + uint8_t alt_timeout; +}; + +enum ibv_wr_opcode { + IBV_WR_RDMA_WRITE, + IBV_WR_RDMA_WRITE_WITH_IMM, + IBV_WR_SEND, + IBV_WR_SEND_WITH_IMM, + IBV_WR_RDMA_READ, + IBV_WR_ATOMIC_CMP_AND_SWP, + IBV_WR_ATOMIC_FETCH_AND_ADD +}; + +enum ibv_send_flags { + IBV_SEND_FENCE = 1 << 0, + IBV_SEND_SIGNALED = 1 << 1, + IBV_SEND_SOLICITED = 1 << 2, + IBV_SEND_INLINE = 1 << 3 +}; + +struct ibv_sge { + uint64_t addr; + uint32_t length; + uint32_t lkey; +}; + +struct ibv_send_wr { + uint64_t wr_id; + struct ibv_send_wr *next; + struct ibv_sge *sg_list; + int num_sge; + enum ibv_wr_opcode opcode; + int send_flags; + uint32_t imm_data; /* in network byte order */ + union { + struct { + uint64_t remote_addr; + uint32_t rkey; + } rdma; + struct { + uint64_t remote_addr; + uint64_t compare_add; + uint64_t swap; + uint32_t rkey; + } atomic; + struct { + struct ibv_ah *ah; + uint32_t remote_qpn; + uint32_t remote_qkey; + } ud; + } wr; +}; + +struct ibv_recv_wr { + uint64_t wr_id; + struct ibv_recv_wr *next; + struct ibv_sge *sg_list; + int num_sge; +}; + +struct ibv_mw_bind { + uint64_t wr_id; + struct ibv_mr *mr; + void *addr; + size_t length; + int send_flags; + int mw_access_flags; +}; + +struct ibv_srq { + struct ibv_context *context; + void *srq_context; + struct ibv_pd *pd; + uint32_t handle; + + pthread_mutex_t mutex; + pthread_cond_t cond; + uint32_t events_completed; +}; + +struct ibv_qp { + struct ibv_context *context; + void *qp_context; + struct ibv_pd *pd; + struct ibv_cq *send_cq; + struct ibv_cq *recv_cq; + struct ibv_srq *srq; + uint32_t handle; + uint32_t qp_num; + enum ibv_qp_state state; + enum ibv_qp_type qp_type; + + pthread_mutex_t mutex; + pthread_cond_t cond; + uint32_t events_completed; +}; + +struct ibv_comp_channel { + struct ibv_context *context; + int fd; + int refcnt; +}; + +struct ibv_cq { + struct ibv_context *context; + struct ibv_comp_channel *channel; + void *cq_context; + uint32_t handle; + int cqe; + + pthread_mutex_t mutex; + pthread_cond_t cond; + uint32_t comp_events_completed; + uint32_t async_events_completed; +}; + +struct ibv_ah { + struct ibv_context *context; + struct ibv_pd *pd; + uint32_t handle; +}; + +struct ibv_device; +struct ibv_context; + +struct ibv_device_ops { + struct ibv_context * (*alloc_context)(struct ibv_device *device, fid_t fid); + void (*free_context)(struct ibv_context *context); +}; + +enum { + IBV_SYSFS_NAME_MAX = 64, + IBV_SYSFS_PATH_MAX = 256 +}; + +struct ibv_device { + struct ibv_device_ops ops; + enum ibv_node_type node_type; + enum ibv_transport_type transport_type; + /* Name of underlying kernel IB device, eg "mthca0" */ + char name[IBV_SYSFS_NAME_MAX]; + /* Name of uverbs device, eg "uverbs0" */ + char dev_name[IBV_SYSFS_NAME_MAX]; + /* Path to infiniband_verbs class device in sysfs */ + char dev_path[IBV_SYSFS_PATH_MAX]; + /* Path to infiniband class device in sysfs */ + char ibdev_path[IBV_SYSFS_PATH_MAX]; +}; + +struct ibv_context_ops { + int (*query_device)(struct ibv_context *context, + struct ibv_device_attr *device_attr); + int (*query_port)(struct ibv_context *context, uint8_t port_num, + struct ibv_port_attr *port_attr); + struct ibv_pd * (*alloc_pd)(struct ibv_context *context); + int (*dealloc_pd)(struct ibv_pd *pd); + struct ibv_mr * (*reg_mr)(struct ibv_pd *pd, void *addr, size_t length, + int access); + struct ibv_mr * (*rereg_mr)(struct ibv_mr *mr, + int flags, + struct ibv_pd *pd, void *addr, + size_t length, + int access); + int (*dereg_mr)(struct ibv_mr *mr); + struct ibv_mw * (*alloc_mw)(struct ibv_pd *pd, enum ibv_mw_type type); + int (*bind_mw)(struct ibv_qp *qp, struct ibv_mw *mw, + struct ibv_mw_bind *mw_bind); + int (*dealloc_mw)(struct ibv_mw *mw); + struct ibv_cq * (*create_cq)(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, + int comp_vector); + int (*poll_cq)(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc); + int (*req_notify_cq)(struct ibv_cq *cq, int solicited_only); + void (*cq_event)(struct ibv_cq *cq); + int (*resize_cq)(struct ibv_cq *cq, int cqe); + int (*destroy_cq)(struct ibv_cq *cq); + struct ibv_srq * (*create_srq)(struct ibv_pd *pd, + struct ibv_srq_init_attr *srq_init_attr); + int (*modify_srq)(struct ibv_srq *srq, + struct ibv_srq_attr *srq_attr, + int srq_attr_mask); + int (*query_srq)(struct ibv_srq *srq, + struct ibv_srq_attr *srq_attr); + int (*destroy_srq)(struct ibv_srq *srq); + int (*post_srq_recv)(struct ibv_srq *srq, + struct ibv_recv_wr *recv_wr, + struct ibv_recv_wr **bad_recv_wr); + struct ibv_qp * (*create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *attr); + int (*query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask, + struct ibv_qp_init_attr *init_attr); + int (*modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask); + int (*destroy_qp)(struct ibv_qp *qp); + int (*post_send)(struct ibv_qp *qp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr); + int (*post_recv)(struct ibv_qp *qp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr); + struct ibv_ah * (*create_ah)(struct ibv_pd *pd, struct ibv_ah_attr *attr); + int (*destroy_ah)(struct ibv_ah *ah); + int (*attach_mcast)(struct ibv_qp *qp, const union ibv_gid *gid, + uint16_t lid); + int (*detach_mcast)(struct ibv_qp *qp, const union ibv_gid *gid, + uint16_t lid); + void (*async_event)(struct ibv_async_event *event); +}; + +struct ibv_context { + struct ibv_device *device; + struct ibv_context_ops ops; + int cmd_fd; + int async_fd; + int num_comp_vectors; + pthread_mutex_t mutex; + void *abi_compat; + fid_t uv_fid; +}; + +/** + * ibv_get_device_list - Get list of IB devices currently available + * @num_devices: optional. if non-NULL, set to the number of devices + * returned in the array. + * + * Return a NULL-terminated array of IB devices. The array can be + * released with ibv_free_device_list(). + */ +struct ibv_device **ibv_get_device_list(int *num_devices); + +/** + * ibv_free_device_list - Free list from ibv_get_device_list() + * + * Free an array of devices returned from ibv_get_device_list(). Once + * the array is freed, pointers to devices that were not opened with + * ibv_open_device() are no longer valid. Client code must open all + * devices it intends to use before calling ibv_free_device_list(). + */ +void ibv_free_device_list(struct ibv_device **list); + +/** + * ibv_get_device_name - Return kernel device name + */ +const char *ibv_get_device_name(struct ibv_device *device); + +/** + * ibv_get_device_guid - Return device's node GUID + */ +uint64_t ibv_get_device_guid(struct ibv_device *device); + +/** + * ibv_open_device - Initialize device for use + */ +struct ibv_context *ibv_open_device(struct ibv_device *device); + +/** + * ibv_close_device - Release device + */ +int ibv_close_device(struct ibv_context *context); + +/** + * ibv_get_async_event - Get next async event + * @event: Pointer to use to return async event + * + * All async events returned by ibv_get_async_event() must eventually + * be acknowledged with ibv_ack_async_event(). + */ +int ibv_get_async_event(struct ibv_context *context, + struct ibv_async_event *event); + +/** + * ibv_ack_async_event - Acknowledge an async event + * @event: Event to be acknowledged. + * + * All async events which are returned by ibv_get_async_event() must + * be acknowledged. To avoid races, destroying an object (CQ, SRQ or + * QP) will wait for all affiliated events to be acknowledged, so + * there should be a one-to-one correspondence between acks and + * successful gets. + */ +void ibv_ack_async_event(struct ibv_async_event *event); + +/** + * ibv_query_device - Get device properties + */ +int ibv_query_device(struct ibv_context *context, + struct ibv_device_attr *device_attr); + +/** + * ibv_query_port - Get port properties + */ +int ibv_query_port(struct ibv_context *context, uint8_t port_num, + struct ibv_port_attr *port_attr); + +/** + * ibv_query_gid - Get a GID table entry + */ +int ibv_query_gid(struct ibv_context *context, uint8_t port_num, + int index, union ibv_gid *gid); + +/** + * ibv_query_pkey - Get a P_Key table entry + */ +int ibv_query_pkey(struct ibv_context *context, uint8_t port_num, + int index, uint16_t *pkey); + +/** + * ibv_alloc_pd - Allocate a protection domain + */ +struct ibv_pd *ibv_alloc_pd(struct ibv_context *context); + +/** + * ibv_dealloc_pd - Free a protection domain + */ +int ibv_dealloc_pd(struct ibv_pd *pd); + +/** + * ibv_reg_mr - Register a memory region + */ +struct ibv_mr *ibv_reg_mr(struct ibv_pd *pd, void *addr, + size_t length, int access); + +/** + * ibv_dereg_mr - Deregister a memory region + */ +int ibv_dereg_mr(struct ibv_mr *mr); + +/** + * ibv_create_comp_channel - Create a completion event channel + */ +struct ibv_comp_channel *ibv_create_comp_channel(struct ibv_context *context); + +/** + * ibv_destroy_comp_channel - Destroy a completion event channel + */ +int ibv_destroy_comp_channel(struct ibv_comp_channel *channel); + +/** + * ibv_create_cq - Create a completion queue + * @context - Context CQ will be attached to + * @cqe - Minimum number of entries required for CQ + * @cq_context - Consumer-supplied context returned for completion events + * @channel - Completion channel where completion events will be queued. + * May be NULL if completion events will not be used. + * @comp_vector - Completion vector used to signal completion events. + * Must be >= 0 and < context->num_comp_vectors. + */ +struct ibv_cq *ibv_create_cq(struct ibv_context *context, int cqe, + void *cq_context, + struct ibv_comp_channel *channel, + int comp_vector); + +/** + * ibv_resize_cq - Modifies the capacity of the CQ. + * @cq: The CQ to resize. + * @cqe: The minimum size of the CQ. + * + * Users can examine the cq structure to determine the actual CQ size. + */ +int ibv_resize_cq(struct ibv_cq *cq, int cqe); + +/** + * ibv_destroy_cq - Destroy a completion queue + */ +int ibv_destroy_cq(struct ibv_cq *cq); + +/** + * ibv_get_cq_event - Read next CQ event + * @channel: Channel to get next event from. + * @cq: Used to return pointer to CQ. + * @cq_context: Used to return consumer-supplied CQ context. + * + * All completion events returned by ibv_get_cq_event() must + * eventually be acknowledged with ibv_ack_cq_events(). + */ +int ibv_get_cq_event(struct ibv_comp_channel *channel, + struct ibv_cq **cq, void **cq_context); + +/** + * ibv_ack_cq_events - Acknowledge CQ completion events + * @cq: CQ to acknowledge events for + * @nevents: Number of events to acknowledge. + * + * All completion events which are returned by ibv_get_cq_event() must + * be acknowledged. To avoid races, ibv_destroy_cq() will wait for + * all completion events to be acknowledged, so there should be a + * one-to-one correspondence between acks and successful gets. An + * application may accumulate multiple completion events and + * acknowledge them in a single call to ibv_ack_cq_events() by passing + * the number of events to ack in @nevents. + */ +void ibv_ack_cq_events(struct ibv_cq *cq, unsigned int nevents); + +/** + * ibv_poll_cq - Poll a CQ for work completions + * @cq:the CQ being polled + * @num_entries:maximum number of completions to return + * @wc:array of at least @num_entries of &struct ibv_wc where completions + * will be returned + * + * Poll a CQ for (possibly multiple) completions. If the return value + * is < 0, an error occurred. If the return value is >= 0, it is the + * number of completions returned. If the return value is + * non-negative and strictly less than num_entries, then the CQ was + * emptied. + */ +static inline int ibv_poll_cq(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc) +{ + return cq->context->ops.poll_cq(cq, num_entries, wc); +} + +/** + * ibv_req_notify_cq - Request completion notification on a CQ. An + * event will be added to the completion channel associated with the + * CQ when an entry is added to the CQ. + * @cq: The completion queue to request notification for. + * @solicited_only: If non-zero, an event will be generated only for + * the next solicited CQ entry. If zero, any CQ entry, solicited or + * not, will generate an event. + */ +static inline int ibv_req_notify_cq(struct ibv_cq *cq, int solicited_only) +{ + return cq->context->ops.req_notify_cq(cq, solicited_only); +} + +/** + * ibv_create_srq - Creates a SRQ associated with the specified protection + * domain. + * @pd: The protection domain associated with the SRQ. + * @srq_init_attr: A list of initial attributes required to create the SRQ. + * + * srq_attr->max_wr and srq_attr->max_sge are read the determine the + * requested size of the SRQ, and set to the actual values allocated + * on return. If ibv_create_srq() succeeds, then max_wr and max_sge + * will always be at least as large as the requested values. + */ +struct ibv_srq *ibv_create_srq(struct ibv_pd *pd, + struct ibv_srq_init_attr *srq_init_attr); + +/** + * ibv_modify_srq - Modifies the attributes for the specified SRQ. + * @srq: The SRQ to modify. + * @srq_attr: On input, specifies the SRQ attributes to modify. On output, + * the current values of selected SRQ attributes are returned. + * @srq_attr_mask: A bit-mask used to specify which attributes of the SRQ + * are being modified. + * + * The mask may contain IBV_SRQ_MAX_WR to resize the SRQ and/or + * IBV_SRQ_LIMIT to set the SRQ's limit and request notification when + * the number of receives queued drops below the limit. + */ +int ibv_modify_srq(struct ibv_srq *srq, + struct ibv_srq_attr *srq_attr, + int srq_attr_mask); + +/** + * ibv_query_srq - Returns the attribute list and current values for the + * specified SRQ. + * @srq: The SRQ to query. + * @srq_attr: The attributes of the specified SRQ. + */ +int ibv_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr); + +/** + * ibv_destroy_srq - Destroys the specified SRQ. + * @srq: The SRQ to destroy. + */ +int ibv_destroy_srq(struct ibv_srq *srq); + +/** + * ibv_post_srq_recv - Posts a list of work requests to the specified SRQ. + * @srq: The SRQ to post the work request on. + * @recv_wr: A list of work requests to post on the receive queue. + * @bad_recv_wr: On an immediate failure, this parameter will reference + * the work request that failed to be posted on the QP. + */ +static inline int ibv_post_srq_recv(struct ibv_srq *srq, + struct ibv_recv_wr *recv_wr, + struct ibv_recv_wr **bad_recv_wr) +{ + return srq->context->ops.post_srq_recv(srq, recv_wr, bad_recv_wr); +} + +/** + * ibv_create_qp - Create a queue pair. + */ +struct ibv_qp *ibv_create_qp(struct ibv_pd *pd, + struct ibv_qp_init_attr *qp_init_attr); + +/** + * ibv_modify_qp - Modify a queue pair. + */ +int ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask); + +/** + * ibv_query_qp - Returns the attribute list and current values for the + * specified QP. + * @qp: The QP to query. + * @attr: The attributes of the specified QP. + * @attr_mask: A bit-mask used to select specific attributes to query. + * @init_attr: Additional attributes of the selected QP. + * + * The qp_attr_mask may be used to limit the query to gathering only the + * selected attributes. + */ +int ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask, + struct ibv_qp_init_attr *init_attr); + +/** + * ibv_destroy_qp - Destroy a queue pair. + */ +int ibv_destroy_qp(struct ibv_qp *qp); + +/** + * ibv_post_send - Post a list of work requests to a send queue. + * + * If IBV_SEND_INLINE flag is set, the data buffers can be reused + * immediately after the call returns. + */ +static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr) +{ + return qp->context->ops.post_send(qp, wr, bad_wr); +} + +/** + * ibv_post_recv - Post a list of work requests to a receive queue. + */ +static inline int ibv_post_recv(struct ibv_qp *qp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + return qp->context->ops.post_recv(qp, wr, bad_wr); +} + +/** + * ibv_create_ah - Create an address handle. + */ +struct ibv_ah *ibv_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr); + +/** + * ibv_init_ah_from_wc - Initializes address handle attributes from a + * work completion. + * @context: Device context on which the received message arrived. + * @port_num: Port on which the received message arrived. + * @wc: Work completion associated with the received message. + * @grh: References the received global route header. This parameter is + * ignored unless the work completion indicates that the GRH is valid. + * @ah_attr: Returned attributes that can be used when creating an address + * handle for replying to the message. + */ +int ibv_init_ah_from_wc(struct ibv_context *context, uint8_t port_num, + struct ibv_wc *wc, struct ibv_grh *grh, + struct ibv_ah_attr *ah_attr); + +/** + * ibv_create_ah_from_wc - Creates an address handle associated with the + * sender of the specified work completion. + * @pd: The protection domain associated with the address handle. + * @wc: Work completion information associated with a received message. + * @grh: References the received global route header. This parameter is + * ignored unless the work completion indicates that the GRH is valid. + * @port_num: The outbound port number to associate with the address. + * + * The address handle is used to reference a local or global destination + * in all UD QP post sends. + */ +struct ibv_ah *ibv_create_ah_from_wc(struct ibv_pd *pd, struct ibv_wc *wc, + struct ibv_grh *grh, uint8_t port_num); + +/** + * ibv_destroy_ah - Destroy an address handle. + */ +int ibv_destroy_ah(struct ibv_ah *ah); + +/** + * ibv_attach_mcast - Attaches the specified QP to a multicast group. + * @qp: QP to attach to the multicast group. The QP must be a UD QP. + * @gid: Multicast group GID. + * @lid: Multicast group LID in host byte order. + * + * In order to route multicast packets correctly, subnet + * administration must have created the multicast group and configured + * the fabric appropriately. The port associated with the specified + * QP must also be a member of the multicast group. + */ +int ibv_attach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid); + +/** + * ibv_detach_mcast - Detaches the specified QP from a multicast group. + * @qp: QP to detach from the multicast group. + * @gid: Multicast group GID. + * @lid: Multicast group LID in host byte order. + */ +int ibv_detach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid); + +/** + * ibv_fork_init - Prepare data structures so that fork() may be used + * safely. If this function is not called or returns a non-zero + * status, then libibverbs data structures are not fork()-safe and the + * effect of an application calling fork() is undefined. + */ +int ibv_fork_init(void); + +/** + * ibv_node_type_str - Return string describing node_type enum value + */ +const char *ibv_node_type_str(enum ibv_node_type node_type); + +/** + * ibv_port_state_str - Return string describing port_state enum value + */ +const char *ibv_port_state_str(enum ibv_port_state port_state); + +/** + * ibv_event_type_str - Return string describing event_type enum value + */ +const char *ibv_event_type_str(enum ibv_event_type event); + +END_C_DECLS + +# undef __attribute_const + + +#endif /* INFINIBAND_VERBS_H */ diff --git a/prov/ibverbs/src/cmd.c b/prov/ibverbs/src/cmd.c new file mode 100644 index 00000000000..0a240d47237 --- /dev/null +++ b/prov/ibverbs/src/cmd.c @@ -0,0 +1,879 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 PathScale, Inc. All rights reserved. + * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +# include <config.h> +#endif /* HAVE_CONFIG_H */ + +#include <stdio.h> +#include <unistd.h> +#include <stdlib.h> +#include <errno.h> +#include <alloca.h> +#include <string.h> + +#include <rdma/fi_uverbs.h> +#include "ibverbs.h" + +int ibv_cmd_get_context(struct ibv_context *context, struct ibv_get_context *cmd, + size_t cmd_size, struct ibv_get_context_resp *resp, + size_t resp_size) +{ + int ret; + + ret = uv_get_context(context->uv_fid, cmd, cmd_size, resp, resp_size); + if (ret) + return ret; + + context->async_fd = resp->async_fd; + context->num_comp_vectors = resp->num_comp_vectors; + + return 0; +} + +int ibv_cmd_query_device(struct ibv_context *context, + struct ibv_device_attr *device_attr, + uint64_t *raw_fw_ver, + struct ibv_query_device *cmd, size_t cmd_size) +{ + struct ibv_query_device_resp resp; + int ret; + + ret = uv_query_device(context->uv_fid, cmd, cmd_size, &resp, sizeof resp); + if (ret) + return ret; + + memset(device_attr->fw_ver, 0, sizeof device_attr->fw_ver); + *raw_fw_ver = resp.fw_ver; + device_attr->node_guid = resp.node_guid; + device_attr->sys_image_guid = resp.sys_image_guid; + device_attr->max_mr_size = resp.max_mr_size; + device_attr->page_size_cap = resp.page_size_cap; + device_attr->vendor_id = resp.vendor_id; + device_attr->vendor_part_id = resp.vendor_part_id; + device_attr->hw_ver = resp.hw_ver; + device_attr->max_qp = resp.max_qp; + device_attr->max_qp_wr = resp.max_qp_wr; + device_attr->device_cap_flags = resp.device_cap_flags; + device_attr->max_sge = resp.max_sge; + device_attr->max_sge_rd = resp.max_sge_rd; + device_attr->max_cq = resp.max_cq; + device_attr->max_cqe = resp.max_cqe; + device_attr->max_mr = resp.max_mr; + device_attr->max_pd = resp.max_pd; + device_attr->max_qp_rd_atom = resp.max_qp_rd_atom; + device_attr->max_ee_rd_atom = resp.max_ee_rd_atom; + device_attr->max_res_rd_atom = resp.max_res_rd_atom; + device_attr->max_qp_init_rd_atom = resp.max_qp_init_rd_atom; + device_attr->max_ee_init_rd_atom = resp.max_ee_init_rd_atom; + device_attr->atomic_cap = resp.atomic_cap; + device_attr->max_ee = resp.max_ee; + device_attr->max_rdd = resp.max_rdd; + device_attr->max_mw = resp.max_mw; + device_attr->max_raw_ipv6_qp = resp.max_raw_ipv6_qp; + device_attr->max_raw_ethy_qp = resp.max_raw_ethy_qp; + device_attr->max_mcast_grp = resp.max_mcast_grp; + device_attr->max_mcast_qp_attach = resp.max_mcast_qp_attach; + device_attr->max_total_mcast_qp_attach = resp.max_total_mcast_qp_attach; + device_attr->max_ah = resp.max_ah; + device_attr->max_fmr = resp.max_fmr; + device_attr->max_map_per_fmr = resp.max_map_per_fmr; + device_attr->max_srq = resp.max_srq; + device_attr->max_srq_wr = resp.max_srq_wr; + device_attr->max_srq_sge = resp.max_srq_sge; + device_attr->max_pkeys = resp.max_pkeys; + device_attr->local_ca_ack_delay = resp.local_ca_ack_delay; + device_attr->phys_port_cnt = resp.phys_port_cnt; + + return 0; +} + +int ibv_cmd_query_port(struct ibv_context *context, uint8_t port_num, + struct ibv_port_attr *port_attr, + struct ibv_query_port *cmd, size_t cmd_size) +{ + struct ibv_query_port_resp resp; + int ret; + + cmd->port_num = port_num; + memset(cmd->reserved, 0, sizeof cmd->reserved); + + ret = uv_query_port(context->uv_fid, cmd, cmd_size, &resp, sizeof resp); + if (ret) + return ret; + + port_attr->state = resp.state; + port_attr->max_mtu = resp.max_mtu; + port_attr->active_mtu = resp.active_mtu; + port_attr->gid_tbl_len = resp.gid_tbl_len; + port_attr->port_cap_flags = resp.port_cap_flags; + port_attr->max_msg_sz = resp.max_msg_sz; + port_attr->bad_pkey_cntr = resp.bad_pkey_cntr; + port_attr->qkey_viol_cntr = resp.qkey_viol_cntr; + port_attr->pkey_tbl_len = resp.pkey_tbl_len; + port_attr->lid = resp.lid; + port_attr->sm_lid = resp.sm_lid; + port_attr->lmc = resp.lmc; + port_attr->max_vl_num = resp.max_vl_num; + port_attr->sm_sl = resp.sm_sl; + port_attr->subnet_timeout = resp.subnet_timeout; + port_attr->init_type_reply = resp.init_type_reply; + port_attr->active_width = resp.active_width; + port_attr->active_speed = resp.active_speed; + port_attr->phys_state = resp.phys_state; + port_attr->link_layer = resp.link_layer; + + return 0; +} + +int ibv_cmd_alloc_pd(struct ibv_context *context, struct ibv_pd *pd, + struct ibv_alloc_pd *cmd, size_t cmd_size, + struct ibv_alloc_pd_resp *resp, size_t resp_size) +{ + int ret; + + ret = uv_alloc_pd(context->uv_fid, cmd, cmd_size, resp, resp_size); + if (ret) + return ret; + + pd->handle = resp->pd_handle; + pd->context = context; + + return 0; +} + +int ibv_cmd_dealloc_pd(struct ibv_pd *pd) +{ + struct ibv_dealloc_pd cmd; + + cmd.pd_handle = pd->handle; + return uv_dealloc_pd(pd->context->uv_fid, &cmd, sizeof cmd); +} + +int ibv_cmd_reg_mr(struct ibv_pd *pd, void *addr, size_t length, + uint64_t hca_va, int access, + struct ibv_mr *mr, struct ibv_reg_mr *cmd, + size_t cmd_size, + struct ibv_reg_mr_resp *resp, size_t resp_size) +{ + int ret; + + cmd->start = (uintptr_t) addr; + cmd->length = length; + cmd->hca_va = hca_va; + cmd->pd_handle = pd->handle; + cmd->access_flags = access; + + ret = uv_reg_mr(pd->context->uv_fid, cmd, cmd_size, resp, resp_size); + if (ret) + return ret; + + mr->handle = resp->mr_handle; + mr->lkey = resp->lkey; + mr->rkey = resp->rkey; + mr->context = pd->context; + + return 0; +} + +int ibv_cmd_dereg_mr(struct ibv_mr *mr) +{ + struct ibv_dereg_mr cmd; + + cmd.mr_handle = mr->handle; + return uv_dereg_mr(mr->context->uv_fid, &cmd, sizeof cmd); +} + +int ibv_cmd_create_cq(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, + int comp_vector, struct ibv_cq *cq, + struct ibv_create_cq *cmd, size_t cmd_size, + struct ibv_create_cq_resp *resp, size_t resp_size) +{ + int ret; + + cmd->user_handle = (uintptr_t) cq; + cmd->cqe = cqe; + cmd->comp_vector = comp_vector; + cmd->comp_channel = channel ? channel->fd : -1; + cmd->reserved = 0; + + ret = uv_create_cq(context->uv_fid, cmd, cmd_size, resp, resp_size); + if (ret) + return ret; + + cq->handle = resp->cq_handle; + cq->cqe = resp->cqe; + cq->context = context; + + return 0; +} + +int ibv_cmd_poll_cq(struct ibv_cq *ibcq, int ne, struct ibv_wc *wc) +{ + struct ibv_poll_cq cmd; + struct ibv_poll_cq_resp *resp; + int i; + int rsize; + int ret; + + rsize = sizeof *resp + ne * sizeof(struct ibv_kern_wc); + resp = malloc(rsize); + if (!resp) + return -1; + + cmd.cq_handle = ibcq->handle; + cmd.ne = ne; + + ret = uv_poll_cq(ibcq->context->uv_fid, &cmd, sizeof cmd, resp, rsize); + if (ret) { + errno = ret; + ret = -1; + goto out; + } + + for (i = 0; i < resp->count; i++) { + wc[i].wr_id = resp->wc[i].wr_id; + wc[i].status = resp->wc[i].status; + wc[i].opcode = resp->wc[i].opcode; + wc[i].vendor_err = resp->wc[i].vendor_err; + wc[i].byte_len = resp->wc[i].byte_len; + wc[i].imm_data = resp->wc[i].imm_data; + wc[i].qp_num = resp->wc[i].qp_num; + wc[i].src_qp = resp->wc[i].src_qp; + wc[i].wc_flags = resp->wc[i].wc_flags; + wc[i].pkey_index = resp->wc[i].pkey_index; + wc[i].slid = resp->wc[i].slid; + wc[i].sl = resp->wc[i].sl; + wc[i].dlid_path_bits = resp->wc[i].dlid_path_bits; + } + + ret = resp->count; + +out: + free(resp); + return ret; +} + +int ibv_cmd_req_notify_cq(struct ibv_cq *ibcq, int solicited_only) +{ + struct ibv_req_notify_cq cmd; + + cmd.cq_handle = ibcq->handle; + cmd.solicited = !!solicited_only; + + return uv_req_notify_cq(ibcq->context->uv_fid, &cmd, sizeof cmd); +} + +int ibv_cmd_resize_cq(struct ibv_cq *cq, int cqe, + struct ibv_resize_cq *cmd, size_t cmd_size, + struct ibv_resize_cq_resp *resp, size_t resp_size) +{ + int ret; + + cmd->cq_handle = cq->handle; + cmd->cqe = cqe; + + ret = uv_resize_cq(cq->context->uv_fid, cmd, cmd_size, resp, resp_size); + if (ret) + return ret; + + cq->cqe = resp->cqe; + + return 0; +} + +int ibv_cmd_destroy_cq(struct ibv_cq *cq) +{ + struct ibv_destroy_cq cmd; + struct ibv_destroy_cq_resp resp; + int ret; + + cmd.cq_handle = cq->handle; + cmd.reserved = 0; + + ret = uv_destroy_cq(cq->context->uv_fid, &cmd, sizeof cmd, &resp, sizeof resp); + if (ret) + return ret; + + pthread_mutex_lock(&cq->mutex); + while (cq->comp_events_completed != resp.comp_events_reported || + cq->async_events_completed != resp.async_events_reported) + pthread_cond_wait(&cq->cond, &cq->mutex); + pthread_mutex_unlock(&cq->mutex); + + return 0; +} + +int ibv_cmd_create_srq(struct ibv_pd *pd, + struct ibv_srq *srq, struct ibv_srq_init_attr *attr, + struct ibv_create_srq *cmd, size_t cmd_size, + struct ibv_create_srq_resp *resp, size_t resp_size) +{ + int ret; + + cmd->user_handle = (uintptr_t) srq; + cmd->pd_handle = pd->handle; + cmd->max_wr = attr->attr.max_wr; + cmd->max_sge = attr->attr.max_sge; + cmd->srq_limit = attr->attr.srq_limit; + + ret = uv_create_srq(pd->context->uv_fid, cmd, cmd_size, resp, resp_size); + if (ret) + return ret; + + srq->handle = resp->srq_handle; + srq->context = pd->context; + attr->attr.max_wr = resp->max_wr; + attr->attr.max_sge = resp->max_sge; + + return 0; +} + +int ibv_cmd_modify_srq(struct ibv_srq *srq, + struct ibv_srq_attr *srq_attr, + int srq_attr_mask, + struct ibv_modify_srq *cmd, size_t cmd_size) +{ + cmd->srq_handle = srq->handle; + cmd->attr_mask = srq_attr_mask; + cmd->max_wr = srq_attr->max_wr; + cmd->srq_limit = srq_attr->srq_limit; + + return uv_modify_srq(srq->context->uv_fid, cmd, cmd_size); +} + +int ibv_cmd_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr, + struct ibv_query_srq *cmd, size_t cmd_size) +{ + struct ibv_query_srq_resp resp; + int ret; + + cmd->srq_handle = srq->handle; + cmd->reserved = 0; + + ret = uv_query_srq(srq->context->uv_fid, cmd, cmd_size, &resp, sizeof resp); + if (ret) + return ret; + + srq_attr->max_wr = resp.max_wr; + srq_attr->max_sge = resp.max_sge; + srq_attr->srq_limit = resp.srq_limit; + + return 0; +} + +int ibv_cmd_destroy_srq(struct ibv_srq *srq) +{ + struct ibv_destroy_srq cmd; + struct ibv_destroy_srq_resp resp; + int ret; + + cmd.srq_handle = srq->handle; + cmd.reserved = 0; + + ret = uv_destroy_srq(srq->context->uv_fid, &cmd, sizeof cmd, &resp, sizeof resp); + if (ret) + return ret; + + pthread_mutex_lock(&srq->mutex); + while (srq->events_completed != resp.events_reported) + pthread_cond_wait(&srq->cond, &srq->mutex); + pthread_mutex_unlock(&srq->mutex); + + return 0; +} + +int ibv_cmd_create_qp(struct ibv_pd *pd, + struct ibv_qp *qp, struct ibv_qp_init_attr *attr, + struct ibv_create_qp *cmd, size_t cmd_size, + struct ibv_create_qp_resp *resp, size_t resp_size) +{ + int ret; + + cmd->user_handle = (uintptr_t) qp; + cmd->pd_handle = pd->handle; + cmd->send_cq_handle = attr->send_cq->handle; + cmd->recv_cq_handle = attr->recv_cq->handle; + cmd->srq_handle = attr->srq ? attr->srq->handle : 0; + cmd->max_send_wr = attr->cap.max_send_wr; + cmd->max_recv_wr = attr->cap.max_recv_wr; + cmd->max_send_sge = attr->cap.max_send_sge; + cmd->max_recv_sge = attr->cap.max_recv_sge; + cmd->max_inline_data = attr->cap.max_inline_data; + cmd->sq_sig_all = attr->sq_sig_all; + cmd->qp_type = attr->qp_type; + cmd->is_srq = !!attr->srq; + cmd->reserved = 0; + + ret = uv_create_qp(pd->context->uv_fid, cmd, cmd_size, resp, resp_size); + if (ret) + return ret; + + qp->handle = resp->qp_handle; + qp->qp_num = resp->qpn; + qp->context = pd->context; + + attr->cap.max_recv_sge = resp->max_recv_sge; + attr->cap.max_send_sge = resp->max_send_sge; + attr->cap.max_recv_wr = resp->max_recv_wr; + attr->cap.max_send_wr = resp->max_send_wr; + attr->cap.max_inline_data = resp->max_inline_data; + + return 0; +} + +int ibv_cmd_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask, + struct ibv_qp_init_attr *init_attr, + struct ibv_query_qp *cmd, size_t cmd_size) +{ + struct ibv_query_qp_resp resp; + int ret; + + cmd->qp_handle = qp->handle; + cmd->attr_mask = attr_mask; + + ret = uv_query_qp(qp->context->uv_fid, cmd, cmd_size, &resp, sizeof resp); + if (ret) + return ret; + + attr->qkey = resp.qkey; + attr->rq_psn = resp.rq_psn; + attr->sq_psn = resp.sq_psn; + attr->dest_qp_num = resp.dest_qp_num; + attr->qp_access_flags = resp.qp_access_flags; + attr->pkey_index = resp.pkey_index; + attr->alt_pkey_index = resp.alt_pkey_index; + attr->qp_state = resp.qp_state; + attr->cur_qp_state = resp.cur_qp_state; + attr->path_mtu = resp.path_mtu; + attr->path_mig_state = resp.path_mig_state; + attr->sq_draining = resp.sq_draining; + attr->max_rd_atomic = resp.max_rd_atomic; + attr->max_dest_rd_atomic = resp.max_dest_rd_atomic; + attr->min_rnr_timer = resp.min_rnr_timer; + attr->port_num = resp.port_num; + attr->timeout = resp.timeout; + attr->retry_cnt = resp.retry_cnt; + attr->rnr_retry = resp.rnr_retry; + attr->alt_port_num = resp.alt_port_num; + attr->alt_timeout = resp.alt_timeout; + attr->cap.max_send_wr = resp.max_send_wr; + attr->cap.max_recv_wr = resp.max_recv_wr; + attr->cap.max_send_sge = resp.max_send_sge; + attr->cap.max_recv_sge = resp.max_recv_sge; + attr->cap.max_inline_data = resp.max_inline_data; + + memcpy(attr->ah_attr.grh.dgid.raw, resp.dest.dgid, 16); + attr->ah_attr.grh.flow_label = resp.dest.flow_label; + attr->ah_attr.dlid = resp.dest.dlid; + attr->ah_attr.grh.sgid_index = resp.dest.sgid_index; + attr->ah_attr.grh.hop_limit = resp.dest.hop_limit; + attr->ah_attr.grh.traffic_class = resp.dest.traffic_class; + attr->ah_attr.sl = resp.dest.sl; + attr->ah_attr.src_path_bits = resp.dest.src_path_bits; + attr->ah_attr.static_rate = resp.dest.static_rate; + attr->ah_attr.is_global = resp.dest.is_global; + attr->ah_attr.port_num = resp.dest.port_num; + + memcpy(attr->alt_ah_attr.grh.dgid.raw, resp.alt_dest.dgid, 16); + attr->alt_ah_attr.grh.flow_label = resp.alt_dest.flow_label; + attr->alt_ah_attr.dlid = resp.alt_dest.dlid; + attr->alt_ah_attr.grh.sgid_index = resp.alt_dest.sgid_index; + attr->alt_ah_attr.grh.hop_limit = resp.alt_dest.hop_limit; + attr->alt_ah_attr.grh.traffic_class = resp.alt_dest.traffic_class; + attr->alt_ah_attr.sl = resp.alt_dest.sl; + attr->alt_ah_attr.src_path_bits = resp.alt_dest.src_path_bits; + attr->alt_ah_attr.static_rate = resp.alt_dest.static_rate; + attr->alt_ah_attr.is_global = resp.alt_dest.is_global; + attr->alt_ah_attr.port_num = resp.alt_dest.port_num; + + init_attr->qp_context = qp->qp_context; + init_attr->send_cq = qp->send_cq; + init_attr->recv_cq = qp->recv_cq; + init_attr->srq = qp->srq; + init_attr->qp_type = qp->qp_type; + init_attr->cap.max_send_wr = resp.max_send_wr; + init_attr->cap.max_recv_wr = resp.max_recv_wr; + init_attr->cap.max_send_sge = resp.max_send_sge; + init_attr->cap.max_recv_sge = resp.max_recv_sge; + init_attr->cap.max_inline_data = resp.max_inline_data; + init_attr->sq_sig_all = resp.sq_sig_all; + + return 0; +} + +int ibv_cmd_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask, + struct ibv_modify_qp *cmd, size_t cmd_size) +{ + cmd->qp_handle = qp->handle; + cmd->attr_mask = attr_mask; + cmd->qkey = attr->qkey; + cmd->rq_psn = attr->rq_psn; + cmd->sq_psn = attr->sq_psn; + cmd->dest_qp_num = attr->dest_qp_num; + cmd->qp_access_flags = attr->qp_access_flags; + cmd->pkey_index = attr->pkey_index; + cmd->alt_pkey_index = attr->alt_pkey_index; + cmd->qp_state = attr->qp_state; + cmd->cur_qp_state = attr->cur_qp_state; + cmd->path_mtu = attr->path_mtu; + cmd->path_mig_state = attr->path_mig_state; + cmd->en_sqd_async_notify = attr->en_sqd_async_notify; + cmd->max_rd_atomic = attr->max_rd_atomic; + cmd->max_dest_rd_atomic = attr->max_dest_rd_atomic; + cmd->min_rnr_timer = attr->min_rnr_timer; + cmd->port_num = attr->port_num; + cmd->timeout = attr->timeout; + cmd->retry_cnt = attr->retry_cnt; + cmd->rnr_retry = attr->rnr_retry; + cmd->alt_port_num = attr->alt_port_num; + cmd->alt_timeout = attr->alt_timeout; + + memcpy(cmd->dest.dgid, attr->ah_attr.grh.dgid.raw, 16); + cmd->dest.flow_label = attr->ah_attr.grh.flow_label; + cmd->dest.dlid = attr->ah_attr.dlid; + cmd->dest.reserved = 0; + cmd->dest.sgid_index = attr->ah_attr.grh.sgid_index; + cmd->dest.hop_limit = attr->ah_attr.grh.hop_limit; + cmd->dest.traffic_class = attr->ah_attr.grh.traffic_class; + cmd->dest.sl = attr->ah_attr.sl; + cmd->dest.src_path_bits = attr->ah_attr.src_path_bits; + cmd->dest.static_rate = attr->ah_attr.static_rate; + cmd->dest.is_global = attr->ah_attr.is_global; + cmd->dest.port_num = attr->ah_attr.port_num; + + memcpy(cmd->alt_dest.dgid, attr->alt_ah_attr.grh.dgid.raw, 16); + cmd->alt_dest.flow_label = attr->alt_ah_attr.grh.flow_label; + cmd->alt_dest.dlid = attr->alt_ah_attr.dlid; + cmd->alt_dest.reserved = 0; + cmd->alt_dest.sgid_index = attr->alt_ah_attr.grh.sgid_index; + cmd->alt_dest.hop_limit = attr->alt_ah_attr.grh.hop_limit; + cmd->alt_dest.traffic_class = attr->alt_ah_attr.grh.traffic_class; + cmd->alt_dest.sl = attr->alt_ah_attr.sl; + cmd->alt_dest.src_path_bits = attr->alt_ah_attr.src_path_bits; + cmd->alt_dest.static_rate = attr->alt_ah_attr.static_rate; + cmd->alt_dest.is_global = attr->alt_ah_attr.is_global; + cmd->alt_dest.port_num = attr->alt_ah_attr.port_num; + + cmd->reserved[0] = cmd->reserved[1] = 0; + + return uv_modify_qp(qp->context->uv_fid, cmd, cmd_size); +} + +int ibv_cmd_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr) +{ + struct ibv_post_send *cmd; + struct ibv_post_send_resp resp; + struct ibv_send_wr *i; + struct ibv_kern_send_wr *n, *tmp; + struct ibv_sge *s; + unsigned wr_count = 0; + unsigned sge_count = 0; + int cmd_size; + int ret; + + for (i = wr; i; i = i->next) { + wr_count++; + sge_count += i->num_sge; + } + + cmd_size = sizeof *cmd + wr_count * sizeof *n + sge_count * sizeof *s; + cmd = alloca(cmd_size); + + cmd->qp_handle = ibqp->handle; + cmd->wr_count = wr_count; + cmd->sge_count = sge_count; + cmd->wqe_size = sizeof *n; + + n = (struct ibv_kern_send_wr *) ((void *) cmd + sizeof *cmd); + s = (struct ibv_sge *) (n + wr_count); + + tmp = n; + for (i = wr; i; i = i->next) { + tmp->wr_id = i->wr_id; + tmp->num_sge = i->num_sge; + tmp->opcode = i->opcode; + tmp->send_flags = i->send_flags; + tmp->imm_data = i->imm_data; + if (ibqp->qp_type == IBV_QPT_UD) { + tmp->wr.ud.ah = i->wr.ud.ah->handle; + tmp->wr.ud.remote_qpn = i->wr.ud.remote_qpn; + tmp->wr.ud.remote_qkey = i->wr.ud.remote_qkey; + } else { + switch (i->opcode) { + case IBV_WR_RDMA_WRITE: + case IBV_WR_RDMA_WRITE_WITH_IMM: + case IBV_WR_RDMA_READ: + tmp->wr.rdma.remote_addr = + i->wr.rdma.remote_addr; + tmp->wr.rdma.rkey = i->wr.rdma.rkey; + break; + case IBV_WR_ATOMIC_CMP_AND_SWP: + case IBV_WR_ATOMIC_FETCH_AND_ADD: + tmp->wr.atomic.remote_addr = + i->wr.atomic.remote_addr; + tmp->wr.atomic.compare_add = + i->wr.atomic.compare_add; + tmp->wr.atomic.swap = i->wr.atomic.swap; + tmp->wr.atomic.rkey = i->wr.atomic.rkey; + break; + default: + break; + } + } + + if (tmp->num_sge) { + memcpy(s, i->sg_list, tmp->num_sge * sizeof *s); + s += tmp->num_sge; + } + + tmp++; + } + + resp.bad_wr = 0; + ret = uv_post_send(ibqp->context->uv_fid, cmd, cmd_size, &resp, sizeof resp); + + wr_count = resp.bad_wr; + if (wr_count) { + i = wr; + while (--wr_count) + i = i->next; + *bad_wr = i; + } else if (ret) + *bad_wr = wr; + + return ret; +} + +int ibv_cmd_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + struct ibv_post_recv *cmd; + struct ibv_post_recv_resp resp; + struct ibv_recv_wr *i; + struct ibv_kern_recv_wr *n, *tmp; + struct ibv_sge *s; + unsigned wr_count = 0; + unsigned sge_count = 0; + int cmd_size; + int ret; + + for (i = wr; i; i = i->next) { + wr_count++; + sge_count += i->num_sge; + } + + cmd_size = sizeof *cmd + wr_count * sizeof *n + sge_count * sizeof *s; + cmd = alloca(cmd_size); + + cmd->qp_handle = ibqp->handle; + cmd->wr_count = wr_count; + cmd->sge_count = sge_count; + cmd->wqe_size = sizeof *n; + + n = (struct ibv_kern_recv_wr *) ((void *) cmd + sizeof *cmd); + s = (struct ibv_sge *) (n + wr_count); + + tmp = n; + for (i = wr; i; i = i->next) { + tmp->wr_id = i->wr_id; + tmp->num_sge = i->num_sge; + + if (tmp->num_sge) { + memcpy(s, i->sg_list, tmp->num_sge * sizeof *s); + s += tmp->num_sge; + } + + tmp++; + } + + resp.bad_wr = 0; + ret = uv_post_recv(ibqp->context->uv_fid, cmd, cmd_size, &resp, sizeof resp); + + wr_count = resp.bad_wr; + if (wr_count) { + i = wr; + while (--wr_count) + i = i->next; + *bad_wr = i; + } else if (ret) + *bad_wr = wr; + + return ret; +} + +int ibv_cmd_post_srq_recv(struct ibv_srq *srq, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + struct ibv_post_srq_recv *cmd; + struct ibv_post_srq_recv_resp resp; + struct ibv_recv_wr *i; + struct ibv_kern_recv_wr *n, *tmp; + struct ibv_sge *s; + unsigned wr_count = 0; + unsigned sge_count = 0; + int cmd_size; + int ret; + + for (i = wr; i; i = i->next) { + wr_count++; + sge_count += i->num_sge; + } + + cmd_size = sizeof *cmd + wr_count * sizeof *n + sge_count * sizeof *s; + cmd = alloca(cmd_size); + + cmd->srq_handle = srq->handle; + cmd->wr_count = wr_count; + cmd->sge_count = sge_count; + cmd->wqe_size = sizeof *n; + + n = (struct ibv_kern_recv_wr *) ((void *) cmd + sizeof *cmd); + s = (struct ibv_sge *) (n + wr_count); + + tmp = n; + for (i = wr; i; i = i->next) { + tmp->wr_id = i->wr_id; + tmp->num_sge = i->num_sge; + + if (tmp->num_sge) { + memcpy(s, i->sg_list, tmp->num_sge * sizeof *s); + s += tmp->num_sge; + } + + tmp++; + } + + resp.bad_wr = 0; + ret = uv_post_srq_recv(srq->context->uv_fid, cmd, cmd_size, &resp, sizeof resp); + + wr_count = resp.bad_wr; + if (wr_count) { + i = wr; + while (--wr_count) + i = i->next; + *bad_wr = i; + } else if (ret) + *bad_wr = wr; + + return ret; +} + +int ibv_cmd_create_ah(struct ibv_pd *pd, struct ibv_ah *ah, + struct ibv_ah_attr *attr) +{ + struct ibv_create_ah cmd; + struct ibv_create_ah_resp resp; + int ret; + + cmd.user_handle = (uintptr_t) ah; + cmd.pd_handle = pd->handle; + cmd.attr.dlid = attr->dlid; + cmd.attr.sl = attr->sl; + cmd.attr.src_path_bits = attr->src_path_bits; + cmd.attr.static_rate = attr->static_rate; + cmd.attr.is_global = attr->is_global; + cmd.attr.port_num = attr->port_num; + cmd.attr.grh.flow_label = attr->grh.flow_label; + cmd.attr.grh.sgid_index = attr->grh.sgid_index; + cmd.attr.grh.hop_limit = attr->grh.hop_limit; + cmd.attr.grh.traffic_class = attr->grh.traffic_class; + memcpy(cmd.attr.grh.dgid, attr->grh.dgid.raw, 16); + + ret = uv_create_ah(pd->context->uv_fid, &cmd, sizeof cmd, &resp, sizeof resp); + if (ret) + return ret; + + ah->handle = resp.handle; + ah->context = pd->context; + + return 0; +} + +int ibv_cmd_destroy_ah(struct ibv_ah *ah) +{ + struct ibv_destroy_ah cmd; + + cmd.ah_handle = ah->handle; + return uv_destroy_ah(ah->context->uv_fid, &cmd, sizeof cmd); +} + +int ibv_cmd_destroy_qp(struct ibv_qp *qp) +{ + struct ibv_destroy_qp cmd; + struct ibv_destroy_qp_resp resp; + int ret; + + cmd.qp_handle = qp->handle; + cmd.reserved = 0; + + ret = uv_destroy_qp(qp->context->uv_fid, &cmd, sizeof cmd, &resp, sizeof resp); + if (ret) + return ret; + + pthread_mutex_lock(&qp->mutex); + while (qp->events_completed != resp.events_reported) + pthread_cond_wait(&qp->cond, &qp->mutex); + pthread_mutex_unlock(&qp->mutex); + + return 0; +} + +int ibv_cmd_attach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid) +{ + struct ibv_attach_mcast cmd; + + memcpy(cmd.gid, gid->raw, sizeof cmd.gid); + cmd.qp_handle = qp->handle; + cmd.mlid = lid; + cmd.reserved = 0; + + return uv_attach_mcast(qp->context->uv_fid, &cmd, sizeof cmd); +} + +int ibv_cmd_detach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid) +{ + struct ibv_detach_mcast cmd; + + memcpy(cmd.gid, gid->raw, sizeof cmd.gid); + cmd.qp_handle = qp->handle; + cmd.mlid = lid; + cmd.reserved = 0; + + return uv_detach_mcast(qp->context->uv_fid, &cmd, sizeof cmd); +} diff --git a/prov/ibverbs/src/device.c b/prov/ibverbs/src/device.c new file mode 100644 index 00000000000..429a25e08ff --- /dev/null +++ b/prov/ibverbs/src/device.c @@ -0,0 +1,255 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +# include <config.h> +#endif /* HAVE_CONFIG_H */ + +#include <stdio.h> +#include <netinet/in.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> +#include <stdlib.h> +#include <alloca.h> +#include <errno.h> + +#include <rdma/fi_uverbs.h> +#include <fi.h> + +#include "ibverbs.h" + +static pthread_once_t device_list_once = PTHREAD_ONCE_INIT; +static int num_devices; +static struct ibv_device **device_list; + +static void count_devices(void) +{ + num_devices = ibverbs_init(&device_list); +} + +struct ibv_device **ibv_get_device_list(int *num) +{ + struct ibv_device **l; + int i; + + if (num) + *num = 0; + + pthread_once(&device_list_once, count_devices); + + if (num_devices < 0) { + errno = -num_devices; + return NULL; + } + + l = calloc(num_devices + 1, sizeof (struct ibv_device *)); + if (!l) { + errno = ENOMEM; + return NULL; + } + + for (i = 0; i < num_devices; ++i) + l[i] = device_list[i]; + if (num) + *num = num_devices; + + return l; +} + +void ibv_free_device_list(struct ibv_device **list) +{ + free(list); +} + +const char *ibv_get_device_name(struct ibv_device *device) +{ + return device->name; +} + +uint64_t ibv_get_device_guid(struct ibv_device *device) +{ + char attr[24]; + uint64_t guid = 0; + uint16_t parts[4]; + int i; + + if (fi_read_file(device->ibdev_path, "node_guid", attr, sizeof attr) < 0) + return 0; + + if (sscanf(attr, "%hx:%hx:%hx:%hx", + parts, parts + 1, parts + 2, parts + 3) != 4) + return 0; + + for (i = 0; i < 4; ++i) + guid = (guid << 16) | parts[i]; + + return htonll(guid); +} + +struct ibv_context *ibv_open_device(struct ibv_device *device) +{ + struct ibv_context *context; + struct fid_uverbs *uv; + fid_t uv_fid; + char *uv_name; + int ret; + + if (asprintf(&uv_name, "uverbs/%s", device->dev_name) < 0) + return NULL; + + ret = fi_open(uv_name, NULL, 0, &uv_fid, NULL); + free(uv_name); + if (ret) + return NULL; + + uv = (struct fid_uverbs *) uv_fid; + context = device->ops.alloc_context(device, uv_fid); + if (!context) { + fi_close(uv_fid); + return NULL; + } + + context->device = device; + context->cmd_fd = uv->fd; + uv_fid->context = context; + pthread_mutex_init(&context->mutex, NULL); + return context; +} + +int ibv_close_device(struct ibv_context *context) +{ + int async_fd = context->async_fd; + fid_t uv_fid; + + context->device->ops.free_context(context); + uv_fid = context->uv_fid; + close(async_fd); + fi_close(uv_fid); + return 0; +} + +int ibv_get_async_event(struct ibv_context *context, + struct ibv_async_event *event) +{ + struct ibv_kern_async_event ev; + + if (read(context->async_fd, &ev, sizeof ev) != sizeof ev) + return -1; + + event->event_type = ev.event_type; + + switch (event->event_type) { + case IBV_EVENT_CQ_ERR: + event->element.cq = (void *) (uintptr_t) ev.element; + break; + + case IBV_EVENT_QP_FATAL: + case IBV_EVENT_QP_REQ_ERR: + case IBV_EVENT_QP_ACCESS_ERR: + case IBV_EVENT_COMM_EST: + case IBV_EVENT_SQ_DRAINED: + case IBV_EVENT_PATH_MIG: + case IBV_EVENT_PATH_MIG_ERR: + case IBV_EVENT_QP_LAST_WQE_REACHED: + event->element.qp = (void *) (uintptr_t) ev.element; + break; + + case IBV_EVENT_SRQ_ERR: + case IBV_EVENT_SRQ_LIMIT_REACHED: + event->element.srq = (void *) (uintptr_t) ev.element; + break; + + default: + event->element.port_num = ev.element; + break; + } + + if (context->ops.async_event) + context->ops.async_event(event); + + return 0; +} + +void ibv_ack_async_event(struct ibv_async_event *event) +{ + switch (event->event_type) { + case IBV_EVENT_CQ_ERR: + { + struct ibv_cq *cq = event->element.cq; + + pthread_mutex_lock(&cq->mutex); + ++cq->async_events_completed; + pthread_cond_signal(&cq->cond); + pthread_mutex_unlock(&cq->mutex); + + return; + } + + case IBV_EVENT_QP_FATAL: + case IBV_EVENT_QP_REQ_ERR: + case IBV_EVENT_QP_ACCESS_ERR: + case IBV_EVENT_COMM_EST: + case IBV_EVENT_SQ_DRAINED: + case IBV_EVENT_PATH_MIG: + case IBV_EVENT_PATH_MIG_ERR: + case IBV_EVENT_QP_LAST_WQE_REACHED: + { + struct ibv_qp *qp = event->element.qp; + + pthread_mutex_lock(&qp->mutex); + ++qp->events_completed; + pthread_cond_signal(&qp->cond); + pthread_mutex_unlock(&qp->mutex); + + return; + } + + case IBV_EVENT_SRQ_ERR: + case IBV_EVENT_SRQ_LIMIT_REACHED: + { + struct ibv_srq *srq = event->element.srq; + + pthread_mutex_lock(&srq->mutex); + ++srq->events_completed; + pthread_cond_signal(&srq->cond); + pthread_mutex_unlock(&srq->mutex); + + return; + } + + default: + return; + } +} diff --git a/prov/ibverbs/src/enum_strs.c b/prov/ibverbs/src/enum_strs.c new file mode 100644 index 00000000000..54d71a6e209 --- /dev/null +++ b/prov/ibverbs/src/enum_strs.c @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2008 Lawrence Livermore National Laboratory + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <infiniband/verbs.h> + +const char *ibv_node_type_str(enum ibv_node_type node_type) +{ + static const char *const node_type_str[] = { + [IBV_NODE_CA] = "InfiniBand channel adapter", + [IBV_NODE_SWITCH] = "InfiniBand switch", + [IBV_NODE_ROUTER] = "InfiniBand router", + [IBV_NODE_RNIC] = "iWARP NIC" + }; + + if (node_type < IBV_NODE_CA || node_type > IBV_NODE_RNIC) + return "unknown"; + + return node_type_str[node_type]; +} + +const char *ibv_port_state_str(enum ibv_port_state port_state) +{ + static const char *const port_state_str[] = { + [IBV_PORT_NOP] = "no state change (NOP)", + [IBV_PORT_DOWN] = "down", + [IBV_PORT_INIT] = "init", + [IBV_PORT_ARMED] = "armed", + [IBV_PORT_ACTIVE] = "active", + [IBV_PORT_ACTIVE_DEFER] = "active defer" + }; + + if (port_state < IBV_PORT_NOP || port_state > IBV_PORT_ACTIVE_DEFER) + return "unknown"; + + return port_state_str[port_state]; +} + +const char *ibv_event_type_str(enum ibv_event_type event) +{ + static const char *const event_type_str[] = { + [IBV_EVENT_CQ_ERR] = "CQ error", + [IBV_EVENT_QP_FATAL] = "local work queue catastrophic error", + [IBV_EVENT_QP_REQ_ERR] = "invalid request local work queue error", + [IBV_EVENT_QP_ACCESS_ERR] = "local access violation work queue error", + [IBV_EVENT_COMM_EST] = "communication established", + [IBV_EVENT_SQ_DRAINED] = "send queue drained", + [IBV_EVENT_PATH_MIG] = "path migrated", + [IBV_EVENT_PATH_MIG_ERR] = "path migration request error", + [IBV_EVENT_DEVICE_FATAL] = "local catastrophic error", + [IBV_EVENT_PORT_ACTIVE] = "port active", + [IBV_EVENT_PORT_ERR] = "port error", + [IBV_EVENT_LID_CHANGE] = "LID change", + [IBV_EVENT_PKEY_CHANGE] = "P_Key change", + [IBV_EVENT_SM_CHANGE] = "SM change", + [IBV_EVENT_SRQ_ERR] = "SRQ catastrophic error", + [IBV_EVENT_SRQ_LIMIT_REACHED] = "SRQ limit reached", + [IBV_EVENT_QP_LAST_WQE_REACHED] = "last WQE reached", + [IBV_EVENT_CLIENT_REREGISTER] = "client reregistration", + [IBV_EVENT_GID_CHANGE] = "GID table change" + }; + + if (event < IBV_EVENT_CQ_ERR || event > IBV_EVENT_GID_CHANGE) + return "unknown"; + + return event_type_str[event]; +} + +const char *ibv_wc_status_str(enum ibv_wc_status status) +{ + static const char *const wc_status_str[] = { + [IBV_WC_SUCCESS] = "success", + [IBV_WC_LOC_LEN_ERR] = "local length error", + [IBV_WC_LOC_QP_OP_ERR] = "local QP operation error", + [IBV_WC_LOC_EEC_OP_ERR] = "local EE context operation error", + [IBV_WC_LOC_PROT_ERR] = "local protection error", + [IBV_WC_WR_FLUSH_ERR] = "Work Request Flushed Error", + [IBV_WC_MW_BIND_ERR] = "memory management operation error", + [IBV_WC_BAD_RESP_ERR] = "bad response error", + [IBV_WC_LOC_ACCESS_ERR] = "local access error", + [IBV_WC_REM_INV_REQ_ERR] = "remote invalid request error", + [IBV_WC_REM_ACCESS_ERR] = "remote access error", + [IBV_WC_REM_OP_ERR] = "remote operation error", + [IBV_WC_RETRY_EXC_ERR] = "transport retry counter exceeded", + [IBV_WC_RNR_RETRY_EXC_ERR] = "RNR retry counter exceeded", + [IBV_WC_LOC_RDD_VIOL_ERR] = "local RDD violation error", + [IBV_WC_REM_INV_RD_REQ_ERR] = "remote invalid RD request", + [IBV_WC_REM_ABORT_ERR] = "aborted error", + [IBV_WC_INV_EECN_ERR] = "invalid EE context number", + [IBV_WC_INV_EEC_STATE_ERR] = "invalid EE context state", + [IBV_WC_FATAL_ERR] = "fatal error", + [IBV_WC_RESP_TIMEOUT_ERR] = "response timeout error", + [IBV_WC_GENERAL_ERR] = "general error" + }; + + if (status < IBV_WC_SUCCESS || status > IBV_WC_GENERAL_ERR) + return "unknown"; + + return wc_status_str[status]; +} diff --git a/prov/ibverbs/src/fi_verbs.c b/prov/ibverbs/src/fi_verbs.c new file mode 100644 index 00000000000..f9286f71ab2 --- /dev/null +++ b/prov/ibverbs/src/fi_verbs.c @@ -0,0 +1,1277 @@ +/* + * Copyright (c) 2013 Intel Corporation, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +# include <config.h> +#endif /* HAVE_CONFIG_H */ + +#include <errno.h> +#include <fcntl.h> +#include <poll.h> +#include <stdlib.h> +#include <string.h> +#include <stdio.h> + +#include <infiniband/verbs.h> +#include <rdma/rdma_cma.h> + +#include <rdma/fabric.h> +#include <rdma/fi_cm.h> +#include <rdma/fi_domain.h> +#include <rdma/fi_prov.h> +#include <rdma/fi_socket.h> +#include <rdma/fi_rdma.h> + +#include "ibverbs.h" + + +struct ibv_domain { + struct fid_domain domain_fid; + struct ibv_context *verbs; + struct ibv_pd *pd; +}; + +struct ibv_ec { + struct fid_ec fid; + enum fi_ec_domain ec_domain; + struct ibv_domain *domain; +}; + +struct ibv_ec_comp { + struct ibv_ec ec; + struct ibv_comp_channel *channel; + struct ibv_cq *cq; + uint64_t flags; + struct ibv_wc wc; +}; + +struct ibv_ec_cm { + struct ibv_ec ec; + struct rdma_event_channel *channel; + uint64_t flags; + struct fi_ec_err_entry err; +}; + +struct ibv_mem_desc { + struct fid_mr mr_fid; + struct ibv_mr *mr; + struct ibv_domain *domain; +}; + +struct ibv_msg_socket { + struct fid_socket socket_fid; + struct rdma_cm_id *id; + struct ibv_ec_cm *cm_ec; + struct ibv_ec_comp *rec; + struct ibv_ec_comp *sec; + uint32_t inline_size; +}; + +static char def_send_wr[16] = "384"; +static char def_recv_wr[16] = "384"; +static char def_send_sge[16] = "4"; +static char def_recv_sge[16] = "4"; +static char def_inline_data[16] = "64"; + +static int ibv_check_domain(const char *name) +{ + return (!name || !strncmp(name, IBV_PREFIX "/", sizeof(IBV_PREFIX))) ? + 0 : -ENODATA; +} + +/* + * TODO: this is not the full set of checks which are needed + */ +static int ibv_fi_to_rai(struct fi_info *fi, struct rdma_addrinfo *rai) +{ + memset(rai, 0, sizeof *rai); + if (fi->flags & FI_PASSIVE) + rai->ai_flags = RAI_PASSIVE; + if (fi->flags & FI_NUMERICHOST) + rai->ai_flags |= RAI_NUMERICHOST; +// if (fi->flags & FI_FAMILY) +// rai->ai_flags |= RAI_FAMILY; + +// rai->ai_family = fi->sa_family; + if (fi->type == FID_MSG || fi->protocol & FI_PROTO_RDMA || + ((fi->protocol & FI_PROTO_MASK) == FI_PROTO_IB_RC) || + ((fi->protocol & FI_PROTO_MASK) == FI_PROTO_IWARP)) { + rai->ai_qp_type = IBV_QPT_RC; + rai->ai_port_space = RDMA_PS_TCP; + } else if (fi->type == FID_DGRAM || + ((fi->protocol & FI_PROTO_MASK) == FI_PROTO_IB_UD)) { + rai->ai_qp_type = IBV_QPT_UD; + rai->ai_port_space = RDMA_PS_UDP; + } + + if (fi->src_addrlen) { + if (!(rai->ai_src_addr = malloc(fi->src_addrlen))) + return ENOMEM; + memcpy(rai->ai_src_addr, fi->src_addr, fi->src_addrlen); + rai->ai_src_len = fi->src_addrlen; + } + if (fi->dst_addrlen) { + if (!(rai->ai_dst_addr = malloc(fi->dst_addrlen))) + return ENOMEM; + memcpy(rai->ai_dst_addr, fi->dst_addr, fi->dst_addrlen); + rai->ai_dst_len = fi->dst_addrlen; + } +// if (fi->src_canonname) +// rai->ai_src_canonname = strdup(fi->src_canonname); +// if (fi->dst_canonname) +// rai->ai_dst_canonname = strdup(fi->dst_canonname); + + return 0; +} + + static int ibv_rai_to_fi(struct rdma_addrinfo *rai, struct fi_info *fi) + { + memset(fi, 0, sizeof *fi); + if (rai->ai_flags & RAI_PASSIVE) + fi->flags = RAI_PASSIVE; + + // fi->sa_family = rai->ai_family; + if (rai->ai_qp_type == IBV_QPT_RC || rai->ai_port_space == RDMA_PS_TCP) { + fi->protocol = FI_PROTO_MSG | FI_PROTO_RDMA; + fi->type = FID_MSG; + } else if (rai->ai_qp_type == IBV_QPT_UD || + rai->ai_port_space == RDMA_PS_UDP) { + fi->protocol = FI_PROTO_IB_UD | FI_PROTO_MSG; + fi->type = FID_DGRAM; + } + + if (rai->ai_src_len) { + if (!(fi->src_addr = malloc(rai->ai_src_len))) + return ENOMEM; + memcpy(fi->src_addr, rai->ai_src_addr, rai->ai_src_len); + fi->src_addrlen = rai->ai_src_len; + } + if (rai->ai_dst_len) { + if (!(fi->dst_addr = malloc(rai->ai_dst_len))) + return ENOMEM; + memcpy(fi->dst_addr, rai->ai_dst_addr, rai->ai_dst_len); + fi->dst_addrlen = rai->ai_dst_len; + } + // if (rai->ai_src_canonname) + // fi->src_canonname = strdup(rai->ai_src_canonname); + // if (rai->ai_dst_canonname) + // fi->dst_canonname = strdup(rai->ai_dst_canonname); + + return 0; + } + +static int ibv_getinfo(char *node, char *service, struct fi_info *hints, + struct fi_info **info) +{ + struct rdma_addrinfo rai_hints, *rai; + struct fi_info *fi; + struct rdma_cm_id *id; + int ret; + + if (hints) { + ret = ibv_check_domain(hints->domain_name); + if (ret) + return ret; + + ret = ibv_fi_to_rai(hints, &rai_hints); + if (ret) + return ret; + + ret = rdma_getaddrinfo(node, service, &rai_hints, &rai); + } else { + ret = rdma_getaddrinfo(node, service, NULL, &rai); + } + if (ret) + return -errno; + + if (!(fi = malloc(sizeof *fi))) { + ret = ENOMEM; + goto err1; + } + + ret = ibv_rai_to_fi(rai, fi); + if (ret) + goto err2; + + ret = rdma_create_ep(&id, rai, NULL, NULL); + if (ret) { + ret = -errno; + goto err2; + } + rdma_freeaddrinfo(rai); + + if (!fi->src_addr) { + fi->src_addrlen = rdma_addrlen(rdma_get_local_addr(id)); + if (!(fi->src_addr = malloc(fi->src_addrlen))) { + ret = -ENOMEM; + goto err3; + } + memcpy(fi->src_addr, rdma_get_local_addr(id), fi->src_addrlen); + } + + if (id->verbs) { + if (!(fi->domain_name = malloc(FI_NAME_MAX))) { + ret = -ENOMEM; + goto err3; + } + strcpy(fi->domain_name, IBV_PREFIX "/"); + strcpy(&fi->domain_name[sizeof(IBV_PREFIX)], id->verbs->device->name); + } else { + fi->domain_name = strdup(IBV_PREFIX "/" FI_UNBOUND_NAME); + } + + fi->data = id; + fi->datalen = sizeof id; + *info = fi; + return 0; + +err3: + rdma_destroy_ep(id); +err2: + __fi_freeinfo(fi); +err1: + rdma_freeaddrinfo(rai); + return ret; +} + +static int ibv_freeinfo(struct fi_info *info) +{ + int ret; + + ret = ibv_check_domain(info->domain_name); + if (ret) + return ret; + + if (info->data) { + rdma_destroy_ep(info->data); + info->data = NULL; + } + __fi_freeinfo(info); + return 0; +} + +static int ibv_msg_socket_create_qp(struct ibv_msg_socket *sock) +{ + struct ibv_qp_init_attr attr; + + /* TODO: serialize access to string buffers */ + fi_read_file(FI_CONF_DIR, "def_send_wr", + def_send_wr, sizeof def_send_wr); + fi_read_file(FI_CONF_DIR, "def_recv_wr", + def_recv_wr, sizeof def_recv_wr); + fi_read_file(FI_CONF_DIR, "def_send_sge", + def_send_sge, sizeof def_send_sge); + fi_read_file(FI_CONF_DIR, "def_recv_sge", + def_recv_sge, sizeof def_recv_sge); + fi_read_file(FI_CONF_DIR, "def_inline_data", + def_inline_data, sizeof def_inline_data); + + attr.cap.max_send_wr = atoi(def_send_wr); + attr.cap.max_recv_wr = atoi(def_recv_wr); + attr.cap.max_send_sge = atoi(def_send_sge); + attr.cap.max_recv_sge = atoi(def_recv_sge); + attr.cap.max_inline_data = atoi(def_inline_data); + sock->inline_size = attr.cap.max_inline_data; + attr.qp_context = sock; + attr.send_cq = sock->sec->cq; + attr.recv_cq = sock->rec->cq; + attr.srq = NULL; + attr.qp_type = IBV_QPT_RC; + attr.sq_sig_all = 1; + + return rdma_create_qp(sock->id, sock->rec->ec.domain->pd, &attr) ? -errno : 0; +} + +static int ibv_msg_socket_bind(fid_t fid, struct fi_resource *fids, int nfids) +{ + struct ibv_msg_socket *sock; + struct ibv_ec *ec; + int i, ret; + + sock = container_of(fid, struct ibv_msg_socket, socket_fid.fid); + for (i = 0; i < nfids; i++) { + if (fids[i].fid->fclass != FID_CLASS_EC) + return -EINVAL; + + ec = container_of(fids[i].fid, struct ibv_ec, fid.fid); + if (fids[i].flags & FI_RECV) { + if (sock->rec) + return -EINVAL; + sock->rec = container_of(ec, struct ibv_ec_comp, ec); + } + if (fids[i].flags & FI_SEND) { + if (sock->sec) + return -EINVAL; + sock->sec = container_of(ec, struct ibv_ec_comp, ec); + } + if (ec->ec_domain == FI_EC_DOMAIN_CM) { + sock->cm_ec = container_of(ec, struct ibv_ec_cm, ec); + ret = rdma_migrate_id(sock->id, sock->cm_ec->channel); + if (ret) + return -errno; + } + } + + if (sock->sec && sock->rec && !sock->id->qp) { + ret = ibv_msg_socket_create_qp(sock); + if (ret) + return ret; + } + + return 0; +} + +static ssize_t ibv_msg_socket_recvmem(fid_t fid, void *buf, size_t len, + uint64_t mem_desc, void *context) +{ + struct ibv_msg_socket *sock; + struct ibv_recv_wr wr, *bad; + struct ibv_sge sge; + + sge.addr = (uintptr_t) buf; + sge.length = (uint32_t) len; + sge.lkey = (uint32_t) mem_desc; + + wr.wr_id = (uintptr_t) context; + wr.next = NULL; + wr.sg_list = &sge; + wr.num_sge = 1; + + sock = container_of(fid, struct ibv_msg_socket, socket_fid.fid); + return -ibv_post_recv(sock->id->qp, &wr, &bad); +} + +static ssize_t ibv_msg_socket_sendmem(fid_t fid, const void *buf, size_t len, + uint64_t mem_desc, void *context) +{ + struct ibv_msg_socket *sock; + struct ibv_send_wr wr, *bad; + struct ibv_sge sge; + + sock = container_of(fid, struct ibv_msg_socket, socket_fid.fid); + sge.addr = (uintptr_t) buf; + sge.length = (uint32_t) len; + sge.lkey = (uint32_t) mem_desc; + + wr.wr_id = (uintptr_t) context; + wr.next = NULL; + wr.sg_list = &sge; + wr.num_sge = 1; + wr.opcode = IBV_WR_SEND; + wr.send_flags = (len <= sock->inline_size) ? IBV_SEND_INLINE : 0; + + return -ibv_post_send(sock->id->qp, &wr, &bad); +} + +static struct fi_ops_msg ibv_msg_socket_msg_ops = { + .size = sizeof(struct fi_ops_msg), + .recvmem = ibv_msg_socket_recvmem, + .sendmem = ibv_msg_socket_sendmem, +}; + +static int ibv_msg_socket_rdma_writemem(fid_t fid, const void *buf, size_t len, + uint64_t mem_desc, uint64_t addr, be64_t tag, void *context) +{ + struct ibv_msg_socket *sock; + struct ibv_send_wr wr, *bad; + struct ibv_sge sge; + + sock = container_of(fid, struct ibv_msg_socket, socket_fid.fid); + sge.addr = (uintptr_t) buf; + sge.length = (uint32_t) len; + sge.lkey = (uint32_t) mem_desc; + + wr.wr_id = (uintptr_t) context; + wr.next = NULL; + wr.sg_list = &sge; + wr.num_sge = 1; + wr.opcode = IBV_WR_RDMA_WRITE; + wr.send_flags = (len <= sock->inline_size) ? IBV_SEND_INLINE : 0; + wr.wr.rdma.remote_addr = addr; + wr.wr.rdma.rkey = (uint32_t) tag; + + return -ibv_post_send(sock->id->qp, &wr, &bad); +} + +static int ibv_msg_socket_rdma_readmem(fid_t fid, void *buf, size_t len, + uint64_t mem_desc, uint64_t addr, be64_t tag, void *context) +{ + struct ibv_msg_socket *sock; + struct ibv_send_wr wr, *bad; + struct ibv_sge sge; + + sge.addr = (uintptr_t) buf; + sge.length = (uint32_t) len; + sge.lkey = (uint32_t) mem_desc; + + wr.wr_id = (uintptr_t) context; + wr.next = NULL; + wr.sg_list = &sge; + wr.num_sge = 1; + wr.opcode = IBV_WR_RDMA_READ; + wr.send_flags = 0; + wr.wr.rdma.remote_addr = addr; + wr.wr.rdma.rkey = (uint32_t) tag; + + sock = container_of(fid, struct ibv_msg_socket, socket_fid.fid); + return -ibv_post_send(sock->id->qp, &wr, &bad); +} + +static struct fi_ops_rdma ibv_msg_socket_rdma_ops = { + .size = sizeof(struct fi_ops_rdma), + .writemem = ibv_msg_socket_rdma_writemem, + .readmem = ibv_msg_socket_rdma_readmem +}; + +static int ibv_msg_socket_connect(fid_t fid, const void *param, size_t paramlen) +{ + struct ibv_msg_socket *sock; + struct rdma_conn_param conn_param; + + sock = container_of(fid, struct ibv_msg_socket, socket_fid.fid); + memset(&conn_param, 0, sizeof conn_param); + conn_param.private_data = param; + conn_param.private_data_len = paramlen; + conn_param.responder_resources = RDMA_MAX_RESP_RES; + conn_param.initiator_depth = RDMA_MAX_INIT_DEPTH; + conn_param.flow_control = 1; + conn_param.retry_count = 15; + conn_param.rnr_retry_count = 7; + + return rdma_connect(sock->id, &conn_param) ? -errno : 0; +} + +static int ibv_msg_socket_listen(fid_t fid) +{ + struct ibv_msg_socket *sock; + + sock = container_of(fid, struct ibv_msg_socket, socket_fid.fid); + return rdma_listen(sock->id, 0) ? -errno : 0; +} + +static int ibv_msg_socket_accept(fid_t fid, const void *param, size_t paramlen) +{ + struct ibv_msg_socket *sock; + struct rdma_conn_param conn_param; + + sock = container_of(fid, struct ibv_msg_socket, socket_fid.fid); + memset(&conn_param, 0, sizeof conn_param); + conn_param.private_data = param; + conn_param.private_data_len = paramlen; + conn_param.responder_resources = RDMA_MAX_RESP_RES; + conn_param.initiator_depth = RDMA_MAX_INIT_DEPTH; + conn_param.flow_control = 1; + conn_param.rnr_retry_count = 7; + + return rdma_accept(sock->id, &conn_param) ? -errno : 0; +} + +static int ibv_msg_socket_reject(fid_t fid, struct fi_info *info, + const void *param, size_t paramlen) +{ + return rdma_reject(info->data, param, (uint8_t) paramlen) ? -errno : 0; +} + +static int ibv_msg_socket_shutdown(fid_t fid, uint64_t flags) +{ + struct ibv_msg_socket *sock; + sock = container_of(fid, struct ibv_msg_socket, socket_fid.fid); + return rdma_disconnect(sock->id) ? -errno : 0; +} + +struct fi_ops_cm ibv_msg_socket_cm_ops = { + .size = sizeof(struct fi_ops_cm), + .connect = ibv_msg_socket_connect, + .listen = ibv_msg_socket_listen, + .accept = ibv_msg_socket_accept, + .reject = ibv_msg_socket_reject, + .shutdown = ibv_msg_socket_shutdown, +}; + +static int ibv_msg_socket_close(fid_t fid) +{ + struct ibv_msg_socket *sock; + + sock = container_of(fid, struct ibv_msg_socket, socket_fid.fid); + if (sock->id) + rdma_destroy_ep(sock->id); + + free(sock); + return 0; +} + +struct fi_ops ibv_msg_socket_ops = { + .size = sizeof(struct fi_ops), + .close = ibv_msg_socket_close, + .bind = ibv_msg_socket_bind +}; + +static int ibv_socket(struct fi_info *info, fid_t *fid, void *context) +{ + struct ibv_msg_socket *sock; + int ret; + + ret = ibv_check_domain(info->domain_name); + if (ret) + return ret; + + if (!info->data || info->datalen != sizeof(sock->id)) + return -ENOSYS; + + sock = calloc(1, sizeof *sock); + if (!sock) + return -ENOMEM; + + sock->id = info->data; + sock->id->context = &sock->socket_fid.fid; + info->data = NULL; + info->datalen = 0; + + sock->socket_fid.fid.fclass = FID_CLASS_SOCKET; + sock->socket_fid.fid.size = sizeof(struct fid_socket); + sock->socket_fid.fid.context = context; + sock->socket_fid.fid.ops = &ibv_msg_socket_ops; + sock->socket_fid.ops = NULL; + sock->socket_fid.msg = &ibv_msg_socket_msg_ops; + sock->socket_fid.cm = &ibv_msg_socket_cm_ops; + sock->socket_fid.rdma = &ibv_msg_socket_rdma_ops; + + *fid = &sock->socket_fid.fid; + return 0; +} + +static int ibv_poll_fd(int fd) +{ + struct pollfd fds; + + fds.fd = fd; + fds.events = POLLIN; + return poll(&fds, 1, -1) < 0 ? -errno : 0; +} + +static ssize_t ibv_ec_cm_readerr(fid_t fid, void *buf, size_t len, uint64_t flags) +{ + struct ibv_ec_cm *ec; + struct fi_ec_err_entry *entry; + + ec = container_of(fid, struct ibv_ec_cm, ec.fid.fid); + if (!ec->err.err) + return 0; + + if (len < sizeof(*entry)) + return -EINVAL; + + entry = (struct fi_ec_err_entry *) buf; + *entry = ec->err; + ec->err.err = 0; + ec->err.prov_errno = 0; + return sizeof(*entry); +} + +static struct fi_info * ibv_ec_cm_getinfo(struct rdma_cm_event *event) +{ + struct fi_info *fi; + + fi = calloc(1, sizeof *fi); + if (!fi) + return NULL; + + fi->size = sizeof *fi; + fi->type = FID_MSG; + if (event->id->verbs->device->transport_type == IBV_TRANSPORT_IWARP) + fi->protocol = FI_PROTO_IWARP | FI_PROTO_RDMA; + else + fi->protocol = FI_PROTO_IB_RC | FI_PROTO_RDMA; +// fi->sa_family = rdma_get_local_addr(event->id)->sa_family; + + fi->src_addrlen = rdma_addrlen(rdma_get_local_addr(event->id)); + if (!(fi->src_addr = malloc(fi->src_addrlen))) + goto err; + memcpy(fi->src_addr, rdma_get_local_addr(event->id), fi->src_addrlen); + + fi->dst_addrlen = rdma_addrlen(rdma_get_peer_addr(event->id)); + if (!(fi->dst_addr = malloc(fi->dst_addrlen))) + goto err; + memcpy(fi->dst_addr, rdma_get_peer_addr(event->id), fi->dst_addrlen); + + if (!(fi->domain_name = malloc(FI_NAME_MAX))) + goto err; + strcpy(fi->domain_name, IBV_PREFIX "/"); + strcpy(&fi->domain_name[sizeof(IBV_PREFIX)], event->id->verbs->device->name); + + fi->datalen = sizeof event->id; + fi->data = event->id; + return fi; +err: + fi_freeinfo(fi); + return NULL; +} + +static ssize_t ibv_ec_cm_process_event(struct ibv_ec_cm *ec, + struct rdma_cm_event *event, struct fi_ec_cm_entry *entry, size_t len) +{ + fid_t fid; + size_t datalen; + + fid = event->id->context; + switch (event->event) { +// case RDMA_CM_EVENT_ADDR_RESOLVED: +// return 0; +// case RDMA_CM_EVENT_ROUTE_RESOLVED: +// return 0; + case RDMA_CM_EVENT_CONNECT_REQUEST: + rdma_migrate_id(event->id, NULL); + entry->event = FI_CONNREQ; + entry->info = ibv_ec_cm_getinfo(event); + if (!entry->info) { + rdma_destroy_id(event->id); + return 0; + } + break; + case RDMA_CM_EVENT_ESTABLISHED: + entry->event = FI_CONNECTED; + entry->info = NULL; + break; + case RDMA_CM_EVENT_DISCONNECTED: + entry->event = FI_SHUTDOWN; + entry->info = NULL; + break; + case RDMA_CM_EVENT_ADDR_ERROR: + case RDMA_CM_EVENT_ROUTE_ERROR: + case RDMA_CM_EVENT_CONNECT_ERROR: + case RDMA_CM_EVENT_UNREACHABLE: + ec->err.fid_context = fid->context; + ec->err.err = event->status; + return -EIO; + case RDMA_CM_EVENT_REJECTED: + ec->err.fid_context = fid->context; + ec->err.err = ECONNREFUSED; + ec->err.prov_errno = event->status; + return -EIO; + case RDMA_CM_EVENT_DEVICE_REMOVAL: + ec->err.fid_context = fid->context; + ec->err.err = ENODEV; + return -EIO; + case RDMA_CM_EVENT_ADDR_CHANGE: + ec->err.fid_context = fid->context; + ec->err.err = EADDRNOTAVAIL; + return -EIO; + default: + return 0; + } + + entry->fid_context = fid->context; + entry->flags = 0; + datalen = min(len - sizeof(*entry), event->param.conn.private_data_len); + if (datalen) + memcpy(entry->data, event->param.conn.private_data, datalen); + return sizeof(*entry) + datalen; +} + +static ssize_t ibv_ec_cm_read_data(fid_t fid, void *buf, size_t len) +{ + struct ibv_ec_cm *ec; + struct fi_ec_cm_entry *entry; + struct rdma_cm_event *event; + size_t left; + ssize_t ret = -EINVAL; + + ec = container_of(fid, struct ibv_ec_cm, ec.fid.fid); + entry = (struct fi_ec_cm_entry *) buf; + if (ec->err.err) + return -EIO; + + for (left = len; left >= sizeof(*entry); ) { + ret = rdma_get_cm_event(ec->channel, &event); + if (!ret) { + ret = ibv_ec_cm_process_event(ec, event, entry, left); + rdma_ack_cm_event(event); + if (ret < 0) + break; + else if (!ret) + continue; + + left -= ret; + entry = ((void *) entry) + ret; + } else if (errno == EAGAIN) { + if (left < len) + return len - left; + + if (ec->flags & FI_NONBLOCK) + return 0; + + ibv_poll_fd(ec->channel->fd); + } else { + ret = -errno; + break; + } + } + + return (left < len) ? len - left : ret; +} + +static const char * ibv_ec_cm_strerror(fid_t fid, int prov_errno, void *prov_data, + void *buf, size_t len) +{ + if (buf && len) + strncpy(buf, strerror(prov_errno), len); + return strerror(prov_errno); +} + +struct fi_ops_ec ibv_ec_cm_data_ops = { + .size = sizeof(struct fi_ops_ec), + .read = ibv_ec_cm_read_data, + .readfrom = NULL, + .readerr = ibv_ec_cm_readerr, + .write = NULL, + .reset = NULL, + .strerror = ibv_ec_cm_strerror +}; + +static int ibv_ec_cm_close(fid_t fid) +{ + struct ibv_ec_cm *ec; + + ec = container_of(fid, struct ibv_ec_cm, ec.fid.fid); + if (ec->channel) + rdma_destroy_event_channel(ec->channel); + + free(ec); + return 0; +} + +struct fi_ops ibv_ec_cm_ops = { + .size = sizeof(struct fi_ops), + .close = ibv_ec_cm_close, +}; + +static int ibv_ec_cm_open(fid_t fid, struct fi_ec_attr *attr, fid_t *ec, void *context) +{ + struct ibv_ec_cm *vec; + long flags = 0; + int ret; + + if (attr->type != FI_EC_QUEUE || attr->format != FI_EC_FORMAT_CM) + return -ENOSYS; + + vec = calloc(1, sizeof *vec); + if (!vec) + return -ENOMEM; + + vec->ec.domain = container_of(fid, struct ibv_domain, domain_fid.fid); + + switch (attr->wait_obj) { + case FI_EC_WAIT_FD: + vec->channel = rdma_create_event_channel(); + if (!vec->channel) { + ret = -errno; + goto err1; + } + fcntl(vec->channel->fd, F_GETFL, &flags); + ret = fcntl(vec->channel->fd, F_SETFL, flags | O_NONBLOCK); + if (ret) { + ret = -errno; + goto err2; + } + break; + case FI_EC_WAIT_NONE: + vec->flags = O_NONBLOCK; + break; + default: + return -ENOSYS; + } + + vec->flags = attr->flags; + vec->ec.fid.fid.fclass = FID_CLASS_EC; + vec->ec.fid.fid.size = sizeof(struct fid_ec); + vec->ec.fid.fid.context = context; + vec->ec.fid.fid.ops = &ibv_ec_cm_ops; + vec->ec.fid.ops = &ibv_ec_cm_data_ops; + + *ec = &vec->ec.fid.fid; + return 0; +err2: + if (vec->channel) + rdma_destroy_event_channel(vec->channel); +err1: + free(vec); + return ret; +} + +static int ibv_ec_comp_reset(fid_t fid, void *cond) +{ + struct ibv_ec_comp *ec; + struct ibv_cq *cq; + void *context; + int ret; + + ec = container_of(fid, struct ibv_ec_comp, ec.fid.fid); + ret = ibv_get_cq_event(ec->channel, &cq , &context); + if (!ret) + ibv_ack_cq_events(cq, 1); + + return -ibv_req_notify_cq(ec->cq, (ec->flags & FI_SIGNAL) ? 1 : 0); +} + +static ssize_t ibv_ec_comp_readerr(fid_t fid, void *buf, size_t len, uint64_t flags) +{ + struct ibv_ec_comp *ec; + struct fi_ec_err_entry *entry; + + ec = container_of(fid, struct ibv_ec_comp, ec.fid.fid); + if (!ec->wc.status) + return 0; + + if (len < sizeof(*entry)) + return -EINVAL; + + entry = (struct fi_ec_err_entry *) buf; + entry->fid_context = NULL; /* TODO: return qp context from wc */ + entry->op_context = (void *) (uintptr_t) ec->wc.wr_id; + entry->flags = 0; + entry->err = EIO; + entry->prov_errno = ec->wc.status; + entry->data = ec->wc.vendor_err; + entry->prov_data = NULL; + + ec->wc.status = 0; + return sizeof(*entry); +} + +static ssize_t ibv_ec_comp_read(fid_t fid, void *buf, size_t len) +{ + struct ibv_ec_comp *ec; + struct fi_ec_entry *entry; + size_t left; + int reset = 1, ret = -EINVAL; + + ec = container_of(fid, struct ibv_ec_comp, ec.fid.fid); + entry = (struct fi_ec_entry *) buf; + if (ec->wc.status) + return -EIO; + + for (left = len; left >= sizeof(*entry); ) { + ret = ibv_poll_cq(ec->cq, 1, &ec->wc); + if (ret > 0) { + if (ec->wc.status) { + ret = -EIO; + break; + } + + entry->op_context = (void *) (uintptr_t) ec->wc.wr_id; + left -= sizeof(*entry); + entry = entry + 1; + } else if (ret == 0) { + if (left < len) + return len - left; + + if (reset && (ec->flags & FI_AUTO_RESET)) { + ibv_ec_comp_reset(fid, NULL); + reset = 0; + continue; + } + + if (ec->flags & FI_NONBLOCK) + return 0; + + ibv_poll_fd(ec->channel->fd); + } else { + break; + } + } + + return (left < len) ? len - left : ret; +} + +static ssize_t ibv_ec_comp_read_data(fid_t fid, void *buf, size_t len) +{ + struct ibv_ec_comp *ec; + struct fi_ec_data_entry *entry; + size_t left; + int reset = 1, ret = -EINVAL; + + ec = container_of(fid, struct ibv_ec_comp, ec.fid.fid); + entry = (struct fi_ec_data_entry *) buf; + if (ec->wc.status) + return -EIO; + + for (left = len; left >= sizeof(*entry); ) { + ret = ibv_poll_cq(ec->cq, 1, &ec->wc); + if (ret > 0) { + if (ec->wc.status) { + ret = -EIO; + break; + } + + entry->op_context = (void *) (uintptr_t) ec->wc.wr_id; + if (ec->wc.wc_flags & IBV_WC_WITH_IMM) { + entry->flags = FI_IMM; + entry->data = ec->wc.imm_data; + } + if (ec->wc.opcode & IBV_WC_RECV) + entry->len = ec->wc.byte_len; + left -= sizeof(*entry); + entry = entry + 1; + } else if (ret == 0) { + if (left < len) + return len - left; + + if (reset && (ec->flags & FI_AUTO_RESET)) { + ibv_ec_comp_reset(fid, NULL); + reset = 0; + continue; + } + + if (ec->flags & FI_NONBLOCK) + return 0; + + ibv_poll_fd(ec->channel->fd); + } else { + break; + } + } + + return (left < len) ? len - left : ret; +} + +static const char * ibv_ec_comp_strerror(fid_t fid, int prov_errno, void *prov_data, + void *buf, size_t len) +{ + if (buf && len) + strncpy(buf, ibv_wc_status_str(prov_errno), len); + return ibv_wc_status_str(prov_errno); +} + +struct fi_ops_ec ibv_ec_comp_context_ops = { + .size = sizeof(struct fi_ops_ec), + .read = ibv_ec_comp_read, + .readerr = ibv_ec_comp_readerr, + .reset = ibv_ec_comp_reset, + .strerror = ibv_ec_comp_strerror +}; + +struct fi_ops_ec ibv_ec_comp_data_ops = { + .size = sizeof(struct fi_ops_ec), + .read = ibv_ec_comp_read_data, + .readerr = ibv_ec_comp_readerr, + .reset = ibv_ec_comp_reset, + .strerror = ibv_ec_comp_strerror +}; + +static int ibv_ec_comp_close(fid_t fid) +{ + struct ibv_ec_comp *ec; + int ret; + + ec = container_of(fid, struct ibv_ec_comp, ec.fid.fid); + if (ec->cq) { + ret = ibv_destroy_cq(ec->cq); + if (ret) + return -ret; + ec->cq = NULL; + } + if (ec->channel) + ibv_destroy_comp_channel(ec->channel); + + free(ec); + return 0; +} + +struct fi_ops ibv_ec_comp_ops = { + .size = sizeof(struct fi_ops), + .close = ibv_ec_comp_close, +}; + +static int ibv_ec_comp_open(fid_t fid, struct fi_ec_attr *attr, fid_t *ec, void *context) +{ + struct ibv_ec_comp *vec; + long flags = 0; + int ret; + + if (attr->type != FI_EC_QUEUE || attr->wait_cond != FI_EC_COND_NONE) + return -ENOSYS; + + vec = calloc(1, sizeof *vec); + if (!vec) + return -ENOMEM; + + vec->ec.domain = container_of(fid, struct ibv_domain, domain_fid.fid); + + switch (attr->wait_obj) { + case FI_EC_WAIT_FD: + vec->channel = ibv_create_comp_channel(vec->ec.domain->verbs); + if (!vec->channel) { + ret = -errno; + goto err1; + } + fcntl(vec->channel->fd, F_GETFL, &flags); + ret = fcntl(vec->channel->fd, F_SETFL, flags | O_NONBLOCK); + if (ret) { + ret = -errno; + goto err1; + } + break; + case FI_EC_WAIT_NONE: + vec->flags = FI_NONBLOCK; + break; + default: + return -ENOSYS; + } + + vec->cq = ibv_create_cq(vec->ec.domain->verbs, attr->size, vec, + vec->channel, attr->signaling_vector); + if (!vec->cq) { + ret = -errno; + goto err2; + } + + vec->flags |= attr->flags; + vec->ec.fid.fid.fclass = FID_CLASS_EC; + vec->ec.fid.fid.size = sizeof(struct fid_ec); + vec->ec.fid.fid.context = context; + vec->ec.fid.fid.ops = &ibv_ec_comp_ops; + + switch (attr->format) { + case FI_EC_FORMAT_CONTEXT: + vec->ec.fid.ops = &ibv_ec_comp_context_ops; + break; + case FI_EC_FORMAT_DATA: + vec->ec.fid.ops = &ibv_ec_comp_data_ops; + break; + default: + ret = -ENOSYS; + goto err3; + } + + *ec = &vec->ec.fid.fid; + return 0; + +err3: + ibv_destroy_cq(vec->cq); +err2: + if (vec->channel) + ibv_destroy_comp_channel(vec->channel); +err1: + free(vec); + return ret; +} + +static int ibv_ec_open(fid_t fid, struct fi_ec_attr *attr, fid_t *ec, void *context) +{ + struct ibv_ec *vec; + int ret; + + switch (attr->domain) { + case FI_EC_DOMAIN_GENERAL: + return -ENOSYS; + case FI_EC_DOMAIN_COMP: + ret = ibv_ec_comp_open(fid, attr, ec, context); + break; + case FI_EC_DOMAIN_CM: + ret = ibv_ec_cm_open(fid, attr, ec, context); + break; + case FI_EC_DOMAIN_AV: + return -ENOSYS; + default: + return -ENOSYS; + } + if (ret) + return ret; + + vec = container_of(*ec, struct ibv_ec, fid); + vec->ec_domain = attr->domain; + + if (attr->flags & FI_AUTO_RESET && vec->fid.ops->reset) + fi_ec_reset(*ec, attr->cond); + + return 0; +} + +static int ibv_mr_close(fid_t fid) +{ + struct ibv_mem_desc *mr; + int ret; + + mr = container_of(fid, struct ibv_mem_desc, mr_fid.fid); + ret = -ibv_dereg_mr(mr->mr); + if (!ret) + free(mr); + return ret; +} + +struct fi_ops ibv_mr_ops = { + .size = sizeof(struct fi_ops), + .close = ibv_mr_close +}; + +static int ibv_mr_reg(fid_t fid, const void *buf, size_t len, fid_t *mr, + uint64_t flags, void *context) +{ + struct ibv_mem_desc *md; + int access; + + md = calloc(1, sizeof *md); + if (!md) + return -ENOMEM; + + md->domain = container_of(fid, struct ibv_domain, domain_fid.fid); + md->mr_fid.fid.fclass = FID_CLASS_MR; + md->mr_fid.fid.size = sizeof(struct fid_mr); + md->mr_fid.fid.context = context; + md->mr_fid.fid.ops = &ibv_mr_ops; + + access = IBV_ACCESS_LOCAL_WRITE; + if (flags & FI_READ) + access |= IBV_ACCESS_REMOTE_READ; + if (flags & FI_WRITE) + access |= IBV_ACCESS_REMOTE_WRITE; + md->mr = ibv_reg_mr(md->domain->pd, (void *) buf, len, access); + if (!md->mr) + goto err; + + md->mr_fid.mem_desc = md->mr->lkey; + md->mr_fid.key = md->mr->rkey; + *mr = &md->mr_fid.fid; + return 0; + +err: + free(md); + return -errno; +} + +static int ibv_close(fid_t fid) +{ + struct ibv_domain *domain; + int ret; + + domain = container_of(fid, struct ibv_domain, domain_fid.fid); + if (domain->pd) { + ret = ibv_dealloc_pd(domain->pd); + if (ret) + return -ret; + domain->pd = NULL; + } + + free(domain); + return 0; +} + +static int ibv_open_device_by_name(struct ibv_domain *domain, const char *name) +{ + struct ibv_context **dev_list; + int i, ret = -ENODEV; + + name = name + sizeof(IBV_PREFIX); + dev_list = rdma_get_devices(NULL); + if (!dev_list) + return -errno; + + for (i = 0; dev_list[i]; i++) { + if (!strcmp(name, ibv_get_device_name(dev_list[i]->device))) { + domain->verbs = dev_list[i]; + ret = 0; + break; + } + } + rdma_free_devices(dev_list); + return ret; +} + +struct fi_ops ibv_fid_ops = { + .size = sizeof(struct fi_ops), + .close = ibv_close, +}; + +struct fi_ops_domain ibv_domain_ops = { + .size = sizeof(struct fi_ops_domain), + .mr_reg = ibv_mr_reg, + .ec_open = ibv_ec_open +}; + +static int ibv_open(const char *name, struct fi_info *info, uint64_t flags, + fid_t *fid, void *context) +{ + struct ibv_domain *domain; + const char *domain_name; + int ret; + + domain_name = name ? name : info->domain_name; + ret = ibv_check_domain(domain_name); + if (ret) + return ret; + + domain = calloc(1, sizeof *domain); + if (!domain) + return -ENOMEM; + + if (strcmp(domain_name + sizeof(IBV_PREFIX), "local")) { + ret = ibv_open_device_by_name(domain, domain_name); + if (ret) + goto err; + + domain->pd = ibv_alloc_pd(domain->verbs); + if (!domain->pd) { + ret = -errno; + goto err; + } + } + + domain->domain_fid.fid.fclass = FID_CLASS_RESOURCE_DOMAIN; + domain->domain_fid.fid.size = sizeof(struct fid_domain); + domain->domain_fid.fid.context = context; + domain->domain_fid.fid.ops = &ibv_fid_ops; + domain->domain_fid.ops = &ibv_domain_ops; + + *fid = &domain->domain_fid.fid; + return 0; +err: + free(domain); + return ret; +} + +struct fi_ops_prov ibv_ops = { + .size = sizeof(struct fi_ops_prov), + .getinfo = ibv_getinfo, + .freeinfo = ibv_freeinfo, + .socket = ibv_socket, + .open = ibv_open +}; + + +void ibv_ini(void) +{ + fi_register(&ibv_ops); +} + +void ibv_fini(void) +{ +} diff --git a/prov/ibverbs/src/ibverbs.h b/prov/ibverbs/src/ibverbs.h new file mode 100644 index 00000000000..e3db32d04fe --- /dev/null +++ b/prov/ibverbs/src/ibverbs.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef IB_VERBS_H +#define IB_VERBS_H + +#include <pthread.h> + +#include <infiniband/driver.h> +#include <rdma/fi_uverbs.h> +#include "fi.h" + +#define HIDDEN __attribute__((visibility ("hidden"))) + +#define DEFAULT_ABI "IBVERBS_1.1" + +#ifdef HAVE_SYMVER_SUPPORT +# define symver(name, api, ver) \ + asm(".symver " #name "," #api "@" #ver) +# define default_symver(name, api) \ + asm(".symver " #name "," #api "@@" DEFAULT_ABI) +#else +# define symver(name, api, ver) +# define default_symver(name, api) \ + extern __typeof(name) api __attribute__((alias(#name))) +#endif /* HAVE_SYMVER_SUPPORT */ + +extern HIDDEN int abi_ver; + +HIDDEN int ibverbs_init(struct ibv_device ***list); + +#endif /* IB_VERBS_H */ diff --git a/prov/ibverbs/src/init.c b/prov/ibverbs/src/init.c new file mode 100644 index 00000000000..4eefb9a796a --- /dev/null +++ b/prov/ibverbs/src/init.c @@ -0,0 +1,473 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +# include <config.h> +#endif /* HAVE_CONFIG_H */ + +#include <stdlib.h> +#include <string.h> +#include <glob.h> +#include <stdio.h> +#include <dlfcn.h> +#include <unistd.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/time.h> +#include <sys/resource.h> +#include <dirent.h> +#include <errno.h> + +#include "ibverbs.h" +#include "fi.h" + +#define IBV_CONFIG_DIR SYSCONFDIR "/libibverbs.d" + +HIDDEN int abi_ver; + +struct ibv_sysfs_dev { + char sysfs_name[IBV_SYSFS_NAME_MAX]; + char ibdev_name[IBV_SYSFS_NAME_MAX]; + char sysfs_path[IBV_SYSFS_PATH_MAX]; + char ibdev_path[IBV_SYSFS_PATH_MAX]; + struct ibv_sysfs_dev *next; + int abi_ver; + int have_driver; +}; + +struct ibv_driver_name { + char *name; + struct ibv_driver_name *next; +}; + +struct ibv_driver { + const char *name; + ibv_driver_init_func init_func; + struct ibv_driver *next; +}; + +static struct ibv_sysfs_dev *sysfs_dev_list; +static struct ibv_driver_name *driver_name_list; +static struct ibv_driver *head_driver, *tail_driver; + +static int find_sysfs_devs(void) +{ + struct uv_dev *udev; + struct ibv_sysfs_dev *sysfs_dev; + char value[8]; + int ret= 0; + + for (udev = udev_head; udev; udev = udev->next) { + sysfs_dev = calloc(1, sizeof *sysfs_dev); + if (!sysfs_dev) { + ret = ENOMEM; + break; + } + + strcpy(sysfs_dev->sysfs_name, udev->sysfs_name); + strcpy(sysfs_dev->sysfs_path, udev->sysfs_path); + strcpy(sysfs_dev->ibdev_name, udev->dev_name); + strcpy(sysfs_dev->ibdev_path, udev->dev_path); + if (fi_read_file(sysfs_dev->sysfs_path, "abi_version", + value, sizeof value) > 0) + sysfs_dev->abi_ver = strtol(value, NULL, 10); + + sysfs_dev->next = sysfs_dev_list; + sysfs_dev_list = sysfs_dev; + } + + return ret; +} + +void ibv_register_driver(const char *name, ibv_driver_init_func init_func) +{ + struct ibv_driver *driver; + + driver = malloc(sizeof *driver); + if (!driver) { + fprintf(stderr, "ibverbs: warning: couldn't allocate driver for %s\n", name); + return; + } + + driver->name = name; + driver->init_func = init_func; + driver->next = NULL; + + if (tail_driver) + tail_driver->next = driver; + else + head_driver = driver; + tail_driver = driver; +} + +static void load_driver(const char *name) +{ + char *so_name; + void *dlhandle; + +#define __IBV_QUOTE(x) #x +#define IBV_QUOTE(x) __IBV_QUOTE(x) + + if (asprintf(&so_name, + name[0] == '/' ? + "%s-" IBV_QUOTE(IBV_DEVICE_LIBRARY_EXTENSION) ".so" : + "lib%s-" IBV_QUOTE(IBV_DEVICE_LIBRARY_EXTENSION) ".so", + name) < 0) { + fprintf(stderr, "ibverbs: warning: couldn't load driver '%s'.\n", + name); + return; + } + + dlhandle = dlopen(so_name, RTLD_NOW); + if (!dlhandle) { + fprintf(stderr, "ibverbs: warning: couldn't load driver '%s': %s\n", + name, dlerror()); + goto out; + } + +out: + free(so_name); +} + +static void load_drivers(void) +{ + struct ibv_driver_name *name, *next_name; + const char *env; + char *list, *env_name; + + /* + * Only use drivers passed in through the calling user's + * environment if we're not running setuid. + */ + if (getuid() == geteuid()) { + if ((env = getenv("RDMAV_DRIVERS"))) { + list = strdupa(env); + while ((env_name = strsep(&list, ":;"))) + load_driver(env_name); + } else if ((env = getenv("IBV_DRIVERS"))) { + list = strdupa(env); + while ((env_name = strsep(&list, ":;"))) + load_driver(env_name); + } + } + + for (name = driver_name_list, next_name = name ? name->next : NULL; + name; + name = next_name, next_name = name ? name->next : NULL) { + load_driver(name->name); + free(name->name); + free(name); + } +} + +static void read_config_file(const char *path) +{ + FILE *conf; + char *line = NULL; + char *config; + char *field; + size_t buflen = 0; + ssize_t len; + + conf = fopen(path, "r"); + if (!conf) { + fprintf(stderr, "ibverbs: warning: couldn't read config file %s.\n", + path); + return; + } + + while ((len = getline(&line, &buflen, conf)) != -1) { + config = line + strspn(line, "\t "); + if (config[0] == '\n' || config[0] == '#') + continue; + + field = strsep(&config, "\n\t "); + + if (strcmp(field, "driver") == 0 && config != NULL) { + struct ibv_driver_name *driver_name; + + config += strspn(config, "\t "); + field = strsep(&config, "\n\t "); + + driver_name = malloc(sizeof *driver_name); + if (!driver_name) { + fprintf(stderr, "ibverbs: warning: couldn't allocate " + "driver name '%s'.\n", field); + continue; + } + + driver_name->name = strdup(field); + if (!driver_name->name) { + fprintf(stderr, "ibverbs: warning: couldn't allocate " + "driver name '%s'.\n", field); + free(driver_name); + continue; + } + + driver_name->next = driver_name_list; + driver_name_list = driver_name; + } else + fprintf(stderr, "ibverbs: warning: ignoring bad config directive " + "'%s' in file '%s'.\n", field, path); + } + + if (line) + free(line); + fclose(conf); +} + +static void read_config(void) +{ + DIR *conf_dir; + struct dirent *dent; + char *path; + + conf_dir = opendir(IBV_CONFIG_DIR); + if (!conf_dir) { + fprintf(stderr, "ibverbs: warning: couldn't open config directory '%s'.\n", + IBV_CONFIG_DIR); + return; + } + + while ((dent = readdir(conf_dir))) { + struct stat buf; + + if (asprintf(&path, "%s/%s", IBV_CONFIG_DIR, dent->d_name) < 0) { + fprintf(stderr, "ibverbs: warning: couldn't read config file %s/%s.\n", + IBV_CONFIG_DIR, dent->d_name); + goto out; + } + + if (stat(path, &buf)) { + fprintf(stderr, "ibverbs: warning: couldn't stat config file '%s'.\n", + path); + goto next; + } + + if (!S_ISREG(buf.st_mode)) + goto next; + + read_config_file(path); +next: + free(path); + } + +out: + closedir(conf_dir); +} + +static struct ibv_device *try_driver(struct ibv_driver *driver, + struct ibv_sysfs_dev *sysfs_dev) +{ + struct ibv_device *dev; + char value[8]; + + dev = driver->init_func(sysfs_dev->sysfs_path, sysfs_dev->abi_ver); + if (!dev) + return NULL; + + if (fi_read_file(sysfs_dev->ibdev_path, "node_type", value, sizeof value) < 0) { + fprintf(stderr, "ibverbs: warning: no node_type attr under %s.\n", + sysfs_dev->ibdev_path); + dev->node_type = IBV_NODE_UNKNOWN; + } else { + dev->node_type = strtol(value, NULL, 10); + if (dev->node_type < IBV_NODE_CA || dev->node_type > IBV_NODE_RNIC) + dev->node_type = IBV_NODE_UNKNOWN; + } + + switch (dev->node_type) { + case IBV_NODE_CA: + case IBV_NODE_SWITCH: + case IBV_NODE_ROUTER: + dev->transport_type = IBV_TRANSPORT_IB; + break; + case IBV_NODE_RNIC: + dev->transport_type = IBV_TRANSPORT_IWARP; + break; + default: + dev->transport_type = IBV_TRANSPORT_UNKNOWN; + break; + } + + strcpy(dev->dev_name, sysfs_dev->sysfs_name); + strcpy(dev->dev_path, sysfs_dev->sysfs_path); + strcpy(dev->name, sysfs_dev->ibdev_name); + strcpy(dev->ibdev_path, sysfs_dev->ibdev_path); + + return dev; +} + +static struct ibv_device *try_drivers(struct ibv_sysfs_dev *sysfs_dev) +{ + struct ibv_driver *driver; + struct ibv_device *dev; + + for (driver = head_driver; driver; driver = driver->next) { + dev = try_driver(driver, sysfs_dev); + if (dev) + return dev; + } + + return NULL; +} + +static void check_memlock_limit(void) +{ + struct rlimit rlim; + + if (!geteuid()) + return; + + if (getrlimit(RLIMIT_MEMLOCK, &rlim)) { + fprintf(stderr, "ibverbs: warning: getrlimit(RLIMIT_MEMLOCK) failed."); + return; + } + + if (rlim.rlim_cur <= 32768) + fprintf(stderr, "ibverbs: warning: RLIMIT_MEMLOCK is %lu bytes.\n" + " This will severely limit memory registrations.\n", + rlim.rlim_cur); +} + +static void add_device(struct ibv_device *dev, + struct ibv_device ***dev_list, + int *num_devices, + int *list_size) +{ + struct ibv_device **new_list; + + if (*list_size <= *num_devices) { + *list_size = *list_size ? *list_size * 2 : 1; + new_list = realloc(*dev_list, *list_size * sizeof (struct ibv_device *)); + if (!new_list) + return; + *dev_list = new_list; + } + + (*dev_list)[(*num_devices)++] = dev; +} + +HIDDEN int ibverbs_init(struct ibv_device ***list) +{ + const char *sysfs_path; + struct ibv_sysfs_dev *sysfs_dev, *next_dev; + struct ibv_device *device; + int num_devices = 0; + int list_size = 0; + int statically_linked = 0; + int no_driver = 0; + int ret; + + fi_init(); /* temporary until we have a real provider */ + *list = NULL; + + if (getenv("RDMAV_FORK_SAFE") || getenv("IBV_FORK_SAFE")) + if (ibv_fork_init()) + fprintf(stderr, "ibverbs: warning: fork()-safety requested " + "but init failed\n"); + + sysfs_path = fi_sysfs_path(); + if (!sysfs_path) + return -ENOSYS; + + check_memlock_limit(); + + read_config(); + + ret = find_sysfs_devs(); + if (ret) + return -ret; + + for (sysfs_dev = sysfs_dev_list; sysfs_dev; sysfs_dev = sysfs_dev->next) { + device = try_drivers(sysfs_dev); + if (device) { + add_device(device, list, &num_devices, &list_size); + sysfs_dev->have_driver = 1; + } else + no_driver = 1; + } + + if (!no_driver) + goto out; + + /* + * Check if we can dlopen() ourselves. If this fails, + * libibverbs is probably statically linked into the + * executable, and we should just give up, since trying to + * dlopen() a driver module will fail spectacularly (loading a + * driver .so will bring in dynamic copies of libibverbs and + * libdl to go along with the static copies the executable + * has, which quickly leads to a crash. + */ + { + void *hand = dlopen(NULL, RTLD_NOW); + if (!hand) { + fprintf(stderr, "ibverbs: warning: dlopen(NULL) failed, " + "assuming static linking.\n"); + statically_linked = 1; + goto out; + } + dlclose(hand); + } + + load_drivers(); + + for (sysfs_dev = sysfs_dev_list; sysfs_dev; sysfs_dev = sysfs_dev->next) { + if (sysfs_dev->have_driver) + continue; + + device = try_drivers(sysfs_dev); + if (device) { + add_device(device, list, &num_devices, &list_size); + sysfs_dev->have_driver = 1; + } + } + +out: + for (sysfs_dev = sysfs_dev_list, + next_dev = sysfs_dev ? sysfs_dev->next : NULL; + sysfs_dev; + sysfs_dev = next_dev, next_dev = sysfs_dev ? sysfs_dev->next : NULL) { + if (!sysfs_dev->have_driver) { + fprintf(stderr, "ibverbs: warning: no userspace device-specific " + "driver found for %s\n", sysfs_dev->sysfs_path); + if (statically_linked) + fprintf(stderr, " When linking libibverbs statically, " + "driver must be statically linked too.\n"); + } + free(sysfs_dev); + } + + return num_devices; +} diff --git a/prov/ibverbs/src/marshall.c b/prov/ibverbs/src/marshall.c new file mode 100644 index 00000000000..dc576059404 --- /dev/null +++ b/prov/ibverbs/src/marshall.c @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2005 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +# include <config.h> +#endif /* HAVE_CONFIG_H */ + +#include <string.h> + +#include <infiniband/marshall.h> +#include <rdma/fi_ucma.h> + + +void ibv_copy_ah_attr_from_kern(struct ibv_ah_attr *dst, + struct ibv_kern_ah_attr *src) +{ + memcpy(&dst->grh.dgid, src->grh.dgid, sizeof dst->grh.dgid); + dst->grh.flow_label = src->grh.flow_label; + dst->grh.sgid_index = src->grh.sgid_index; + dst->grh.hop_limit = src->grh.hop_limit; + dst->grh.traffic_class = src->grh.traffic_class; + + dst->dlid = src->dlid; + dst->sl = src->sl; + dst->src_path_bits = src->src_path_bits; + dst->static_rate = src->static_rate; + dst->is_global = src->is_global; + dst->port_num = src->port_num; +} + +void ibv_copy_qp_attr_from_kern(struct ibv_qp_attr *dst, + struct ibv_kern_qp_attr *src) +{ + dst->cur_qp_state = src->cur_qp_state; + dst->path_mtu = src->path_mtu; + dst->path_mig_state = src->path_mig_state; + dst->qkey = src->qkey; + dst->rq_psn = src->rq_psn; + dst->sq_psn = src->sq_psn; + dst->dest_qp_num = src->dest_qp_num; + dst->qp_access_flags = src->qp_access_flags; + + dst->cap.max_send_wr = src->max_send_wr; + dst->cap.max_recv_wr = src->max_recv_wr; + dst->cap.max_send_sge = src->max_send_sge; + dst->cap.max_recv_sge = src->max_recv_sge; + dst->cap.max_inline_data = src->max_inline_data; + + ibv_copy_ah_attr_from_kern(&dst->ah_attr, &src->ah_attr); + ibv_copy_ah_attr_from_kern(&dst->alt_ah_attr, &src->alt_ah_attr); + + dst->pkey_index = src->pkey_index; + dst->alt_pkey_index = src->alt_pkey_index; + dst->en_sqd_async_notify = src->en_sqd_async_notify; + dst->sq_draining = src->sq_draining; + dst->max_rd_atomic = src->max_rd_atomic; + dst->max_dest_rd_atomic = src->max_dest_rd_atomic; + dst->min_rnr_timer = src->min_rnr_timer; + dst->port_num = src->port_num; + dst->timeout = src->timeout; + dst->retry_cnt = src->retry_cnt; + dst->rnr_retry = src->rnr_retry; + dst->alt_port_num = src->alt_port_num; + dst->alt_timeout = src->alt_timeout; +} + +void ibv_copy_path_rec_from_kern(struct ibv_sa_path_rec *dst, + struct ibv_kern_path_rec *src) +{ + memcpy(&dst->dgid, src->dgid, sizeof dst->dgid); + memcpy(&dst->sgid, src->sgid, sizeof dst->sgid); + + dst->dlid = src->dlid; + dst->slid = src->slid; + dst->raw_traffic = src->raw_traffic; + dst->flow_label = src->flow_label; + dst->hop_limit = src->hop_limit; + dst->traffic_class = src->traffic_class; + dst->reversible = src->reversible; + dst->numb_path = src->numb_path; + dst->pkey = src->pkey; + dst->sl = src->sl; + dst->mtu_selector = src->mtu_selector; + dst->mtu = src->mtu; + dst->rate_selector = src->rate_selector; + dst->rate = src->rate; + dst->packet_life_time = src->packet_life_time; + dst->preference = src->preference; + dst->packet_life_time_selector = src->packet_life_time_selector; +} + +void ibv_copy_path_rec_to_kern(struct ibv_kern_path_rec *dst, + struct ibv_sa_path_rec *src) +{ + memcpy(dst->dgid, &src->dgid, sizeof src->dgid); + memcpy(dst->sgid, &src->sgid, sizeof src->sgid); + + dst->dlid = src->dlid; + dst->slid = src->slid; + dst->raw_traffic = src->raw_traffic; + dst->flow_label = src->flow_label; + dst->hop_limit = src->hop_limit; + dst->traffic_class = src->traffic_class; + dst->reversible = src->reversible; + dst->numb_path = src->numb_path; + dst->pkey = src->pkey; + dst->sl = src->sl; + dst->mtu_selector = src->mtu_selector; + dst->mtu = src->mtu; + dst->rate_selector = src->rate_selector; + dst->rate = src->rate; + dst->packet_life_time = src->packet_life_time; + dst->preference = src->preference; + dst->packet_life_time_selector = src->packet_life_time_selector; +} diff --git a/prov/ibverbs/src/memory.c b/prov/ibverbs/src/memory.c new file mode 100644 index 00000000000..7d97e5541b2 --- /dev/null +++ b/prov/ibverbs/src/memory.c @@ -0,0 +1,719 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +# include <config.h> +#endif /* HAVE_CONFIG_H */ + +#include <errno.h> +#include <sys/mman.h> +#include <unistd.h> +#include <stdlib.h> +#include <stdint.h> +#include <stdio.h> +#include <string.h> +#include <dirent.h> +#include <limits.h> +#include <inttypes.h> + +#include "ibverbs.h" + +/* + * Most distro's headers don't have these yet. + */ +#ifndef MADV_DONTFORK +#define MADV_DONTFORK 10 +#endif + +#ifndef MADV_DOFORK +#define MADV_DOFORK 11 +#endif + +struct ibv_mem_node { + enum { + IBV_RED, + IBV_BLACK + } color; + struct ibv_mem_node *parent; + struct ibv_mem_node *left, *right; + uintptr_t start, end; + int refcnt; +}; + +static struct ibv_mem_node *mm_root; +static pthread_mutex_t mm_mutex = PTHREAD_MUTEX_INITIALIZER; +static int page_size; +static int huge_page_enabled; +static int too_late; + +static unsigned long smaps_page_size(FILE *file) +{ + int n; + unsigned long size = page_size; + char buf[1024]; + + while (fgets(buf, sizeof(buf), file) != NULL) { + if (!strstr(buf, "KernelPageSize:")) + continue; + + n = sscanf(buf, "%*s %lu", &size); + if (n < 1) + continue; + + /* page size is printed in Kb */ + size = size * 1024; + + break; + } + + return size; +} + +static unsigned long get_page_size(void *base) +{ + unsigned long ret = page_size; + pid_t pid; + FILE *file; + char buf[1024]; + + pid = getpid(); + snprintf(buf, sizeof(buf), "/proc/%d/smaps", pid); + + file = fopen(buf, "r"); + if (!file) + goto out; + + while (fgets(buf, sizeof(buf), file) != NULL) { + int n; + uintptr_t range_start, range_end; + + n = sscanf(buf, "%" SCNxPTR "-%" SCNxPTR, &range_start, &range_end); + + if (n < 2) + continue; + + if ((uintptr_t) base >= range_start && (uintptr_t) base < range_end) { + ret = smaps_page_size(file); + break; + } + } + + fclose(file); + +out: + return ret; +} + +int ibv_fork_init(void) +{ + void *tmp, *tmp_aligned; + int ret; + unsigned long size; + + if (mm_root) + return 0; + + if (too_late) + return EINVAL; + + page_size = sysconf(_SC_PAGESIZE); + if (page_size < 0) + return errno; + + if (posix_memalign(&tmp, page_size, page_size)) + return ENOMEM; + + if (getenv("RDMAV_HUGEPAGES_SAFE")) + huge_page_enabled = 1; + else + huge_page_enabled = 0; + + if (huge_page_enabled) { + size = get_page_size(tmp); + tmp_aligned = (void *) ((uintptr_t) tmp & ~(size - 1)); + } else { + size = page_size; + tmp_aligned = tmp; + } + + ret = madvise(tmp_aligned, size, MADV_DONTFORK) || + madvise(tmp_aligned, size, MADV_DOFORK); + + free(tmp); + + if (ret) + return ENOSYS; + + mm_root = malloc(sizeof *mm_root); + if (!mm_root) + return ENOMEM; + + mm_root->parent = NULL; + mm_root->left = NULL; + mm_root->right = NULL; + mm_root->color = IBV_BLACK; + mm_root->start = 0; + mm_root->end = UINTPTR_MAX; + mm_root->refcnt = 0; + + return 0; +} + +static struct ibv_mem_node *__mm_prev(struct ibv_mem_node *node) +{ + if (node->left) { + node = node->left; + while (node->right) + node = node->right; + } else { + while (node->parent && node == node->parent->left) + node = node->parent; + + node = node->parent; + } + + return node; +} + +static struct ibv_mem_node *__mm_next(struct ibv_mem_node *node) +{ + if (node->right) { + node = node->right; + while (node->left) + node = node->left; + } else { + while (node->parent && node == node->parent->right) + node = node->parent; + + node = node->parent; + } + + return node; +} + +static void __mm_rotate_right(struct ibv_mem_node *node) +{ + struct ibv_mem_node *tmp; + + tmp = node->left; + + node->left = tmp->right; + if (node->left) + node->left->parent = node; + + if (node->parent) { + if (node->parent->right == node) + node->parent->right = tmp; + else + node->parent->left = tmp; + } else + mm_root = tmp; + + tmp->parent = node->parent; + + tmp->right = node; + node->parent = tmp; +} + +static void __mm_rotate_left(struct ibv_mem_node *node) +{ + struct ibv_mem_node *tmp; + + tmp = node->right; + + node->right = tmp->left; + if (node->right) + node->right->parent = node; + + if (node->parent) { + if (node->parent->right == node) + node->parent->right = tmp; + else + node->parent->left = tmp; + } else + mm_root = tmp; + + tmp->parent = node->parent; + + tmp->left = node; + node->parent = tmp; +} + +#if 0 +static int verify(struct ibv_mem_node *node) +{ + int hl, hr; + + if (!node) + return 1; + + hl = verify(node->left); + hr = verify(node->left); + + if (!hl || !hr) + return 0; + if (hl != hr) + return 0; + + if (node->color == IBV_RED) { + if (node->left && node->left->color != IBV_BLACK) + return 0; + if (node->right && node->right->color != IBV_BLACK) + return 0; + return hl; + } + + return hl + 1; +} +#endif + +static void __mm_add_rebalance(struct ibv_mem_node *node) +{ + struct ibv_mem_node *parent, *gp, *uncle; + + while (node->parent && node->parent->color == IBV_RED) { + parent = node->parent; + gp = node->parent->parent; + + if (parent == gp->left) { + uncle = gp->right; + + if (uncle && uncle->color == IBV_RED) { + parent->color = IBV_BLACK; + uncle->color = IBV_BLACK; + gp->color = IBV_RED; + + node = gp; + } else { + if (node == parent->right) { + __mm_rotate_left(parent); + node = parent; + parent = node->parent; + } + + parent->color = IBV_BLACK; + gp->color = IBV_RED; + + __mm_rotate_right(gp); + } + } else { + uncle = gp->left; + + if (uncle && uncle->color == IBV_RED) { + parent->color = IBV_BLACK; + uncle->color = IBV_BLACK; + gp->color = IBV_RED; + + node = gp; + } else { + if (node == parent->left) { + __mm_rotate_right(parent); + node = parent; + parent = node->parent; + } + + parent->color = IBV_BLACK; + gp->color = IBV_RED; + + __mm_rotate_left(gp); + } + } + } + + mm_root->color = IBV_BLACK; +} + +static void __mm_add(struct ibv_mem_node *new) +{ + struct ibv_mem_node *node, *parent = NULL; + + node = mm_root; + while (node) { + parent = node; + if (node->start < new->start) + node = node->right; + else + node = node->left; + } + + if (parent->start < new->start) + parent->right = new; + else + parent->left = new; + + new->parent = parent; + new->left = NULL; + new->right = NULL; + + new->color = IBV_RED; + __mm_add_rebalance(new); +} + +static void __mm_remove(struct ibv_mem_node *node) +{ + struct ibv_mem_node *child, *parent, *sib, *tmp; + int nodecol; + + if (node->left && node->right) { + tmp = node->left; + while (tmp->right) + tmp = tmp->right; + + nodecol = tmp->color; + child = tmp->left; + tmp->color = node->color; + + if (tmp->parent != node) { + parent = tmp->parent; + parent->right = tmp->left; + if (tmp->left) + tmp->left->parent = parent; + + tmp->left = node->left; + node->left->parent = tmp; + } else + parent = tmp; + + tmp->right = node->right; + node->right->parent = tmp; + + tmp->parent = node->parent; + if (node->parent) { + if (node->parent->left == node) + node->parent->left = tmp; + else + node->parent->right = tmp; + } else + mm_root = tmp; + } else { + nodecol = node->color; + + child = node->left ? node->left : node->right; + parent = node->parent; + + if (child) + child->parent = parent; + if (parent) { + if (parent->left == node) + parent->left = child; + else + parent->right = child; + } else + mm_root = child; + } + + free(node); + + if (nodecol == IBV_RED) + return; + + while ((!child || child->color == IBV_BLACK) && child != mm_root) { + if (parent->left == child) { + sib = parent->right; + + if (sib->color == IBV_RED) { + parent->color = IBV_RED; + sib->color = IBV_BLACK; + __mm_rotate_left(parent); + sib = parent->right; + } + + if ((!sib->left || sib->left->color == IBV_BLACK) && + (!sib->right || sib->right->color == IBV_BLACK)) { + sib->color = IBV_RED; + child = parent; + parent = child->parent; + } else { + if (!sib->right || sib->right->color == IBV_BLACK) { + if (sib->left) + sib->left->color = IBV_BLACK; + sib->color = IBV_RED; + __mm_rotate_right(sib); + sib = parent->right; + } + + sib->color = parent->color; + parent->color = IBV_BLACK; + if (sib->right) + sib->right->color = IBV_BLACK; + __mm_rotate_left(parent); + child = mm_root; + break; + } + } else { + sib = parent->left; + + if (sib->color == IBV_RED) { + parent->color = IBV_RED; + sib->color = IBV_BLACK; + __mm_rotate_right(parent); + sib = parent->left; + } + + if ((!sib->left || sib->left->color == IBV_BLACK) && + (!sib->right || sib->right->color == IBV_BLACK)) { + sib->color = IBV_RED; + child = parent; + parent = child->parent; + } else { + if (!sib->left || sib->left->color == IBV_BLACK) { + if (sib->right) + sib->right->color = IBV_BLACK; + sib->color = IBV_RED; + __mm_rotate_left(sib); + sib = parent->left; + } + + sib->color = parent->color; + parent->color = IBV_BLACK; + if (sib->left) + sib->left->color = IBV_BLACK; + __mm_rotate_right(parent); + child = mm_root; + break; + } + } + } + + if (child) + child->color = IBV_BLACK; +} + +static struct ibv_mem_node *__mm_find_start(uintptr_t start, uintptr_t end) +{ + struct ibv_mem_node *node = mm_root; + + while (node) { + if (node->start <= start && node->end >= start) + break; + + if (node->start < start) + node = node->right; + else + node = node->left; + } + + return node; +} + +static struct ibv_mem_node *merge_ranges(struct ibv_mem_node *node, + struct ibv_mem_node *prev) +{ + prev->end = node->end; + prev->refcnt = node->refcnt; + __mm_remove(node); + + return prev; +} + +static struct ibv_mem_node *split_range(struct ibv_mem_node *node, + uintptr_t cut_line) +{ + struct ibv_mem_node *new_node = NULL; + + new_node = malloc(sizeof *new_node); + if (!new_node) + return NULL; + new_node->start = cut_line; + new_node->end = node->end; + new_node->refcnt = node->refcnt; + node->end = cut_line - 1; + __mm_add(new_node); + + return new_node; +} + +static struct ibv_mem_node *get_start_node(uintptr_t start, uintptr_t end, + int inc) +{ + struct ibv_mem_node *node, *tmp = NULL; + + node = __mm_find_start(start, end); + if (node->start < start) + node = split_range(node, start); + else { + tmp = __mm_prev(node); + if (tmp && tmp->refcnt == node->refcnt + inc) + node = merge_ranges(node, tmp); + } + return node; +} + +/* + * This function is called if madvise() fails to undo merging/splitting + * operations performed on the node. + */ +static struct ibv_mem_node *undo_node(struct ibv_mem_node *node, + uintptr_t start, int inc) +{ + struct ibv_mem_node *tmp = NULL; + + /* + * This condition can be true only if we merged this + * node with the previous one, so we need to split them. + */ + if (start > node->start) { + tmp = split_range(node, start); + if (tmp) { + node->refcnt += inc; + node = tmp; + } else + return NULL; + } + + tmp = __mm_prev(node); + if (tmp && tmp->refcnt == node->refcnt) + node = merge_ranges(node, tmp); + + tmp = __mm_next(node); + if (tmp && tmp->refcnt == node->refcnt) + node = merge_ranges(tmp, node); + + return node; +} + +static int ibv_madvise_range(void *base, size_t size, int advice) +{ + uintptr_t start, end; + struct ibv_mem_node *node, *tmp; + int inc; + int rolling_back = 0; + int ret = 0; + unsigned long range_page_size; + + if (!size) + return 0; + + if (huge_page_enabled) + range_page_size = get_page_size(base); + else + range_page_size = page_size; + + start = (uintptr_t) base & ~(range_page_size - 1); + end = ((uintptr_t) (base + size + range_page_size - 1) & + ~(range_page_size - 1)) - 1; + + pthread_mutex_lock(&mm_mutex); +again: + inc = advice == MADV_DONTFORK ? 1 : -1; + + node = get_start_node(start, end, inc); + if (!node) { + ret = -1; + goto out; + } + + while (node && node->start <= end) { + if (node->end > end) { + if (!split_range(node, end + 1)) { + ret = -1; + goto out; + } + } + + if ((inc == -1 && node->refcnt == 1) || + (inc == 1 && node->refcnt == 0)) { + /* + * If this is the first time through the loop, + * and we merged this node with the previous + * one, then we only want to do the madvise() + * on start ... node->end (rather than + * starting at node->start). + * + * Otherwise we end up doing madvise() on + * bigger region than we're being asked to, + * and that may lead to a spurious failure. + */ + if (start > node->start) + ret = madvise((void *) start, node->end - start + 1, + advice); + else + ret = madvise((void *) node->start, + node->end - node->start + 1, + advice); + if (ret) { + node = undo_node(node, start, inc); + + if (rolling_back || !node) + goto out; + + /* madvise failed, roll back previous changes */ + rolling_back = 1; + advice = advice == MADV_DONTFORK ? + MADV_DOFORK : MADV_DONTFORK; + tmp = __mm_prev(node); + if (!tmp || start > tmp->end) + goto out; + end = tmp->end; + goto again; + } + } + + node->refcnt += inc; + node = __mm_next(node); + } + + if (node) { + tmp = __mm_prev(node); + if (tmp && node->refcnt == tmp->refcnt) + node = merge_ranges(node, tmp); + } + +out: + if (rolling_back) + ret = -1; + + pthread_mutex_unlock(&mm_mutex); + + return ret; +} + +int ibv_dontfork_range(void *base, size_t size) +{ + if (mm_root) + return ibv_madvise_range(base, size, MADV_DONTFORK); + else { + too_late = 1; + return 0; + } +} + +int ibv_dofork_range(void *base, size_t size) +{ + if (mm_root) + return ibv_madvise_range(base, size, MADV_DOFORK); + else { + too_late = 1; + return 0; + } +} diff --git a/prov/ibverbs/src/verbs.c b/prov/ibverbs/src/verbs.c new file mode 100644 index 00000000000..c58108087b3 --- /dev/null +++ b/prov/ibverbs/src/verbs.c @@ -0,0 +1,534 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +# include <config.h> +#endif /* HAVE_CONFIG_H */ + +#include <stdio.h> +#include <netinet/in.h> +#include <unistd.h> +#include <stdlib.h> +#include <errno.h> +#include <string.h> + +#include "ibverbs.h" + +int ibv_rate_to_mult(enum ibv_rate rate) +{ + switch (rate) { + case IBV_RATE_2_5_GBPS: return 1; + case IBV_RATE_5_GBPS: return 2; + case IBV_RATE_10_GBPS: return 4; + case IBV_RATE_20_GBPS: return 8; + case IBV_RATE_30_GBPS: return 12; + case IBV_RATE_40_GBPS: return 16; + case IBV_RATE_60_GBPS: return 24; + case IBV_RATE_80_GBPS: return 32; + case IBV_RATE_120_GBPS: return 48; + default: return -1; + } +} + +enum ibv_rate mult_to_ibv_rate(int mult) +{ + switch (mult) { + case 1: return IBV_RATE_2_5_GBPS; + case 2: return IBV_RATE_5_GBPS; + case 4: return IBV_RATE_10_GBPS; + case 8: return IBV_RATE_20_GBPS; + case 12: return IBV_RATE_30_GBPS; + case 16: return IBV_RATE_40_GBPS; + case 24: return IBV_RATE_60_GBPS; + case 32: return IBV_RATE_80_GBPS; + case 48: return IBV_RATE_120_GBPS; + default: return IBV_RATE_MAX; + } +} + +int ibv_rate_to_mbps(enum ibv_rate rate) +{ + switch (rate) { + case IBV_RATE_2_5_GBPS: return 2500; + case IBV_RATE_5_GBPS: return 5000; + case IBV_RATE_10_GBPS: return 10000; + case IBV_RATE_20_GBPS: return 20000; + case IBV_RATE_30_GBPS: return 30000; + case IBV_RATE_40_GBPS: return 40000; + case IBV_RATE_60_GBPS: return 60000; + case IBV_RATE_80_GBPS: return 80000; + case IBV_RATE_120_GBPS: return 120000; + case IBV_RATE_14_GBPS: return 14062; + case IBV_RATE_56_GBPS: return 56250; + case IBV_RATE_112_GBPS: return 112500; + case IBV_RATE_168_GBPS: return 168750; + case IBV_RATE_25_GBPS: return 25781; + case IBV_RATE_100_GBPS: return 103125; + case IBV_RATE_200_GBPS: return 206250; + case IBV_RATE_300_GBPS: return 309375; + default: return -1; + } +} + +enum ibv_rate mbps_to_ibv_rate(int mbps) +{ + switch (mbps) { + case 2500: return IBV_RATE_2_5_GBPS; + case 5000: return IBV_RATE_5_GBPS; + case 10000: return IBV_RATE_10_GBPS; + case 20000: return IBV_RATE_20_GBPS; + case 30000: return IBV_RATE_30_GBPS; + case 40000: return IBV_RATE_40_GBPS; + case 60000: return IBV_RATE_60_GBPS; + case 80000: return IBV_RATE_80_GBPS; + case 120000: return IBV_RATE_120_GBPS; + case 14062: return IBV_RATE_14_GBPS; + case 56250: return IBV_RATE_56_GBPS; + case 112500: return IBV_RATE_112_GBPS; + case 168750: return IBV_RATE_168_GBPS; + case 25781: return IBV_RATE_25_GBPS; + case 103125: return IBV_RATE_100_GBPS; + case 206250: return IBV_RATE_200_GBPS; + case 309375: return IBV_RATE_300_GBPS; + default: return IBV_RATE_MAX; + } +} + +int ibv_query_device(struct ibv_context *context, + struct ibv_device_attr *device_attr) +{ + return context->ops.query_device(context, device_attr); +} + +int ibv_query_port(struct ibv_context *context, uint8_t port_num, + struct ibv_port_attr *port_attr) +{ + return context->ops.query_port(context, port_num, port_attr); +} + +int ibv_query_gid(struct ibv_context *context, uint8_t port_num, + int index, union ibv_gid *gid) +{ + char name[24]; + char attr[41]; + uint16_t val; + int i; + + snprintf(name, sizeof name, "ports/%d/gids/%d", port_num, index); + + if (fi_read_file(context->device->ibdev_path, name, + attr, sizeof attr) < 0) + return -1; + + for (i = 0; i < 8; ++i) { + if (sscanf(attr + i * 5, "%hx", &val) != 1) + return -1; + gid->raw[i * 2 ] = val >> 8; + gid->raw[i * 2 + 1] = val & 0xff; + } + + return 0; +} + +int ibv_query_pkey(struct ibv_context *context, uint8_t port_num, + int index, uint16_t *pkey) +{ + char name[24]; + char attr[8]; + uint16_t val; + + snprintf(name, sizeof name, "ports/%d/pkeys/%d", port_num, index); + + if (fi_read_file(context->device->ibdev_path, name, + attr, sizeof attr) < 0) + return -1; + + if (sscanf(attr, "%hx", &val) != 1) + return -1; + + *pkey = htons(val); + return 0; +} + +struct ibv_pd *ibv_alloc_pd(struct ibv_context *context) +{ + struct ibv_pd *pd; + + pd = context->ops.alloc_pd(context); + if (pd) + pd->context = context; + + return pd; +} + +int ibv_dealloc_pd(struct ibv_pd *pd) +{ + return pd->context->ops.dealloc_pd(pd); +} + +struct ibv_mr *ibv_reg_mr(struct ibv_pd *pd, void *addr, + size_t length, int access) +{ + struct ibv_mr *mr; + + if (ibv_dontfork_range(addr, length)) + return NULL; + + mr = pd->context->ops.reg_mr(pd, addr, length, access); + if (mr) { + mr->context = pd->context; + mr->pd = pd; + mr->addr = addr; + mr->length = length; + } else + ibv_dofork_range(addr, length); + + return mr; +} + +int ibv_dereg_mr(struct ibv_mr *mr) +{ + int ret; + void *addr = mr->addr; + size_t length = mr->length; + + ret = mr->context->ops.dereg_mr(mr); + if (!ret) + ibv_dofork_range(addr, length); + + return ret; +} + +struct ibv_comp_channel *ibv_create_comp_channel(struct ibv_context *context) +{ + struct ibv_comp_channel *channel; + struct ibv_create_comp_channel cmd; + struct ibv_create_comp_channel_resp resp; + int ret; + + channel = malloc(sizeof *channel); + if (!channel) + return NULL; + + ret = uv_create_comp_channel(context->uv_fid, &cmd, sizeof cmd, &resp, sizeof resp); + if (ret) { + free(channel); + return NULL; + } + + channel->context = context; + channel->fd = resp.fd; + channel->refcnt = 0; + + return channel; +} + +int ibv_destroy_comp_channel(struct ibv_comp_channel *channel) +{ + struct ibv_context *context; + int ret; + + context = channel->context; + pthread_mutex_lock(&context->mutex); + + if (channel->refcnt) { + ret = EBUSY; + goto out; + } + + close(channel->fd); + free(channel); + ret = 0; + +out: + pthread_mutex_unlock(&context->mutex); + + return ret; +} + +struct ibv_cq *ibv_create_cq(struct ibv_context *context, int cqe, void *cq_context, + struct ibv_comp_channel *channel, int comp_vector) +{ + struct ibv_cq *cq; + + pthread_mutex_lock(&context->mutex); + + cq = context->ops.create_cq(context, cqe, channel, comp_vector); + + if (cq) { + cq->context = context; + cq->channel = channel; + if (channel) + ++channel->refcnt; + cq->cq_context = cq_context; + cq->comp_events_completed = 0; + cq->async_events_completed = 0; + pthread_mutex_init(&cq->mutex, NULL); + pthread_cond_init(&cq->cond, NULL); + } + + pthread_mutex_unlock(&context->mutex); + + return cq; +} + +int ibv_resize_cq(struct ibv_cq *cq, int cqe) +{ + if (!cq->context->ops.resize_cq) + return ENOSYS; + + return cq->context->ops.resize_cq(cq, cqe); +} + +int ibv_destroy_cq(struct ibv_cq *cq) +{ + struct ibv_comp_channel *channel = cq->channel; + int ret; + + if (channel) + pthread_mutex_lock(&channel->context->mutex); + + ret = cq->context->ops.destroy_cq(cq); + + if (channel) { + if (!ret) + --channel->refcnt; + pthread_mutex_unlock(&channel->context->mutex); + } + + return ret; +} + +int ibv_get_cq_event(struct ibv_comp_channel *channel, + struct ibv_cq **cq, void **cq_context) +{ + struct ibv_comp_event ev; + + if (read(channel->fd, &ev, sizeof ev) != sizeof ev) + return -1; + + *cq = (struct ibv_cq *) (uintptr_t) ev.cq_handle; + *cq_context = (*cq)->cq_context; + + if ((*cq)->context->ops.cq_event) + (*cq)->context->ops.cq_event(*cq); + + return 0; +} + +void ibv_ack_cq_events(struct ibv_cq *cq, unsigned int nevents) +{ + pthread_mutex_lock(&cq->mutex); + cq->comp_events_completed += nevents; + pthread_cond_signal(&cq->cond); + pthread_mutex_unlock(&cq->mutex); +} + +struct ibv_srq *ibv_create_srq(struct ibv_pd *pd, + struct ibv_srq_init_attr *srq_init_attr) +{ + struct ibv_srq *srq; + + if (!pd->context->ops.create_srq) + return NULL; + + srq = pd->context->ops.create_srq(pd, srq_init_attr); + if (srq) { + srq->context = pd->context; + srq->srq_context = srq_init_attr->srq_context; + srq->pd = pd; + srq->events_completed = 0; + pthread_mutex_init(&srq->mutex, NULL); + pthread_cond_init(&srq->cond, NULL); + } + + return srq; +} + +int ibv_modify_srq(struct ibv_srq *srq, + struct ibv_srq_attr *srq_attr, + int srq_attr_mask) +{ + return srq->context->ops.modify_srq(srq, srq_attr, srq_attr_mask); +} + +int ibv_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr) +{ + return srq->context->ops.query_srq(srq, srq_attr); +} + +int ibv_destroy_srq(struct ibv_srq *srq) +{ + return srq->context->ops.destroy_srq(srq); +} + +struct ibv_qp *ibv_create_qp(struct ibv_pd *pd, + struct ibv_qp_init_attr *qp_init_attr) +{ + struct ibv_qp *qp = pd->context->ops.create_qp(pd, qp_init_attr); + + if (qp) { + qp->context = pd->context; + qp->qp_context = qp_init_attr->qp_context; + qp->pd = pd; + qp->send_cq = qp_init_attr->send_cq; + qp->recv_cq = qp_init_attr->recv_cq; + qp->srq = qp_init_attr->srq; + qp->qp_type = qp_init_attr->qp_type; + qp->state = IBV_QPS_RESET; + qp->events_completed = 0; + pthread_mutex_init(&qp->mutex, NULL); + pthread_cond_init(&qp->cond, NULL); + } + + return qp; +} + +int ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask, + struct ibv_qp_init_attr *init_attr) +{ + int ret; + + ret = qp->context->ops.query_qp(qp, attr, attr_mask, init_attr); + if (ret) + return ret; + + if (attr_mask & IBV_QP_STATE) + qp->state = attr->qp_state; + + return 0; +} + +int ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask) +{ + int ret; + + ret = qp->context->ops.modify_qp(qp, attr, attr_mask); + if (ret) + return ret; + + if (attr_mask & IBV_QP_STATE) + qp->state = attr->qp_state; + + return 0; +} + +int ibv_destroy_qp(struct ibv_qp *qp) +{ + return qp->context->ops.destroy_qp(qp); +} + +struct ibv_ah *ibv_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr) +{ + struct ibv_ah *ah = pd->context->ops.create_ah(pd, attr); + + if (ah) { + ah->context = pd->context; + ah->pd = pd; + } + + return ah; +} + +static int ibv_find_gid_index(struct ibv_context *context, uint8_t port_num, + union ibv_gid *gid) +{ + union ibv_gid sgid; + int i = 0, ret; + + do { + ret = ibv_query_gid(context, port_num, i++, &sgid); + } while (!ret && memcmp(&sgid, gid, sizeof *gid)); + + return ret ? ret : i - 1; +} + +int ibv_init_ah_from_wc(struct ibv_context *context, uint8_t port_num, + struct ibv_wc *wc, struct ibv_grh *grh, + struct ibv_ah_attr *ah_attr) +{ + uint32_t flow_class; + int ret; + + memset(ah_attr, 0, sizeof *ah_attr); + ah_attr->dlid = wc->slid; + ah_attr->sl = wc->sl; + ah_attr->src_path_bits = wc->dlid_path_bits; + ah_attr->port_num = port_num; + + if (wc->wc_flags & IBV_WC_GRH) { + ah_attr->is_global = 1; + ah_attr->grh.dgid = grh->sgid; + + ret = ibv_find_gid_index(context, port_num, &grh->dgid); + if (ret < 0) + return ret; + + ah_attr->grh.sgid_index = (uint8_t) ret; + flow_class = ntohl(grh->version_tclass_flow); + ah_attr->grh.flow_label = flow_class & 0xFFFFF; + ah_attr->grh.hop_limit = grh->hop_limit; + ah_attr->grh.traffic_class = (flow_class >> 20) & 0xFF; + } + return 0; +} + +struct ibv_ah *ibv_create_ah_from_wc(struct ibv_pd *pd, struct ibv_wc *wc, + struct ibv_grh *grh, uint8_t port_num) +{ + struct ibv_ah_attr ah_attr; + int ret; + + ret = ibv_init_ah_from_wc(pd->context, port_num, wc, grh, &ah_attr); + if (ret) + return NULL; + + return ibv_create_ah(pd, &ah_attr); +} + +int ibv_destroy_ah(struct ibv_ah *ah) +{ + return ah->context->ops.destroy_ah(ah); +} + +int ibv_attach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid) +{ + return qp->context->ops.attach_mcast(qp, gid, lid); +} + +int ibv_detach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid) +{ + return qp->context->ops.detach_mcast(qp, gid, lid); +} diff --git a/prov/mlx4/AUTHORS b/prov/mlx4/AUTHORS new file mode 100644 index 00000000000..ffe1800452f --- /dev/null +++ b/prov/mlx4/AUTHORS @@ -0,0 +1 @@ +Roland Dreier <rolandd@cisco.com> diff --git a/prov/mlx4/COPYING b/prov/mlx4/COPYING new file mode 100644 index 00000000000..add3d1990bc --- /dev/null +++ b/prov/mlx4/COPYING @@ -0,0 +1,378 @@ +This software is available to you under a choice of one of two +licenses. You may choose to be licensed under the terms of the the +OpenIB.org BSD license or the GNU General Public License (GPL) Version +2, both included below. + +Copyright (c) 2007 Cisco, Inc. All rights reserved. + +================================================================== + + OpenIB.org BSD license + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +================================================================== + + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + <one line to give the program's name and a brief idea of what it does.> + Copyright (C) <year> <name of author> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + <signature of Ty Coon>, 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/prov/mlx4/src/buf.c b/prov/mlx4/src/buf.c new file mode 100644 index 00000000000..3e8ec9a17e5 --- /dev/null +++ b/prov/mlx4/src/buf.c @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2006, 2007 Cisco, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +# include <config.h> +#endif /* HAVE_CONFIG_H */ + +#include <stdlib.h> +#include <errno.h> +#include <sys/mman.h> + +#include "mlx4.h" + + +int mlx4_alloc_buf(struct mlx4_buf *buf, size_t size, int page_size) +{ + int ret; + + buf->length = align(size, page_size); + buf->buf = mmap(NULL, buf->length, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (buf->buf == MAP_FAILED) + return errno; + + ret = ibv_dontfork_range(buf->buf, size); + if (ret) + munmap(buf->buf, buf->length); + + return ret; +} + +void mlx4_free_buf(struct mlx4_buf *buf) +{ + ibv_dofork_range(buf->buf, buf->length); + munmap(buf->buf, buf->length); +} diff --git a/prov/mlx4/src/cq.c b/prov/mlx4/src/cq.c new file mode 100644 index 00000000000..18447c48fbc --- /dev/null +++ b/prov/mlx4/src/cq.c @@ -0,0 +1,480 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2006, 2007 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +# include <config.h> +#endif /* HAVE_CONFIG_H */ + +#include <stdio.h> +#include <stdlib.h> +#include <pthread.h> +#include <netinet/in.h> +#include <string.h> + +#include <infiniband/opcode.h> + +#include "mlx4.h" +#include "doorbell.h" + +enum { + MLX4_CQ_DOORBELL = 0x20 +}; + +enum { + CQ_OK = 0, + CQ_EMPTY = -1, + CQ_POLL_ERR = -2 +}; + +#define MLX4_CQ_DB_REQ_NOT_SOL (1 << 24) +#define MLX4_CQ_DB_REQ_NOT (2 << 24) + +enum { + MLX4_CQE_VLAN_PRESENT_MASK = 1 << 29, + MLX4_CQE_QPN_MASK = 0xffffff, +}; + +enum { + MLX4_CQE_OWNER_MASK = 0x80, + MLX4_CQE_IS_SEND_MASK = 0x40, + MLX4_CQE_OPCODE_MASK = 0x1f +}; + +enum { + MLX4_CQE_SYNDROME_LOCAL_LENGTH_ERR = 0x01, + MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR = 0x02, + MLX4_CQE_SYNDROME_LOCAL_PROT_ERR = 0x04, + MLX4_CQE_SYNDROME_WR_FLUSH_ERR = 0x05, + MLX4_CQE_SYNDROME_MW_BIND_ERR = 0x06, + MLX4_CQE_SYNDROME_BAD_RESP_ERR = 0x10, + MLX4_CQE_SYNDROME_LOCAL_ACCESS_ERR = 0x11, + MLX4_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR = 0x12, + MLX4_CQE_SYNDROME_REMOTE_ACCESS_ERR = 0x13, + MLX4_CQE_SYNDROME_REMOTE_OP_ERR = 0x14, + MLX4_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR = 0x15, + MLX4_CQE_SYNDROME_RNR_RETRY_EXC_ERR = 0x16, + MLX4_CQE_SYNDROME_REMOTE_ABORTED_ERR = 0x22, +}; + +struct mlx4_err_cqe { + uint32_t vlan_my_qpn; + uint32_t reserved1[5]; + uint16_t wqe_index; + uint8_t vendor_err; + uint8_t syndrome; + uint8_t reserved2[3]; + uint8_t owner_sr_opcode; +}; + +static struct mlx4_cqe *get_cqe(struct mlx4_cq *cq, int entry) +{ + return cq->buf.buf + entry * cq->cqe_size; +} + +static void *get_sw_cqe(struct mlx4_cq *cq, int n) +{ + struct mlx4_cqe *cqe = get_cqe(cq, n & cq->ibv_cq.cqe); + struct mlx4_cqe *tcqe = cq->cqe_size == 64 ? cqe + 1 : cqe; + + return (!!(tcqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^ + !!(n & (cq->ibv_cq.cqe + 1))) ? NULL : cqe; +} + +static struct mlx4_cqe *next_cqe_sw(struct mlx4_cq *cq) +{ + return get_sw_cqe(cq, cq->cons_index); +} + +static void update_cons_index(struct mlx4_cq *cq) +{ + *cq->set_ci_db = htonl(cq->cons_index & 0xffffff); +} + +static void mlx4_handle_error_cqe(struct mlx4_err_cqe *cqe, struct ibv_wc *wc) +{ + if (cqe->syndrome == MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR) + printf(PFX "local QP operation err " + "(QPN %06x, WQE index %x, vendor syndrome %02x, " + "opcode = %02x)\n", + htonl(cqe->vlan_my_qpn), htonl(cqe->wqe_index), + cqe->vendor_err, + cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK); + + switch (cqe->syndrome) { + case MLX4_CQE_SYNDROME_LOCAL_LENGTH_ERR: + wc->status = IBV_WC_LOC_LEN_ERR; + break; + case MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR: + wc->status = IBV_WC_LOC_QP_OP_ERR; + break; + case MLX4_CQE_SYNDROME_LOCAL_PROT_ERR: + wc->status = IBV_WC_LOC_PROT_ERR; + break; + case MLX4_CQE_SYNDROME_WR_FLUSH_ERR: + wc->status = IBV_WC_WR_FLUSH_ERR; + break; + case MLX4_CQE_SYNDROME_MW_BIND_ERR: + wc->status = IBV_WC_MW_BIND_ERR; + break; + case MLX4_CQE_SYNDROME_BAD_RESP_ERR: + wc->status = IBV_WC_BAD_RESP_ERR; + break; + case MLX4_CQE_SYNDROME_LOCAL_ACCESS_ERR: + wc->status = IBV_WC_LOC_ACCESS_ERR; + break; + case MLX4_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR: + wc->status = IBV_WC_REM_INV_REQ_ERR; + break; + case MLX4_CQE_SYNDROME_REMOTE_ACCESS_ERR: + wc->status = IBV_WC_REM_ACCESS_ERR; + break; + case MLX4_CQE_SYNDROME_REMOTE_OP_ERR: + wc->status = IBV_WC_REM_OP_ERR; + break; + case MLX4_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR: + wc->status = IBV_WC_RETRY_EXC_ERR; + break; + case MLX4_CQE_SYNDROME_RNR_RETRY_EXC_ERR: + wc->status = IBV_WC_RNR_RETRY_EXC_ERR; + break; + case MLX4_CQE_SYNDROME_REMOTE_ABORTED_ERR: + wc->status = IBV_WC_REM_ABORT_ERR; + break; + default: + wc->status = IBV_WC_GENERAL_ERR; + break; + } + + wc->vendor_err = cqe->vendor_err; +} + +static int mlx4_poll_one(struct mlx4_cq *cq, + struct mlx4_qp **cur_qp, + struct ibv_wc *wc) +{ + struct mlx4_wq *wq; + struct mlx4_cqe *cqe; + struct mlx4_srq *srq; + uint32_t qpn; + uint32_t g_mlpath_rqpn; + uint16_t wqe_index; + int is_error; + int is_send; + + cqe = next_cqe_sw(cq); + if (!cqe) + return CQ_EMPTY; + + if (cq->cqe_size == 64) + ++cqe; + + ++cq->cons_index; + + VALGRIND_MAKE_MEM_DEFINED(cqe, sizeof *cqe); + + /* + * Make sure we read CQ entry contents after we've checked the + * ownership bit. + */ + rmb(); + + qpn = ntohl(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK; + + is_send = cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK; + is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == + MLX4_CQE_OPCODE_ERROR; + + if (!*cur_qp || + (qpn != (*cur_qp)->ibv_qp.qp_num)) { + /* + * We do not have to take the QP table lock here, + * because CQs will be locked while QPs are removed + * from the table. + */ + *cur_qp = mlx4_find_qp(to_mctx(cq->ibv_cq.context), qpn); + if (!*cur_qp) + return CQ_POLL_ERR; + } + + wc->qp_num = (*cur_qp)->ibv_qp.qp_num; + + if (is_send) { + wq = &(*cur_qp)->sq; + wqe_index = ntohs(cqe->wqe_index); + wq->tail += (uint16_t) (wqe_index - (uint16_t) wq->tail); + wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; + ++wq->tail; + } else if ((*cur_qp)->ibv_qp.srq) { + srq = to_msrq((*cur_qp)->ibv_qp.srq); + wqe_index = htons(cqe->wqe_index); + wc->wr_id = srq->wrid[wqe_index]; + mlx4_free_srq_wqe(srq, wqe_index); + } else { + wq = &(*cur_qp)->rq; + wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; + ++wq->tail; + } + + if (is_error) { + mlx4_handle_error_cqe((struct mlx4_err_cqe *) cqe, wc); + return CQ_OK; + } + + wc->status = IBV_WC_SUCCESS; + + if (is_send) { + wc->wc_flags = 0; + switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) { + case MLX4_OPCODE_RDMA_WRITE_IMM: + wc->wc_flags |= IBV_WC_WITH_IMM; + case MLX4_OPCODE_RDMA_WRITE: + wc->opcode = IBV_WC_RDMA_WRITE; + break; + case MLX4_OPCODE_SEND_IMM: + wc->wc_flags |= IBV_WC_WITH_IMM; + case MLX4_OPCODE_SEND: + wc->opcode = IBV_WC_SEND; + break; + case MLX4_OPCODE_RDMA_READ: + wc->opcode = IBV_WC_RDMA_READ; + wc->byte_len = ntohl(cqe->byte_cnt); + break; + case MLX4_OPCODE_ATOMIC_CS: + wc->opcode = IBV_WC_COMP_SWAP; + wc->byte_len = 8; + break; + case MLX4_OPCODE_ATOMIC_FA: + wc->opcode = IBV_WC_FETCH_ADD; + wc->byte_len = 8; + break; + case MLX4_OPCODE_BIND_MW: + wc->opcode = IBV_WC_BIND_MW; + break; + default: + /* assume it's a send completion */ + wc->opcode = IBV_WC_SEND; + break; + } + } else { + wc->byte_len = ntohl(cqe->byte_cnt); + + switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) { + case MLX4_RECV_OPCODE_RDMA_WRITE_IMM: + wc->opcode = IBV_WC_RECV_RDMA_WITH_IMM; + wc->wc_flags = IBV_WC_WITH_IMM; + wc->imm_data = cqe->immed_rss_invalid; + break; + case MLX4_RECV_OPCODE_SEND: + wc->opcode = IBV_WC_RECV; + wc->wc_flags = 0; + break; + case MLX4_RECV_OPCODE_SEND_IMM: + wc->opcode = IBV_WC_RECV; + wc->wc_flags = IBV_WC_WITH_IMM; + wc->imm_data = cqe->immed_rss_invalid; + break; + } + + wc->slid = ntohs(cqe->rlid); + g_mlpath_rqpn = ntohl(cqe->g_mlpath_rqpn); + wc->src_qp = g_mlpath_rqpn & 0xffffff; + wc->dlid_path_bits = (g_mlpath_rqpn >> 24) & 0x7f; + wc->wc_flags |= g_mlpath_rqpn & 0x80000000 ? IBV_WC_GRH : 0; + wc->pkey_index = ntohl(cqe->immed_rss_invalid) & 0x7f; + if ((*cur_qp)->link_layer == IBV_LINK_LAYER_ETHERNET) + wc->sl = ntohs(cqe->sl_vid) >> 13; + else + wc->sl = ntohs(cqe->sl_vid) >> 12; + } + + return CQ_OK; +} + +int mlx4_poll_cq(struct ibv_cq *ibcq, int ne, struct ibv_wc *wc) +{ + struct mlx4_cq *cq = to_mcq(ibcq); + struct mlx4_qp *qp = NULL; + int npolled; + int err = CQ_OK; + + pthread_spin_lock(&cq->lock); + + for (npolled = 0; npolled < ne; ++npolled) { + err = mlx4_poll_one(cq, &qp, wc + npolled); + if (err != CQ_OK) + break; + } + + if (npolled || err == CQ_POLL_ERR) + update_cons_index(cq); + + pthread_spin_unlock(&cq->lock); + + return err == CQ_POLL_ERR ? err : npolled; +} + +int mlx4_arm_cq(struct ibv_cq *ibvcq, int solicited) +{ + struct mlx4_cq *cq = to_mcq(ibvcq); + uint32_t doorbell[2]; + uint32_t sn; + uint32_t ci; + uint32_t cmd; + + sn = cq->arm_sn & 3; + ci = cq->cons_index & 0xffffff; + cmd = solicited ? MLX4_CQ_DB_REQ_NOT_SOL : MLX4_CQ_DB_REQ_NOT; + + *cq->arm_db = htonl(sn << 28 | cmd | ci); + + /* + * Make sure that the doorbell record in host memory is + * written before ringing the doorbell via PCI MMIO. + */ + wmb(); + + doorbell[0] = htonl(sn << 28 | cmd | cq->cqn); + doorbell[1] = htonl(ci); + + mlx4_write64(doorbell, to_mctx(ibvcq->context), MLX4_CQ_DOORBELL); + + return 0; +} + +void mlx4_cq_event(struct ibv_cq *cq) +{ + to_mcq(cq)->arm_sn++; +} + +void __mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq) +{ + struct mlx4_cqe *cqe, *dest; + uint32_t prod_index; + uint8_t owner_bit; + int nfreed = 0; + int cqe_inc = cq->cqe_size == 64 ? 1 : 0; + + /* + * First we need to find the current producer index, so we + * know where to start cleaning from. It doesn't matter if HW + * adds new entries after this loop -- the QP we're worried + * about is already in RESET, so the new entries won't come + * from our QP and therefore don't need to be checked. + */ + for (prod_index = cq->cons_index; get_sw_cqe(cq, prod_index); ++prod_index) + if (prod_index == cq->cons_index + cq->ibv_cq.cqe) + break; + + /* + * Now sweep backwards through the CQ, removing CQ entries + * that match our QP by copying older entries on top of them. + */ + while ((int) --prod_index - (int) cq->cons_index >= 0) { + cqe = get_cqe(cq, prod_index & cq->ibv_cq.cqe); + cqe += cqe_inc; + if ((ntohl(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK) == qpn) { + if (srq && !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK)) + mlx4_free_srq_wqe(srq, ntohs(cqe->wqe_index)); + ++nfreed; + } else if (nfreed) { + dest = get_cqe(cq, (prod_index + nfreed) & cq->ibv_cq.cqe); + dest += cqe_inc; + owner_bit = dest->owner_sr_opcode & MLX4_CQE_OWNER_MASK; + memcpy(dest, cqe, sizeof *cqe); + dest->owner_sr_opcode = owner_bit | + (dest->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK); + } + } + + if (nfreed) { + cq->cons_index += nfreed; + /* + * Make sure update of buffer contents is done before + * updating consumer index. + */ + wmb(); + update_cons_index(cq); + } +} + +void mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq) +{ + pthread_spin_lock(&cq->lock); + __mlx4_cq_clean(cq, qpn, srq); + pthread_spin_unlock(&cq->lock); +} + +int mlx4_get_outstanding_cqes(struct mlx4_cq *cq) +{ + uint32_t i; + + for (i = cq->cons_index; get_sw_cqe(cq, (i & cq->ibv_cq.cqe)); ++i) + ; + + return i - cq->cons_index; +} + +void mlx4_cq_resize_copy_cqes(struct mlx4_cq *cq, void *buf, int old_cqe) +{ + struct mlx4_cqe *cqe; + int i; + int cqe_inc = cq->cqe_size == 64 ? 1 : 0; + + i = cq->cons_index; + cqe = get_cqe(cq, (i & old_cqe)); + cqe += cqe_inc; + + while ((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) != MLX4_CQE_OPCODE_RESIZE) { + cqe->owner_sr_opcode = (cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK) | + (((i + 1) & (cq->ibv_cq.cqe + 1)) ? MLX4_CQE_OWNER_MASK : 0); + memcpy(buf + ((i + 1) & cq->ibv_cq.cqe) * cq->cqe_size, + cqe - cqe_inc, cq->cqe_size); + ++i; + cqe = get_cqe(cq, (i & old_cqe)); + cqe += cqe_inc; + } + + ++cq->cons_index; +} + +int mlx4_alloc_cq_buf(struct mlx4_device *dev, struct mlx4_buf *buf, int nent, + int entry_size) +{ + if (mlx4_alloc_buf(buf, align(nent * entry_size, dev->page_size), + dev->page_size)) + return -1; + memset(buf->buf, 0, nent * entry_size); + + return 0; +} diff --git a/prov/mlx4/src/dbrec.c b/prov/mlx4/src/dbrec.c new file mode 100644 index 00000000000..02ef237b392 --- /dev/null +++ b/prov/mlx4/src/dbrec.c @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +# include <config.h> +#endif /* HAVE_CONFIG_H */ + +#include <stdlib.h> +#include <netinet/in.h> +#include <pthread.h> +#include <string.h> + +#include "mlx4.h" + +struct mlx4_db_page { + struct mlx4_db_page *prev, *next; + struct mlx4_buf buf; + int num_db; + int use_cnt; + unsigned long free[0]; +}; + +static const int db_size[] = { + [MLX4_DB_TYPE_CQ] = 8, + [MLX4_DB_TYPE_RQ] = 4, +}; + +static struct mlx4_db_page *__add_page(struct mlx4_context *context, + enum mlx4_db_type type) +{ + struct mlx4_db_page *page; + int ps = to_mdev(context->ibv_ctx.device)->page_size; + int pp; + int i; + + pp = ps / db_size[type]; + + page = malloc(sizeof *page + pp / 8); + if (!page) + return NULL; + + if (mlx4_alloc_buf(&page->buf, ps, ps)) { + free(page); + return NULL; + } + + page->num_db = pp; + page->use_cnt = 0; + for (i = 0; i < pp / (sizeof (long) * 8); ++i) + page->free[i] = ~0; + + page->prev = NULL; + page->next = context->db_list[type]; + context->db_list[type] = page; + if (page->next) + page->next->prev = page; + + return page; +} + +uint32_t *mlx4_alloc_db(struct mlx4_context *context, enum mlx4_db_type type) +{ + struct mlx4_db_page *page; + uint32_t *db = NULL; + int i, j; + + pthread_mutex_lock(&context->db_list_mutex); + + for (page = context->db_list[type]; page; page = page->next) + if (page->use_cnt < page->num_db) + goto found; + + page = __add_page(context, type); + if (!page) + goto out; + +found: + ++page->use_cnt; + + for (i = 0; !page->free[i]; ++i) + /* nothing */; + + j = ffsl(page->free[i]); + page->free[i] &= ~(1UL << (j - 1)); + db = page->buf.buf + (i * 8 * sizeof (long) + (j - 1)) * db_size[type]; + +out: + pthread_mutex_unlock(&context->db_list_mutex); + + return db; +} + +void mlx4_free_db(struct mlx4_context *context, enum mlx4_db_type type, uint32_t *db) +{ + struct mlx4_db_page *page; + uintptr_t ps = to_mdev(context->ibv_ctx.device)->page_size; + int i; + + pthread_mutex_lock(&context->db_list_mutex); + + for (page = context->db_list[type]; page; page = page->next) + if (((uintptr_t) db & ~(ps - 1)) == (uintptr_t) page->buf.buf) + break; + + if (!page) + goto out; + + i = ((void *) db - page->buf.buf) / db_size[type]; + page->free[i / (8 * sizeof (long))] |= 1UL << (i % (8 * sizeof (long))); + + if (!--page->use_cnt) { + if (page->prev) + page->prev->next = page->next; + else + context->db_list[type] = page->next; + if (page->next) + page->next->prev = page->prev; + + mlx4_free_buf(&page->buf); + free(page); + } + +out: + pthread_mutex_unlock(&context->db_list_mutex); +} diff --git a/prov/mlx4/src/doorbell.h b/prov/mlx4/src/doorbell.h new file mode 100644 index 00000000000..3171e76976a --- /dev/null +++ b/prov/mlx4/src/doorbell.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2007 Cisco, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef DOORBELL_H +#define DOORBELL_H + +#if SIZEOF_LONG == 8 + +#if __BYTE_ORDER == __LITTLE_ENDIAN +# define MLX4_PAIR_TO_64(val) ((uint64_t) val[1] << 32 | val[0]) +#elif __BYTE_ORDER == __BIG_ENDIAN +# define MLX4_PAIR_TO_64(val) ((uint64_t) val[0] << 32 | val[1]) +#else +# error __BYTE_ORDER not defined +#endif + +static inline void mlx4_write64(uint32_t val[2], struct mlx4_context *ctx, int offset) +{ + *(volatile uint64_t *) (ctx->uar + offset) = MLX4_PAIR_TO_64(val); +} + +#else + +static inline void mlx4_write64(uint32_t val[2], struct mlx4_context *ctx, int offset) +{ + pthread_spin_lock(&ctx->uar_lock); + *(volatile uint32_t *) (ctx->uar + offset) = val[0]; + *(volatile uint32_t *) (ctx->uar + offset + 4) = val[1]; + pthread_spin_unlock(&ctx->uar_lock); +} + +#endif + +#endif /* DOORBELL_H */ diff --git a/prov/mlx4/src/mlx4-abi.h b/prov/mlx4/src/mlx4-abi.h new file mode 100644 index 00000000000..3bb3e6f2e65 --- /dev/null +++ b/prov/mlx4/src/mlx4-abi.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2007 Cisco, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX4_ABI_H +#define MLX4_ABI_H + +#include <rdma/fi_uverbs.h> + +#define MLX4_UVERBS_MIN_ABI_VERSION 2 +#define MLX4_UVERBS_MAX_ABI_VERSION 4 + +#define MLX4_UVERBS_NO_DEV_CAPS_ABI_VERSION 3 + +enum { + MLX4_USER_DEV_CAP_64B_CQE = 1L << 0 +}; + +struct mlx4_alloc_ucontext_resp_v3 { + struct ibv_get_context_resp ibv_resp; + __u32 qp_tab_size; + __u16 bf_reg_size; + __u16 bf_regs_per_page; +}; + +struct mlx4_alloc_ucontext_resp { + struct ibv_get_context_resp ibv_resp; + __u32 dev_caps; + __u32 qp_tab_size; + __u16 bf_reg_size; + __u16 bf_regs_per_page; + __u32 cqe_size; +}; + +struct mlx4_alloc_pd_resp { + struct ibv_alloc_pd_resp ibv_resp; + __u32 pdn; + __u32 reserved; +}; + +struct mlx4_create_cq { + struct ibv_create_cq ibv_cmd; + __u64 buf_addr; + __u64 db_addr; +}; + +struct mlx4_create_cq_resp { + struct ibv_create_cq_resp ibv_resp; + __u32 cqn; + __u32 reserved; +}; + +struct mlx4_resize_cq { + struct ibv_resize_cq ibv_cmd; + __u64 buf_addr; +}; + +struct mlx4_create_srq { + struct ibv_create_srq ibv_cmd; + __u64 buf_addr; + __u64 db_addr; +}; + +struct mlx4_create_srq_resp { + struct ibv_create_srq_resp ibv_resp; + __u32 srqn; + __u32 reserved; +}; + +struct mlx4_create_qp { + struct ibv_create_qp ibv_cmd; + __u64 buf_addr; + __u64 db_addr; + __u8 log_sq_bb_count; + __u8 log_sq_stride; + __u8 sq_no_prefetch; /* was reserved in ABI 2 */ + __u8 reserved[5]; +}; + +#endif /* MLX4_ABI_H */ diff --git a/prov/mlx4/src/mlx4.c b/prov/mlx4/src/mlx4.c new file mode 100644 index 00000000000..5e68070f1f5 --- /dev/null +++ b/prov/mlx4/src/mlx4.c @@ -0,0 +1,276 @@ +/* + * Copyright (c) 2007 Cisco, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +# include <config.h> +#endif /* HAVE_CONFIG_H */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <errno.h> +#include <sys/mman.h> +#include <pthread.h> +#include <string.h> + +#include "mlx4.h" +#include "mlx4-abi.h" + +#ifndef PCI_VENDOR_ID_MELLANOX +#define PCI_VENDOR_ID_MELLANOX 0x15b3 +#endif + +#define HCA(v, d) \ + { .vendor = PCI_VENDOR_ID_##v, \ + .device = d } + +struct { + unsigned vendor; + unsigned device; +} hca_table[] = { + HCA(MELLANOX, 0x6340), /* MT25408 "Hermon" SDR */ + HCA(MELLANOX, 0x634a), /* MT25408 "Hermon" DDR */ + HCA(MELLANOX, 0x6354), /* MT25408 "Hermon" QDR */ + HCA(MELLANOX, 0x6732), /* MT25408 "Hermon" DDR PCIe gen2 */ + HCA(MELLANOX, 0x673c), /* MT25408 "Hermon" QDR PCIe gen2 */ + HCA(MELLANOX, 0x6368), /* MT25408 "Hermon" EN 10GigE */ + HCA(MELLANOX, 0x6750), /* MT25408 "Hermon" EN 10GigE PCIe gen2 */ + HCA(MELLANOX, 0x6372), /* MT25458 ConnectX EN 10GBASE-T 10GigE */ + HCA(MELLANOX, 0x675a), /* MT25458 ConnectX EN 10GBASE-T+Gen2 10GigE */ + HCA(MELLANOX, 0x6764), /* MT26468 ConnectX EN 10GigE PCIe gen2*/ + HCA(MELLANOX, 0x6746), /* MT26438 ConnectX EN 40GigE PCIe gen2 5GT/s */ + HCA(MELLANOX, 0x676e), /* MT26478 ConnectX2 40GigE PCIe gen2 */ + HCA(MELLANOX, 0x1002), /* MT25400 Family [ConnectX-2 Virtual Function] */ + HCA(MELLANOX, 0x1003), /* MT27500 Family [ConnectX-3] */ + HCA(MELLANOX, 0x1004), /* MT27500 Family [ConnectX-3 Virtual Function] */ + HCA(MELLANOX, 0x1005), /* MT27510 Family */ + HCA(MELLANOX, 0x1006), /* MT27511 Family */ + HCA(MELLANOX, 0x1007), /* MT27520 Family */ + HCA(MELLANOX, 0x1008), /* MT27521 Family */ + HCA(MELLANOX, 0x1009), /* MT27530 Family */ + HCA(MELLANOX, 0x100a), /* MT27531 Family */ + HCA(MELLANOX, 0x100b), /* MT27540 Family */ + HCA(MELLANOX, 0x100c), /* MT27541 Family */ + HCA(MELLANOX, 0x100d), /* MT27550 Family */ + HCA(MELLANOX, 0x100e), /* MT27551 Family */ + HCA(MELLANOX, 0x100f), /* MT27560 Family */ + HCA(MELLANOX, 0x1010), /* MT27561 Family */ +}; + +static struct ibv_context_ops mlx4_ctx_ops = { + .query_device = mlx4_query_device, + .query_port = mlx4_query_port, + .alloc_pd = mlx4_alloc_pd, + .dealloc_pd = mlx4_free_pd, + .reg_mr = mlx4_reg_mr, + .dereg_mr = mlx4_dereg_mr, + .create_cq = mlx4_create_cq, + .poll_cq = mlx4_poll_cq, + .req_notify_cq = mlx4_arm_cq, + .cq_event = mlx4_cq_event, + .resize_cq = mlx4_resize_cq, + .destroy_cq = mlx4_destroy_cq, + .create_srq = mlx4_create_srq, + .modify_srq = mlx4_modify_srq, + .query_srq = mlx4_query_srq, + .destroy_srq = mlx4_destroy_srq, + .post_srq_recv = mlx4_post_srq_recv, + .create_qp = mlx4_create_qp, + .query_qp = mlx4_query_qp, + .modify_qp = mlx4_modify_qp, + .destroy_qp = mlx4_destroy_qp, + .post_send = mlx4_post_send, + .post_recv = mlx4_post_recv, + .create_ah = mlx4_create_ah, + .destroy_ah = mlx4_destroy_ah, + .attach_mcast = ibv_cmd_attach_mcast, + .detach_mcast = ibv_cmd_detach_mcast +}; + +static struct ibv_context *mlx4_alloc_context(struct ibv_device *ibdev, fid_t fid) +{ + struct mlx4_context *context; + struct ibv_get_context cmd; + struct mlx4_alloc_ucontext_resp resp; + int i; + struct mlx4_alloc_ucontext_resp_v3 resp_v3; + __u16 bf_reg_size; + struct mlx4_device *dev = to_mdev(ibdev); + struct fid_uverbs *uv; + + context = calloc(1, sizeof *context); + if (!context) + return NULL; + + context->ibv_ctx.uv_fid = fid; + uv = container_of(fid, struct fid_uverbs, fid); + + if (dev->abi_version <= MLX4_UVERBS_NO_DEV_CAPS_ABI_VERSION) { + if (ibv_cmd_get_context(&context->ibv_ctx, &cmd, sizeof cmd, + &resp_v3.ibv_resp, sizeof resp_v3)) + goto err_free; + + context->num_qps = resp_v3.qp_tab_size; + bf_reg_size = resp_v3.bf_reg_size; + context->cqe_size = sizeof (struct mlx4_cqe); + } else { + if (ibv_cmd_get_context(&context->ibv_ctx, &cmd, sizeof cmd, + &resp.ibv_resp, sizeof resp)) + goto err_free; + + context->num_qps = resp.qp_tab_size; + bf_reg_size = resp.bf_reg_size; + if (resp.dev_caps & MLX4_USER_DEV_CAP_64B_CQE) + context->cqe_size = resp.cqe_size; + else + context->cqe_size = sizeof (struct mlx4_cqe); + } + + context->qp_table_shift = ffs(context->num_qps) - 1 - MLX4_QP_TABLE_BITS; + context->qp_table_mask = (1 << context->qp_table_shift) - 1; + + pthread_mutex_init(&context->qp_table_mutex, NULL); + for (i = 0; i < MLX4_QP_TABLE_SIZE; ++i) + context->qp_table[i].refcnt = 0; + + for (i = 0; i < MLX4_NUM_DB_TYPE; ++i) + context->db_list[i] = NULL; + + pthread_mutex_init(&context->db_list_mutex, NULL); + + context->uar = mmap(NULL, to_mdev(ibdev)->page_size, PROT_WRITE, + MAP_SHARED, uv->fd, 0); + if (context->uar == MAP_FAILED) + goto err_free; + + if (bf_reg_size) { + context->bf_page = mmap(NULL, to_mdev(ibdev)->page_size, + PROT_WRITE, MAP_SHARED, uv->fd, + to_mdev(ibdev)->page_size); + if (context->bf_page == MAP_FAILED) { + fprintf(stderr, PFX "Warning: BlueFlame available, " + "but failed to mmap() BlueFlame page.\n"); + context->bf_page = NULL; + context->bf_buf_size = 0; + } else { + context->bf_buf_size = bf_reg_size / 2; + context->bf_offset = 0; + pthread_spin_init(&context->bf_lock, PTHREAD_PROCESS_PRIVATE); + } + } else { + context->bf_page = NULL; + context->bf_buf_size = 0; + } + + pthread_spin_init(&context->uar_lock, PTHREAD_PROCESS_PRIVATE); + + context->ibv_ctx.ops = mlx4_ctx_ops; + + return &context->ibv_ctx; + +err_free: + free(context); + return NULL; +} + +static void mlx4_free_context(struct ibv_context *ibctx) +{ + struct mlx4_context *context = to_mctx(ibctx); + + munmap(context->uar, to_mdev(ibctx->device)->page_size); + if (context->bf_page) + munmap(context->bf_page, to_mdev(ibctx->device)->page_size); + free(context); +} + +static struct ibv_device_ops mlx4_dev_ops = { + .alloc_context = mlx4_alloc_context, + .free_context = mlx4_free_context +}; + +static struct ibv_device *mlx4_driver_init(const char *uverbs_sys_path, int abi_version) +{ + char value[8]; + struct mlx4_device *dev; + unsigned vendor, device; + int i; + + if (ibv_read_sysfs_file(uverbs_sys_path, "device/vendor", + value, sizeof value) < 0) + return NULL; + vendor = strtol(value, NULL, 16); + + if (ibv_read_sysfs_file(uverbs_sys_path, "device/device", + value, sizeof value) < 0) + return NULL; + device = strtol(value, NULL, 16); + + for (i = 0; i < sizeof hca_table / sizeof hca_table[0]; ++i) + if (vendor == hca_table[i].vendor && + device == hca_table[i].device) + goto found; + + return NULL; + +found: + if (abi_version < MLX4_UVERBS_MIN_ABI_VERSION || + abi_version > MLX4_UVERBS_MAX_ABI_VERSION) { + fprintf(stderr, PFX "Fatal: ABI version %d of %s is not supported " + "(min supported %d, max supported %d)\n", + abi_version, uverbs_sys_path, + MLX4_UVERBS_MIN_ABI_VERSION, + MLX4_UVERBS_MAX_ABI_VERSION); + return NULL; + } + + dev = malloc(sizeof *dev); + if (!dev) { + fprintf(stderr, PFX "Fatal: couldn't allocate device for %s\n", + uverbs_sys_path); + return NULL; + } + + dev->ibv_dev.ops = mlx4_dev_ops; + dev->page_size = sysconf(_SC_PAGESIZE); + dev->abi_version = abi_version; + + return &dev->ibv_dev; +} + +void mlx4_ini(void) +{ + ibv_register_driver("mlx4", mlx4_driver_init); +} + +void mlx4_fini(void) +{ +} diff --git a/prov/mlx4/src/mlx4.h b/prov/mlx4/src/mlx4.h new file mode 100644 index 00000000000..61ba7a11e47 --- /dev/null +++ b/prov/mlx4/src/mlx4.h @@ -0,0 +1,350 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005, 2006, 2007 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX4_H +#define MLX4_H + +#include <stddef.h> + +#include <fi.h> +#include <infiniband/driver.h> +#include <rdma/fabric.h> +#include <rdma/fi_arch.h> + + +#define HIDDEN __attribute__((visibility ("hidden"))) + + +#ifndef HAVE_IBV_QPT_RAW_PACKET +#define IBV_QPT_RAW_PACKET 8 +#endif + +enum { + MLX4_STAT_RATE_OFFSET = 5 +}; + +enum { + MLX4_QP_TABLE_BITS = 8, + MLX4_QP_TABLE_SIZE = 1 << MLX4_QP_TABLE_BITS, + MLX4_QP_TABLE_MASK = MLX4_QP_TABLE_SIZE - 1 +}; + +enum mlx4_db_type { + MLX4_DB_TYPE_CQ, + MLX4_DB_TYPE_RQ, + MLX4_NUM_DB_TYPE +}; + +enum { + MLX4_OPCODE_NOP = 0x00, + MLX4_OPCODE_SEND_INVAL = 0x01, + MLX4_OPCODE_RDMA_WRITE = 0x08, + MLX4_OPCODE_RDMA_WRITE_IMM = 0x09, + MLX4_OPCODE_SEND = 0x0a, + MLX4_OPCODE_SEND_IMM = 0x0b, + MLX4_OPCODE_LSO = 0x0e, + MLX4_OPCODE_RDMA_READ = 0x10, + MLX4_OPCODE_ATOMIC_CS = 0x11, + MLX4_OPCODE_ATOMIC_FA = 0x12, + MLX4_OPCODE_MASKED_ATOMIC_CS = 0x14, + MLX4_OPCODE_MASKED_ATOMIC_FA = 0x15, + MLX4_OPCODE_BIND_MW = 0x18, + MLX4_OPCODE_FMR = 0x19, + MLX4_OPCODE_LOCAL_INVAL = 0x1b, + MLX4_OPCODE_CONFIG_CMD = 0x1f, + + MLX4_RECV_OPCODE_RDMA_WRITE_IMM = 0x00, + MLX4_RECV_OPCODE_SEND = 0x01, + MLX4_RECV_OPCODE_SEND_IMM = 0x02, + MLX4_RECV_OPCODE_SEND_INVAL = 0x03, + + MLX4_CQE_OPCODE_ERROR = 0x1e, + MLX4_CQE_OPCODE_RESIZE = 0x16, +}; + +struct mlx4_device { + struct ibv_device ibv_dev; + int page_size; + int abi_version; +}; + +struct mlx4_db_page; + +struct mlx4_context { + struct ibv_context ibv_ctx; + + void *uar; + pthread_spinlock_t uar_lock; + + void *bf_page; + int bf_buf_size; + int bf_offset; + pthread_spinlock_t bf_lock; + + struct { + struct mlx4_qp **table; + int refcnt; + } qp_table[MLX4_QP_TABLE_SIZE]; + pthread_mutex_t qp_table_mutex; + int num_qps; + int qp_table_shift; + int qp_table_mask; + + struct mlx4_db_page *db_list[MLX4_NUM_DB_TYPE]; + pthread_mutex_t db_list_mutex; + int cqe_size; +}; + +struct mlx4_buf { + void *buf; + size_t length; +}; + +struct mlx4_pd { + struct ibv_pd ibv_pd; + uint32_t pdn; +}; + +struct mlx4_cq { + struct ibv_cq ibv_cq; + struct mlx4_buf buf; + struct mlx4_buf resize_buf; + pthread_spinlock_t lock; + uint32_t cqn; + uint32_t cons_index; + uint32_t *set_ci_db; + uint32_t *arm_db; + int arm_sn; + int cqe_size; +}; + +struct mlx4_srq { + struct ibv_srq ibv_srq; + struct mlx4_buf buf; + pthread_spinlock_t lock; + uint64_t *wrid; + uint32_t srqn; + int max; + int max_gs; + int wqe_shift; + int head; + int tail; + uint32_t *db; + uint16_t counter; +}; + +struct mlx4_wq { + uint64_t *wrid; + pthread_spinlock_t lock; + int wqe_cnt; + int max_post; + unsigned head; + unsigned tail; + int max_gs; + int wqe_shift; + int offset; +}; + +struct mlx4_qp { + struct ibv_qp ibv_qp; + struct mlx4_buf buf; + int max_inline_data; + int buf_size; + + uint32_t doorbell_qpn; + uint32_t sq_signal_bits; + int sq_spare_wqes; + struct mlx4_wq sq; + + uint32_t *db; + struct mlx4_wq rq; + + uint8_t link_layer; +}; + +struct mlx4_av { + uint32_t port_pd; + uint8_t reserved1; + uint8_t g_slid; + uint16_t dlid; + uint8_t reserved2; + uint8_t gid_index; + uint8_t stat_rate; + uint8_t hop_limit; + uint32_t sl_tclass_flowlabel; + uint8_t dgid[16]; +}; + +struct mlx4_ah { + struct ibv_ah ibv_ah; + struct mlx4_av av; + uint16_t vlan; + uint8_t mac[6]; +}; + +struct mlx4_cqe { + uint32_t vlan_my_qpn; + uint32_t immed_rss_invalid; + uint32_t g_mlpath_rqpn; + uint8_t sl_vid; + uint8_t reserved1; + uint16_t rlid; + uint32_t reserved2; + uint32_t byte_cnt; + uint16_t wqe_index; + uint16_t checksum; + uint8_t reserved3[3]; + uint8_t owner_sr_opcode; +}; + +static inline unsigned long align(unsigned long val, unsigned long align) +{ + return (val + align - 1) & ~(align - 1); +} + +#define to_mxxx(xxx, type) \ + ((struct mlx4_##type *) \ + ((void *) ib##xxx - offsetof(struct mlx4_##type, ibv_##xxx))) + +static inline struct mlx4_device *to_mdev(struct ibv_device *ibdev) +{ + return to_mxxx(dev, device); +} + +static inline struct mlx4_context *to_mctx(struct ibv_context *ibctx) +{ + return to_mxxx(ctx, context); +} + +static inline struct mlx4_pd *to_mpd(struct ibv_pd *ibpd) +{ + return to_mxxx(pd, pd); +} + +static inline struct mlx4_cq *to_mcq(struct ibv_cq *ibcq) +{ + return to_mxxx(cq, cq); +} + +static inline struct mlx4_srq *to_msrq(struct ibv_srq *ibsrq) +{ + return to_mxxx(srq, srq); +} + +static inline struct mlx4_qp *to_mqp(struct ibv_qp *ibqp) +{ + return to_mxxx(qp, qp); +} + +static inline struct mlx4_ah *to_mah(struct ibv_ah *ibah) +{ + return to_mxxx(ah, ah); +} + +int mlx4_alloc_buf(struct mlx4_buf *buf, size_t size, int page_size); +void mlx4_free_buf(struct mlx4_buf *buf); + +uint32_t *mlx4_alloc_db(struct mlx4_context *context, enum mlx4_db_type type); +void mlx4_free_db(struct mlx4_context *context, enum mlx4_db_type type, uint32_t *db); + +int mlx4_query_device(struct ibv_context *context, + struct ibv_device_attr *attr); +int mlx4_query_port(struct ibv_context *context, uint8_t port, + struct ibv_port_attr *attr); + +struct ibv_pd *mlx4_alloc_pd(struct ibv_context *context); +int mlx4_free_pd(struct ibv_pd *pd); + +struct ibv_mr *mlx4_reg_mr(struct ibv_pd *pd, void *addr, + size_t length, int access); +int mlx4_dereg_mr(struct ibv_mr *mr); + +struct ibv_cq *mlx4_create_cq(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, + int comp_vector); +int mlx4_alloc_cq_buf(struct mlx4_device *dev, struct mlx4_buf *buf, int nent, + int entry_size); +int mlx4_resize_cq(struct ibv_cq *cq, int cqe); +int mlx4_destroy_cq(struct ibv_cq *cq); +int mlx4_poll_cq(struct ibv_cq *cq, int ne, struct ibv_wc *wc); +int mlx4_arm_cq(struct ibv_cq *cq, int solicited); +void mlx4_cq_event(struct ibv_cq *cq); +void __mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq); +void mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq); +int mlx4_get_outstanding_cqes(struct mlx4_cq *cq); +void mlx4_cq_resize_copy_cqes(struct mlx4_cq *cq, void *buf, int new_cqe); + +struct ibv_srq *mlx4_create_srq(struct ibv_pd *pd, + struct ibv_srq_init_attr *attr); +int mlx4_modify_srq(struct ibv_srq *srq, + struct ibv_srq_attr *attr, + int mask); +int mlx4_query_srq(struct ibv_srq *srq, + struct ibv_srq_attr *attr); +int mlx4_destroy_srq(struct ibv_srq *srq); +int mlx4_alloc_srq_buf(struct ibv_pd *pd, struct ibv_srq_attr *attr, + struct mlx4_srq *srq); +void mlx4_free_srq_wqe(struct mlx4_srq *srq, int ind); +int mlx4_post_srq_recv(struct ibv_srq *ibsrq, + struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr); + +struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr); +int mlx4_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask, + struct ibv_qp_init_attr *init_attr); +int mlx4_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask); +int mlx4_destroy_qp(struct ibv_qp *qp); +void mlx4_init_qp_indices(struct mlx4_qp *qp); +void mlx4_qp_init_sq_ownership(struct mlx4_qp *qp); +int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr); +int mlx4_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr); +void mlx4_calc_sq_wqe_size(struct ibv_qp_cap *cap, enum ibv_qp_type type, + struct mlx4_qp *qp); +int mlx4_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap *cap, + enum ibv_qp_type type, struct mlx4_qp *qp); +void mlx4_set_sq_sizes(struct mlx4_qp *qp, struct ibv_qp_cap *cap, + enum ibv_qp_type type); +struct mlx4_qp *mlx4_find_qp(struct mlx4_context *ctx, uint32_t qpn); +int mlx4_store_qp(struct mlx4_context *ctx, uint32_t qpn, struct mlx4_qp *qp); +void mlx4_clear_qp(struct mlx4_context *ctx, uint32_t qpn); +struct ibv_ah *mlx4_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr); +int mlx4_destroy_ah(struct ibv_ah *ah); +int mlx4_alloc_av(struct mlx4_pd *pd, struct ibv_ah_attr *attr, + struct mlx4_ah *ah); +void mlx4_free_av(struct mlx4_ah *ah); + +#endif /* MLX4_H */ diff --git a/prov/mlx4/src/mlx4_verbs.c b/prov/mlx4/src/mlx4_verbs.c new file mode 100644 index 00000000000..7c5ee531498 --- /dev/null +++ b/prov/mlx4/src/mlx4_verbs.c @@ -0,0 +1,741 @@ +/* + * Copyright (c) 2007 Cisco, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +# include <config.h> +#endif /* HAVE_CONFIG_H */ + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <pthread.h> +#include <errno.h> +#include <netinet/in.h> + +#include "mlx4.h" +#include "mlx4-abi.h" +#include "wqe.h" + +int mlx4_query_device(struct ibv_context *context, struct ibv_device_attr *attr) +{ + struct ibv_query_device cmd; + uint64_t raw_fw_ver; + unsigned major, minor, sub_minor; + int ret; + + ret = ibv_cmd_query_device(context, attr, &raw_fw_ver, &cmd, sizeof cmd); + if (ret) + return ret; + + major = (raw_fw_ver >> 32) & 0xffff; + minor = (raw_fw_ver >> 16) & 0xffff; + sub_minor = raw_fw_ver & 0xffff; + + snprintf(attr->fw_ver, sizeof attr->fw_ver, + "%d.%d.%03d", major, minor, sub_minor); + + return 0; +} + +int mlx4_query_port(struct ibv_context *context, uint8_t port, + struct ibv_port_attr *attr) +{ + struct ibv_query_port cmd; + + return ibv_cmd_query_port(context, port, attr, &cmd, sizeof cmd); +} + +struct ibv_pd *mlx4_alloc_pd(struct ibv_context *context) +{ + struct ibv_alloc_pd cmd; + struct mlx4_alloc_pd_resp resp; + struct mlx4_pd *pd; + + pd = malloc(sizeof *pd); + if (!pd) + return NULL; + + if (ibv_cmd_alloc_pd(context, &pd->ibv_pd, &cmd, sizeof cmd, + &resp.ibv_resp, sizeof resp)) { + free(pd); + return NULL; + } + + pd->pdn = resp.pdn; + + return &pd->ibv_pd; +} + +int mlx4_free_pd(struct ibv_pd *pd) +{ + int ret; + + ret = ibv_cmd_dealloc_pd(pd); + if (ret) + return ret; + + free(to_mpd(pd)); + return 0; +} + +struct ibv_mr *mlx4_reg_mr(struct ibv_pd *pd, void *addr, size_t length, + int access) +{ + struct ibv_mr *mr; + struct ibv_reg_mr cmd; + int ret; + + mr = malloc(sizeof *mr); + if (!mr) + return NULL; + +#ifdef IBV_CMD_REG_MR_HAS_RESP_PARAMS + { + struct ibv_reg_mr_resp resp; + + ret = ibv_cmd_reg_mr(pd, addr, length, (uintptr_t) addr, + access, mr, &cmd, sizeof cmd, + &resp, sizeof resp); + } +#else + ret = ibv_cmd_reg_mr(pd, addr, length, (uintptr_t) addr, access, mr, + &cmd, sizeof cmd); +#endif + if (ret) { + free(mr); + return NULL; + } + + return mr; +} + +int mlx4_dereg_mr(struct ibv_mr *mr) +{ + int ret; + + ret = ibv_cmd_dereg_mr(mr); + if (ret) + return ret; + + free(mr); + return 0; +} + +static int align_queue_size(int req) +{ + int nent; + + for (nent = 1; nent < req; nent <<= 1) + ; /* nothing */ + + return nent; +} + +struct ibv_cq *mlx4_create_cq(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, + int comp_vector) +{ + struct mlx4_create_cq cmd; + struct mlx4_create_cq_resp resp; + struct mlx4_cq *cq; + int ret; + struct mlx4_context *mctx = to_mctx(context); + + /* Sanity check CQ size before proceeding */ + if (cqe > 0x3fffff) + return NULL; + + cq = malloc(sizeof *cq); + if (!cq) + return NULL; + + cq->cons_index = 0; + + if (pthread_spin_init(&cq->lock, PTHREAD_PROCESS_PRIVATE)) + goto err; + + cqe = align_queue_size(cqe + 1); + + if (mlx4_alloc_cq_buf(to_mdev(context->device), &cq->buf, cqe, mctx->cqe_size)) + goto err; + + cq->cqe_size = mctx->cqe_size; + cq->set_ci_db = mlx4_alloc_db(to_mctx(context), MLX4_DB_TYPE_CQ); + if (!cq->set_ci_db) + goto err_buf; + + cq->arm_db = cq->set_ci_db + 1; + *cq->arm_db = 0; + cq->arm_sn = 1; + *cq->set_ci_db = 0; + + cmd.buf_addr = (uintptr_t) cq->buf.buf; + cmd.db_addr = (uintptr_t) cq->set_ci_db; + + ret = ibv_cmd_create_cq(context, cqe - 1, channel, comp_vector, + &cq->ibv_cq, &cmd.ibv_cmd, sizeof cmd, + &resp.ibv_resp, sizeof resp); + if (ret) + goto err_db; + + cq->cqn = resp.cqn; + + return &cq->ibv_cq; + +err_db: + mlx4_free_db(to_mctx(context), MLX4_DB_TYPE_CQ, cq->set_ci_db); + +err_buf: + mlx4_free_buf(&cq->buf); + +err: + free(cq); + + return NULL; +} + +int mlx4_resize_cq(struct ibv_cq *ibcq, int cqe) +{ + struct mlx4_cq *cq = to_mcq(ibcq); + struct mlx4_resize_cq cmd; + struct mlx4_buf buf; + int old_cqe, outst_cqe, ret; + + /* Sanity check CQ size before proceeding */ + if (cqe > 0x3fffff) + return EINVAL; + + pthread_spin_lock(&cq->lock); + + cqe = align_queue_size(cqe + 1); + if (cqe == ibcq->cqe + 1) { + ret = 0; + goto out; + } + + /* Can't be smaller then the number of outstanding CQEs */ + outst_cqe = mlx4_get_outstanding_cqes(cq); + if (cqe < outst_cqe + 1) { + ret = 0; + goto out; + } + + ret = mlx4_alloc_cq_buf(to_mdev(ibcq->context->device), &buf, cqe, cq->cqe_size); + if (ret) + goto out; + + old_cqe = ibcq->cqe; + cmd.buf_addr = (uintptr_t) buf.buf; + +#ifdef IBV_CMD_RESIZE_CQ_HAS_RESP_PARAMS + { + struct ibv_resize_cq_resp resp; + ret = ibv_cmd_resize_cq(ibcq, cqe - 1, &cmd.ibv_cmd, sizeof cmd, + &resp, sizeof resp); + } +#else + ret = ibv_cmd_resize_cq(ibcq, cqe - 1, &cmd.ibv_cmd, sizeof cmd); +#endif + if (ret) { + mlx4_free_buf(&buf); + goto out; + } + + mlx4_cq_resize_copy_cqes(cq, buf.buf, old_cqe); + + mlx4_free_buf(&cq->buf); + cq->buf = buf; + +out: + pthread_spin_unlock(&cq->lock); + return ret; +} + +int mlx4_destroy_cq(struct ibv_cq *cq) +{ + int ret; + + ret = ibv_cmd_destroy_cq(cq); + if (ret) + return ret; + + mlx4_free_db(to_mctx(cq->context), MLX4_DB_TYPE_CQ, to_mcq(cq)->set_ci_db); + mlx4_free_buf(&to_mcq(cq)->buf); + free(to_mcq(cq)); + + return 0; +} + +struct ibv_srq *mlx4_create_srq(struct ibv_pd *pd, + struct ibv_srq_init_attr *attr) +{ + struct mlx4_create_srq cmd; + struct mlx4_create_srq_resp resp; + struct mlx4_srq *srq; + int ret; + + /* Sanity check SRQ size before proceeding */ + if (attr->attr.max_wr > 1 << 16 || attr->attr.max_sge > 64) + return NULL; + + srq = malloc(sizeof *srq); + if (!srq) + return NULL; + + if (pthread_spin_init(&srq->lock, PTHREAD_PROCESS_PRIVATE)) + goto err; + + srq->max = align_queue_size(attr->attr.max_wr + 1); + srq->max_gs = attr->attr.max_sge; + srq->counter = 0; + + if (mlx4_alloc_srq_buf(pd, &attr->attr, srq)) + goto err; + + srq->db = mlx4_alloc_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ); + if (!srq->db) + goto err_free; + + *srq->db = 0; + + cmd.buf_addr = (uintptr_t) srq->buf.buf; + cmd.db_addr = (uintptr_t) srq->db; + + ret = ibv_cmd_create_srq(pd, &srq->ibv_srq, attr, + &cmd.ibv_cmd, sizeof cmd, + &resp.ibv_resp, sizeof resp); + if (ret) + goto err_db; + + srq->srqn = resp.srqn; + + return &srq->ibv_srq; + +err_db: + mlx4_free_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ, srq->db); + +err_free: + free(srq->wrid); + mlx4_free_buf(&srq->buf); + +err: + free(srq); + + return NULL; +} + +int mlx4_modify_srq(struct ibv_srq *srq, + struct ibv_srq_attr *attr, + int attr_mask) +{ + struct ibv_modify_srq cmd; + + return ibv_cmd_modify_srq(srq, attr, attr_mask, &cmd, sizeof cmd); +} + +int mlx4_query_srq(struct ibv_srq *srq, + struct ibv_srq_attr *attr) +{ + struct ibv_query_srq cmd; + + return ibv_cmd_query_srq(srq, attr, &cmd, sizeof cmd); +} + +int mlx4_destroy_srq(struct ibv_srq *srq) +{ + int ret; + + ret = ibv_cmd_destroy_srq(srq); + if (ret) + return ret; + + mlx4_free_db(to_mctx(srq->context), MLX4_DB_TYPE_RQ, to_msrq(srq)->db); + mlx4_free_buf(&to_msrq(srq)->buf); + free(to_msrq(srq)->wrid); + free(to_msrq(srq)); + + return 0; +} + +struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr) +{ + struct mlx4_create_qp cmd; + struct ibv_create_qp_resp resp; + struct mlx4_qp *qp; + int ret; + + /* Sanity check QP size before proceeding */ + if (attr->cap.max_send_wr > 65536 || + attr->cap.max_recv_wr > 65536 || + attr->cap.max_send_sge > 64 || + attr->cap.max_recv_sge > 64 || + attr->cap.max_inline_data > 1024) + return NULL; + + qp = malloc(sizeof *qp); + if (!qp) + return NULL; + + mlx4_calc_sq_wqe_size(&attr->cap, attr->qp_type, qp); + + /* + * We need to leave 2 KB + 1 WQE of headroom in the SQ to + * allow HW to prefetch. + */ + qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + 1; + qp->sq.wqe_cnt = align_queue_size(attr->cap.max_send_wr + qp->sq_spare_wqes); + qp->rq.wqe_cnt = align_queue_size(attr->cap.max_recv_wr); + + if (attr->srq) + attr->cap.max_recv_wr = qp->rq.wqe_cnt = 0; + else { + if (attr->cap.max_recv_sge < 1) + attr->cap.max_recv_sge = 1; + if (attr->cap.max_recv_wr < 1) + attr->cap.max_recv_wr = 1; + } + + if (mlx4_alloc_qp_buf(pd, &attr->cap, attr->qp_type, qp)) + goto err; + + mlx4_init_qp_indices(qp); + + if (pthread_spin_init(&qp->sq.lock, PTHREAD_PROCESS_PRIVATE) || + pthread_spin_init(&qp->rq.lock, PTHREAD_PROCESS_PRIVATE)) + goto err_free; + + if (!attr->srq) { + qp->db = mlx4_alloc_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ); + if (!qp->db) + goto err_free; + + *qp->db = 0; + } + + cmd.buf_addr = (uintptr_t) qp->buf.buf; + if (attr->srq) + cmd.db_addr = 0; + else + cmd.db_addr = (uintptr_t) qp->db; + cmd.log_sq_stride = qp->sq.wqe_shift; + for (cmd.log_sq_bb_count = 0; + qp->sq.wqe_cnt > 1 << cmd.log_sq_bb_count; + ++cmd.log_sq_bb_count) + ; /* nothing */ + cmd.sq_no_prefetch = 0; /* OK for ABI 2: just a reserved field */ + memset(cmd.reserved, 0, sizeof cmd.reserved); + + pthread_mutex_lock(&to_mctx(pd->context)->qp_table_mutex); + + ret = ibv_cmd_create_qp(pd, &qp->ibv_qp, attr, &cmd.ibv_cmd, sizeof cmd, + &resp, sizeof resp); + if (ret) + goto err_rq_db; + + ret = mlx4_store_qp(to_mctx(pd->context), qp->ibv_qp.qp_num, qp); + if (ret) + goto err_destroy; + pthread_mutex_unlock(&to_mctx(pd->context)->qp_table_mutex); + + qp->rq.wqe_cnt = qp->rq.max_post = attr->cap.max_recv_wr; + qp->rq.max_gs = attr->cap.max_recv_sge; + mlx4_set_sq_sizes(qp, &attr->cap, attr->qp_type); + + qp->doorbell_qpn = htonl(qp->ibv_qp.qp_num << 8); + if (attr->sq_sig_all) + qp->sq_signal_bits = htonl(MLX4_WQE_CTRL_CQ_UPDATE); + else + qp->sq_signal_bits = 0; + + return &qp->ibv_qp; + +err_destroy: + ibv_cmd_destroy_qp(&qp->ibv_qp); + +err_rq_db: + pthread_mutex_unlock(&to_mctx(pd->context)->qp_table_mutex); + if (!attr->srq) + mlx4_free_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ, qp->db); + +err_free: + free(qp->sq.wrid); + if (qp->rq.wqe_cnt) + free(qp->rq.wrid); + mlx4_free_buf(&qp->buf); + +err: + free(qp); + + return NULL; +} + +int mlx4_query_qp(struct ibv_qp *ibqp, struct ibv_qp_attr *attr, + int attr_mask, + struct ibv_qp_init_attr *init_attr) +{ + struct ibv_query_qp cmd; + struct mlx4_qp *qp = to_mqp(ibqp); + int ret; + + ret = ibv_cmd_query_qp(ibqp, attr, attr_mask, init_attr, &cmd, sizeof cmd); + if (ret) + return ret; + + init_attr->cap.max_send_wr = qp->sq.max_post; + init_attr->cap.max_send_sge = qp->sq.max_gs; + init_attr->cap.max_inline_data = qp->max_inline_data; + + attr->cap = init_attr->cap; + + return 0; +} + +int mlx4_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask) +{ + struct ibv_modify_qp cmd; + struct ibv_port_attr port_attr; + struct mlx4_qp *mqp = to_mqp(qp); + int ret; + + if (attr_mask & IBV_QP_PORT) { + ret = ibv_query_port(qp->pd->context, attr->port_num, + &port_attr); + if (ret) + return ret; + mqp->link_layer = port_attr.link_layer; + } + + if (qp->state == IBV_QPS_RESET && + attr_mask & IBV_QP_STATE && + attr->qp_state == IBV_QPS_INIT) { + mlx4_qp_init_sq_ownership(to_mqp(qp)); + } + + ret = ibv_cmd_modify_qp(qp, attr, attr_mask, &cmd, sizeof cmd); + + if (!ret && + (attr_mask & IBV_QP_STATE) && + attr->qp_state == IBV_QPS_RESET) { + mlx4_cq_clean(to_mcq(qp->recv_cq), qp->qp_num, + qp->srq ? to_msrq(qp->srq) : NULL); + if (qp->send_cq != qp->recv_cq) + mlx4_cq_clean(to_mcq(qp->send_cq), qp->qp_num, NULL); + + mlx4_init_qp_indices(to_mqp(qp)); + if (!qp->srq) + *to_mqp(qp)->db = 0; + } + + return ret; +} + +static void mlx4_lock_cqs(struct ibv_qp *qp) +{ + struct mlx4_cq *send_cq = to_mcq(qp->send_cq); + struct mlx4_cq *recv_cq = to_mcq(qp->recv_cq); + + if (send_cq == recv_cq) + pthread_spin_lock(&send_cq->lock); + else if (send_cq->cqn < recv_cq->cqn) { + pthread_spin_lock(&send_cq->lock); + pthread_spin_lock(&recv_cq->lock); + } else { + pthread_spin_lock(&recv_cq->lock); + pthread_spin_lock(&send_cq->lock); + } +} + +static void mlx4_unlock_cqs(struct ibv_qp *qp) +{ + struct mlx4_cq *send_cq = to_mcq(qp->send_cq); + struct mlx4_cq *recv_cq = to_mcq(qp->recv_cq); + + if (send_cq == recv_cq) + pthread_spin_unlock(&send_cq->lock); + else if (send_cq->cqn < recv_cq->cqn) { + pthread_spin_unlock(&recv_cq->lock); + pthread_spin_unlock(&send_cq->lock); + } else { + pthread_spin_unlock(&send_cq->lock); + pthread_spin_unlock(&recv_cq->lock); + } +} + +int mlx4_destroy_qp(struct ibv_qp *ibqp) +{ + struct mlx4_qp *qp = to_mqp(ibqp); + int ret; + + pthread_mutex_lock(&to_mctx(ibqp->context)->qp_table_mutex); + ret = ibv_cmd_destroy_qp(ibqp); + if (ret) { + pthread_mutex_unlock(&to_mctx(ibqp->context)->qp_table_mutex); + return ret; + } + + mlx4_lock_cqs(ibqp); + + __mlx4_cq_clean(to_mcq(ibqp->recv_cq), ibqp->qp_num, + ibqp->srq ? to_msrq(ibqp->srq) : NULL); + if (ibqp->send_cq != ibqp->recv_cq) + __mlx4_cq_clean(to_mcq(ibqp->send_cq), ibqp->qp_num, NULL); + + mlx4_clear_qp(to_mctx(ibqp->context), ibqp->qp_num); + + mlx4_unlock_cqs(ibqp); + pthread_mutex_unlock(&to_mctx(ibqp->context)->qp_table_mutex); + + if (!ibqp->srq) + mlx4_free_db(to_mctx(ibqp->context), MLX4_DB_TYPE_RQ, qp->db); + free(qp->sq.wrid); + if (qp->rq.wqe_cnt) + free(qp->rq.wrid); + mlx4_free_buf(&qp->buf); + free(qp); + + return 0; +} + +static int link_local_gid(const union ibv_gid *gid) +{ + uint32_t hi = *(uint32_t *)(gid->raw); + uint32_t lo = *(uint32_t *)(gid->raw + 4); + if (hi == htonl(0xfe800000) && lo == 0) + return 1; + + return 0; +} + +static int is_multicast_gid(const union ibv_gid *gid) +{ + return gid->raw[0] == 0xff; +} + +static uint16_t get_vlan_id(union ibv_gid *gid) +{ + uint16_t vid; + vid = gid->raw[11] << 8 | gid->raw[12]; + return vid < 0x1000 ? vid : 0xffff; +} + +static int mlx4_resolve_grh_to_l2(struct ibv_pd *pd, struct mlx4_ah *ah, + struct ibv_ah_attr *attr) +{ + int err, i; + uint16_t vid; + union ibv_gid sgid; + + if (link_local_gid(&attr->grh.dgid)) { + memcpy(ah->mac, &attr->grh.dgid.raw[8], 3); + memcpy(ah->mac + 3, &attr->grh.dgid.raw[13], 3); + ah->mac[0] ^= 2; + + vid = get_vlan_id(&attr->grh.dgid); + } else if (is_multicast_gid(&attr->grh.dgid)) { + ah->mac[0] = 0x33; + ah->mac[1] = 0x33; + for (i = 2; i < 6; ++i) + ah->mac[i] = attr->grh.dgid.raw[i + 10]; + + err = ibv_query_gid(pd->context, attr->port_num, + attr->grh.sgid_index, &sgid); + if (err) + return err; + + ah->av.dlid = htons(0xc000); + ah->av.port_pd |= htonl(1 << 31); + + vid = get_vlan_id(&sgid); + } else + return 1; + + if (vid != 0xffff) { + ah->av.port_pd |= htonl(1 << 29); + ah->vlan = vid | ((attr->sl & 7) << 13); + } + + return 0; +} + +struct ibv_ah *mlx4_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr) +{ + struct mlx4_ah *ah; + struct ibv_port_attr port_attr; + + if (ibv_query_port(pd->context, attr->port_num, &port_attr)) + return NULL; + + ah = malloc(sizeof *ah); + if (!ah) + return NULL; + + memset(&ah->av, 0, sizeof ah->av); + + ah->av.port_pd = htonl(to_mpd(pd)->pdn | (attr->port_num << 24)); + + if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) { + ah->av.g_slid = attr->src_path_bits; + ah->av.dlid = htons(attr->dlid); + ah->av.sl_tclass_flowlabel = htonl(attr->sl << 28); + } else + ah->av.sl_tclass_flowlabel = htonl(attr->sl << 29); + + if (attr->static_rate) { + ah->av.stat_rate = attr->static_rate + MLX4_STAT_RATE_OFFSET; + /* XXX check rate cap? */ + } + if (attr->is_global) { + ah->av.g_slid |= 0x80; + ah->av.gid_index = attr->grh.sgid_index; + ah->av.hop_limit = attr->grh.hop_limit; + ah->av.sl_tclass_flowlabel |= + htonl((attr->grh.traffic_class << 20) | + attr->grh.flow_label); + memcpy(ah->av.dgid, attr->grh.dgid.raw, 16); + } + + if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) + if (mlx4_resolve_grh_to_l2(pd, ah, attr)) { + free(ah); + return NULL; + } + + return &ah->ibv_ah; +} + +int mlx4_destroy_ah(struct ibv_ah *ah) +{ + free(to_mah(ah)); + + return 0; +} diff --git a/prov/mlx4/src/qp.c b/prov/mlx4/src/qp.c new file mode 100644 index 00000000000..11c750b4c9a --- /dev/null +++ b/prov/mlx4/src/qp.c @@ -0,0 +1,702 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2007 Cisco, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +# include <config.h> +#endif /* HAVE_CONFIG_H */ + +#include <stdlib.h> +#include <netinet/in.h> +#include <pthread.h> +#include <string.h> +#include <errno.h> + +#include "mlx4.h" +#include "doorbell.h" +#include "wqe.h" + +static const uint32_t mlx4_ib_opcode[] = { + [IBV_WR_SEND] = MLX4_OPCODE_SEND, + [IBV_WR_SEND_WITH_IMM] = MLX4_OPCODE_SEND_IMM, + [IBV_WR_RDMA_WRITE] = MLX4_OPCODE_RDMA_WRITE, + [IBV_WR_RDMA_WRITE_WITH_IMM] = MLX4_OPCODE_RDMA_WRITE_IMM, + [IBV_WR_RDMA_READ] = MLX4_OPCODE_RDMA_READ, + [IBV_WR_ATOMIC_CMP_AND_SWP] = MLX4_OPCODE_ATOMIC_CS, + [IBV_WR_ATOMIC_FETCH_AND_ADD] = MLX4_OPCODE_ATOMIC_FA, +}; + +static void *get_recv_wqe(struct mlx4_qp *qp, int n) +{ + return qp->buf.buf + qp->rq.offset + (n << qp->rq.wqe_shift); +} + +static void *get_send_wqe(struct mlx4_qp *qp, int n) +{ + return qp->buf.buf + qp->sq.offset + (n << qp->sq.wqe_shift); +} + +/* + * Stamp a SQ WQE so that it is invalid if prefetched by marking the + * first four bytes of every 64 byte chunk with 0xffffffff, except for + * the very first chunk of the WQE. + */ +static void stamp_send_wqe(struct mlx4_qp *qp, int n) +{ + uint32_t *wqe = get_send_wqe(qp, n); + int i; + int ds = (((struct mlx4_wqe_ctrl_seg *)wqe)->fence_size & 0x3f) << 2; + + for (i = 16; i < ds; i += 16) + wqe[i] = 0xffffffff; +} + +void mlx4_init_qp_indices(struct mlx4_qp *qp) +{ + qp->sq.head = 0; + qp->sq.tail = 0; + qp->rq.head = 0; + qp->rq.tail = 0; +} + +void mlx4_qp_init_sq_ownership(struct mlx4_qp *qp) +{ + struct mlx4_wqe_ctrl_seg *ctrl; + int i; + + for (i = 0; i < qp->sq.wqe_cnt; ++i) { + ctrl = get_send_wqe(qp, i); + ctrl->owner_opcode = htonl(1 << 31); + ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4); + + stamp_send_wqe(qp, i); + } +} + +static int wq_overflow(struct mlx4_wq *wq, int nreq, struct mlx4_cq *cq) +{ + unsigned cur; + + cur = wq->head - wq->tail; + if (cur + nreq < wq->max_post) + return 0; + + pthread_spin_lock(&cq->lock); + cur = wq->head - wq->tail; + pthread_spin_unlock(&cq->lock); + + return cur + nreq >= wq->max_post; +} + +static inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg, + uint64_t remote_addr, uint32_t rkey) +{ + rseg->raddr = htonll(remote_addr); + rseg->rkey = htonl(rkey); + rseg->reserved = 0; +} + +static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, struct ibv_send_wr *wr) +{ + if (wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP) { + aseg->swap_add = htonll(wr->wr.atomic.swap); + aseg->compare = htonll(wr->wr.atomic.compare_add); + } else { + aseg->swap_add = htonll(wr->wr.atomic.compare_add); + aseg->compare = 0; + } + +} + +static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg, + struct ibv_send_wr *wr) +{ + memcpy(dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av)); + dseg->dqpn = htonl(wr->wr.ud.remote_qpn); + dseg->qkey = htonl(wr->wr.ud.remote_qkey); + dseg->vlan = htons(to_mah(wr->wr.ud.ah)->vlan); + memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->mac, 6); +} + +static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg) +{ + dseg->byte_count = htonl(sg->length); + dseg->lkey = htonl(sg->lkey); + dseg->addr = htonll(sg->addr); +} + +static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg) +{ + dseg->lkey = htonl(sg->lkey); + dseg->addr = htonll(sg->addr); + + /* + * Need a barrier here before writing the byte_count field to + * make sure that all the data is visible before the + * byte_count field is set. Otherwise, if the segment begins + * a new cacheline, the HCA prefetcher could grab the 64-byte + * chunk and get a valid (!= * 0xffffffff) byte count but + * stale data, and end up sending the wrong data. + */ + wmb(); + + dseg->byte_count = htonl(sg->length); +} + +/* + * Avoid using memcpy() to copy to BlueFlame page, since memcpy() + * implementations may use move-string-buffer assembler instructions, + * which do not guarantee order of copying. + */ +static void mlx4_bf_copy(unsigned long *dst, unsigned long *src, unsigned bytecnt) +{ + while (bytecnt > 0) { + *dst++ = *src++; + *dst++ = *src++; + bytecnt -= 2 * sizeof (long); + } +} + +int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr) +{ + struct mlx4_context *ctx; + struct mlx4_qp *qp = to_mqp(ibqp); + void *wqe; + struct mlx4_wqe_ctrl_seg *ctrl; + int ind; + int nreq; + int inl = 0; + int ret = 0; + int size; + int i; + + pthread_spin_lock(&qp->sq.lock); + + /* XXX check that state is OK to post send */ + + ind = qp->sq.head; + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + if (wq_overflow(&qp->sq, nreq, to_mcq(qp->ibv_qp.send_cq))) { + ret = ENOMEM; + *bad_wr = wr; + goto out; + } + + if (wr->num_sge > qp->sq.max_gs) { + ret = ENOMEM; + *bad_wr = wr; + goto out; + } + + if (wr->opcode >= sizeof mlx4_ib_opcode / sizeof mlx4_ib_opcode[0]) { + ret = EINVAL; + *bad_wr = wr; + goto out; + } + + ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1)); + qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id; + + ctrl->srcrb_flags = + (wr->send_flags & IBV_SEND_SIGNALED ? + htonl(MLX4_WQE_CTRL_CQ_UPDATE) : 0) | + (wr->send_flags & IBV_SEND_SOLICITED ? + htonl(MLX4_WQE_CTRL_SOLICIT) : 0) | + qp->sq_signal_bits; + + if (wr->opcode == IBV_WR_SEND_WITH_IMM || + wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM) + ctrl->imm = wr->imm_data; + else + ctrl->imm = 0; + + wqe += sizeof *ctrl; + size = sizeof *ctrl / 16; + + switch (ibqp->qp_type) { + case IBV_QPT_RC: + case IBV_QPT_UC: + switch (wr->opcode) { + case IBV_WR_ATOMIC_CMP_AND_SWP: + case IBV_WR_ATOMIC_FETCH_AND_ADD: + set_raddr_seg(wqe, wr->wr.atomic.remote_addr, + wr->wr.atomic.rkey); + wqe += sizeof (struct mlx4_wqe_raddr_seg); + + set_atomic_seg(wqe, wr); + wqe += sizeof (struct mlx4_wqe_atomic_seg); + size += (sizeof (struct mlx4_wqe_raddr_seg) + + sizeof (struct mlx4_wqe_atomic_seg)) / 16; + + break; + + case IBV_WR_RDMA_READ: + inl = 1; + /* fall through */ + case IBV_WR_RDMA_WRITE: + case IBV_WR_RDMA_WRITE_WITH_IMM: + if (!wr->num_sge) + inl = 1; + set_raddr_seg(wqe, wr->wr.rdma.remote_addr, + wr->wr.rdma.rkey); + wqe += sizeof (struct mlx4_wqe_raddr_seg); + size += sizeof (struct mlx4_wqe_raddr_seg) / 16; + + break; + + default: + /* No extra segments required for sends */ + break; + } + break; + + case IBV_QPT_UD: + set_datagram_seg(wqe, wr); + wqe += sizeof (struct mlx4_wqe_datagram_seg); + size += sizeof (struct mlx4_wqe_datagram_seg) / 16; + break; + + case IBV_QPT_RAW_PACKET: + /* For raw eth, the MLX4_WQE_CTRL_SOLICIT flag is used + * to indicate that no icrc should be calculated */ + ctrl->srcrb_flags |= htonl(MLX4_WQE_CTRL_SOLICIT); + break; + + default: + break; + } + + if (wr->send_flags & IBV_SEND_INLINE && wr->num_sge) { + struct mlx4_wqe_inline_seg *seg; + void *addr; + int len, seg_len; + int num_seg; + int off, to_copy; + + inl = 0; + + seg = wqe; + wqe += sizeof *seg; + off = ((uintptr_t) wqe) & (MLX4_INLINE_ALIGN - 1); + num_seg = 0; + seg_len = 0; + + for (i = 0; i < wr->num_sge; ++i) { + addr = (void *) (uintptr_t) wr->sg_list[i].addr; + len = wr->sg_list[i].length; + inl += len; + + if (inl > qp->max_inline_data) { + inl = 0; + ret = ENOMEM; + *bad_wr = wr; + goto out; + } + + while (len >= MLX4_INLINE_ALIGN - off) { + to_copy = MLX4_INLINE_ALIGN - off; + memcpy(wqe, addr, to_copy); + len -= to_copy; + wqe += to_copy; + addr += to_copy; + seg_len += to_copy; + wmb(); /* see comment below */ + seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len); + seg_len = 0; + seg = wqe; + wqe += sizeof *seg; + off = sizeof *seg; + ++num_seg; + } + + memcpy(wqe, addr, len); + wqe += len; + seg_len += len; + off += len; + } + + if (seg_len) { + ++num_seg; + /* + * Need a barrier here to make sure + * all the data is visible before the + * byte_count field is set. Otherwise + * the HCA prefetcher could grab the + * 64-byte chunk with this inline + * segment and get a valid (!= + * 0xffffffff) byte count but stale + * data, and end up sending the wrong + * data. + */ + wmb(); + seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len); + } + + size += (inl + num_seg * sizeof * seg + 15) / 16; + } else { + struct mlx4_wqe_data_seg *seg = wqe; + + for (i = wr->num_sge - 1; i >= 0 ; --i) + set_data_seg(seg + i, wr->sg_list + i); + + size += wr->num_sge * (sizeof *seg / 16); + } + + ctrl->fence_size = (wr->send_flags & IBV_SEND_FENCE ? + MLX4_WQE_CTRL_FENCE : 0) | size; + + /* + * Make sure descriptor is fully written before + * setting ownership bit (because HW can start + * executing as soon as we do). + */ + wmb(); + + ctrl->owner_opcode = htonl(mlx4_ib_opcode[wr->opcode]) | + (ind & qp->sq.wqe_cnt ? htonl(1 << 31) : 0); + + /* + * We can improve latency by not stamping the last + * send queue WQE until after ringing the doorbell, so + * only stamp here if there are still more WQEs to post. + */ + if (wr->next) + stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) & + (qp->sq.wqe_cnt - 1)); + + ++ind; + } + +out: + ctx = to_mctx(ibqp->context); + + if (nreq == 1 && inl && size > 1 && size <= ctx->bf_buf_size / 16) { + ctrl->owner_opcode |= htonl((qp->sq.head & 0xffff) << 8); + *(uint32_t *) ctrl->reserved |= qp->doorbell_qpn; + /* + * Make sure that descriptor is written to memory + * before writing to BlueFlame page. + */ + wmb(); + + ++qp->sq.head; + + pthread_spin_lock(&ctx->bf_lock); + + mlx4_bf_copy(ctx->bf_page + ctx->bf_offset, (unsigned long *) ctrl, + align(size * 16, 64)); + wc_wmb(); + + ctx->bf_offset ^= ctx->bf_buf_size; + + pthread_spin_unlock(&ctx->bf_lock); + } else if (nreq) { + qp->sq.head += nreq; + + /* + * Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + + *(uint32_t *) (ctx->uar + MLX4_SEND_DOORBELL) = qp->doorbell_qpn; + } + + if (nreq) + stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) & + (qp->sq.wqe_cnt - 1)); + + pthread_spin_unlock(&qp->sq.lock); + + return ret; +} + +int mlx4_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + struct mlx4_qp *qp = to_mqp(ibqp); + struct mlx4_wqe_data_seg *scat; + int ret = 0; + int nreq; + int ind; + int i; + + pthread_spin_lock(&qp->rq.lock); + + /* XXX check that state is OK to post receive */ + + ind = qp->rq.head & (qp->rq.wqe_cnt - 1); + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + if (wq_overflow(&qp->rq, nreq, to_mcq(qp->ibv_qp.recv_cq))) { + ret = ENOMEM; + *bad_wr = wr; + goto out; + } + + if (wr->num_sge > qp->rq.max_gs) { + ret = ENOMEM; + *bad_wr = wr; + goto out; + } + + scat = get_recv_wqe(qp, ind); + + for (i = 0; i < wr->num_sge; ++i) + __set_data_seg(scat + i, wr->sg_list + i); + + if (i < qp->rq.max_gs) { + scat[i].byte_count = 0; + scat[i].lkey = htonl(MLX4_INVALID_LKEY); + scat[i].addr = 0; + } + + qp->rq.wrid[ind] = wr->wr_id; + + ind = (ind + 1) & (qp->rq.wqe_cnt - 1); + } + +out: + if (nreq) { + qp->rq.head += nreq; + + /* + * Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + + *qp->db = htonl(qp->rq.head & 0xffff); + } + + pthread_spin_unlock(&qp->rq.lock); + + return ret; +} + +static int num_inline_segs(int data, enum ibv_qp_type type) +{ + /* + * Inline data segments are not allowed to cross 64 byte + * boundaries. For UD QPs, the data segments always start + * aligned to 64 bytes (16 byte control segment + 48 byte + * datagram segment); for other QPs, there will be a 16 byte + * control segment and possibly a 16 byte remote address + * segment, so in the worst case there will be only 32 bytes + * available for the first data segment. + */ + if (type == IBV_QPT_UD) + data += (sizeof (struct mlx4_wqe_ctrl_seg) + + sizeof (struct mlx4_wqe_datagram_seg)) % + MLX4_INLINE_ALIGN; + else + data += (sizeof (struct mlx4_wqe_ctrl_seg) + + sizeof (struct mlx4_wqe_raddr_seg)) % + MLX4_INLINE_ALIGN; + + return (data + MLX4_INLINE_ALIGN - sizeof (struct mlx4_wqe_inline_seg) - 1) / + (MLX4_INLINE_ALIGN - sizeof (struct mlx4_wqe_inline_seg)); +} + +void mlx4_calc_sq_wqe_size(struct ibv_qp_cap *cap, enum ibv_qp_type type, + struct mlx4_qp *qp) +{ + int size; + int max_sq_sge; + + max_sq_sge = align(cap->max_inline_data + + num_inline_segs(cap->max_inline_data, type) * + sizeof (struct mlx4_wqe_inline_seg), + sizeof (struct mlx4_wqe_data_seg)) / + sizeof (struct mlx4_wqe_data_seg); + if (max_sq_sge < cap->max_send_sge) + max_sq_sge = cap->max_send_sge; + + size = max_sq_sge * sizeof (struct mlx4_wqe_data_seg); + switch (type) { + case IBV_QPT_UD: + size += sizeof (struct mlx4_wqe_datagram_seg); + break; + + case IBV_QPT_UC: + size += sizeof (struct mlx4_wqe_raddr_seg); + break; + + case IBV_QPT_RC: + size += sizeof (struct mlx4_wqe_raddr_seg); + /* + * An atomic op will require an atomic segment, a + * remote address segment and one scatter entry. + */ + if (size < (sizeof (struct mlx4_wqe_atomic_seg) + + sizeof (struct mlx4_wqe_raddr_seg) + + sizeof (struct mlx4_wqe_data_seg))) + size = (sizeof (struct mlx4_wqe_atomic_seg) + + sizeof (struct mlx4_wqe_raddr_seg) + + sizeof (struct mlx4_wqe_data_seg)); + break; + + default: + break; + } + + /* Make sure that we have enough space for a bind request */ + if (size < sizeof (struct mlx4_wqe_bind_seg)) + size = sizeof (struct mlx4_wqe_bind_seg); + + size += sizeof (struct mlx4_wqe_ctrl_seg); + + for (qp->sq.wqe_shift = 6; 1 << qp->sq.wqe_shift < size; + qp->sq.wqe_shift++) + ; /* nothing */ +} + +int mlx4_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap *cap, + enum ibv_qp_type type, struct mlx4_qp *qp) +{ + qp->rq.max_gs = cap->max_recv_sge; + + qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof (uint64_t)); + if (!qp->sq.wrid) + return -1; + + if (qp->rq.wqe_cnt) { + qp->rq.wrid = malloc(qp->rq.wqe_cnt * sizeof (uint64_t)); + if (!qp->rq.wrid) { + free(qp->sq.wrid); + return -1; + } + } + + for (qp->rq.wqe_shift = 4; + 1 << qp->rq.wqe_shift < qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg); + qp->rq.wqe_shift++) + ; /* nothing */ + + qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + + (qp->sq.wqe_cnt << qp->sq.wqe_shift); + if (qp->rq.wqe_shift > qp->sq.wqe_shift) { + qp->rq.offset = 0; + qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; + } else { + qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift; + qp->sq.offset = 0; + } + + if (mlx4_alloc_buf(&qp->buf, + align(qp->buf_size, to_mdev(pd->context->device)->page_size), + to_mdev(pd->context->device)->page_size)) { + free(qp->sq.wrid); + free(qp->rq.wrid); + return -1; + } + + memset(qp->buf.buf, 0, qp->buf_size); + + return 0; +} + +void mlx4_set_sq_sizes(struct mlx4_qp *qp, struct ibv_qp_cap *cap, + enum ibv_qp_type type) +{ + int wqe_size; + + wqe_size = (1 << qp->sq.wqe_shift) - sizeof (struct mlx4_wqe_ctrl_seg); + switch (type) { + case IBV_QPT_UD: + wqe_size -= sizeof (struct mlx4_wqe_datagram_seg); + break; + + case IBV_QPT_UC: + case IBV_QPT_RC: + wqe_size -= sizeof (struct mlx4_wqe_raddr_seg); + break; + + default: + break; + } + + qp->sq.max_gs = wqe_size / sizeof (struct mlx4_wqe_data_seg); + cap->max_send_sge = qp->sq.max_gs; + qp->sq.max_post = qp->sq.wqe_cnt - qp->sq_spare_wqes; + cap->max_send_wr = qp->sq.max_post; + + /* + * Inline data segments can't cross a 64 byte boundary. So + * subtract off one segment header for each 64-byte chunk, + * taking into account the fact that wqe_size will be 32 mod + * 64 for non-UD QPs. + */ + qp->max_inline_data = wqe_size - + sizeof (struct mlx4_wqe_inline_seg) * + (align(wqe_size, MLX4_INLINE_ALIGN) / MLX4_INLINE_ALIGN); + cap->max_inline_data = qp->max_inline_data; +} + +struct mlx4_qp *mlx4_find_qp(struct mlx4_context *ctx, uint32_t qpn) +{ + int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift; + + if (ctx->qp_table[tind].refcnt) + return ctx->qp_table[tind].table[qpn & ctx->qp_table_mask]; + else + return NULL; +} + +int mlx4_store_qp(struct mlx4_context *ctx, uint32_t qpn, struct mlx4_qp *qp) +{ + int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift; + + if (!ctx->qp_table[tind].refcnt) { + ctx->qp_table[tind].table = calloc(ctx->qp_table_mask + 1, + sizeof (struct mlx4_qp *)); + if (!ctx->qp_table[tind].table) + return -1; + } + + ++ctx->qp_table[tind].refcnt; + ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = qp; + return 0; +} + +void mlx4_clear_qp(struct mlx4_context *ctx, uint32_t qpn) +{ + int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift; + + if (!--ctx->qp_table[tind].refcnt) + free(ctx->qp_table[tind].table); + else + ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = NULL; +} diff --git a/prov/mlx4/src/srq.c b/prov/mlx4/src/srq.c new file mode 100644 index 00000000000..f1d12402701 --- /dev/null +++ b/prov/mlx4/src/srq.c @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2007 Cisco, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +# include <config.h> +#endif /* HAVE_CONFIG_H */ + +#include <stdlib.h> +#include <netinet/in.h> +#include <pthread.h> +#include <string.h> + +#include "mlx4.h" +#include "doorbell.h" +#include "wqe.h" + +static void *get_wqe(struct mlx4_srq *srq, int n) +{ + return srq->buf.buf + (n << srq->wqe_shift); +} + +void mlx4_free_srq_wqe(struct mlx4_srq *srq, int ind) +{ + struct mlx4_wqe_srq_next_seg *next; + + pthread_spin_lock(&srq->lock); + + next = get_wqe(srq, srq->tail); + next->next_wqe_index = htons(ind); + srq->tail = ind; + + pthread_spin_unlock(&srq->lock); +} + +int mlx4_post_srq_recv(struct ibv_srq *ibsrq, + struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + struct mlx4_srq *srq = to_msrq(ibsrq); + struct mlx4_wqe_srq_next_seg *next; + struct mlx4_wqe_data_seg *scat; + int err = 0; + int nreq; + int i; + + pthread_spin_lock(&srq->lock); + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + if (wr->num_sge > srq->max_gs) { + err = -1; + *bad_wr = wr; + break; + } + + if (srq->head == srq->tail) { + /* SRQ is full*/ + err = -1; + *bad_wr = wr; + break; + } + + srq->wrid[srq->head] = wr->wr_id; + + next = get_wqe(srq, srq->head); + srq->head = ntohs(next->next_wqe_index); + scat = (struct mlx4_wqe_data_seg *) (next + 1); + + for (i = 0; i < wr->num_sge; ++i) { + scat[i].byte_count = htonl(wr->sg_list[i].length); + scat[i].lkey = htonl(wr->sg_list[i].lkey); + scat[i].addr = htonll(wr->sg_list[i].addr); + } + + if (i < srq->max_gs) { + scat[i].byte_count = 0; + scat[i].lkey = htonl(MLX4_INVALID_LKEY); + scat[i].addr = 0; + } + } + + if (nreq) { + srq->counter += nreq; + + /* + * Make sure that descriptors are written before + * we write doorbell record. + */ + wmb(); + + *srq->db = htonl(srq->counter); + } + + pthread_spin_unlock(&srq->lock); + + return err; +} + +int mlx4_alloc_srq_buf(struct ibv_pd *pd, struct ibv_srq_attr *attr, + struct mlx4_srq *srq) +{ + struct mlx4_wqe_srq_next_seg *next; + struct mlx4_wqe_data_seg *scatter; + int size; + int buf_size; + int i; + + srq->wrid = malloc(srq->max * sizeof (uint64_t)); + if (!srq->wrid) + return -1; + + size = sizeof (struct mlx4_wqe_srq_next_seg) + + srq->max_gs * sizeof (struct mlx4_wqe_data_seg); + + for (srq->wqe_shift = 5; 1 << srq->wqe_shift < size; ++srq->wqe_shift) + ; /* nothing */ + + buf_size = srq->max << srq->wqe_shift; + + if (mlx4_alloc_buf(&srq->buf, buf_size, + to_mdev(pd->context->device)->page_size)) { + free(srq->wrid); + return -1; + } + + memset(srq->buf.buf, 0, buf_size); + + /* + * Now initialize the SRQ buffer so that all of the WQEs are + * linked into the list of free WQEs. + */ + + for (i = 0; i < srq->max; ++i) { + next = get_wqe(srq, i); + next->next_wqe_index = htons((i + 1) & (srq->max - 1)); + + for (scatter = (void *) (next + 1); + (void *) scatter < (void *) next + (1 << srq->wqe_shift); + ++scatter) + scatter->lkey = htonl(MLX4_INVALID_LKEY); + } + + srq->head = 0; + srq->tail = srq->max - 1; + + return 0; +} diff --git a/prov/mlx4/src/wqe.h b/prov/mlx4/src/wqe.h new file mode 100644 index 00000000000..bbd22bad225 --- /dev/null +++ b/prov/mlx4/src/wqe.h @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2007 Cisco, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef WQE_H +#define WQE_H + +enum { + MLX4_SEND_DOORBELL = 0x14, +}; + +enum { + MLX4_WQE_CTRL_FENCE = 1 << 6, + MLX4_WQE_CTRL_CQ_UPDATE = 3 << 2, + MLX4_WQE_CTRL_SOLICIT = 1 << 1, +}; + +enum { + MLX4_INLINE_SEG = 1 << 31, + MLX4_INLINE_ALIGN = 64, +}; + +enum { + MLX4_INVALID_LKEY = 0x100, +}; + +struct mlx4_wqe_ctrl_seg { + uint32_t owner_opcode; + uint8_t reserved[3]; + uint8_t fence_size; + /* + * High 24 bits are SRC remote buffer; low 8 bits are flags: + * [7] SO (strong ordering) + * [5] TCP/UDP checksum + * [4] IP checksum + * [3:2] C (generate completion queue entry) + * [1] SE (solicited event) + * [0] FL (force loopback) + */ + uint32_t srcrb_flags; + /* + * imm is immediate data for send/RDMA write w/ immediate; + * also invalidation key for send with invalidate; input + * modifier for WQEs on CCQs. + */ + uint32_t imm; +}; + +struct mlx4_wqe_datagram_seg { + uint32_t av[8]; + uint32_t dqpn; + uint32_t qkey; + uint16_t vlan; + uint8_t mac[6]; +}; + +struct mlx4_wqe_data_seg { + uint32_t byte_count; + uint32_t lkey; + uint64_t addr; +}; + +struct mlx4_wqe_inline_seg { + uint32_t byte_count; +}; + +struct mlx4_wqe_srq_next_seg { + uint16_t reserved1; + uint16_t next_wqe_index; + uint32_t reserved2[3]; +}; + +struct mlx4_wqe_raddr_seg { + uint64_t raddr; + uint32_t rkey; + uint32_t reserved; +}; + +struct mlx4_wqe_atomic_seg { + uint64_t swap_add; + uint64_t compare; +}; + +struct mlx4_wqe_bind_seg { + uint32_t flags1; + uint32_t flags2; + uint32_t new_rkey; + uint32_t lkey; + uint64_t addr; + uint64_t length; +}; + +#endif /* WQE_H */ diff --git a/prov/psm/AUTHORS b/prov/psm/AUTHORS new file mode 100644 index 00000000000..e104c6e5b95 --- /dev/null +++ b/prov/psm/AUTHORS @@ -0,0 +1 @@ +Jianxin Xiong <jianxin.xiong@intel.com> diff --git a/prov/psm/COPYING b/prov/psm/COPYING new file mode 100644 index 00000000000..ee1a79ffabf --- /dev/null +++ b/prov/psm/COPYING @@ -0,0 +1,378 @@ +This software is available to you under a choice of one of two +licenses. You may choose to be licensed under the terms of the the +OpenIB.org BSD license or the GNU General Public License (GPL) Version +2, both included below. + +Copyright (c) 2004 Topspin Communications. All rights reserved. + +================================================================== + + OpenIB.org BSD license + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +================================================================== + + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + <one line to give the program's name and a brief idea of what it does.> + Copyright (C) <year> <name of author> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + <signature of Ty Coon>, 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/prov/psm/src/psmx.h b/prov/psm/src/psmx.h new file mode 100644 index 00000000000..4fadc90b558 --- /dev/null +++ b/prov/psm/src/psmx.h @@ -0,0 +1,91 @@ +#ifndef _FI_PSM_H +#define _FI_PSM_H + +#ifdef __cplusplus +extern "C" { +#endif + +#if HAVE_CONFIG_H +#include <config.h> +#endif + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <unistd.h> +#include <fcntl.h> +#include <pthread.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <netdb.h> +#include <rdma/fabric.h> +#include <rdma/fi_prov.h> +#include <rdma/fi_domain.h> +#include <rdma/fi_socket.h> +#include <rdma/fi_tagged.h> +#include <rdma/fi_cm.h> +#include <rdma/fi_errno.h> +#include <psm.h> +#include <psm_mq.h> + +#define PFX "libfabric:psm" + +#define PSMX_TIME_OUT 120 + +struct psmx_fid_domain { + struct fid_domain domain; + psm_ep_t psm_ep; + psm_epid_t psm_epid; + psm_mq_t psm_mq; + pthread_t ns_thread; + int ns_port; +}; + +struct psmx_fid_ec { + struct fid_ec ec; + struct psmx_fid_domain *domain; + int type; + int format; +}; + +struct psmx_fid_av { + struct fid_av av; + struct psmx_fid_domain *domain; + int type; + int format; + size_t addrlen; +}; + +struct psmx_fid_socket { + struct fid_socket socket; + struct psmx_fid_domain *domain; + struct psmx_fid_ec *ec; + struct psmx_fid_av *av; + uint64_t flags; +}; + +extern struct fi_ops_cm psmx_cm_ops; +extern struct fi_ops_tagged psmx_tagged_ops; + +void psmx_ini(void); +void psmx_fini(void); + +int psmx_domain_open(const char *name, struct fi_info *info, uint64_t flags, + fid_t *fid, void *context); +int psmx_sock_open(struct fi_info *info, fid_t *fid, void *context); +int psmx_ec_open(fid_t fid, struct fi_ec_attr *attr, fid_t *ec, void *context); +int psmx_av_open(fid_t fid, struct fi_av_attr *attr, fid_t *av, void *context); + +void *psmx_name_server(void *args); +void *psmx_resolve_name(char *servername, psm_uuid_t uuid); +void psmx_string_to_uuid(char *s, psm_uuid_t uuid); +int psmx_uuid_to_port(psm_uuid_t uuid); +int psmx_errno(int err); + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/prov/psm/src/psmx_av.c b/prov/psm/src/psmx_av.c new file mode 100644 index 00000000000..d14f22fe038 --- /dev/null +++ b/prov/psm/src/psmx_av.c @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2013 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenFabrics.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "psmx.h" + +static int psmx_av_insert(fid_t fid, const void *addr, size_t count, + void **fi_addr, uint64_t flags) +{ + struct psmx_fid_av *fid_av; + psm_error_t *errors; + int err; + + fid_av = container_of(fid, struct psmx_fid_av, av.fid); + + errors = (psm_error_t *) calloc(count, sizeof *errors); + if (!errors) + return -ENOMEM; + + err = psm_ep_connect(fid_av->domain->psm_ep, count, + (psm_epid_t *) addr, NULL, errors, + (psm_epaddr_t *) fi_addr, 30*1e9); + + free(errors); + + return psmx_errno(err); +} + +static int psmx_av_remove(fid_t fid, void *fi_addr, size_t count, + uint64_t flags) +{ + struct psmx_fid_av *fid_av; + int err = PSM_OK; + fid_av = container_of(fid, struct psmx_fid_av, av.fid); + + return psmx_errno(err); +} + +static int psmx_av_close(fid_t fid) +{ + struct psmx_fid_av *fid_av; + fid_av = container_of(fid, struct psmx_fid_av, av.fid); + free(fid_av); + return 0; +} + +static int psmx_av_bind(fid_t fid, struct fi_resource *fids, int nfids) +{ + /* no need to bind an EQ since insert/remove is synchronous */ + return 0; +} + +static int psmx_av_sync(fid_t fid, uint64_t flags, void *context) +{ + /* no-op since insert/remove is synchronous */ + return 0; +} + +static int psmx_av_control(fid_t fid, int command, void *arg) +{ + return -ENOSYS; +} + +static struct fi_ops psmx_fi_ops = { + .size = sizeof(struct fi_ops), + .close = psmx_av_close, + .bind = psmx_av_bind, + .sync = psmx_av_sync, + .control = psmx_av_control, +}; + +static struct fi_ops_av psmx_av_ops = { + .size = sizeof(struct fi_ops_av), + .insert = psmx_av_insert, + .remove = psmx_av_remove, +}; + +int psmx_av_open(fid_t fid, struct fi_av_attr *attr, fid_t *av, void *context) +{ + struct psmx_fid_domain *fid_domain; + struct psmx_fid_av *fid_av; + + fid_domain = container_of(fid, struct psmx_fid_domain, domain.fid); + + if (attr) { + if ((attr->av_mask & FI_AV_ATTR_TYPE) && + attr->type != FI_AV_MAP) + return -ENOSYS; + + if ((attr->av_mask & FI_AV_ATTR_ADDR_FORMAT) && + attr->addr_format != FI_ADDR) + return -ENOSYS; + + if ((attr->av_mask & FI_AV_ATTR_ADDRLEN) && + attr->addrlen != sizeof(psm_epaddr_t)) + return -ENOSYS; + } + + fid_av = (struct psmx_fid_av *) calloc(1, sizeof *fid_av); + if (!fid_av) + return -ENOMEM; + + fid_av->domain = fid_domain; + fid_av->type = FI_AV_MAP; + fid_av->format = FI_ADDR; + fid_av->addrlen = sizeof(psm_epaddr_t); + + fid_av->av.fid.size = sizeof(struct fid_av); + fid_av->av.fid.fclass = FID_CLASS_AV; + fid_av->av.fid.context = context; + fid_av->av.fid.ops = &psmx_fi_ops; + fid_av->av.ops = &psmx_av_ops; + + *av = &fid_av->av.fid; + return 0; +} + diff --git a/prov/psm/src/psmx_cm.c b/prov/psm/src/psmx_cm.c new file mode 100644 index 00000000000..c43ddb7faff --- /dev/null +++ b/prov/psm/src/psmx_cm.c @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2013 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenFabrics.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "psmx.h" + +static int psmx_cm_getname(fid_t fid, void *addr, size_t *addrlen) +{ + struct psmx_fid_socket *fid_socket; + + fid_socket = container_of(fid, struct psmx_fid_socket, socket.fid); + if (!fid_socket->domain) + return -EBADF; + + if (*addrlen < sizeof(psm_epid_t)) + return -FI_ETOOSMALL; + + *(psm_epid_t *)addr = fid_socket->domain->psm_epid; + *addrlen = sizeof(psm_epid_t); + + return 0; +} + +static int psmx_cm_getpeer(fid_t fid, void *addr, size_t *addrlen) +{ + return -ENOSYS; +} + +static int psmx_cm_connect(fid_t fid, const void *param, size_t paramlen) +{ + return -ENOSYS; +} + +static int psmx_cm_listen(fid_t fid) +{ + return -ENOSYS; +} + +static int psmx_cm_accept(fid_t fid, const void *param, size_t paramlen) +{ + return -ENOSYS; +} + +static int psmx_cm_reject(fid_t fid, struct fi_info *info, const void *param, + size_t paramlen) +{ + return -ENOSYS; +} + +static int psmx_cm_shutdown(fid_t fid, uint64_t flags) +{ + return -ENOSYS; +} + +static int psmx_cm_join(fid_t fid, void *addr, void **fi_addr, uint64_t flags) +{ + return -ENOSYS; +} + +static int psmx_cm_leave(fid_t fid, void *addr, void *fi_addr, uint64_t flags) +{ + return -ENOSYS; +} + +struct fi_ops_cm psmx_cm_ops = { + .size = sizeof(struct fi_ops_cm), + .getname = psmx_cm_getname, + .getpeer = psmx_cm_getpeer, + .connect = psmx_cm_connect, + .listen = psmx_cm_listen, + .accept = psmx_cm_accept, + .reject = psmx_cm_reject, + .shutdown = psmx_cm_shutdown, + .join = psmx_cm_join, + .leave = psmx_cm_leave, +}; + diff --git a/prov/psm/src/psmx_domain.c b/prov/psm/src/psmx_domain.c new file mode 100644 index 00000000000..7d5c95c3e85 --- /dev/null +++ b/prov/psm/src/psmx_domain.c @@ -0,0 +1,177 @@ +/* + * Copyright (c) 2013 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenFabrics.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "psmx.h" + +static int psmx_domain_close(fid_t fid) +{ + struct psmx_fid_domain *fid_domain; + int err; + + fid_domain = container_of(fid, struct psmx_fid_domain, domain.fid); + + if (fid_domain->ns_thread) { + pthread_cancel(fid_domain->ns_thread); + pthread_join(fid_domain->ns_thread, NULL); + } + + psm_mq_finalize(fid_domain->psm_mq); + + err = psm_ep_close(fid_domain->psm_ep, PSM_EP_CLOSE_GRACEFUL, + (int64_t) PSMX_TIME_OUT * 1000000000LL); + if (err != PSM_OK) + psm_ep_close(fid_domain->psm_ep, PSM_EP_CLOSE_FORCE, 0); + + free(fid_domain); + + return 0; +} + +static int psmx_domain_bind(fid_t fid, struct fi_resource *fids, int nfids) +{ + return -ENOSYS; +} + +static int psmx_domain_sync(fid_t fid, uint64_t flags, void *context) +{ + return -ENOSYS; +} + +static int psmx_domain_control(fid_t fid, int command, void *arg) +{ + return -ENOSYS; +} + +static int psmx_domain_query(fid_t fid, struct fi_domain_attr *attr, size_t *attrlen) +{ + return -ENOSYS; +} + +static int psmx_progress(fid_t fid) +{ + return -ENOSYS; +} + +static int psmx_mr_reg(fid_t fid, const void *buf, size_t len, fid_t *mr, + uint64_t flags, void *context) +{ + return -ENOSYS; +} + +static int psmx_mr_regv(fid_t fid, const struct iovec *iov, size_t count, + fid_t *mr, uint64_t flags, void *context) +{ + return -ENOSYS; +} + +static struct fi_ops psmx_fi_ops = { + .size = sizeof(struct fi_ops), + .close = psmx_domain_close, + .bind = psmx_domain_bind, + .sync = psmx_domain_sync, + .control = psmx_domain_control, +}; + +static struct fi_ops_domain psmx_domain_ops = { + .size = sizeof(struct fi_ops_domain), + .progress = psmx_progress, + .query = psmx_domain_query, + .av_open = psmx_av_open, + .ec_open = psmx_ec_open, + .mr_reg = psmx_mr_reg, + .mr_regv = psmx_mr_regv, +}; + +int psmx_domain_open(const char *name, struct fi_info *info, uint64_t flags, + fid_t *fid, void *context) +{ + struct psmx_fid_domain *fid_domain; + int err = -ENOMEM; + char *s; + + if (name && strncmp(name, "psm", 3)) + return -EINVAL; + + fid_domain = (struct psmx_fid_domain *) calloc(1, sizeof *fid_domain); + if (!fid_domain) + goto err_out; + + fid_domain->domain.fid.size = sizeof(struct fid_domain); + fid_domain->domain.fid.fclass = FID_CLASS_RESOURCE_DOMAIN; + fid_domain->domain.fid.context = context; + fid_domain->domain.fid.ops = &psmx_fi_ops; + fid_domain->domain.ops = &psmx_domain_ops; + + err = psm_ep_open(info->auth_key, NULL, + &fid_domain->psm_ep, &fid_domain->psm_epid); + if (err != PSM_OK) { + fprintf(stderr, "%s: psm_ep_open returns %d, errno=%d\n", + __func__, err, errno); + err = psmx_errno(err); + goto err_out_free_domain; + } + + err = psm_mq_init(fid_domain->psm_ep, PSM_MQ_ORDERMASK_ALL, + NULL, 0, &fid_domain->psm_mq); + if (err != PSM_OK) { + fprintf(stderr, "%s: psm_mq_init returns %d, errno=%d\n", + __func__, err, errno); + err = psmx_errno(err); + goto err_out_close_ep; + } + + fid_domain->ns_port = psmx_uuid_to_port(info->auth_key); + + s = getenv("SFI_PSM_NAME_SERVER"); + if (s && (!strcasecmp(s, "yes") || !strcasecmp(s, "on") || !strcmp(s, "1"))) + err = pthread_create(&fid_domain->ns_thread, NULL, psmx_name_server, (void *)fid_domain); + else + err = -1; + + if (err) + fid_domain->ns_thread = 0; + + *fid = &fid_domain->domain.fid; + return 0; + +err_out_close_ep: + if (psm_ep_close(fid_domain->psm_ep, PSM_EP_CLOSE_GRACEFUL, + (int64_t) PSMX_TIME_OUT * 1000000000LL) != PSM_OK) + psm_ep_close(fid_domain->psm_ep, PSM_EP_CLOSE_FORCE, 0); + +err_out_free_domain: + free(fid_domain); + +err_out: + return err; +} + diff --git a/prov/psm/src/psmx_ec.c b/prov/psm/src/psmx_ec.c new file mode 100644 index 00000000000..dfd53fb1692 --- /dev/null +++ b/prov/psm/src/psmx_ec.c @@ -0,0 +1,205 @@ +/* + * Copyright (c) 2013 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenFabrics.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "psmx.h" + +static struct fi_ec_err_entry error_ece; +static int error_state = 0; + +static ssize_t psmx_ec_readfrom(fid_t fid, void *buf, size_t len, + void *src_addr, size_t *addrlen) +{ + struct psmx_fid_ec *fid_ec; + psm_mq_req_t psm_req; + psm_mq_status_t psm_status; + struct fi_ec_tagged_entry *ece; + int err; + + fid_ec = container_of(fid, struct psmx_fid_ec, ec.fid); + assert(fid_ec->domain); + assert(fid_ec->format == FI_EC_FORMAT_TAGGED); + + if (len < sizeof *ece) + return -FI_ETOOSMALL; + + err = psm_mq_ipeek(fid_ec->domain->psm_mq, &psm_req, NULL); + if (err == PSM_OK) { + err = psm_mq_test(&psm_req, &psm_status); + + if (psm_status.error_code) { + error_ece.fid_context = fid_ec->ec.fid.context; + error_ece.op_context = psm_status.context; + error_ece.flags = 0; + error_ece.err = psmx_errno(psm_status.error_code); + error_ece.prov_errno = psm_status.error_code; + error_ece.data = 0; + error_ece.prov_data = NULL; + error_state = 1; + return error_ece.err; + } + + ece = (struct fi_ec_tagged_entry *) buf; + ece->op_context = psm_status.context; + ece->flags = 0; + ece->len = psm_status.nbytes; + ece->data = 0; + ece->tag = psm_status.msg_tag; + ece->olen = psm_status.msg_length; + + return 1; + } else if (err == PSM_MQ_NO_COMPLETIONS) { + return 0; + } else { + return -1; + } +} + +static ssize_t psmx_ec_read(fid_t fid, void *buf, size_t len) +{ + return psmx_ec_readfrom(fid, buf, len, NULL, NULL); +} + +static ssize_t psmx_ec_readerr(fid_t fid, void *buf, size_t len, uint64_t flags) +{ + if (len < sizeof(error_ece)) + return -FI_ETOOSMALL; + + *(struct fi_ec_err_entry *)buf = error_ece; + error_state = 0; + + return 0; +} + +static ssize_t psmx_ec_write(fid_t fid, void *buf, size_t len) +{ + return -ENOSYS; +} + +static int psmx_ec_reset(fid_t fid, void *cond) +{ + return -ENOSYS; +} + +static ssize_t psmx_ec_condread(fid_t fid, void *buf, size_t len, void *cond) +{ + return -ENOSYS; +} + +static ssize_t psmx_ec_condreadfrom(fid_t fid, void *buf, size_t len, + void *src_addr, size_t *addrlen, void *cond) +{ + return -ENOSYS; +} + +static const char *psmx_ec_strerror(fid_t fid, int prov_errno, void *prov_data, + void *buf, size_t len) +{ + return psm_error_get_string(prov_errno); +} + +static int psmx_ec_close(fid_t fid) +{ + struct psmx_fid_ec *fid_ec; + + fid_ec = container_of(fid, struct psmx_fid_ec, ec.fid); + free(fid_ec); + + return 0; +} + +static int psmx_ec_bind(fid_t fid, struct fi_resource *fids, int nfids) +{ + return -ENOSYS; +} + +static int psmx_ec_sync(fid_t fid, uint64_t flags, void *context) +{ + return -ENOSYS; +} + +static int psmx_ec_control(fid_t fid, int command, void *arg) +{ + return -ENOSYS; +} + +static struct fi_ops psmx_fi_ops = { + .size = sizeof(struct fi_ops), + .close = psmx_ec_close, + .bind = psmx_ec_bind, + .sync = psmx_ec_sync, + .control = psmx_ec_control, +}; + +static struct fi_ops_ec psmx_ec_ops = { + .size = sizeof(struct fi_ops_ec), + .read = psmx_ec_read, + .readfrom = psmx_ec_readfrom, + .readerr = psmx_ec_readerr, + .write = psmx_ec_write, + .reset = psmx_ec_reset, + .condread = psmx_ec_condread, + .condreadfrom = psmx_ec_condreadfrom, + .strerror = psmx_ec_strerror, +}; + +int psmx_ec_open(fid_t fid, struct fi_ec_attr *attr, fid_t *ec, void *context) +{ + struct psmx_fid_domain *fid_domain; + struct psmx_fid_ec *fid_ec; + + if (attr->domain != FI_EC_DOMAIN_GENERAL && attr->domain != FI_EC_DOMAIN_COMP) + return -ENOSYS; + + if (attr->type != FI_EC_QUEUE) + return -ENOSYS; + + if (attr->format != FI_EC_FORMAT_TAGGED && attr->format != FI_EC_FORMAT_UNSPEC) + return -ENOSYS; + + fid_domain = container_of(fid, struct psmx_fid_domain, domain.fid); + fid_ec = (struct psmx_fid_ec *) calloc(1, sizeof *fid_ec); + if (!fid_ec) + return -ENOMEM; + + fid_ec->domain = fid_domain; + fid_ec->type = FI_EC_QUEUE; + fid_ec->format = FI_EC_FORMAT_TAGGED; + fid_ec->ec.fid.size = sizeof(struct fid_ec); + fid_ec->ec.fid.fclass = FID_CLASS_EC; + fid_ec->ec.fid.context = context; + fid_ec->ec.fid.ops = &psmx_fi_ops; + fid_ec->ec.ops = &psmx_ec_ops; + + *ec = &fid_ec->ec.fid; + return 0; +} + diff --git a/prov/psm/src/psmx_init.c b/prov/psm/src/psmx_init.c new file mode 100644 index 00000000000..ad817a8b069 --- /dev/null +++ b/prov/psm/src/psmx_init.c @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2013 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenFabrics.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "psmx.h" + +static int psmx_getinfo(char *node, char *service, struct fi_info *hints, + struct fi_info **info) +{ + struct fi_info *psmx_info; + uint64_t supported_flags = FI_NONBLOCK|FI_ACK|FI_EXCL|FI_BUFFERED_RECV|FI_CANCEL; + uint64_t default_flags = FI_NONBLOCK; + uint64_t flags = 0; + void *dst_addr = NULL; + void *uuid; + char *s; + + uuid = calloc(1, sizeof(psm_uuid_t)); + if (!uuid) + return -ENOMEM; + + s = getenv("SFI_PSM_UUID"); + if (s) + psmx_string_to_uuid(s, uuid); + + if (node) + dst_addr = psmx_resolve_name(node, uuid); + + if (service) { + /* FIXME: check service */ + } + + if (hints) { + switch (hints->type & FID_TYPE_MASK) { + case FID_UNSPEC: + case FID_RDM: + break; + default: + *info = NULL; + return -ENODATA; + } + + switch (hints->protocol & FI_PROTO_MASK) { + case FI_PROTO_UNSPEC: + if (hints->protocol & FI_PROTO_TAGGED) + break; + /* fall through */ + default: + *info = NULL; + return -ENODATA; + } + + flags = hints->flags; + if ((flags & supported_flags) != flags) { + *info = NULL; + return -ENODATA; + } + + if (hints->domain_name && strncmp(hints->domain_name, "psm", 3)) { + *info = NULL; + return -ENODATA; + } + + /* FIXME: check other fields of hints */ + } + + psmx_info = calloc(1, sizeof *psmx_info); + if (!psmx_info) { + free(uuid); + return -ENOMEM; + } + + psmx_info->next = NULL; + psmx_info->size = sizeof(*psmx_info); + psmx_info->flags = flags | default_flags; + psmx_info->type = FID_RDM; + psmx_info->protocol = FI_PROTO_TAGGED; + psmx_info->iov_format = FI_IOTAGGED; /* FIXME: or FI_IOTAGGEDV? */ + psmx_info->addr_format = FI_ADDR; + psmx_info->info_addr_format = FI_ADDR; + psmx_info->src_addrlen = 0; + psmx_info->dst_addrlen = sizeof(psm_epid_t); + psmx_info->src_addr = NULL; + psmx_info->dst_addr = dst_addr; + psmx_info->auth_keylen = sizeof(psm_uuid_t); + psmx_info->auth_key = uuid; + psmx_info->shared_fd = -1; + psmx_info->domain_name = strdup("psm"); + psmx_info->datalen = 0; + psmx_info->data = NULL; + + *info = psmx_info; + + return 0; +} + +static struct fi_ops_prov psmx_ops = { + .size = sizeof(struct fi_ops_prov), + .getinfo = psmx_getinfo, + .freeinfo = NULL, + .socket = psmx_sock_open, + .open = psmx_domain_open +}; + +void psmx_ini(void) +{ + int major, minor; + int err; + + psm_error_register_handler(NULL, PSM_ERRHANDLER_NO_HANDLER); + + major = PSM_VERNO_MAJOR; + minor = PSM_VERNO_MINOR; + + err = psm_init(&major, &minor); + if (err != PSM_OK) { + fprintf(stderr, "%s: psm_init failed: %s\n", __func__, + psm_error_get_string(err)); + return; + } + + if (major > PSM_VERNO_MAJOR) { + fprintf(stderr, "%s: PSM loaded an unexpected/unsupported version %d.%d\n", + __func__, major, minor); + return; + } + + fi_register(&psmx_ops); +} + +void psmx_fini(void) +{ + psm_finalize(); +} + diff --git a/prov/psm/src/psmx_sock.c b/prov/psm/src/psmx_sock.c new file mode 100644 index 00000000000..5202b1d7ddd --- /dev/null +++ b/prov/psm/src/psmx_sock.c @@ -0,0 +1,177 @@ +/* + * Copyright (c) 2013 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenFabrics.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "psmx.h" + +static ssize_t psmx_sock_cancel(fid_t fid, struct fi_context *context) +{ + struct psmx_fid_socket *fid_socket; + int err; + + fid_socket = container_of(fid, struct psmx_fid_socket, socket.fid); + if (!fid_socket->domain) + return -EBADF; + + if (!context) + return -EINVAL; + + if (context->internal[0] == NULL) + return 0; + + err = psm_mq_cancel((psm_mq_req_t *)&context->internal[0]); + return psmx_errno(err); +} + +static int psmx_sock_getopt(fid_t fid, int level, int optname, + void *optval, size_t *optlen) +{ + return -ENOSYS; +} + +static int psmx_sock_setopt(fid_t fid, int level, int optname, + const void *optval, size_t optlen) +{ + return -ENOSYS; +} + +static int psmx_sock_close(fid_t fid) +{ + struct psmx_fid_socket *fid_socket; + + fid_socket = container_of(fid, struct psmx_fid_socket, socket.fid); + free(fid_socket); + + return 0; +} + +static int psmx_sock_bind(fid_t fid, struct fi_resource *ress, int nress) +{ + int i; + struct psmx_fid_socket *fid_socket; + struct psmx_fid_domain *domain; + struct psmx_fid_av *av; + struct psmx_fid_ec *ec; + + fid_socket = container_of(fid, struct psmx_fid_socket, socket.fid); + + for (i=0; i<nress; i++) { + if (!ress[i].fid) + return -EINVAL; + switch (ress[i].fid->fclass) { + case FID_CLASS_RESOURCE_DOMAIN: + domain = container_of(ress[i].fid, + struct psmx_fid_domain, domain.fid); + if (fid_socket->domain && fid_socket->domain != domain) + return -EEXIST; + fid_socket->domain = domain; + break; + + case FID_CLASS_EC: + /* TODO: check ress flags for send/recv EQs */ + ec = container_of(ress[i].fid, + struct psmx_fid_ec, ec.fid); + if (fid_socket->ec && fid_socket->ec != ec) + return -EEXIST; + if (fid_socket->domain && fid_socket->domain != ec->domain) + return -EINVAL; + fid_socket->ec = ec; + fid_socket->domain = ec->domain; + break; + + case FID_CLASS_AV: + av = container_of(ress[i].fid, + struct psmx_fid_av, av.fid); + if (fid_socket->av && fid_socket->av != av) + return -EEXIST; + if (fid_socket->domain && fid_socket->domain != av->domain) + return -EINVAL; + fid_socket->av = av; + fid_socket->domain = av->domain; + break; + + default: + return -ENOSYS; + } + } + + return 0; +} + +static int psmx_sock_sync(fid_t fid, uint64_t flags, void *context) +{ + return -ENOSYS; +} + +static int psmx_sock_control(fid_t fid, int command, void *arg) +{ + return -ENOSYS; +} + +static struct fi_ops psmx_fi_ops = { + .size = sizeof(struct fi_ops), + .close = psmx_sock_close, + .bind = psmx_sock_bind, + .sync = psmx_sock_sync, + .control = psmx_sock_control, +}; + +static struct fi_ops_sock psmx_sock_ops = { + .size = sizeof(struct fi_ops_sock), + .cancel = psmx_sock_cancel, + .getopt = psmx_sock_getopt, + .setopt = psmx_sock_setopt, +}; + +int psmx_sock_open(struct fi_info *info, fid_t *fid, void *context) +{ + struct psmx_fid_socket *fid_socket; + + fid_socket = (struct psmx_fid_socket *) calloc(1, sizeof *fid_socket); + if (!fid_socket) + return -ENOMEM; + + fid_socket->socket.fid.size = sizeof(struct fid_socket); + fid_socket->socket.fid.fclass = FID_CLASS_SOCKET; + fid_socket->socket.fid.context = context; + fid_socket->socket.fid.ops = &psmx_fi_ops; + fid_socket->socket.ops = &psmx_sock_ops; + fid_socket->socket.cm = &psmx_cm_ops; + fid_socket->socket.tagged = &psmx_tagged_ops; + + if (info) + fid_socket->flags = info->flags; + + *fid = &fid_socket->socket.fid; + + return 0; +} + diff --git a/prov/psm/src/psmx_tagged.c b/prov/psm/src/psmx_tagged.c new file mode 100644 index 00000000000..a94b672673b --- /dev/null +++ b/prov/psm/src/psmx_tagged.c @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2013 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenFabrics.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "psmx.h" + +static ssize_t psmx_tagged_recv(fid_t fid, void *buf, size_t len, + be64_t tag, be64_t mask, void *context) +{ + return -ENOSYS; +} + +static ssize_t psmx_tagged_recvv(fid_t fid, const void *iov, size_t len, + be64_t tag, be64_t mask, void *context) +{ + return -ENOSYS; +} + +static ssize_t psmx_tagged_recvfrom(fid_t fid, void *buf, size_t len, + const void *src_addr, + be64_t tag, be64_t mask, void *context) +{ + struct psmx_fid_socket *fid_socket; + psm_mq_req_t psm_req; + int err; + + fid_socket = container_of(fid, struct psmx_fid_socket, socket.fid); + assert(fid_socket->domain); + + err = psm_mq_irecv(fid_socket->domain->psm_mq, tag, ~mask, 0, /* flags */ + buf, len, context, &psm_req); + if (err != PSM_OK) + return psmx_errno(err); + + if (fid_socket->flags & (FI_BUFFERED_RECV | FI_CANCEL)) + ((struct fi_context *)context)->internal[0] = psm_req; + + return 0; +} + +static ssize_t psmx_tagged_recvmsg(fid_t fid, const struct fi_msg_tagged *msg, + uint64_t flags) +{ + return -ENOSYS; +} + +static ssize_t psmx_tagged_send(fid_t fid, const void *buf, size_t len, + be64_t tag, void *context) +{ + return -ENOSYS; +} + +static ssize_t psmx_tagged_sendv(fid_t fid, const void *iov, size_t len, + be64_t tag, void *context) +{ + return -ENOSYS; +} + +static ssize_t psmx_tagged_sendto(fid_t fid, const void *buf, size_t len, + const void *dest_addr, + be64_t tag, void *context) +{ + struct psmx_fid_socket *fid_socket; + int nonblocking; + int send_flag; + psm_epaddr_t psm_epaddr; + psm_mq_req_t psm_req; + int err; + int flags; + + fid_socket = container_of(fid, struct psmx_fid_socket, socket.fid); + assert(fid_socket->domain); + + psm_epaddr = (psm_epaddr_t) dest_addr; + + flags = fid_socket->flags; + + nonblocking = !!(flags & FI_NONBLOCK); + send_flag = (flags & FI_ACK) ? PSM_MQ_FLAG_SENDSYNC : 0; + + if (nonblocking) { + err = psm_mq_isend(fid_socket->domain->psm_mq, psm_epaddr, + send_flag, tag, buf, len, context, &psm_req); + + if (flags & (FI_BUFFERED_RECV | FI_CANCEL)) + ((struct fi_context *)context)->internal[0] = NULL; + /* send cannot be canceled */ + return 0; + } else { + err = psm_mq_send(fid_socket->domain->psm_mq, psm_epaddr, + send_flag, tag, buf, len); + if (err == PSM_OK) + return len; + else + return psmx_errno(err); + } +} + +static ssize_t psmx_tagged_sendmsg(fid_t fid, const struct fi_msg_tagged *msg, + uint64_t flags) +{ + return -ENOSYS; +} + +static ssize_t psmx_tagged_search(fid_t fid, be64_t *tag, be64_t mask, + uint64_t flags, void *src_addr, + size_t *src_addrlen, size_t *len, + void *context) +{ + struct psmx_fid_socket *fid_socket; + psm_mq_status_t psm_status; + int err; + + fid_socket = container_of(fid, struct psmx_fid_socket, socket.fid); + assert(fid_socket->domain); + + err = psm_mq_iprobe(fid_socket->domain->psm_mq, *tag, ~mask, &psm_status); + switch (err) { + case PSM_OK: + *tag = psm_status.msg_tag; + *len = psm_status.msg_length; + /* FIXME: fill in src_addr and src_addrlen */ + return 1; + + case PSM_MQ_NO_COMPLETIONS: + return -FI_ENOMSG; + + default: + return psmx_errno(err); + } +} + +struct fi_ops_tagged psmx_tagged_ops = { + .size = sizeof(struct fi_ops_tagged), + .recv = psmx_tagged_recv, + .recvv = psmx_tagged_recvv, + .recvfrom = psmx_tagged_recvfrom, + .recvmsg = psmx_tagged_recvmsg, + .send = psmx_tagged_send, + .sendv = psmx_tagged_sendv, + .sendto = psmx_tagged_sendto, + .sendmsg = psmx_tagged_sendmsg, + .search = psmx_tagged_search, +}; + diff --git a/prov/psm/src/psmx_util.c b/prov/psm/src/psmx_util.c new file mode 100644 index 00000000000..41d71f26885 --- /dev/null +++ b/prov/psm/src/psmx_util.c @@ -0,0 +1,270 @@ +/* + * Copyright (c) 2013 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenFabrics.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "psmx.h" + +void psmx_string_to_uuid(char *s, psm_uuid_t uuid) +{ + int n; + + n = sscanf(s, + "%2hhx%2hhx%2hhx%2hhx-" + "%2hhx%2hhx-%2hhx%2hhx-%2hhx%2hhx-" + "%2hhx%2hhx%2hhx%2hhx%2hhx%2hhx", + &uuid[0], &uuid[1], &uuid[2], &uuid[3], + &uuid[4], &uuid[5], &uuid[6], &uuid[7], &uuid[8], &uuid[9], + &uuid[10], &uuid[11], &uuid[12], &uuid[13], &uuid[14], &uuid[15]); + + if (n != 16) { + fprintf(stderr, "%s: wrong uuid format: %s\n", __func__, s); + fprintf(stderr, "%s: correct uuid format is: " + "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx\n", + __func__); + } +} + +int psmx_uuid_to_port(psm_uuid_t uuid) +{ + uint16_t port; + uint16_t *u = (uint16_t *)uuid; + + port = u[0] + u[1] + u[2] + u[3] + u[4] + u[5] + u[6] + u[7]; + if (port < 4096) + port += 4096; + + return (int)port; +} + +static void psmx_name_server_cleanup(void *args) +{ + close((int)(uintptr_t)args); +} + +/************************************************************* + * A simple name resolution mechanism for client-server style + * applications. The server side has to run first. The client + * side then passes the server name as the first parameter + * of fi_getinfo call and the resulting provider info should + * have the transport address of the server in the dst_addr + * field. Both side has to use the same UUID. + *************************************************************/ +void *psmx_name_server(void *args) +{ + struct psmx_fid_domain *fid_domain; + struct addrinfo hints = { + .ai_flags = AI_PASSIVE, + .ai_family = AF_UNSPEC, + .ai_socktype = SOCK_STREAM + }; + struct addrinfo *res, *p; + char *service; + int listenfd = -1, connfd; + int port; + int n; + + fid_domain = args; + port = fid_domain->ns_port; + + pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL); + + if (asprintf(&service, "%d", port) < 0) + return NULL; + + n = getaddrinfo(NULL, service, &hints, &res); + if (n < 0) { + fprintf(stderr, "%s: port %d: %s\n", __func__, port, gai_strerror(n)); + free(service); + return NULL; + } + + for (p=res; p; p=p->ai_next) { + listenfd = socket(p->ai_family, p->ai_socktype, p->ai_protocol); + if (listenfd >= 0) { + n = 1; + setsockopt(listenfd, SOL_SOCKET, SO_REUSEADDR, &n, sizeof(n)); + if (!bind(listenfd, p->ai_addr, p->ai_addrlen)) + break; + close(listenfd); + listenfd = -1; + } + } + + freeaddrinfo(res); + free(service); + + if (listenfd < 0) { + fprintf(stderr, "%s: couldn't listen to port %d\n", __func__, port); + return NULL; + } + + listen(listenfd, 256); + + pthread_cleanup_push(psmx_name_server_cleanup, (void *)(uintptr_t)listenfd); + { + pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); + pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL); + + while (1) { + connfd = accept(listenfd, NULL, 0); + if (connfd >= 0) { + pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL); + write(connfd, &fid_domain->psm_epid, sizeof(psm_epid_t)); + close(connfd); + pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); + } + } + } + pthread_cleanup_pop(1); + + return NULL; +} + +void *psmx_resolve_name(char *servername, psm_uuid_t uuid) +{ + struct addrinfo hints = { + .ai_family = AF_UNSPEC, + .ai_socktype = SOCK_STREAM + }; + struct addrinfo *res, *p; + char *service; + void *dst_addr; + int sockfd = -1; + int port; + int n; + + port = psmx_uuid_to_port(uuid); + + if (asprintf(&service, "%d", port) < 0) + return NULL; + + n = getaddrinfo(servername, service, &hints, &res); + if (n < 0) { + fprintf(stderr, "%s:(%s:%d):%s\n", __func__, servername, port, gai_strerror(n)); + free(service); + return NULL; + } + + for (p = res; p; p = p->ai_next) { + sockfd = socket(p->ai_family, p->ai_socktype, p->ai_protocol); + if (sockfd >= 0) { + if (!connect(sockfd, p->ai_addr, p->ai_addrlen)) + break; + close(sockfd); + sockfd = -1; + } + } + + freeaddrinfo(res); + free(service); + + if (sockfd < 0) { + fprintf(stderr, "%s: couldn't connect to %s:%d\n", __func__, servername, port); + return NULL; + } + + dst_addr = calloc(1,sizeof(*dst_addr)); + if (!dst_addr) { + close(sockfd); + return NULL; + } + + if (read(sockfd, dst_addr, sizeof(psm_epid_t)) != sizeof(psm_epid_t)) { + perror(__func__); + free(dst_addr); + close(sockfd); + return NULL; + } + + close(sockfd); + + return dst_addr; +} + +static int psmx_errno_table[PSM_ERROR_LAST] = { + 0, /* PSM_OK = 0 */ + 0, /* PSM_OK_NO_PROGRESS = 1 */ + -FI_EOTHER, + -FI_EINVAL, /* PSM_PARAM_ERR = 3 */ + -FI_ENOMEM, /* PSM_NO_MEMORY = 4 */ + -FI_EBADF, /* PSM_INIT_NOT_INIT = 5 */ + -FI_EINVAL, /* PSM_INIT_BAD_API_VERSION = 6 */ + -FI_ENOSYS, /* PSM_NO_AFFINITY = 7 */ + -FI_EIO, /* PSM_INTERNAL_ERR = 8 */ + -FI_EINVAL, /* PSM_SHMEM_SEGMENT_ERR = 9 */ + -FI_EACCES, /* PSM_OPT_READONLY = 10 */ + -FI_ETIMEDOUT, /* PSM_TIMEOUT = 11 */ + -FI_EMFILE, /* PSM_TOO_MANY_ENDPOINTS = 12 */ + -FI_ESHUTDOWN, /* PSM_IS_FINALIZED = 13 */ + -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, + -FI_ESHUTDOWN, /* PSM_EP_WAS_CLOSED = 20 */ + -FI_ENODEV, /* PSM_EP_NO_DEVICE = 21 */ + -FI_ENOENT, /* PSM_EP_UNIT_NOT_FOUND = 22 */ + -FI_EIO, /* PSM_EP_DEVICE_FAILURE = 23 */ + -FI_ETIMEDOUT, /* PSM_EP_CLOSE_TIMEOUT = 24 */ + -FI_ENOENT, /* PSM_EP_NO_PORTS_AVAIL = 25 */ + -FI_ENETDOWN, /* PSM_EP_NO_NETWORK = 26 */ + -FI_EINVAL, /* PSM_EP_INVALID_UUID_KEY = 27 */ + -FI_ENOSPC, /* PSM_EP_NO_RESOURCES = 28 */ + -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, + -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, + -FI_EBADF, /* PSM_EPID_UNKNOWN = 40 */ + -FI_ENETUNREACH,/* PSM_EPID_UNREACHABLE = 41 */ + -FI_EOTHER, + -FI_EINVAL, /* PSM_EPID_INVALID_NODE = 43 */ + -FI_EINVAL, /* PSM_EPID_INVALID_MTU = 44 */ + -FI_EINVAL, /* PSM_EPID_INVALID_UUID_KEY = 45 */ + -FI_EINVAL, /* PSM_EPID_INVALID_VERSION = 46 */ + -FI_EINVAL, /* PSM_EPID_INVALID_CONNECT = 47 */ + -FI_EISCONN, /* PSM_EPID_ALREADY_CONNECTED = 48 */ + -FI_EIO, /* PSM_EPID_NETWORK_ERROR = 49 */ + -FI_EINVAL, /* PSM_EPID_INVALID_PKEY = 50 */ + -FI_ENETUNREACH,/* PSM_EPID_PATH_RESOLUTION = 51 */ + -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, + -FI_EOTHER, -FI_EOTHER, + -FI_EAGAIN, /* PSM_MQ_NO_COMPLETIONS = 60 */ + -FI_EMSGSIZE, /* PSM_MQ_TRUNCATION = 61 */ + -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, + -FI_EOTHER, -FI_EOTHER, + -FI_EINVAL, /* PSM_AM_INVALID_REPLY = 70 */ + -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, + -FI_EOTHER, -FI_EOTHER, -FI_EOTHER + /* PSM_ERROR_LAST = 80 */ +}; + +int psmx_errno(int err) +{ + if (err >= 0 && err < PSM_ERROR_LAST) + return psmx_errno_table[err]; + else + return -FI_EOTHER; +} + diff --git a/prov/rdmacm/AUTHORS b/prov/rdmacm/AUTHORS new file mode 100644 index 00000000000..f76b870b4db --- /dev/null +++ b/prov/rdmacm/AUTHORS @@ -0,0 +1 @@ +Sean Hefty <sean.hefty@intel.com> diff --git a/prov/rdmacm/COPYING b/prov/rdmacm/COPYING new file mode 100644 index 00000000000..39f3831585f --- /dev/null +++ b/prov/rdmacm/COPYING @@ -0,0 +1,378 @@ +This software is available to you under a choice of one of two +licenses. You may choose to be licensed under the terms of the the +OpenIB.org BSD license or the GNU General Public License (GPL) Version +2, both included below. + +Copyright (c) 2005 Intel Corporation. All rights reserved. + +================================================================== + + OpenIB.org BSD license + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +================================================================== + + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + <one line to give the program's name and a brief idea of what it does.> + Copyright (C) <year> <name of author> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + <signature of Ty Coon>, 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/prov/rdmacm/examples/common.c b/prov/rdmacm/examples/common.c new file mode 100644 index 00000000000..2d10ea1262d --- /dev/null +++ b/prov/rdmacm/examples/common.c @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2005-2006,2012 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $Id$ + */ + +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#include <errno.h> +#include <sys/types.h> +#include <netinet/in.h> +#include <sys/socket.h> +#include <netdb.h> +#include <byteswap.h> + +#include <rdma/rdma_cma.h> +#include "common.h" + +int use_rs = 1; + +/* + * rdma_getaddrinfo is not exported by libfabric at this time + */ +//int get_rdma_addr(char *src, char *dst, char *port, +// struct rdma_addrinfo *hints, struct rdma_addrinfo **rai) +//{ +// struct rdma_addrinfo rai_hints, *res; +// int ret; +// +// if (hints->ai_flags & RAI_PASSIVE) +// return rdma_getaddrinfo(src, port, hints, rai); +// +// rai_hints = *hints; +// if (src) { +// rai_hints.ai_flags |= RAI_PASSIVE; +// ret = rdma_getaddrinfo(src, NULL, &rai_hints, &res); +// if (ret) +// return ret; +// +// rai_hints.ai_src_addr = res->ai_src_addr; +// rai_hints.ai_src_len = res->ai_src_len; +// rai_hints.ai_flags &= ~RAI_PASSIVE; +// } +// +// ret = rdma_getaddrinfo(dst, port, &rai_hints, rai); +// if (src) +// rdma_freeaddrinfo(res); +// +// return ret; +//} + +void size_str(char *str, size_t ssize, long long size) +{ + long long base, fraction = 0; + char mag; + + if (size >= (1 << 30)) { + base = 1 << 30; + mag = 'g'; + } else if (size >= (1 << 20)) { + base = 1 << 20; + mag = 'm'; + } else if (size >= (1 << 10)) { + base = 1 << 10; + mag = 'k'; + } else { + base = 1; + mag = '\0'; + } + + if (size / base < 10) + fraction = (size % base) * 10 / base; + if (fraction) { + snprintf(str, ssize, "%lld.%lld%c", size / base, fraction, mag); + } else { + snprintf(str, ssize, "%lld%c", size / base, mag); + } +} + +void cnt_str(char *str, size_t ssize, long long cnt) +{ + if (cnt >= 1000000000) + snprintf(str, ssize, "%lldb", cnt / 1000000000); + else if (cnt >= 1000000) + snprintf(str, ssize, "%lldm", cnt / 1000000); + else if (cnt >= 1000) + snprintf(str, ssize, "%lldk", cnt / 1000); + else + snprintf(str, ssize, "%lld", cnt); +} + +int size_to_count(int size) +{ + if (size >= (1 << 20)) + return 100; + else if (size >= (1 << 16)) + return 1000; + else if (size >= (1 << 10)) + return 10000; + else + return 100000; +} + +void format_buf(void *buf, int size) +{ + uint8_t *array = buf; + static uint8_t data; + int i; + + for (i = 0; i < size; i++) + array[i] = data++; +} + +int verify_buf(void *buf, int size) +{ + static long long total_bytes; + uint8_t *array = buf; + static uint8_t data; + int i; + + for (i = 0; i < size; i++, total_bytes++) { + if (array[i] != data++) { + printf("data verification failed byte %lld\n", total_bytes); + return -1; + } + } + return 0; +} + +int do_poll(struct pollfd *fds, int timeout) +{ + int ret; + + do { + ret = rs_poll(fds, 1, timeout); + } while (!ret); + + return ret == 1 ? 0 : ret; +} diff --git a/prov/rdmacm/examples/common.h b/prov/rdmacm/examples/common.h new file mode 100644 index 00000000000..f7511f03969 --- /dev/null +++ b/prov/rdmacm/examples/common.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2005-2012 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $Id$ + */ + +#include <stdlib.h> +#include <sys/types.h> +#include <byteswap.h> +#include <poll.h> + +#include <rdma/rdma_cma.h> +#include <rdma/rsocket.h> +#include <infiniband/ib.h> + +#if __BYTE_ORDER == __BIG_ENDIAN +static inline uint64_t cpu_to_be64(uint64_t x) { return x; } +static inline uint32_t cpu_to_be32(uint32_t x) { return x; } +#else +static inline uint64_t cpu_to_be64(uint64_t x) { return bswap_64(x); } +static inline uint32_t cpu_to_be32(uint32_t x) { return bswap_32(x); } +#endif + +extern int use_rs; + +#define rs_socket(f,t,p) use_rs ? rsocket(f,t,p) : socket(f,t,p) +#define rs_bind(s,a,l) use_rs ? rbind(s,a,l) : bind(s,a,l) +#define rs_listen(s,b) use_rs ? rlisten(s,b) : listen(s,b) +#define rs_connect(s,a,l) use_rs ? rconnect(s,a,l) : connect(s,a,l) +#define rs_accept(s,a,l) use_rs ? raccept(s,a,l) : accept(s,a,l) +#define rs_shutdown(s,h) use_rs ? rshutdown(s,h) : shutdown(s,h) +#define rs_close(s) use_rs ? rclose(s) : close(s) +#define rs_recv(s,b,l,f) use_rs ? rrecv(s,b,l,f) : recv(s,b,l,f) +#define rs_send(s,b,l,f) use_rs ? rsend(s,b,l,f) : send(s,b,l,f) +#define rs_recvfrom(s,b,l,f,a,al) \ + use_rs ? rrecvfrom(s,b,l,f,a,al) : recvfrom(s,b,l,f,a,al) +#define rs_sendto(s,b,l,f,a,al) \ + use_rs ? rsendto(s,b,l,f,a,al) : sendto(s,b,l,f,a,al) +#define rs_poll(f,n,t) use_rs ? rpoll(f,n,t) : poll(f,n,t) +#define rs_fcntl(s,c,p) use_rs ? rfcntl(s,c,p) : fcntl(s,c,p) +#define rs_setsockopt(s,l,n,v,ol) \ + use_rs ? rsetsockopt(s,l,n,v,ol) : setsockopt(s,l,n,v,ol) +#define rs_getsockopt(s,l,n,v,ol) \ + use_rs ? rgetsockopt(s,l,n,v,ol) : getsockopt(s,l,n,v,ol) + +union socket_addr { + struct sockaddr sa; + struct sockaddr_in sin; + struct sockaddr_in6 sin6; +}; + +enum rs_optimization { + opt_mixed, + opt_latency, + opt_bandwidth +}; + +int get_rdma_addr(char *src, char *dst, char *port, + struct rdma_addrinfo *hints, struct rdma_addrinfo **rai); + +void size_str(char *str, size_t ssize, long long size); +void cnt_str(char *str, size_t ssize, long long cnt); +int size_to_count(int size); +void format_buf(void *buf, int size); +int verify_buf(void *buf, int size); +int do_poll(struct pollfd *fds, int timeout); diff --git a/prov/rdmacm/examples/rcopy.c b/prov/rdmacm/examples/rcopy.c new file mode 100644 index 00000000000..152acef2359 --- /dev/null +++ b/prov/rdmacm/examples/rcopy.c @@ -0,0 +1,628 @@ +/* + * Copyright (c) 2011 Intel Corporation. All rights reserved. + * + * This software is available to you under the OpenIB.org BSD license + * below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AWV + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <strings.h> +#include <errno.h> +#include <getopt.h> +#include <arpa/inet.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/time.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <netdb.h> +#include <unistd.h> + +#include <rdma/rsocket.h> + +union rsocket_address { + struct sockaddr sa; + struct sockaddr_in sin; + struct sockaddr_in6 sin6; + struct sockaddr_storage storage; +}; + +static char *port = "7427"; +static char *dst_addr; +static char *dst_file; +static char *src_file; +static struct timeval start, end; +//static void buf[1024 * 1024]; +static uint64_t bytes; +int fd; +void *file_addr; + +enum { + CMD_NOOP, + CMD_OPEN, + CMD_CLOSE, + CMD_WRITE, + CMD_RESP = 0x80, +}; + +/* TODO: handle byte swapping */ +struct msg_hdr { + uint8_t version; + uint8_t command; + uint16_t len; + uint32_t data; + uint64_t id; +}; + +struct msg_open { + struct msg_hdr hdr; + char path[0]; +}; + +struct msg_write { + struct msg_hdr hdr; + uint64_t size; +}; + +static void show_perf(void) +{ + float usec; + + usec = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec); + + printf("%lld bytes in %.2f seconds = %.2f Gb/sec\n", + (long long) bytes, usec / 1000000., (bytes * 8) / (1000. * usec)); +} + +static char *_ntop(union rsocket_address *rsa) +{ + static char addr[32]; + + switch (rsa->sa.sa_family) { + case AF_INET: + inet_ntop(AF_INET, &rsa->sin.sin_addr, addr, sizeof addr); + break; + case AF_INET6: + inet_ntop(AF_INET6, &rsa->sin6.sin6_addr, addr, sizeof addr); + break; + default: + addr[0] = '\0'; + break; + } + + return addr; +} + +static size_t _recv(int rs, char *msg, size_t len) +{ + size_t ret, offset; + + for (offset = 0; offset < len; offset += ret) { + ret = rrecv(rs, msg + offset, len - offset, 0); + if (ret <= 0) + return ret; + } + + return len; +} + +static int msg_recv_hdr(int rs, struct msg_hdr *hdr) +{ + int ret; + + ret = _recv(rs, (char *) hdr, sizeof *hdr); + if (ret != sizeof *hdr) + return -1; + + if (hdr->version || hdr->len < sizeof *hdr) { + printf("invalid version %d or length %d\n", + hdr->version, hdr->len); + return -1; + } + + return sizeof *hdr; +} + +static int msg_get_resp(int rs, struct msg_hdr *msg, uint8_t cmd) +{ + int ret; + + ret = msg_recv_hdr(rs, msg); + if (ret != sizeof *msg) + return ret; + + if ((msg->len != sizeof *msg) || (msg->command != (cmd | CMD_RESP))) { + printf("invalid length %d or bad command response %x:%x\n", + msg->len, msg->command, cmd | CMD_RESP); + return -1; + } + + return msg->data; +} + +static void msg_send_resp(int rs, struct msg_hdr *msg, uint32_t status) +{ + struct msg_hdr resp; + + resp.version = 0; + resp.command = msg->command | CMD_RESP; + resp.len = sizeof resp; + resp.data = status; + resp.id = msg->id; + rsend(rs, (char *) &resp, sizeof resp, 0); +} + +static int server_listen(void) +{ + struct addrinfo hints, *res; + int ret, rs; + + memset(&hints, 0, sizeof hints); + hints.ai_flags = RAI_PASSIVE; + ret = getaddrinfo(NULL, port, &hints, &res); + if (ret) { + perror("getaddrinfo failed\n"); + return ret; + } + + rs = rsocket(res->ai_family, res->ai_socktype, res->ai_protocol); + if (rs < 0) { + perror("rsocket failed\n"); + ret = rs; + goto free; + } + + ret = 1; + ret = rsetsockopt(rs, SOL_SOCKET, SO_REUSEADDR, &ret, sizeof ret); + if (ret) { + perror("rsetsockopt failed"); + goto close; + } + + ret = rbind(rs, res->ai_addr, res->ai_addrlen); + if (ret) { + perror("rbind failed"); + goto close; + } + + ret = rlisten(rs, 1); + if (ret) { + perror("rlisten failed"); + goto close; + } + + ret = rs; + goto free; + +close: + rclose(rs); +free: + freeaddrinfo(res); + return ret; +} + +static int server_open(int rs, struct msg_hdr *msg) +{ + char *path = NULL; + int ret, len; + + printf("opening: "); + fflush(NULL); + if (file_addr || fd > 0) { + printf("cannot open another file\n"); + ret = EBUSY; + goto out; + } + + len = msg->len - sizeof *msg; + path = malloc(len); + if (!path) { + printf("cannot allocate path name\n"); + ret = ENOMEM; + goto out; + } + + ret = _recv(rs, path, len); + if (ret != len) { + printf("error receiving path\n"); + goto out; + } + + printf("%s, ", path); + fflush(NULL); + fd = open(path, O_RDWR | O_CREAT | O_TRUNC, msg->data); + if (fd < 0) { + printf("unable to open destination file\n"); + ret = errno; + } + + ret = 0; +out: + if (path) + free(path); + + msg_send_resp(rs, msg, ret); + return ret; +} + +static void server_close(int rs, struct msg_hdr *msg) +{ + printf("closing..."); + fflush(NULL); + msg_send_resp(rs, msg, 0); + + if (file_addr) { + munmap(file_addr, bytes); + file_addr = 0; + } + + if (fd > 0) { + close(fd); + fd = 0; + } + printf("done\n"); +} + +static int server_write(int rs, struct msg_hdr *msg) +{ + size_t len; + int ret; + + printf("transferring"); + fflush(NULL); + if (fd <= 0) { + printf("...file not opened\n"); + ret = EINVAL; + goto out; + } + + if (msg->len != sizeof(struct msg_write)) { + printf("...invalid message length %d\n", msg->len); + ret = EINVAL; + goto out; + } + + ret = _recv(rs, (char *) &bytes, sizeof bytes); + if (ret != sizeof bytes) + goto out; + + ret = ftruncate(fd, bytes); + if (ret) + goto out; + + file_addr = mmap(NULL, bytes, PROT_WRITE, MAP_SHARED, fd, 0); + if (file_addr == (void *) -1) { + printf("...error mapping file\n"); + ret = errno; + goto out; + } + + printf("...%lld bytes...", (long long) bytes); + fflush(NULL); + len = _recv(rs, file_addr, bytes); + if (len != bytes) { + printf("...error receiving data\n"); + ret = (int) len; + } +out: + msg_send_resp(rs, msg, ret); + return ret; +} + +static void server_process(int rs) +{ + struct msg_hdr msg; + int ret; + + do { + ret = msg_recv_hdr(rs, &msg); + if (ret != sizeof msg) + break; + + switch (msg.command) { + case CMD_OPEN: + ret = server_open(rs, &msg); + break; + case CMD_CLOSE: + server_close(rs, &msg); + ret = 0; + break; + case CMD_WRITE: + ret = server_write(rs, &msg); + break; + default: + msg_send_resp(rs, &msg, EINVAL); + ret = -1; + break; + } + + } while (!ret); +} + +static int server_run(void) +{ + int lrs, rs; + union rsocket_address rsa; + socklen_t len; + + lrs = server_listen(); + if (lrs < 0) + return lrs; + + while (1) { + len = sizeof rsa; + printf("waiting for connection..."); + fflush(NULL); + rs = raccept(lrs, &rsa.sa, &len); + + printf("client: %s\n", _ntop(&rsa)); + server_process(rs); + + rshutdown(rs, SHUT_RDWR); + rclose(rs); + } + return 0; +} + +static int client_connect(void) +{ + struct addrinfo *res; + int ret, rs; + + ret = getaddrinfo(dst_addr, port, NULL, &res); + if (ret) { + perror("getaddrinfo failed\n"); + return ret; + } + + rs = rsocket(res->ai_family, res->ai_socktype, res->ai_protocol); + if (rs < 0) { + perror("rsocket failed\n"); + goto free; + } + + ret = rconnect(rs, res->ai_addr, res->ai_addrlen); + if (ret) { + perror("rconnect failed\n"); + rclose(rs); + rs = ret; + } + +free: + freeaddrinfo(res); + return rs; +} + +static int client_open(int rs) +{ + struct msg_open *msg; + struct stat stats; + uint32_t len; + int ret; + + printf("opening..."); + fflush(NULL); + fd = open(src_file, O_RDONLY); + if (fd < 0) + return fd; + + ret = fstat(fd, &stats); + if (ret < 0) + goto err1; + + bytes = (uint64_t) stats.st_size; + file_addr = mmap(NULL, bytes, PROT_READ, MAP_SHARED, fd, 0); + if (file_addr == (void *) -1) { + ret = errno; + goto err1; + } + + len = (((uint32_t) strlen(dst_file)) + 8) & 0xFFFFFFF8; + msg = calloc(1, sizeof(*msg) + len); + if (!msg) { + ret = -1; + goto err2; + } + + msg->hdr.command = CMD_OPEN; + msg->hdr.len = sizeof(*msg) + len; + msg->hdr.data = (uint32_t) stats.st_mode; + strcpy(msg->path, dst_file); + ret = rsend(rs, msg, msg->hdr.len, 0); + if (ret != msg->hdr.len) + goto err3; + + ret = msg_get_resp(rs, &msg->hdr, CMD_OPEN); + if (ret) + goto err3; + + return 0; + +err3: + free(msg); +err2: + munmap(file_addr, bytes); +err1: + close(fd); + return ret; +} + +static int client_start_write(int rs) +{ + struct msg_write msg; + int ret; + + printf("transferring"); + fflush(NULL); + memset(&msg, 0, sizeof msg); + msg.hdr.command = CMD_WRITE; + msg.hdr.len = sizeof(msg); + msg.size = bytes; + + ret = rsend(rs, &msg, sizeof msg, 0); + if (ret != msg.hdr.len) + return ret; + + return 0; +} + +static int client_close(int rs) +{ + struct msg_hdr msg; + int ret; + + printf("closing..."); + fflush(NULL); + memset(&msg, 0, sizeof msg); + msg.command = CMD_CLOSE; + msg.len = sizeof msg; + ret = rsend(rs, (char *) &msg, msg.len, 0); + if (ret != msg.len) + goto out; + + ret = msg_get_resp(rs, &msg, CMD_CLOSE); + if (ret) + goto out; + + printf("done\n"); +out: + munmap(file_addr, bytes); + close(fd); + return ret; +} + +static int client_run(void) +{ + struct msg_hdr ack; + int ret, rs; + size_t len; + + rs = client_connect(); + if (rs < 0) + return rs; + + ret = client_open(rs); + if (ret) + goto shutdown; + + ret = client_start_write(rs); + if (ret) + goto close; + + printf("..."); + fflush(NULL); + gettimeofday(&start, NULL); + len = rsend(rs, file_addr, bytes, 0); + if (len == bytes) + ret = msg_get_resp(rs, &ack, CMD_WRITE); + else + ret = (int) len; + + gettimeofday(&end, NULL); + +close: + client_close(rs); +shutdown: + rshutdown(rs, SHUT_RDWR); + rclose(rs); + if (!ret) + show_perf(); + return ret; +} + +static void show_usage(char *program) +{ + printf("usage 1: %s [options]\n", program); + printf("\t starts the server application\n"); + printf("\t[-p port_number]\n"); + printf("usage 2: %s source server[:destination] [options]\n", program); + printf("\t source - file name and path\n"); + printf("\t server - name or address\n"); + printf("\t destination - file name and path\n"); + printf("\t[-p port_number]\n"); + exit(1); +} + +static void server_opts(int argc, char **argv) +{ + int op; + + while ((op = getopt(argc, argv, "p:")) != -1) { + switch (op) { + case 'p': + port = optarg; + break; + default: + show_usage(argv[0]); + } + } +} + +static void client_opts(int argc, char **argv) +{ + int op; + + if (argc < 3) + show_usage(argv[0]); + + src_file = argv[1]; + dst_addr = argv[2]; + dst_file = strchr(dst_addr, ':'); + if (dst_file) { + *dst_file = '\0'; + dst_file++; + } + if (!dst_file) + dst_file = src_file; + + while ((op = getopt(argc, argv, "p:")) != -1) { + switch (op) { + case 'p': + port = optarg; + break; + default: + show_usage(argv[0]); + } + } + +} + +int main(int argc, char **argv) +{ + int ret; + + if (argc == 1 || argv[1][0] == '-') { + server_opts(argc, argv); + ret = server_run(); + } else { + client_opts(argc, argv); + ret = client_run(); + } + + return ret; +} diff --git a/prov/rdmacm/examples/riostream.c b/prov/rdmacm/examples/riostream.c new file mode 100644 index 00000000000..a1d36718aed --- /dev/null +++ b/prov/rdmacm/examples/riostream.c @@ -0,0 +1,639 @@ +/* + * Copyright (c) 2011-2012 Intel Corporation. All rights reserved. + * + * This software is available to you under the OpenIB.org BSD license + * below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AWV + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <strings.h> +#include <errno.h> +#include <getopt.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/time.h> +#include <sys/wait.h> +#include <netdb.h> +#include <fcntl.h> +#include <unistd.h> +#include <netinet/in.h> +#include <netinet/tcp.h> + +#include <rdma/rdma_cma.h> +#include <rdma/rsocket.h> +#include "common.h" + +struct test_size_param { + int size; + int option; +}; + +static struct test_size_param test_size[] = { + { 1 << 6, 0 }, + { 1 << 7, 1 }, { (1 << 7) + (1 << 6), 1}, + { 1 << 8, 1 }, { (1 << 8) + (1 << 7), 1}, + { 1 << 9, 1 }, { (1 << 9) + (1 << 8), 1}, + { 1 << 10, 1 }, { (1 << 10) + (1 << 9), 1}, + { 1 << 11, 1 }, { (1 << 11) + (1 << 10), 1}, + { 1 << 12, 0 }, { (1 << 12) + (1 << 11), 1}, + { 1 << 13, 1 }, { (1 << 13) + (1 << 12), 1}, + { 1 << 14, 1 }, { (1 << 14) + (1 << 13), 1}, + { 1 << 15, 1 }, { (1 << 15) + (1 << 14), 1}, + { 1 << 16, 0 }, { (1 << 16) + (1 << 15), 1}, + { 1 << 17, 1 }, { (1 << 17) + (1 << 16), 1}, + { 1 << 18, 1 }, { (1 << 18) + (1 << 17), 1}, + { 1 << 19, 1 }, { (1 << 19) + (1 << 18), 1}, + { 1 << 20, 0 }, { (1 << 20) + (1 << 19), 1}, + { 1 << 21, 1 }, { (1 << 21) + (1 << 20), 1}, + { 1 << 22, 1 }, { (1 << 22) + (1 << 21), 1}, +}; +#define TEST_CNT (sizeof test_size / sizeof test_size[0]) + +static int rs, lrs; +static int use_async; +static int verify; +static int flags = MSG_DONTWAIT; +static int poll_timeout = 0; +static int custom; +static enum rs_optimization optimization; +static int size_option; +static int iterations = 1; +static int transfer_size = 1000; +static int transfer_count = 1000; +static int buffer_size; +static char test_name[10] = "custom"; +static char *port = "7471"; +static char *dst_addr; +static char *src_addr; +static struct timeval start, end; +static void *buf; +static volatile uint8_t *poll_byte; + +static void show_perf(void) +{ + char str[32]; + float usec; + long long bytes; + + usec = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec); + bytes = (long long) iterations * transfer_count * transfer_size * 2; + + /* name size transfers iterations bytes seconds Gb/sec usec/xfer */ + printf("%-10s", test_name); + size_str(str, sizeof str, transfer_size); + printf("%-8s", str); + cnt_str(str, sizeof str, transfer_count); + printf("%-8s", str); + cnt_str(str, sizeof str, iterations); + printf("%-8s", str); + size_str(str, sizeof str, bytes); + printf("%-8s", str); + printf("%8.2fs%10.2f%11.2f\n", + usec / 1000000., (bytes * 8) / (1000. * usec), + (usec / iterations) / (transfer_count * 2)); +} + +static void init_latency_test(int size) +{ + char sstr[5]; + + size_str(sstr, sizeof sstr, size); + snprintf(test_name, sizeof test_name, "%s_lat", sstr); + transfer_count = 1; + transfer_size = size; + iterations = size_to_count(transfer_size); +} + +static void init_bandwidth_test(int size) +{ + char sstr[5]; + + size_str(sstr, sizeof sstr, size); + snprintf(test_name, sizeof test_name, "%s_bw", sstr); + iterations = 1; + transfer_size = size; + transfer_count = size_to_count(transfer_size); +} + +static int send_msg(int size) +{ + struct pollfd fds; + int offset, ret; + + if (verify) + format_buf(buf, size); + + if (use_async) { + fds.fd = rs; + fds.events = POLLOUT; + } + + for (offset = 0; offset < size; ) { + if (use_async) { + ret = do_poll(&fds, poll_timeout); + if (ret) + return ret; + } + + ret = rsend(rs, buf + offset, size - offset, flags); + if (ret > 0) { + offset += ret; + } else if (errno != EWOULDBLOCK && errno != EAGAIN) { + perror("rsend"); + return ret; + } + } + + return 0; +} + +static int send_xfer(int size) +{ + struct pollfd fds; + int offset, ret; + + if (verify) + format_buf(buf, size - 1); + + if (use_async) { + fds.fd = rs; + fds.events = POLLOUT; + } + + for (offset = 0; offset < size; ) { + if (use_async) { + ret = do_poll(&fds, poll_timeout); + if (ret) + return ret; + } + + ret = riowrite(rs, buf + offset, size - offset, offset, flags); + if (ret > 0) { + offset += ret; + } else if (errno != EWOULDBLOCK && errno != EAGAIN) { + perror("riowrite"); + return ret; + } + } + + return 0; +} + +static int recv_msg(int size) +{ + struct pollfd fds; + int offset, ret; + + if (use_async) { + fds.fd = rs; + fds.events = POLLIN; + } + + for (offset = 0; offset < size; ) { + if (use_async) { + ret = do_poll(&fds, poll_timeout); + if (ret) + return ret; + } + + ret = rrecv(rs, buf + offset, size - offset, flags); + if (ret > 0) { + offset += ret; + } else if (errno != EWOULDBLOCK && errno != EAGAIN) { + perror("rrecv"); + return ret; + } + } + + if (verify) { + ret = verify_buf(buf, size); + if (ret) + return ret; + } + + return 0; +} + +static int recv_xfer(int size, uint8_t marker) +{ + int ret; + + while (*poll_byte != marker) + ; + + if (verify) { + ret = verify_buf(buf, size - 1); + if (ret) + return ret; + } + + return 0; +} + +static int sync_test(void) +{ + int ret; + + ret = dst_addr ? send_msg(16) : recv_msg(16); + if (ret) + return ret; + + return dst_addr ? recv_msg(16) : send_msg(16); +} + +static int run_test(void) +{ + int ret, i, t; + off_t offset; + uint8_t marker = 0; + + poll_byte = buf + transfer_size - 1; + *poll_byte = -1; + offset = riomap(rs, buf, transfer_size, PROT_WRITE, 0, 0); + if (offset == -1) { + perror("riomap"); + ret = -1; + goto out; + } + ret = sync_test(); + if (ret) + goto out; + + gettimeofday(&start, NULL); + for (i = 0; i < iterations; i++) { + if (dst_addr) { + for (t = 0; t < transfer_count - 1; t++) { + ret = send_xfer(transfer_size); + if (ret) + goto out; + } + *poll_byte = (uint8_t) marker++; + ret = send_xfer(transfer_size); + if (ret) + goto out; + + ret = recv_xfer(transfer_size, marker++); + } else { + ret = recv_xfer(transfer_size, marker++); + if (ret) + goto out; + + for (t = 0; t < transfer_count - 1; t++) { + ret = send_xfer(transfer_size); + if (ret) + goto out; + } + *poll_byte = (uint8_t) marker++; + ret = send_xfer(transfer_size); + } + if (ret) + goto out; + } + gettimeofday(&end, NULL); + show_perf(); + ret = riounmap(rs, buf, transfer_size); + +out: + return ret; +} + +static void set_options(int rs) +{ + int val; + + if (buffer_size) { + rsetsockopt(rs, SOL_SOCKET, SO_SNDBUF, (void *) &buffer_size, + sizeof buffer_size); + rsetsockopt(rs, SOL_SOCKET, SO_RCVBUF, (void *) &buffer_size, + sizeof buffer_size); + } else { + val = 1 << 19; + rsetsockopt(rs, SOL_SOCKET, SO_SNDBUF, (void *) &val, sizeof val); + rsetsockopt(rs, SOL_SOCKET, SO_RCVBUF, (void *) &val, sizeof val); + } + + val = 1; + rsetsockopt(rs, IPPROTO_TCP, TCP_NODELAY, (void *) &val, sizeof(val)); + rsetsockopt(rs, SOL_RDMA, RDMA_IOMAPSIZE, (void *) &val, sizeof val); + + if (flags & MSG_DONTWAIT) + rfcntl(rs, F_SETFL, O_NONBLOCK); + + /* Inline size based on experimental data */ + if (optimization == opt_latency) { + val = 384; + rsetsockopt(rs, SOL_RDMA, RDMA_INLINE, &val, sizeof val); + } else if (optimization == opt_bandwidth) { + val = 0; + rsetsockopt(rs, SOL_RDMA, RDMA_INLINE, &val, sizeof val); + } +} + +static int server_listen(void) +{ + struct addrinfo hints, *res; + int val, ret; + + memset(&hints, 0, sizeof hints); + hints.ai_flags = AI_PASSIVE; + ret = getaddrinfo(src_addr, port, &hints, &res); + if (ret) { + perror("getaddrinfo"); + return ret; + } + + lrs = rsocket(res->ai_family, res->ai_socktype, res->ai_protocol); + if (lrs < 0) { + perror("rsocket"); + ret = lrs; + goto free; + } + + val = 1; + ret = rsetsockopt(lrs, SOL_SOCKET, SO_REUSEADDR, &val, sizeof val); + if (ret) { + perror("rsetsockopt SO_REUSEADDR"); + goto close; + } + + ret = rbind(lrs, res->ai_addr, res->ai_addrlen); + if (ret) { + perror("rbind"); + goto close; + } + + ret = rlisten(lrs, 1); + if (ret) + perror("rlisten"); + +close: + if (ret) + rclose(lrs); +free: + freeaddrinfo(res); + return ret; +} + +static int server_connect(void) +{ + struct pollfd fds; + int ret = 0; + + set_options(lrs); + do { + if (use_async) { + fds.fd = lrs; + fds.events = POLLIN; + + ret = do_poll(&fds, poll_timeout); + if (ret) { + perror("rpoll"); + return ret; + } + } + + rs = raccept(lrs, NULL, 0); + } while (rs < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)); + if (rs < 0) { + perror("raccept"); + return rs; + } + + set_options(rs); + return ret; +} + +static int client_connect(void) +{ + struct addrinfo *res; + struct pollfd fds; + int ret, err; + socklen_t len; + + ret = getaddrinfo(dst_addr, port, NULL, &res); + if (ret) { + perror("getaddrinfo"); + return ret; + } + + rs = rsocket(res->ai_family, res->ai_socktype, res->ai_protocol); + if (rs < 0) { + perror("rsocket"); + ret = rs; + goto free; + } + + set_options(rs); + /* TODO: bind client to src_addr */ + + ret = rconnect(rs, res->ai_addr, res->ai_addrlen); + if (ret && (errno != EINPROGRESS)) { + perror("rconnect"); + goto close; + } + + if (ret && (errno == EINPROGRESS)) { + fds.fd = rs; + fds.events = POLLOUT; + ret = do_poll(&fds, poll_timeout); + if (ret) + goto close; + + len = sizeof err; + ret = rgetsockopt(rs, SOL_SOCKET, SO_ERROR, &err, &len); + if (ret) + goto close; + if (err) { + ret = -1; + errno = err; + perror("async rconnect"); + } + } + +close: + if (ret) + rclose(rs); +free: + freeaddrinfo(res); + return ret; +} + +static int run(void) +{ + int i, ret = 0; + + buf = malloc(!custom ? test_size[TEST_CNT - 1].size : transfer_size); + if (!buf) { + perror("malloc"); + return -1; + } + + if (!dst_addr) { + ret = server_listen(); + if (ret) + goto free; + } + + printf("%-10s%-8s%-8s%-8s%-8s%8s %10s%13s\n", + "name", "bytes", "xfers", "iters", "total", "time", "Gb/sec", "usec/xfer"); + if (!custom) { + optimization = opt_latency; + ret = dst_addr ? client_connect() : server_connect(); + if (ret) + goto free; + + for (i = 0; i < TEST_CNT; i++) { + if (test_size[i].option > size_option) + continue; + init_latency_test(test_size[i].size); + run_test(); + } + rshutdown(rs, SHUT_RDWR); + rclose(rs); + + optimization = opt_bandwidth; + ret = dst_addr ? client_connect() : server_connect(); + if (ret) + goto free; + for (i = 0; i < TEST_CNT; i++) { + if (test_size[i].option > size_option) + continue; + init_bandwidth_test(test_size[i].size); + run_test(); + } + } else { + ret = dst_addr ? client_connect() : server_connect(); + if (ret) + goto free; + + ret = run_test(); + } + + rshutdown(rs, SHUT_RDWR); + rclose(rs); +free: + free(buf); + return ret; +} + +static int set_test_opt(char *optarg) +{ + if (strlen(optarg) == 1) { + switch (optarg[0]) { + case 'a': + use_async = 1; + break; + case 'b': + flags = (flags & ~MSG_DONTWAIT) | MSG_WAITALL; + break; + case 'n': + flags |= MSG_DONTWAIT; + break; + case 'v': + verify = 1; + break; + default: + return -1; + } + } else { + if (!strncasecmp("async", optarg, 5)) { + use_async = 1; + } else if (!strncasecmp("block", optarg, 5)) { + flags = (flags & ~MSG_DONTWAIT) | MSG_WAITALL; + } else if (!strncasecmp("nonblock", optarg, 8)) { + flags |= MSG_DONTWAIT; + } else if (!strncasecmp("verify", optarg, 6)) { + verify = 1; + } else { + return -1; + } + } + return 0; +} + +int main(int argc, char **argv) +{ + int op, ret; + + while ((op = getopt(argc, argv, "s:b:B:I:C:S:p:T:")) != -1) { + switch (op) { + case 's': + dst_addr = optarg; + break; + case 'b': + src_addr = optarg; + break; + case 'B': + buffer_size = atoi(optarg); + break; + case 'I': + custom = 1; + iterations = atoi(optarg); + break; + case 'C': + custom = 1; + transfer_count = atoi(optarg); + break; + case 'S': + if (!strncasecmp("all", optarg, 3)) { + size_option = 1; + } else { + custom = 1; + transfer_size = atoi(optarg); + } + break; + case 'p': + port = optarg; + break; + case 'T': + if (!set_test_opt(optarg)) + break; + /* invalid option - fall through */ + default: + printf("usage: %s\n", argv[0]); + printf("\t[-s server_address]\n"); + printf("\t[-b bind_address]\n"); + printf("\t[-B buffer_size]\n"); + printf("\t[-I iterations]\n"); + printf("\t[-C transfer_count]\n"); + printf("\t[-S transfer_size or all]\n"); + printf("\t[-p port_number]\n"); + printf("\t[-T test_option]\n"); + printf("\t a|async - asynchronous operation (use poll)\n"); + printf("\t b|blocking - use blocking calls\n"); + printf("\t n|nonblocking - use nonblocking calls\n"); + printf("\t v|verify - verify data\n"); + exit(1); + } + } + + if (!(flags & MSG_DONTWAIT)) + poll_timeout = -1; + + ret = run(); + return ret; +} diff --git a/prov/rdmacm/examples/rstream.c b/prov/rdmacm/examples/rstream.c new file mode 100644 index 00000000000..e94e8807048 --- /dev/null +++ b/prov/rdmacm/examples/rstream.c @@ -0,0 +1,609 @@ +/* + * Copyright (c) 2011-2012 Intel Corporation. All rights reserved. + * + * This software is available to you under the OpenIB.org BSD license + * below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AWV + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <strings.h> +#include <errno.h> +#include <getopt.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/time.h> +#include <sys/wait.h> +#include <netdb.h> +#include <fcntl.h> +#include <unistd.h> +#include <netinet/in.h> +#include <netinet/tcp.h> + +#include <rdma/rsocket.h> +#include "common.h" + +struct test_size_param { + int size; + int option; +}; + +static struct test_size_param test_size[] = { + { 1 << 6, 0 }, + { 1 << 7, 1 }, { (1 << 7) + (1 << 6), 1}, + { 1 << 8, 1 }, { (1 << 8) + (1 << 7), 1}, + { 1 << 9, 1 }, { (1 << 9) + (1 << 8), 1}, + { 1 << 10, 1 }, { (1 << 10) + (1 << 9), 1}, + { 1 << 11, 1 }, { (1 << 11) + (1 << 10), 1}, + { 1 << 12, 0 }, { (1 << 12) + (1 << 11), 1}, + { 1 << 13, 1 }, { (1 << 13) + (1 << 12), 1}, + { 1 << 14, 1 }, { (1 << 14) + (1 << 13), 1}, + { 1 << 15, 1 }, { (1 << 15) + (1 << 14), 1}, + { 1 << 16, 0 }, { (1 << 16) + (1 << 15), 1}, + { 1 << 17, 1 }, { (1 << 17) + (1 << 16), 1}, + { 1 << 18, 1 }, { (1 << 18) + (1 << 17), 1}, + { 1 << 19, 1 }, { (1 << 19) + (1 << 18), 1}, + { 1 << 20, 0 }, { (1 << 20) + (1 << 19), 1}, + { 1 << 21, 1 }, { (1 << 21) + (1 << 20), 1}, + { 1 << 22, 1 }, { (1 << 22) + (1 << 21), 1}, +}; +#define TEST_CNT (sizeof test_size / sizeof test_size[0]) + +static int rs, lrs; +static int use_async; +static int verify; +static int flags = MSG_DONTWAIT; +static int poll_timeout = 0; +static int custom; +static int use_fork; +static pid_t fork_pid; +static enum rs_optimization optimization; +static int size_option; +static int iterations = 1; +static int transfer_size = 1000; +static int transfer_count = 1000; +static int buffer_size; +static char test_name[10] = "custom"; +static char *port = "7471"; +static char *dst_addr; +static char *src_addr; +static struct timeval start, end; +static void *buf; +static struct addrinfo ai_hints; + +static void show_perf(void) +{ + char str[32]; + float usec; + long long bytes; + + usec = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec); + bytes = (long long) iterations * transfer_count * transfer_size * 2; + + /* name size transfers iterations bytes seconds Gb/sec usec/xfer */ + printf("%-10s", test_name); + size_str(str, sizeof str, transfer_size); + printf("%-8s", str); + cnt_str(str, sizeof str, transfer_count); + printf("%-8s", str); + cnt_str(str, sizeof str, iterations); + printf("%-8s", str); + size_str(str, sizeof str, bytes); + printf("%-8s", str); + printf("%8.2fs%10.2f%11.2f\n", + usec / 1000000., (bytes * 8) / (1000. * usec), + (usec / iterations) / (transfer_count * 2)); +} + +static void init_latency_test(int size) +{ + char sstr[5]; + + size_str(sstr, sizeof sstr, size); + snprintf(test_name, sizeof test_name, "%s_lat", sstr); + transfer_count = 1; + transfer_size = size; + iterations = size_to_count(transfer_size); +} + +static void init_bandwidth_test(int size) +{ + char sstr[5]; + + size_str(sstr, sizeof sstr, size); + snprintf(test_name, sizeof test_name, "%s_bw", sstr); + iterations = 1; + transfer_size = size; + transfer_count = size_to_count(transfer_size); +} + +static int send_xfer(int size) +{ + struct pollfd fds; + int offset, ret; + + if (verify) + format_buf(buf, size); + + if (use_async) { + fds.fd = rs; + fds.events = POLLOUT; + } + + for (offset = 0; offset < size; ) { + if (use_async) { + ret = do_poll(&fds, poll_timeout); + if (ret) + return ret; + } + + ret = rs_send(rs, buf + offset, size - offset, flags); + if (ret > 0) { + offset += ret; + } else if (errno != EWOULDBLOCK && errno != EAGAIN) { + perror("rsend"); + return ret; + } + } + + return 0; +} + +static int recv_xfer(int size) +{ + struct pollfd fds; + int offset, ret; + + if (use_async) { + fds.fd = rs; + fds.events = POLLIN; + } + + for (offset = 0; offset < size; ) { + if (use_async) { + ret = do_poll(&fds, poll_timeout); + if (ret) + return ret; + } + + ret = rs_recv(rs, buf + offset, size - offset, flags); + if (ret > 0) { + offset += ret; + } else if (errno != EWOULDBLOCK && errno != EAGAIN) { + perror("rrecv"); + return ret; + } + } + + if (verify) { + ret = verify_buf(buf, size); + if (ret) + return ret; + } + + return 0; +} + +static int sync_test(void) +{ + int ret; + + ret = dst_addr ? send_xfer(16) : recv_xfer(16); + if (ret) + return ret; + + return dst_addr ? recv_xfer(16) : send_xfer(16); +} + +static int run_test(void) +{ + int ret, i, t; + + ret = sync_test(); + if (ret) + goto out; + + gettimeofday(&start, NULL); + for (i = 0; i < iterations; i++) { + for (t = 0; t < transfer_count; t++) { + ret = dst_addr ? send_xfer(transfer_size) : + recv_xfer(transfer_size); + if (ret) + goto out; + } + + for (t = 0; t < transfer_count; t++) { + ret = dst_addr ? recv_xfer(transfer_size) : + send_xfer(transfer_size); + if (ret) + goto out; + } + } + gettimeofday(&end, NULL); + show_perf(); + ret = 0; + +out: + return ret; +} + +static void set_options(int rs) +{ + int val; + + if (buffer_size) { + rs_setsockopt(rs, SOL_SOCKET, SO_SNDBUF, (void *) &buffer_size, + sizeof buffer_size); + rs_setsockopt(rs, SOL_SOCKET, SO_RCVBUF, (void *) &buffer_size, + sizeof buffer_size); + } else { + val = 1 << 19; + rs_setsockopt(rs, SOL_SOCKET, SO_SNDBUF, (void *) &val, sizeof val); + rs_setsockopt(rs, SOL_SOCKET, SO_RCVBUF, (void *) &val, sizeof val); + } + + val = 1; + rs_setsockopt(rs, IPPROTO_TCP, TCP_NODELAY, (void *) &val, sizeof(val)); + + if (flags & MSG_DONTWAIT) + rs_fcntl(rs, F_SETFL, O_NONBLOCK); + + if (use_rs) { + /* Inline size based on experimental data */ + if (optimization == opt_latency) { + val = 384; + rs_setsockopt(rs, SOL_RDMA, RDMA_INLINE, &val, sizeof val); + } else if (optimization == opt_bandwidth) { + val = 0; + rs_setsockopt(rs, SOL_RDMA, RDMA_INLINE, &val, sizeof val); + } + } +} + +static int server_listen(void) +{ + struct addrinfo *ai; + int val, ret; + + ai_hints.ai_flags |= AI_PASSIVE; + ret = getaddrinfo(src_addr, port, &ai_hints, &ai); + if (ret) { + perror("getaddrinfo"); + return ret; + } + + lrs = rs_socket(ai->ai_family, SOCK_STREAM, 0); + if (lrs < 0) { + perror("rsocket"); + ret = lrs; + goto free; + } + + val = 1; + ret = rs_setsockopt(lrs, SOL_SOCKET, SO_REUSEADDR, &val, sizeof val); + if (ret) { + perror("rsetsockopt SO_REUSEADDR"); + goto close; + } + + ret = rs_bind(lrs, ai->ai_addr, ai->ai_addrlen); + if (ret) { + perror("rbind"); + goto close; + } + + ret = rs_listen(lrs, 1); + if (ret) + perror("rlisten"); + +close: + if (ret) + rs_close(lrs); +free: + freeaddrinfo(ai); + return ret; +} + +static int server_connect(void) +{ + struct pollfd fds; + int ret = 0; + + set_options(lrs); + do { + if (use_async) { + fds.fd = lrs; + fds.events = POLLIN; + + ret = do_poll(&fds, poll_timeout); + if (ret) { + perror("rpoll"); + return ret; + } + } + + rs = rs_accept(lrs, NULL, 0); + } while (rs < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)); + if (rs < 0) { + perror("raccept"); + return rs; + } + + if (use_fork) + fork_pid = fork(); + if (!fork_pid) + set_options(rs); + return ret; +} + +static int client_connect(void) +{ + struct addrinfo *ai; + struct pollfd fds; + int ret, err; + socklen_t len; + + ret = getaddrinfo(dst_addr, port, &ai_hints, &ai); + if (ret) { + perror("getaddrinfo"); + return ret; + } + + rs = rs_socket(ai->ai_family, SOCK_STREAM, 0); + if (rs < 0) { + perror("rsocket"); + ret = rs; + goto free; + } + + set_options(rs); + /* TODO: bind client to src_addr */ + + ret = rs_connect(rs, ai->ai_addr, ai->ai_addrlen); + if (ret && (errno != EINPROGRESS)) { + perror("rconnect"); + goto close; + } + + if (ret && (errno == EINPROGRESS)) { + fds.fd = rs; + fds.events = POLLOUT; + ret = do_poll(&fds, poll_timeout); + if (ret) + goto close; + + len = sizeof err; + ret = rs_getsockopt(rs, SOL_SOCKET, SO_ERROR, &err, &len); + if (ret) + goto close; + if (err) { + ret = -1; + errno = err; + perror("async rconnect"); + } + } + +close: + if (ret) + rs_close(rs); +free: + freeaddrinfo(ai); + return ret; +} + +static int run(void) +{ + int i, ret = 0; + + buf = malloc(!custom ? test_size[TEST_CNT - 1].size : transfer_size); + if (!buf) { + perror("malloc"); + return -1; + } + + if (!dst_addr) { + ret = server_listen(); + if (ret) + goto free; + } + + printf("%-10s%-8s%-8s%-8s%-8s%8s %10s%13s\n", + "name", "bytes", "xfers", "iters", "total", "time", "Gb/sec", "usec/xfer"); + if (!custom) { + optimization = opt_latency; + ret = dst_addr ? client_connect() : server_connect(); + if (ret) + goto free; + + for (i = 0; i < TEST_CNT && !fork_pid; i++) { + if (test_size[i].option > size_option) + continue; + init_latency_test(test_size[i].size); + run_test(); + } + if (fork_pid) + wait(NULL); + else + rs_shutdown(rs, SHUT_RDWR); + rs_close(rs); + + if (!dst_addr && use_fork && !fork_pid) + goto free; + + optimization = opt_bandwidth; + ret = dst_addr ? client_connect() : server_connect(); + if (ret) + goto free; + for (i = 0; i < TEST_CNT && !fork_pid; i++) { + if (test_size[i].option > size_option) + continue; + init_bandwidth_test(test_size[i].size); + run_test(); + } + } else { + ret = dst_addr ? client_connect() : server_connect(); + if (ret) + goto free; + + if (!fork_pid) + ret = run_test(); + } + + if (fork_pid) + wait(NULL); + else + rs_shutdown(rs, SHUT_RDWR); + rs_close(rs); +free: + free(buf); + return ret; +} + +static int set_test_opt(char *optarg) +{ + if (strlen(optarg) == 1) { + switch (optarg[0]) { + case 's': + use_rs = 0; + break; + case 'a': + use_async = 1; + break; + case 'b': + flags = (flags & ~MSG_DONTWAIT) | MSG_WAITALL; + break; + case 'f': + use_fork = 1; + use_rs = 0; + break; + case 'n': + flags |= MSG_DONTWAIT; + break; +// case 'r': +// use_rgai = 1; +// break; + case 'v': + verify = 1; + break; + default: + return -1; + } + } else { + if (!strncasecmp("socket", optarg, 6)) { + use_rs = 0; + } else if (!strncasecmp("async", optarg, 5)) { + use_async = 1; + } else if (!strncasecmp("block", optarg, 5)) { + flags = (flags & ~MSG_DONTWAIT) | MSG_WAITALL; + } else if (!strncasecmp("nonblock", optarg, 8)) { + flags |= MSG_DONTWAIT; +// } else if (strncasecmp("resolve", optarg, 7)) { +// use_rgai = 1; + } else if (!strncasecmp("verify", optarg, 6)) { + verify = 1; + } else if (!strncasecmp("fork", optarg, 4)) { + use_fork = 1; + use_rs = 0; + } else { + return -1; + } + } + return 0; +} + +int main(int argc, char **argv) +{ + int op, ret; + + ai_hints.ai_socktype = SOCK_STREAM; + while ((op = getopt(argc, argv, "s:b:f:B:I:C:S:p:T:")) != -1) { + switch (op) { + case 's': + dst_addr = optarg; + break; + case 'b': + src_addr = optarg; + break; + case 'f': + if (!strncasecmp("ip", optarg, 2)) { + ai_hints.ai_flags = AI_NUMERICHOST; + } + break; + case 'B': + buffer_size = atoi(optarg); + break; + case 'I': + custom = 1; + iterations = atoi(optarg); + break; + case 'C': + custom = 1; + transfer_count = atoi(optarg); + break; + case 'S': + if (!strncasecmp("all", optarg, 3)) { + size_option = 1; + } else { + custom = 1; + transfer_size = atoi(optarg); + } + break; + case 'p': + port = optarg; + break; + case 'T': + if (!set_test_opt(optarg)) + break; + /* invalid option - fall through */ + default: + printf("usage: %s\n", argv[0]); + printf("\t[-s server_address]\n"); + printf("\t[-b bind_address]\n"); + printf("\t[-f address_format]\n"); + printf("\t name, ip, ipv6, or gid\n"); + printf("\t[-B buffer_size]\n"); + printf("\t[-I iterations]\n"); + printf("\t[-C transfer_count]\n"); + printf("\t[-S transfer_size or all]\n"); + printf("\t[-p port_number]\n"); + printf("\t[-T test_option]\n"); + printf("\t s|sockets - use standard tcp/ip sockets\n"); + printf("\t a|async - asynchronous operation (use poll)\n"); + printf("\t b|blocking - use blocking calls\n"); + printf("\t f|fork - fork server processing\n"); + printf("\t n|nonblocking - use nonblocking calls\n"); + printf("\t r|resolve - use rdma cm to resolve address\n"); + printf("\t v|verify - verify data\n"); + exit(1); + } + } + + if (!(flags & MSG_DONTWAIT)) + poll_timeout = -1; + + ret = run(); + return ret; +} diff --git a/prov/rdmacm/examples/udpong.c b/prov/rdmacm/examples/udpong.c new file mode 100644 index 00000000000..af8deb9ee8b --- /dev/null +++ b/prov/rdmacm/examples/udpong.c @@ -0,0 +1,568 @@ +/* + * Copyright (c) 2012 Intel Corporation. All rights reserved. + * + * This software is available to you under the OpenIB.org BSD license + * below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AWV + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <strings.h> +#include <errno.h> +#include <getopt.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/time.h> +#include <sys/wait.h> +#include <netdb.h> +#include <fcntl.h> +#include <unistd.h> +#include <netinet/in.h> +#include <netinet/tcp.h> +#include <arpa/inet.h> + +#include <rdma/rdma_cma.h> +#include <rdma/rsocket.h> +#include "common.h" + +static int test_size[] = { + (1 << 6), + (1 << 7), ((1 << 7) + (1 << 6)), + (1 << 8), ((1 << 8) + (1 << 7)), + (1 << 9), ((1 << 9) + (1 << 8)), + (1 << 10), ((1 << 10) + (1 << 9)), +}; +#define TEST_CNT (sizeof test_size / sizeof test_size[0]) + +enum { + msg_op_login, + msg_op_start, + msg_op_data, + msg_op_echo, + msg_op_end +}; + +struct message { + uint8_t op; + uint8_t id; + uint8_t seqno; + uint8_t reserved; + uint32_t data; + uint8_t buf[2048]; +}; + +#define CTRL_MSG_SIZE 16 + +struct client { + uint64_t recvcnt; +}; + +static struct client clients[256]; +static uint8_t id; + +static int rs; +static int use_async; +static int flags = MSG_DONTWAIT; +static int poll_timeout; +static int custom; +static int echo; +static int transfer_size = 1000; +static int transfer_count = 1000; +static int buffer_size; +static char test_name[10] = "custom"; +static char *port = "7174"; +static char *dst_addr; +static char *src_addr; +static union socket_addr addr; +static socklen_t addrlen; +static struct timeval start, end; +static struct message msg; + +static void show_perf(void) +{ + char str[32]; + float usec; + long long bytes; + int transfers; + + usec = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec); + transfers = echo ? transfer_count * 2 : ntohl(msg.data); + bytes = (long long) transfers * transfer_size; + + /* name size transfers bytes seconds Gb/sec usec/xfer */ + printf("%-10s", test_name); + size_str(str, sizeof str, transfer_size); + printf("%-8s", str); + cnt_str(str, sizeof str, transfers); + printf("%-8s", str); + size_str(str, sizeof str, bytes); + printf("%-8s", str); + printf("%8.2fs%10.2f%11.2f\n", + usec / 1000000., (bytes * 8) / (1000. * usec), + (usec / transfers)); +} + +static void init_latency_test(int size) +{ + char sstr[5]; + + size_str(sstr, sizeof sstr, size); + snprintf(test_name, sizeof test_name, "%s_lat", sstr); + transfer_size = size; + transfer_count = size_to_count(transfer_size) / 10; + echo = 1; +} + +static void init_bandwidth_test(int size) +{ + char sstr[5]; + + size_str(sstr, sizeof sstr, size); + snprintf(test_name, sizeof test_name, "%s_bw", sstr); + transfer_size = size; + transfer_count = size_to_count(transfer_size); + echo = 0; +} + +static void set_options(int rs) +{ + int val; + + if (buffer_size) { + rs_setsockopt(rs, SOL_SOCKET, SO_SNDBUF, (void *) &buffer_size, + sizeof buffer_size); + rs_setsockopt(rs, SOL_SOCKET, SO_RCVBUF, (void *) &buffer_size, + sizeof buffer_size); + } else { + val = 1 << 19; + rs_setsockopt(rs, SOL_SOCKET, SO_SNDBUF, (void *) &val, sizeof val); + rs_setsockopt(rs, SOL_SOCKET, SO_RCVBUF, (void *) &val, sizeof val); + } + + if (flags & MSG_DONTWAIT) + rs_fcntl(rs, F_SETFL, O_NONBLOCK); +} + +static ssize_t svr_send(struct message *msg, size_t size, + union socket_addr *addr, socklen_t addrlen) +{ + struct pollfd fds; + ssize_t ret; + + if (use_async) { + fds.fd = rs; + fds.events = POLLOUT; + } + + do { + if (use_async) { + ret = do_poll(&fds, poll_timeout); + if (ret) + return ret; + } + + ret = rs_sendto(rs, msg, size, flags, &addr->sa, addrlen); + } while (ret < 0 && (errno == EWOULDBLOCK || errno == EAGAIN)); + + if (ret < 0) + perror("rsend"); + + return ret; +} + +static ssize_t svr_recv(struct message *msg, size_t size, + union socket_addr *addr, socklen_t *addrlen) +{ + struct pollfd fds; + ssize_t ret; + + if (use_async) { + fds.fd = rs; + fds.events = POLLIN; + } + + do { + if (use_async) { + ret = do_poll(&fds, poll_timeout); + if (ret) + return ret; + } + + ret = rs_recvfrom(rs, msg, size, flags, &addr->sa, addrlen); + } while (ret < 0 && (errno == EWOULDBLOCK || errno == EAGAIN)); + + if (ret < 0) + perror("rrecv"); + + return ret; +} + +static int svr_process(struct message *msg, size_t size, + union socket_addr *addr, socklen_t addrlen) +{ + char str[64]; + ssize_t ret; + + switch (msg->op) { + case msg_op_login: + if (addr->sa.sa_family == AF_INET) { + printf("client login from %s\n", + inet_ntop(AF_INET, &addr->sin.sin_addr.s_addr, + str, sizeof str)); + } else { + printf("client login from %s\n", + inet_ntop(AF_INET6, &addr->sin6.sin6_addr.s6_addr, + str, sizeof str)); + } + msg->id = id++; + /* fall through */ + case msg_op_start: + memset(&clients[msg->id], 0, sizeof clients[msg->id]); + break; + case msg_op_echo: + clients[msg->id].recvcnt++; + break; + case msg_op_end: + msg->data = htonl(clients[msg->id].recvcnt); + break; + default: + clients[msg->id].recvcnt++; + return 0; + } + + ret = svr_send(msg, size, addr, addrlen); + return (ret == size) ? 0 : (int) ret; +} + +static int svr_bind(void) +{ + struct addrinfo hints, *res; + int ret; + + memset(&hints, 0, sizeof hints); + hints.ai_socktype = SOCK_DGRAM; + ret = getaddrinfo(src_addr, port, &hints, &res); + if (ret) { + perror("getaddrinfo"); + return ret; + } + + rs = rs_socket(res->ai_family, res->ai_socktype, res->ai_protocol); + if (rs < 0) { + perror("rsocket"); + ret = rs; + goto out; + } + + set_options(rs); + ret = rs_bind(rs, res->ai_addr, res->ai_addrlen); + if (ret) { + perror("rbind"); + rs_close(rs); + } + +out: + free(res); + return ret; +} + +static int svr_run(void) +{ + size_t len; + int ret; + + ret = svr_bind(); + while (!ret) { + addrlen = sizeof addr; + len = svr_recv(&msg, sizeof msg, &addr, &addrlen); + if (len < 0) + return len; + + ret = svr_process(&msg, len, &addr, addrlen); + } + return ret; +} + +static ssize_t client_send(struct message *msg, size_t size) +{ + struct pollfd fds; + int ret; + + if (use_async) { + fds.fd = rs; + fds.events = POLLOUT; + } + + do { + if (use_async) { + ret = do_poll(&fds, poll_timeout); + if (ret) + return ret; + } + + ret = rs_send(rs, msg, size, flags); + } while (ret < 0 && (errno == EWOULDBLOCK || errno == EAGAIN)); + + if (ret < 0) + perror("rsend"); + + return ret; +} + +static ssize_t client_recv(struct message *msg, size_t size, int timeout) +{ + struct pollfd fds; + int ret; + + if (timeout) { + fds.fd = rs; + fds.events = POLLIN; + + ret = rs_poll(&fds, 1, timeout); + if (ret <= 0) + return ret; + } + + ret = rs_recv(rs, msg, size, flags | MSG_DONTWAIT); + if (ret < 0 && (errno == EWOULDBLOCK || errno == EAGAIN)) + perror("rrecv"); + + return ret; +} + +static int client_send_recv(struct message *msg, size_t size, int timeout) +{ + static uint8_t seqno; + int ret; + + msg->seqno = seqno; + do { + ret = client_send(msg, size); + if (ret != size) + return ret; + + ret = client_recv(msg, size, timeout); + } while (ret <= 0 || msg->seqno != seqno); + + seqno++; + return ret; +} + +static int run_test(void) +{ + int ret, i; + + msg.op = msg_op_start; + ret = client_send_recv(&msg, CTRL_MSG_SIZE, 1000); + if (ret != CTRL_MSG_SIZE) + goto out; + + msg.op = echo ? msg_op_echo : msg_op_data; + gettimeofday(&start, NULL); + for (i = 0; i < transfer_count; i++) { + ret = echo ? client_send_recv(&msg, transfer_size, 1) : + client_send(&msg, transfer_size); + if (ret != transfer_size) + goto out; + } + + msg.op = msg_op_end; + ret = client_send_recv(&msg, CTRL_MSG_SIZE, 1); + if (ret != CTRL_MSG_SIZE) + goto out; + + gettimeofday(&end, NULL); + show_perf(); + ret = 0; + +out: + return ret; +} + +static int client_connect(void) +{ + struct addrinfo hints, *res; + int ret; + + memset(&hints, 0, sizeof hints); + hints.ai_socktype = SOCK_DGRAM; + ret = getaddrinfo(dst_addr, port, &hints, &res); + if (ret) { + perror("getaddrinfo"); + return ret; + } + + rs = rs_socket(res->ai_family, res->ai_socktype, res->ai_protocol); + if (rs < 0) { + perror("rsocket"); + ret = rs; + goto out; + } + + set_options(rs); + ret = rs_connect(rs, res->ai_addr, res->ai_addrlen); + if (ret) { + perror("rconnect"); + rs_close(rs); + } + + msg.op = msg_op_login; + ret = client_send_recv(&msg, CTRL_MSG_SIZE, 1000); + if (ret == CTRL_MSG_SIZE) + ret = 0; + +out: + freeaddrinfo(res); + return ret; +} + +static int client_run(void) +{ + int i, ret; + + printf("%-10s%-8s%-8s%-8s%8s %10s%13s\n", + "name", "bytes", "xfers", "total", "time", "Gb/sec", "usec/xfer"); + + ret = client_connect(); + if (ret) + return ret; + + if (!custom) { + for (i = 0; i < TEST_CNT; i++) { + init_latency_test(test_size[i]); + run_test(); + } + for (i = 0; i < TEST_CNT; i++) { + init_bandwidth_test(test_size[i]); + run_test(); + } + } else { + run_test(); + } + rs_close(rs); + + return ret; +} + +static int set_test_opt(char *optarg) +{ + if (strlen(optarg) == 1) { + switch (optarg[0]) { + case 's': + use_rs = 0; + break; + case 'a': + use_async = 1; + break; + case 'b': + flags = 0; + break; + case 'n': + flags = MSG_DONTWAIT; + break; + case 'e': + echo = 1; + break; + default: + return -1; + } + } else { + if (!strncasecmp("socket", optarg, 6)) { + use_rs = 0; + } else if (!strncasecmp("async", optarg, 5)) { + use_async = 1; + } else if (!strncasecmp("block", optarg, 5)) { + flags = 0; + } else if (!strncasecmp("nonblock", optarg, 8)) { + flags = MSG_DONTWAIT; + } else if (!strncasecmp("echo", optarg, 4)) { + echo = 1; + } else { + return -1; + } + } + return 0; +} + +int main(int argc, char **argv) +{ + int op, ret; + + while ((op = getopt(argc, argv, "s:b:B:C:S:p:T:")) != -1) { + switch (op) { + case 's': + dst_addr = optarg; + break; + case 'b': + src_addr = optarg; + break; + case 'B': + buffer_size = atoi(optarg); + break; + case 'C': + custom = 1; + transfer_count = atoi(optarg); + break; + case 'S': + custom = 1; + transfer_size = atoi(optarg); + if (transfer_size < CTRL_MSG_SIZE) { + printf("size must be at least %d bytes\n", + CTRL_MSG_SIZE); + exit(1); + } + break; + case 'p': + port = optarg; + break; + case 'T': + if (!set_test_opt(optarg)) + break; + /* invalid option - fall through */ + default: + printf("usage: %s\n", argv[0]); + printf("\t[-s server_address]\n"); + printf("\t[-b bind_address]\n"); + printf("\t[-B buffer_size]\n"); + printf("\t[-C transfer_count]\n"); + printf("\t[-S transfer_size]\n"); + printf("\t[-p port_number]\n"); + printf("\t[-T test_option]\n"); + printf("\t s|sockets - use standard tcp/ip sockets\n"); + printf("\t a|async - asynchronous operation (use poll)\n"); + printf("\t b|blocking - use blocking calls\n"); + printf("\t n|nonblocking - use nonblocking calls\n"); + printf("\t e|echo - server echoes all messages\n"); + exit(1); + } + } + + if (flags) + poll_timeout = -1; + + ret = dst_addr ? client_run() : svr_run(); + return ret; +} diff --git a/prov/rdmacm/include/rdma/rdma_cma.h b/prov/rdmacm/include/rdma/rdma_cma.h new file mode 100644 index 00000000000..a5a9150c618 --- /dev/null +++ b/prov/rdmacm/include/rdma/rdma_cma.h @@ -0,0 +1,684 @@ +/* + * Copyright (c) 2005 Voltaire Inc. All rights reserved. + * Copyright (c) 2005-2012 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(RDMA_CMA_H) +#define RDMA_CMA_H + +#include <netinet/in.h> +#include <sys/socket.h> +#include <infiniband/verbs.h> +#include <rdma/fi_ucma.h> + + +#ifdef __cplusplus +extern "C" { +#endif + + +/* + * Upon receiving a device removal event, users must destroy the associated + * RDMA identifier and release all resources allocated with the device. + */ +enum rdma_cm_event_type { + RDMA_CM_EVENT_ADDR_RESOLVED, + RDMA_CM_EVENT_ADDR_ERROR, + RDMA_CM_EVENT_ROUTE_RESOLVED, + RDMA_CM_EVENT_ROUTE_ERROR, + RDMA_CM_EVENT_CONNECT_REQUEST, + RDMA_CM_EVENT_CONNECT_RESPONSE, + RDMA_CM_EVENT_CONNECT_ERROR, + RDMA_CM_EVENT_UNREACHABLE, + RDMA_CM_EVENT_REJECTED, + RDMA_CM_EVENT_ESTABLISHED, + RDMA_CM_EVENT_DISCONNECTED, + RDMA_CM_EVENT_DEVICE_REMOVAL, + RDMA_CM_EVENT_MULTICAST_JOIN, + RDMA_CM_EVENT_MULTICAST_ERROR, + RDMA_CM_EVENT_ADDR_CHANGE, + RDMA_CM_EVENT_TIMEWAIT_EXIT +}; + +enum rdma_port_space { + RDMA_PS_IPOIB = 0x0002, + RDMA_PS_TCP = 0x0106, + RDMA_PS_UDP = 0x0111, + RDMA_PS_IB = 0x013F, +}; + +#define RDMA_IB_IP_PS_MASK 0xFFFFFFFFFFFF0000ULL +#define RDMA_IB_IP_PORT_MASK 0x000000000000FFFFULL +#define RDMA_IB_IP_PS_TCP 0x0000000001060000ULL +#define RDMA_IB_IP_PS_UDP 0x0000000001110000ULL +#define RDMA_IB_PS_IB 0x00000000013F0000ULL + +/* + * Global qkey value for UDP QPs and multicast groups created via the + * RDMA CM. + */ +#define RDMA_UDP_QKEY 0x01234567 + +struct rdma_ib_addr { + union ibv_gid sgid; + union ibv_gid dgid; + uint16_t pkey; +}; + +struct rdma_addr { + union { + struct sockaddr src_addr; + struct sockaddr_in src_sin; + struct sockaddr_in6 src_sin6; + struct sockaddr_storage src_storage; + }; + union { + struct sockaddr dst_addr; + struct sockaddr_in dst_sin; + struct sockaddr_in6 dst_sin6; + struct sockaddr_storage dst_storage; + }; + union { + struct rdma_ib_addr ibaddr; + } addr; +}; + +struct rdma_route { + struct rdma_addr addr; + struct ibv_sa_path_rec *path_rec; + int num_paths; +}; + +struct rdma_event_channel { + int fd; + fid_t fid; +}; + +struct rdma_cm_id { + struct ibv_context *verbs; + struct rdma_event_channel *channel; + void *context; + struct ibv_qp *qp; + struct rdma_route route; + enum rdma_port_space ps; + uint8_t port_num; + struct rdma_cm_event *event; + struct ibv_comp_channel *send_cq_channel; + struct ibv_cq *send_cq; + struct ibv_comp_channel *recv_cq_channel; + struct ibv_cq *recv_cq; + struct ibv_srq *srq; + struct ibv_pd *pd; + enum ibv_qp_type qp_type; +}; + +enum { + RDMA_MAX_RESP_RES = 0xFF, + RDMA_MAX_INIT_DEPTH = 0xFF +}; + +struct rdma_conn_param { + const void *private_data; + uint8_t private_data_len; + uint8_t responder_resources; + uint8_t initiator_depth; + uint8_t flow_control; + uint8_t retry_count; /* ignored when accepting */ + uint8_t rnr_retry_count; + /* Fields below ignored if a QP is created on the rdma_cm_id. */ + uint8_t srq; + uint32_t qp_num; +}; + +struct rdma_ud_param { + const void *private_data; + uint8_t private_data_len; + struct ibv_ah_attr ah_attr; + uint32_t qp_num; + uint32_t qkey; +}; + +struct rdma_cm_event { + struct rdma_cm_id *id; + struct rdma_cm_id *listen_id; + enum rdma_cm_event_type event; + int status; + union { + struct rdma_conn_param conn; + struct rdma_ud_param ud; + } param; +}; + +#define RAI_PASSIVE 0x00000001 +#define RAI_NUMERICHOST 0x00000002 +#define RAI_NOROUTE 0x00000004 +#define RAI_FAMILY 0x00000008 + +struct rdma_addrinfo { + int ai_flags; + int ai_family; + int ai_qp_type; + int ai_port_space; + socklen_t ai_src_len; + socklen_t ai_dst_len; + struct sockaddr *ai_src_addr; + struct sockaddr *ai_dst_addr; + char *ai_src_canonname; + char *ai_dst_canonname; + size_t ai_route_len; + void *ai_route; + size_t ai_connect_len; + void *ai_connect; + struct rdma_addrinfo *ai_next; +}; + +/** + * rdma_create_event_channel - Open a channel used to report communication events. + * Description: + * Asynchronous events are reported to users through event channels. Each + * event channel maps to a file descriptor. + * Notes: + * All created event channels must be destroyed by calling + * rdma_destroy_event_channel. Users should call rdma_get_cm_event to + * retrieve events on an event channel. + * See also: + * rdma_get_cm_event, rdma_destroy_event_channel + */ +struct rdma_event_channel *rdma_create_event_channel(void); + +/** + * rdma_destroy_event_channel - Close an event communication channel. + * @channel: The communication channel to destroy. + * Description: + * Release all resources associated with an event channel and closes the + * associated file descriptor. + * Notes: + * All rdma_cm_id's associated with the event channel must be destroyed, + * and all returned events must be acked before calling this function. + * See also: + * rdma_create_event_channel, rdma_get_cm_event, rdma_ack_cm_event + */ +void rdma_destroy_event_channel(struct rdma_event_channel *channel); + +/** + * rdma_create_id - Allocate a communication identifier. + * @channel: The communication channel that events associated with the + * allocated rdma_cm_id will be reported on. + * @id: A reference where the allocated communication identifier will be + * returned. + * @context: User specified context associated with the rdma_cm_id. + * @ps: RDMA port space. + * Description: + * Creates an identifier that is used to track communication information. + * Notes: + * Rdma_cm_id's are conceptually equivalent to a socket for RDMA + * communication. The difference is that RDMA communication requires + * explicitly binding to a specified RDMA device before communication + * can occur, and most operations are asynchronous in nature. Communication + * events on an rdma_cm_id are reported through the associated event + * channel. Users must release the rdma_cm_id by calling rdma_destroy_id. + * See also: + * rdma_create_event_channel, rdma_destroy_id, rdma_get_devices, + * rdma_bind_addr, rdma_resolve_addr, rdma_connect, rdma_listen, + */ +int rdma_create_id(struct rdma_event_channel *channel, + struct rdma_cm_id **id, void *context, + enum rdma_port_space ps); + +/** + * rdma_create_ep - Allocate a communication identifier and qp. + * @id: A reference where the allocated communication identifier will be + * returned. + * @res: Result from rdma_getaddrinfo, which specifies the source and + * destination addresses, plus optional routing and connection information. + * @pd: Optional protection domain. This parameter is ignored if qp_init_attr + * is NULL. + * @qp_init_attr: Optional attributes for a QP created on the rdma_cm_id. + * Description: + * Create an identifier and option QP used for communication. + * Notes: + * If qp_init_attr is provided, then a queue pair will be allocated and + * associated with the rdma_cm_id. If a pd is provided, the QP will be + * created on that PD. Otherwise, the QP will be allocated on a default + * PD. + * The rdma_cm_id will be set to use synchronous operations (connect, + * listen, and get_request). To convert to asynchronous operation, the + * rdma_cm_id should be migrated to a user allocated event channel. + * See also: + * rdma_create_id, rdma_create_qp, rdma_migrate_id, rdma_connect, + * rdma_listen + */ +int rdma_create_ep(struct rdma_cm_id **id, struct rdma_addrinfo *res, + struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr); + +/** + * rdma_destroy_ep - Deallocates a communication identifier and qp. + * @id: The communication identifer to destroy. + * Description: + * Destroys the specified rdma_cm_id and any associated QP created + * on that id. + * See also: + * rdma_create_ep + */ +void rdma_destroy_ep(struct rdma_cm_id *id); + +/** + * rdma_destroy_id - Release a communication identifier. + * @id: The communication identifier to destroy. + * Description: + * Destroys the specified rdma_cm_id and cancels any outstanding + * asynchronous operation. + * Notes: + * Users must free any associated QP with the rdma_cm_id before + * calling this routine and ack an related events. + * See also: + * rdma_create_id, rdma_destroy_qp, rdma_ack_cm_event + */ +int rdma_destroy_id(struct rdma_cm_id *id); + +/** + * rdma_bind_addr - Bind an RDMA identifier to a source address. + * @id: RDMA identifier. + * @addr: Local address information. Wildcard values are permitted. + * Description: + * Associates a source address with an rdma_cm_id. The address may be + * wildcarded. If binding to a specific local address, the rdma_cm_id + * will also be bound to a local RDMA device. + * Notes: + * Typically, this routine is called before calling rdma_listen to bind + * to a specific port number, but it may also be called on the active side + * of a connection before calling rdma_resolve_addr to bind to a specific + * address. + * See also: + * rdma_create_id, rdma_listen, rdma_resolve_addr, rdma_create_qp + */ +int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr); + +/** + * rdma_resolve_addr - Resolve destination and optional source addresses. + * @id: RDMA identifier. + * @src_addr: Source address information. This parameter may be NULL. + * @dst_addr: Destination address information. + * @timeout_ms: Time to wait for resolution to complete. + * Description: + * Resolve destination and optional source addresses from IP addresses + * to an RDMA address. If successful, the specified rdma_cm_id will + * be bound to a local device. + * Notes: + * This call is used to map a given destination IP address to a usable RDMA + * address. If a source address is given, the rdma_cm_id is bound to that + * address, the same as if rdma_bind_addr were called. If no source + * address is given, and the rdma_cm_id has not yet been bound to a device, + * then the rdma_cm_id will be bound to a source address based on the + * local routing tables. After this call, the rdma_cm_id will be bound to + * an RDMA device. This call is typically made from the active side of a + * connection before calling rdma_resolve_route and rdma_connect. + * See also: + * rdma_create_id, rdma_resolve_route, rdma_connect, rdma_create_qp, + * rdma_get_cm_event, rdma_bind_addr + */ +int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, + struct sockaddr *dst_addr, int timeout_ms); + +/** + * rdma_resolve_route - Resolve the route information needed to establish a connection. + * @id: RDMA identifier. + * @timeout_ms: Time to wait for resolution to complete. + * Description: + * Resolves an RDMA route to the destination address in order to establish + * a connection. The destination address must have already been resolved + * by calling rdma_resolve_addr. + * Notes: + * This is called on the client side of a connection after calling + * rdma_resolve_addr, but before calling rdma_connect. + * See also: + * rdma_resolve_addr, rdma_connect, rdma_get_cm_event + */ +int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms); + +/** + * rdma_create_qp - Allocate a QP. + * @id: RDMA identifier. + * @pd: Optional protection domain for the QP. + * @qp_init_attr: initial QP attributes. + * Description: + * Allocate a QP associated with the specified rdma_cm_id and transition it + * for sending and receiving. + * Notes: + * The rdma_cm_id must be bound to a local RDMA device before calling this + * function, and the protection domain must be for that same device. + * QPs allocated to an rdma_cm_id are automatically transitioned by the + * librdmacm through their states. After being allocated, the QP will be + * ready to handle posting of receives. If the QP is unconnected, it will + * be ready to post sends. + * If pd is NULL, then the QP will be allocated using a default protection + * domain associated with the underlying RDMA device. + * See also: + * rdma_bind_addr, rdma_resolve_addr, rdma_destroy_qp, ibv_create_qp, + * ibv_modify_qp + */ +int rdma_create_qp(struct rdma_cm_id *id, struct ibv_pd *pd, + struct ibv_qp_init_attr *qp_init_attr); + +/** + * rdma_destroy_qp - Deallocate a QP. + * @id: RDMA identifier. + * Description: + * Destroy a QP allocated on the rdma_cm_id. + * Notes: + * Users must destroy any QP associated with an rdma_cm_id before + * destroying the ID. + * See also: + * rdma_create_qp, rdma_destroy_id, ibv_destroy_qp + */ +void rdma_destroy_qp(struct rdma_cm_id *id); + +/** + * rdma_connect - Initiate an active connection request. + * @id: RDMA identifier. + * @conn_param: optional connection parameters. + * Description: + * For a connected rdma_cm_id, this call initiates a connection request + * to a remote destination. For an unconnected rdma_cm_id, it initiates + * a lookup of the remote QP providing the datagram service. + * Notes: + * Users must have resolved a route to the destination address + * by having called rdma_resolve_route before calling this routine. + * A user may override the default connection parameters and exchange + * private data as part of the connection by using the conn_param parameter. + * See also: + * rdma_resolve_route, rdma_disconnect, rdma_listen, rdma_get_cm_event + */ +int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param); + +/** + * rdma_listen - Listen for incoming connection requests. + * @id: RDMA identifier. + * @backlog: backlog of incoming connection requests. + * Description: + * Initiates a listen for incoming connection requests or datagram service + * lookup. The listen will be restricted to the locally bound source + * address. + * Notes: + * Users must have bound the rdma_cm_id to a local address by calling + * rdma_bind_addr before calling this routine. If the rdma_cm_id is + * bound to a specific IP address, the listen will be restricted to that + * address and the associated RDMA device. If the rdma_cm_id is bound + * to an RDMA port number only, the listen will occur across all RDMA + * devices. + * See also: + * rdma_bind_addr, rdma_connect, rdma_accept, rdma_reject, rdma_get_cm_event + */ +int rdma_listen(struct rdma_cm_id *id, int backlog); + +/** + * rdma_get_request + */ +int rdma_get_request(struct rdma_cm_id *listen, struct rdma_cm_id **id); + +/** + * rdma_accept - Called to accept a connection request. + * @id: Connection identifier associated with the request. + * @conn_param: Optional information needed to establish the connection. + * Description: + * Called from the listening side to accept a connection or datagram + * service lookup request. + * Notes: + * Unlike the socket accept routine, rdma_accept is not called on a + * listening rdma_cm_id. Instead, after calling rdma_listen, the user + * waits for a connection request event to occur. Connection request + * events give the user a newly created rdma_cm_id, similar to a new + * socket, but the rdma_cm_id is bound to a specific RDMA device. + * rdma_accept is called on the new rdma_cm_id. + * A user may override the default connection parameters and exchange + * private data as part of the connection by using the conn_param parameter. + * See also: + * rdma_listen, rdma_reject, rdma_get_cm_event + */ +int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param); + +/** + * rdma_reject - Called to reject a connection request. + * @id: Connection identifier associated with the request. + * @private_data: Optional private data to send with the reject message. + * @private_data_len: Size of the private_data to send, in bytes. + * Description: + * Called from the listening side to reject a connection or datagram + * service lookup request. + * Notes: + * After receiving a connection request event, a user may call rdma_reject + * to reject the request. If the underlying RDMA transport supports + * private data in the reject message, the specified data will be passed to + * the remote side. + * See also: + * rdma_listen, rdma_accept, rdma_get_cm_event + */ +int rdma_reject(struct rdma_cm_id *id, const void *private_data, + uint8_t private_data_len); + +/** + * rdma_notify - Notifies the librdmacm of an asynchronous event. + * @id: RDMA identifier. + * @event: Asynchronous event. + * Description: + * Used to notify the librdmacm of asynchronous events that have occurred + * on a QP associated with the rdma_cm_id. + * Notes: + * Asynchronous events that occur on a QP are reported through the user's + * device event handler. This routine is used to notify the librdmacm of + * communication events. In most cases, use of this routine is not + * necessary, however if connection establishment is done out of band + * (such as done through Infiniband), it's possible to receive data on a + * QP that is not yet considered connected. This routine forces the + * connection into an established state in this case in order to handle + * the rare situation where the connection never forms on its own. + * Events that should be reported to the CM are: IB_EVENT_COMM_EST. + * See also: + * rdma_connect, rdma_accept, rdma_listen + */ +int rdma_notify(struct rdma_cm_id *id, enum ibv_event_type event); + +/** + * rdma_disconnect - This function disconnects a connection. + * @id: RDMA identifier. + * Description: + * Disconnects a connection and transitions any associated QP to the + * error state. + * See also: + * rdma_connect, rdma_listen, rdma_accept + */ +int rdma_disconnect(struct rdma_cm_id *id); + +/** + * rdma_join_multicast - Joins a multicast group. + * @id: Communication identifier associated with the request. + * @addr: Multicast address identifying the group to join. + * @context: User-defined context associated with the join request. + * Description: + * Joins a multicast group and attaches an associated QP to the group. + * Notes: + * Before joining a multicast group, the rdma_cm_id must be bound to + * an RDMA device by calling rdma_bind_addr or rdma_resolve_addr. Use of + * rdma_resolve_addr requires the local routing tables to resolve the + * multicast address to an RDMA device. The user must call + * rdma_leave_multicast to leave the multicast group and release any + * multicast resources. The context is returned to the user through + * the private_data field in the rdma_cm_event. + * See also: + * rdma_leave_multicast, rdma_bind_addr, rdma_resolve_addr, rdma_create_qp + */ +int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, + void *context); + +/** + * rdma_leave_multicast - Leaves a multicast group. + * @id: Communication identifier associated with the request. + * @addr: Multicast address identifying the group to leave. + * Description: + * Leaves a multicast group and detaches an associated QP from the group. + * Notes: + * Calling this function before a group has been fully joined results in + * canceling the join operation. Users should be aware that messages + * received from the multicast group may stilled be queued for + * completion processing immediately after leaving a multicast group. + * Destroying an rdma_cm_id will automatically leave all multicast groups. + * See also: + * rdma_join_multicast, rdma_destroy_qp + */ +int rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr); + +/** + * rdma_get_cm_event - Retrieves the next pending communication event. + * @channel: Event channel to check for events. + * @event: Allocated information about the next communication event. + * Description: + * Retrieves a communication event. If no events are pending, by default, + * the call will block until an event is received. + * Notes: + * The default synchronous behavior of this routine can be changed by + * modifying the file descriptor associated with the given channel. All + * events that are reported must be acknowledged by calling rdma_ack_cm_event. + * Destruction of an rdma_cm_id will block until related events have been + * acknowledged. + * See also: + * rdma_ack_cm_event, rdma_create_event_channel, rdma_event_str + */ +int rdma_get_cm_event(struct rdma_event_channel *channel, + struct rdma_cm_event **event); + +/** + * rdma_ack_cm_event - Free a communication event. + * @event: Event to be released. + * Description: + * All events which are allocated by rdma_get_cm_event must be released, + * there should be a one-to-one correspondence between successful gets + * and acks. + * See also: + * rdma_get_cm_event, rdma_destroy_id + */ +int rdma_ack_cm_event(struct rdma_cm_event *event); + +uint16_t rdma_get_src_port(struct rdma_cm_id *id); +uint16_t rdma_get_dst_port(struct rdma_cm_id *id); + +static inline struct sockaddr *rdma_get_local_addr(struct rdma_cm_id *id) +{ + return &id->route.addr.src_addr; +} + +static inline struct sockaddr *rdma_get_peer_addr(struct rdma_cm_id *id) +{ + return &id->route.addr.dst_addr; +} + +/** + * rdma_get_devices - Get list of RDMA devices currently available. + * @num_devices: If non-NULL, set to the number of devices returned. + * Description: + * Return a NULL-terminated array of opened RDMA devices. Callers can use + * this routine to allocate resources on specific RDMA devices that will be + * shared across multiple rdma_cm_id's. + * Notes: + * The returned array must be released by calling rdma_free_devices. Devices + * remain opened while the librdmacm is loaded. + * See also: + * rdma_free_devices + */ +struct ibv_context **rdma_get_devices(int *num_devices); + +/** + * rdma_free_devices - Frees the list of devices returned by rdma_get_devices. + * @list: List of devices returned from rdma_get_devices. + * Description: + * Frees the device array returned by rdma_get_devices. + * See also: + * rdma_get_devices + */ +void rdma_free_devices(struct ibv_context **list); + +/** + * rdma_event_str - Returns a string representation of an rdma cm event. + * @event: Asynchronous event. + * Description: + * Returns a string representation of an asynchronous event. + * See also: + * rdma_get_cm_event + */ +const char *rdma_event_str(enum rdma_cm_event_type event); + +/* Option levels */ +enum { + RDMA_OPTION_ID = 0, + RDMA_OPTION_IB = 1 +}; + +/* Option details */ +enum { + RDMA_OPTION_ID_TOS = 0, /* uint8_t: RFC 2474 */ + RDMA_OPTION_ID_REUSEADDR = 1, /* int: ~SO_REUSEADDR */ + RDMA_OPTION_ID_AFONLY = 2, /* int: ~IPV6_V6ONLY */ + RDMA_OPTION_IB_PATH = 1 /* struct ibv_path_data[] */ +}; + +/** + * rdma_set_option - Set options for an rdma_cm_id. + * @id: Communication identifier to set option for. + * @level: Protocol level of the option to set. + * @optname: Name of the option to set. + * @optval: Reference to the option data. + * @optlen: The size of the %optval buffer. + */ +int rdma_set_option(struct rdma_cm_id *id, int level, int optname, + void *optval, size_t optlen); + +/** + * rdma_migrate_id - Move an rdma_cm_id to a new event channel. + * @id: Communication identifier to migrate. + * @channel: New event channel for rdma_cm_id events. + */ +int rdma_migrate_id(struct rdma_cm_id *id, struct rdma_event_channel *channel); + +/** + * rdma_getaddrinfo - RDMA address and route resolution service. + */ +int rdma_getaddrinfo(char *node, char *service, + struct rdma_addrinfo *hints, + struct rdma_addrinfo **res); + +void rdma_freeaddrinfo(struct rdma_addrinfo *res); + +int rdma_addrlen(struct sockaddr *addr); + +#ifdef __cplusplus +} +#endif + +#endif /* RDMA_CMA_H */ diff --git a/prov/rdmacm/include/rdma/rdma_verbs.h b/prov/rdmacm/include/rdma/rdma_verbs.h new file mode 100644 index 00000000000..198c6a595b1 --- /dev/null +++ b/prov/rdmacm/include/rdma/rdma_verbs.h @@ -0,0 +1,316 @@ +/* + * Copyright (c) 2010-2011 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(RDMA_VERBS_H) +#define RDMA_VERBS_H + +#include <assert.h> +#include <infiniband/verbs.h> +#include <rdma/rdma_cma.h> +#include <errno.h> + +#ifdef __cplusplus +extern "C" { +#endif + +static inline int rdma_seterrno(int ret) +{ + if (ret) { + errno = ret; + ret = -1; + } + return ret; +} + +/* + * Shared receive queues. + */ +int rdma_create_srq(struct rdma_cm_id *id, struct ibv_pd *pd, + struct ibv_srq_init_attr *attr); + +void rdma_destroy_srq(struct rdma_cm_id *id); + + +/* + * Memory registration helpers. + */ +static inline struct ibv_mr * +rdma_reg_msgs(struct rdma_cm_id *id, void *addr, size_t length) +{ + return ibv_reg_mr(id->pd, addr, length, IBV_ACCESS_LOCAL_WRITE); +} + +static inline struct ibv_mr * +rdma_reg_read(struct rdma_cm_id *id, void *addr, size_t length) +{ + return ibv_reg_mr(id->pd, addr, length, IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_READ); +} + +static inline struct ibv_mr * +rdma_reg_write(struct rdma_cm_id *id, void *addr, size_t length) +{ + return ibv_reg_mr(id->pd, addr, length, IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_WRITE); +} + +static inline int +rdma_dereg_mr(struct ibv_mr *mr) +{ + return rdma_seterrno(ibv_dereg_mr(mr)); +} + + +/* + * Vectored send, receive, and RDMA operations. + * Support multiple scatter-gather entries. + */ +static inline int +rdma_post_recvv(struct rdma_cm_id *id, void *context, struct ibv_sge *sgl, + int nsge) +{ + struct ibv_recv_wr wr, *bad; + + wr.wr_id = (uintptr_t) context; + wr.next = NULL; + wr.sg_list = sgl; + wr.num_sge = nsge; + + if (id->srq) + return rdma_seterrno(ibv_post_srq_recv(id->srq, &wr, &bad)); + else + return rdma_seterrno(ibv_post_recv(id->qp, &wr, &bad)); +} + +static inline int +rdma_post_sendv(struct rdma_cm_id *id, void *context, struct ibv_sge *sgl, + int nsge, int flags) +{ + struct ibv_send_wr wr, *bad; + + wr.wr_id = (uintptr_t) context; + wr.next = NULL; + wr.sg_list = sgl; + wr.num_sge = nsge; + wr.opcode = IBV_WR_SEND; + wr.send_flags = flags; + + return rdma_seterrno(ibv_post_send(id->qp, &wr, &bad)); +} + +static inline int +rdma_post_readv(struct rdma_cm_id *id, void *context, struct ibv_sge *sgl, + int nsge, int flags, uint64_t remote_addr, uint32_t rkey) +{ + struct ibv_send_wr wr, *bad; + + wr.wr_id = (uintptr_t) context; + wr.next = NULL; + wr.sg_list = sgl; + wr.num_sge = nsge; + wr.opcode = IBV_WR_RDMA_READ; + wr.send_flags = flags; + wr.wr.rdma.remote_addr = remote_addr; + wr.wr.rdma.rkey = rkey; + + return rdma_seterrno(ibv_post_send(id->qp, &wr, &bad)); +} + +static inline int +rdma_post_writev(struct rdma_cm_id *id, void *context, struct ibv_sge *sgl, + int nsge, int flags, uint64_t remote_addr, uint32_t rkey) +{ + struct ibv_send_wr wr, *bad; + + wr.wr_id = (uintptr_t) context; + wr.next = NULL; + wr.sg_list = sgl; + wr.num_sge = nsge; + wr.opcode = IBV_WR_RDMA_WRITE; + wr.send_flags = flags; + wr.wr.rdma.remote_addr = remote_addr; + wr.wr.rdma.rkey = rkey; + + return rdma_seterrno(ibv_post_send(id->qp, &wr, &bad)); +} + +/* + * Simple send, receive, and RDMA calls. + */ +static inline int +rdma_post_recv(struct rdma_cm_id *id, void *context, void *addr, + size_t length, struct ibv_mr *mr) +{ + struct ibv_sge sge; + + assert((addr >= mr->addr) && + (((uint8_t *) addr + length) <= ((uint8_t *) mr->addr + mr->length))); + sge.addr = (uint64_t) (uintptr_t) addr; + sge.length = (uint32_t) length; + sge.lkey = mr->lkey; + + return rdma_post_recvv(id, context, &sge, 1); +} + +static inline int +rdma_post_send(struct rdma_cm_id *id, void *context, void *addr, + size_t length, struct ibv_mr *mr, int flags) +{ + struct ibv_sge sge; + + sge.addr = (uint64_t) (uintptr_t) addr; + sge.length = (uint32_t) length; + sge.lkey = mr ? mr->lkey : 0; + + return rdma_post_sendv(id, context, &sge, 1, flags); +} + +static inline int +rdma_post_read(struct rdma_cm_id *id, void *context, void *addr, + size_t length, struct ibv_mr *mr, int flags, + uint64_t remote_addr, uint32_t rkey) +{ + struct ibv_sge sge; + + sge.addr = (uint64_t) (uintptr_t) addr; + sge.length = (uint32_t) length; + sge.lkey = mr->lkey; + + return rdma_post_readv(id, context, &sge, 1, flags, remote_addr, rkey); +} + +static inline int +rdma_post_write(struct rdma_cm_id *id, void *context, void *addr, + size_t length, struct ibv_mr *mr, int flags, + uint64_t remote_addr, uint32_t rkey) +{ + struct ibv_sge sge; + + sge.addr = (uint64_t) (uintptr_t) addr; + sge.length = (uint32_t) length; + sge.lkey = mr ? mr->lkey : 0; + + return rdma_post_writev(id, context, &sge, 1, flags, remote_addr, rkey); +} + +static inline int +rdma_post_ud_send(struct rdma_cm_id *id, void *context, void *addr, + size_t length, struct ibv_mr *mr, int flags, + struct ibv_ah *ah, uint32_t remote_qpn) +{ + struct ibv_send_wr wr, *bad; + struct ibv_sge sge; + + sge.addr = (uint64_t) (uintptr_t) addr; + sge.length = (uint32_t) length; + sge.lkey = mr ? mr->lkey : 0; + + wr.wr_id = (uintptr_t) context; + wr.next = NULL; + wr.sg_list = &sge; + wr.num_sge = 1; + wr.opcode = IBV_WR_SEND; + wr.send_flags = flags; + wr.wr.ud.ah = ah; + wr.wr.ud.remote_qpn = remote_qpn; + wr.wr.ud.remote_qkey = RDMA_UDP_QKEY; + + return rdma_seterrno(ibv_post_send(id->qp, &wr, &bad)); +} + +static inline int +rdma_get_send_comp(struct rdma_cm_id *id, struct ibv_wc *wc) +{ + struct ibv_cq *cq; + void *context; + int ret; + + do { + ret = ibv_poll_cq(id->send_cq, 1, wc); + if (ret) + break; + + ret = ibv_req_notify_cq(id->send_cq, 0); + if (ret) + return rdma_seterrno(ret); + + ret = ibv_poll_cq(id->send_cq, 1, wc); + if (ret) + break; + + ret = ibv_get_cq_event(id->send_cq_channel, &cq, &context); + if (ret) + return ret; + + assert(cq == id->send_cq && context == id); + ibv_ack_cq_events(id->send_cq, 1); + } while (1); + + return (ret < 0) ? rdma_seterrno(ret) : ret; +} + +static inline int +rdma_get_recv_comp(struct rdma_cm_id *id, struct ibv_wc *wc) +{ + struct ibv_cq *cq; + void *context; + int ret; + + do { + ret = ibv_poll_cq(id->recv_cq, 1, wc); + if (ret) + break; + + ret = ibv_req_notify_cq(id->recv_cq, 0); + if (ret) + return rdma_seterrno(ret); + + ret = ibv_poll_cq(id->recv_cq, 1, wc); + if (ret) + break; + + ret = ibv_get_cq_event(id->recv_cq_channel, &cq, &context); + if (ret) + return ret; + + assert(cq == id->recv_cq && context == id); + ibv_ack_cq_events(id->recv_cq, 1); + } while (1); + + return (ret < 0) ? rdma_seterrno(ret) : ret; +} + +#ifdef __cplusplus +} +#endif + +#endif /* RDMA_CMA_H */ diff --git a/prov/rdmacm/include/rdma/rsocket.h b/prov/rdmacm/include/rdma/rsocket.h new file mode 100644 index 00000000000..efd0db58bf9 --- /dev/null +++ b/prov/rdmacm/include/rdma/rsocket.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2011-2012 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(RSOCKET_H) +#define RSOCKET_H + +#include <infiniband/verbs.h> +#include <rdma/rdma_cma.h> +#include <sys/socket.h> +#include <errno.h> +#include <poll.h> +#include <sys/select.h> +#include <sys/mman.h> + +#ifdef __cplusplus +extern "C" { +#endif + +int rsocket(int domain, int type, int protocol); +int rbind(int socket, const struct sockaddr *addr, socklen_t addrlen); +int rlisten(int socket, int backlog); +int raccept(int socket, struct sockaddr *addr, socklen_t *addrlen); +int rconnect(int socket, const struct sockaddr *addr, socklen_t addrlen); +int rshutdown(int socket, int how); +int rclose(int socket); + +ssize_t rrecv(int socket, void *buf, size_t len, int flags); +ssize_t rrecvfrom(int socket, void *buf, size_t len, int flags, + struct sockaddr *src_addr, socklen_t *addrlen); +ssize_t rrecvmsg(int socket, struct msghdr *msg, int flags); +ssize_t rsend(int socket, const void *buf, size_t len, int flags); +ssize_t rsendto(int socket, const void *buf, size_t len, int flags, + const struct sockaddr *dest_addr, socklen_t addrlen); +ssize_t rsendmsg(int socket, const struct msghdr *msg, int flags); +ssize_t rread(int socket, void *buf, size_t count); +ssize_t rreadv(int socket, const struct iovec *iov, int iovcnt); +ssize_t rwrite(int socket, const void *buf, size_t count); +ssize_t rwritev(int socket, const struct iovec *iov, int iovcnt); + +int rpoll(struct pollfd *fds, nfds_t nfds, int timeout); +int rselect(int nfds, fd_set *readfds, fd_set *writefds, + fd_set *exceptfds, struct timeval *timeout); + +int rgetpeername(int socket, struct sockaddr *addr, socklen_t *addrlen); +int rgetsockname(int socket, struct sockaddr *addr, socklen_t *addrlen); + +#define SOL_RDMA 0x10000 +enum { + RDMA_SQSIZE, + RDMA_RQSIZE, + RDMA_INLINE, + RDMA_IOMAPSIZE, + RDMA_ROUTE +}; + +int rsetsockopt(int socket, int level, int optname, + const void *optval, socklen_t optlen); +int rgetsockopt(int socket, int level, int optname, + void *optval, socklen_t *optlen); +int rfcntl(int socket, int cmd, ... /* arg */ ); + +off_t riomap(int socket, void *buf, size_t len, int prot, int flags, off_t offset); +int riounmap(int socket, void *buf, size_t len); +size_t riowrite(int socket, const void *buf, size_t count, off_t offset, int flags); + +#ifdef __cplusplus +} +#endif + +#endif /* RSOCKET_H */ diff --git a/prov/rdmacm/src/acm.c b/prov/rdmacm/src/acm.c new file mode 100644 index 00000000000..3dc26bfa984 --- /dev/null +++ b/prov/rdmacm/src/acm.c @@ -0,0 +1,439 @@ +/* + * Copyright (c) 2010-2012 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +# include <config.h> +#endif /* HAVE_CONFIG_H */ + +#include <stdio.h> +#include <inttypes.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <netdb.h> +#include <unistd.h> + +#include "cma.h" +#include <rdma/rdma_cma.h> +#include <infiniband/ib.h> +#include <rdma/fi_ucma.h> + +#define ACM_VERSION 1 + +#define ACM_OP_RESOLVE 0x01 +#define ACM_OP_ACK 0x80 + +#define ACM_STATUS_SUCCESS 0 +#define ACM_STATUS_ENOMEM 1 +#define ACM_STATUS_EINVAL 2 +#define ACM_STATUS_ENODATA 3 +#define ACM_STATUS_ENOTCONN 5 +#define ACM_STATUS_ETIMEDOUT 6 +#define ACM_STATUS_ESRCADDR 7 +#define ACM_STATUS_ESRCTYPE 8 +#define ACM_STATUS_EDESTADDR 9 +#define ACM_STATUS_EDESTTYPE 10 + +#define ACM_FLAGS_NODELAY (1<<30) + +#define ACM_MSG_HDR_LENGTH 16 +#define ACM_MAX_ADDRESS 64 +#define ACM_MSG_EP_LENGTH 72 +#define ACM_MSG_DATA_LENGTH (ACM_MSG_EP_LENGTH * 8) + +struct acm_hdr { + uint8_t version; + uint8_t opcode; + uint8_t status; + uint8_t data[3]; + uint16_t length; + uint64_t tid; +}; + +#define ACM_EP_INFO_NAME 0x0001 +#define ACM_EP_INFO_ADDRESS_IP 0x0002 +#define ACM_EP_INFO_ADDRESS_IP6 0x0003 +#define ACM_EP_INFO_PATH 0x0010 + +union acm_ep_info { + uint8_t addr[ACM_MAX_ADDRESS]; + uint8_t name[ACM_MAX_ADDRESS]; + struct ibv_path_record path; +}; + +#define ACM_EP_FLAG_SOURCE (1<<0) +#define ACM_EP_FLAG_DEST (1<<1) + +struct acm_ep_addr_data { + uint32_t flags; + uint16_t type; + uint16_t reserved; + union acm_ep_info info; +}; + +struct acm_resolve_msg { + struct acm_hdr hdr; + struct acm_ep_addr_data data[0]; +}; + +struct acm_msg { + struct acm_hdr hdr; + union{ + uint8_t data[ACM_MSG_DATA_LENGTH]; + struct acm_ep_addr_data resolve_data[0]; + }; +}; + +static pthread_mutex_t acm_lock = PTHREAD_MUTEX_INITIALIZER; +static int sock = -1; +static uint16_t server_port; + +static int ucma_set_server_port(void) +{ + FILE *f; + + if ((f = fopen("/var/run/ibacm.port", "r"))) { + fscanf(f, "%" SCNu16, &server_port); + fclose(f); + } + return server_port; +} + +void ucma_ib_init(void) +{ + struct sockaddr_in addr; + static int init; + int ret; + + if (init) + return; + + pthread_mutex_lock(&acm_lock); + if (!ucma_set_server_port()) + goto out; + + sock = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP); + if (sock < 0) + goto out; + + memset(&addr, 0, sizeof addr); + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + addr.sin_port = htons(server_port); + ret = connect(sock, (struct sockaddr *) &addr, sizeof(addr)); + if (ret) { + close(sock); + sock = -1; + } +out: + init = 1; + pthread_mutex_unlock(&acm_lock); +} + +void ucma_ib_cleanup(void) +{ + if (sock >= 0) { + shutdown(sock, SHUT_RDWR); + close(sock); + } +} + +static int ucma_ib_set_addr(struct rdma_addrinfo *ib_rai, + struct rdma_addrinfo *rai) +{ + struct sockaddr_ib *src, *dst; + struct ibv_path_record *path; + + src = calloc(1, sizeof *src); + if (!src) + return ERR(ENOMEM); + + dst = calloc(1, sizeof *dst); + if (!dst) { + free(src); + return ERR(ENOMEM); + } + + path = &((struct ibv_path_data *) ib_rai->ai_route)->path; + + src->sib_family = AF_IB; + src->sib_pkey = path->pkey; + src->sib_flowinfo = htonl(ntohl(path->flowlabel_hoplimit) >> 8); + memcpy(&src->sib_addr, &path->sgid, 16); + ucma_set_sid(ib_rai->ai_port_space, rai->ai_src_addr, src); + + dst->sib_family = AF_IB; + dst->sib_pkey = path->pkey; + dst->sib_flowinfo = htonl(ntohl(path->flowlabel_hoplimit) >> 8); + memcpy(&dst->sib_addr, &path->dgid, 16); + ucma_set_sid(ib_rai->ai_port_space, rai->ai_dst_addr, dst); + + ib_rai->ai_src_addr = (struct sockaddr *) src; + ib_rai->ai_src_len = sizeof(*src); + + ib_rai->ai_dst_addr = (struct sockaddr *) dst; + ib_rai->ai_dst_len = sizeof(*dst); + + return 0; +} + +static int ucma_ib_set_connect(struct rdma_addrinfo *ib_rai, + struct rdma_addrinfo *rai) +{ + struct ib_connect_hdr *hdr; + + if (rai->ai_family == AF_IB) + return 0; + + hdr = calloc(1, sizeof *hdr); + if (!hdr) + return ERR(ENOMEM); + + if (rai->ai_family == AF_INET) { + hdr->ip_version = 4 << 4; + memcpy(&hdr->cma_src_ip4, + &((struct sockaddr_in *) rai->ai_src_addr)->sin_addr, 4); + memcpy(&hdr->cma_dst_ip4, + &((struct sockaddr_in *) rai->ai_dst_addr)->sin_addr, 4); + } else { + hdr->ip_version = 6 << 4; + memcpy(&hdr->cma_src_ip6, + &((struct sockaddr_in6 *) rai->ai_src_addr)->sin6_addr, 16); + memcpy(&hdr->cma_dst_ip6, + &((struct sockaddr_in6 *) rai->ai_dst_addr)->sin6_addr, 16); + } + + ib_rai->ai_connect = hdr; + ib_rai->ai_connect_len = sizeof(*hdr); + return 0; +} + +static void ucma_resolve_af_ib(struct rdma_addrinfo **rai) +{ + struct rdma_addrinfo *ib_rai; + + ib_rai = calloc(1, sizeof(*ib_rai)); + if (!ib_rai) + return; + + ib_rai->ai_flags = (*rai)->ai_flags; + ib_rai->ai_family = AF_IB; + ib_rai->ai_qp_type = (*rai)->ai_qp_type; + ib_rai->ai_port_space = (*rai)->ai_port_space; + + ib_rai->ai_route = calloc(1, (*rai)->ai_route_len); + if (!ib_rai->ai_route) + goto err; + + memcpy(ib_rai->ai_route, (*rai)->ai_route, (*rai)->ai_route_len); + ib_rai->ai_route_len = (*rai)->ai_route_len; + + if ((*rai)->ai_src_canonname) { + ib_rai->ai_src_canonname = strdup((*rai)->ai_src_canonname); + if (!ib_rai->ai_src_canonname) + goto err; + } + + if ((*rai)->ai_dst_canonname) { + ib_rai->ai_dst_canonname = strdup((*rai)->ai_dst_canonname); + if (!ib_rai->ai_dst_canonname) + goto err; + } + + if (ucma_ib_set_connect(ib_rai, *rai)) + goto err; + + if (ucma_ib_set_addr(ib_rai, *rai)) + goto err; + + ib_rai->ai_next = *rai; + *rai = ib_rai; + return; + +err: + rdma_freeaddrinfo(ib_rai); +} + +static void ucma_ib_save_resp(struct rdma_addrinfo *rai, struct acm_msg *msg) +{ + struct acm_ep_addr_data *ep_data; + struct ibv_path_data *path_data = NULL; + struct sockaddr_in *sin; + struct sockaddr_in6 *sin6; + int i, cnt, path_cnt = 0; + + cnt = (msg->hdr.length - ACM_MSG_HDR_LENGTH) / ACM_MSG_EP_LENGTH; + for (i = 0; i < cnt; i++) { + ep_data = &msg->resolve_data[i]; + switch (ep_data->type) { + case ACM_EP_INFO_PATH: + ep_data->type = 0; + if (!path_data) + path_data = (struct ibv_path_data *) ep_data; + path_cnt++; + break; + case ACM_EP_INFO_ADDRESS_IP: + if (!(ep_data->flags & ACM_EP_FLAG_SOURCE) || rai->ai_src_len) + break; + + sin = calloc(1, sizeof(*sin)); + if (!sin) + break; + + sin->sin_family = AF_INET; + memcpy(&sin->sin_addr, &ep_data->info.addr, 4); + rai->ai_src_len = sizeof(*sin); + rai->ai_src_addr = (struct sockaddr *) sin; + break; + case ACM_EP_INFO_ADDRESS_IP6: + if (!(ep_data->flags & ACM_EP_FLAG_SOURCE) || rai->ai_src_len) + break; + + sin6 = calloc(1, sizeof(*sin6)); + if (!sin6) + break; + + sin6->sin6_family = AF_INET6; + memcpy(&sin6->sin6_addr, &ep_data->info.addr, 16); + rai->ai_src_len = sizeof(*sin6); + rai->ai_src_addr = (struct sockaddr *) sin6; + break; + default: + break; + } + } + + rai->ai_route = calloc(path_cnt, sizeof(*path_data)); + if (rai->ai_route) { + memcpy(rai->ai_route, path_data, path_cnt * sizeof(*path_data)); + rai->ai_route_len = path_cnt * sizeof(*path_data); + } +} + +static void ucma_set_ep_addr(struct acm_ep_addr_data *data, struct sockaddr *addr) +{ + if (addr->sa_family == AF_INET) { + data->type = ACM_EP_INFO_ADDRESS_IP; + memcpy(data->info.addr, &((struct sockaddr_in *) addr)->sin_addr, 4); + } else { + data->type = ACM_EP_INFO_ADDRESS_IP6; + memcpy(data->info.addr, &((struct sockaddr_in6 *) addr)->sin6_addr, 16); + } +} + +static int ucma_inet_addr(struct sockaddr *addr, socklen_t len) +{ + return len && addr && (addr->sa_family == AF_INET || + addr->sa_family == AF_INET6); +} + +static int ucma_ib_addr(struct sockaddr *addr, socklen_t len) +{ + return len && addr && (addr->sa_family == AF_IB); +} + +void ucma_ib_resolve(struct rdma_addrinfo **rai, struct rdma_addrinfo *hints) +{ + struct acm_msg msg; + struct acm_ep_addr_data *data; + int ret; + + ucma_ib_init(); + if (sock < 0) + return; + + memset(&msg, 0, sizeof msg); + msg.hdr.version = ACM_VERSION; + msg.hdr.opcode = ACM_OP_RESOLVE; + msg.hdr.length = ACM_MSG_HDR_LENGTH; + + data = &msg.resolve_data[0]; + if (ucma_inet_addr((*rai)->ai_src_addr, (*rai)->ai_src_len)) { + data->flags = ACM_EP_FLAG_SOURCE; + ucma_set_ep_addr(data, (*rai)->ai_src_addr); + data++; + msg.hdr.length += ACM_MSG_EP_LENGTH; + } + + if (ucma_inet_addr((*rai)->ai_dst_addr, (*rai)->ai_dst_len)) { + data->flags = ACM_EP_FLAG_DEST; + if (hints->ai_flags & (RAI_NUMERICHOST | RAI_NOROUTE)) + data->flags |= ACM_FLAGS_NODELAY; + ucma_set_ep_addr(data, (*rai)->ai_dst_addr); + data++; + msg.hdr.length += ACM_MSG_EP_LENGTH; + } + + if (hints->ai_route_len || + ucma_ib_addr((*rai)->ai_src_addr, (*rai)->ai_src_len) || + ucma_ib_addr((*rai)->ai_dst_addr, (*rai)->ai_dst_len)) { + struct ibv_path_record *path; + + if (hints->ai_route_len == sizeof(struct ibv_path_record)) + path = (struct ibv_path_record *) hints->ai_route; + else if (hints->ai_route_len == sizeof(struct ibv_path_data)) + path = &((struct ibv_path_data *) hints->ai_route)->path; + else + path = NULL; + + if (path) + memcpy(&data->info.path, path, sizeof(*path)); + + if (ucma_ib_addr((*rai)->ai_src_addr, (*rai)->ai_src_len)) { + memcpy(&data->info.path.sgid, + &((struct sockaddr_ib *) (*rai)->ai_src_addr)->sib_addr, 16); + } + if (ucma_ib_addr((*rai)->ai_dst_addr, (*rai)->ai_dst_len)) { + memcpy(&data->info.path.dgid, + &((struct sockaddr_ib *) (*rai)->ai_dst_addr)->sib_addr, 16); + } + data->type = ACM_EP_INFO_PATH; + data++; + msg.hdr.length += ACM_MSG_EP_LENGTH; + } + + pthread_mutex_lock(&acm_lock); + ret = send(sock, (char *) &msg, msg.hdr.length, 0); + if (ret != msg.hdr.length) { + pthread_mutex_unlock(&acm_lock); + return; + } + + ret = recv(sock, (char *) &msg, sizeof msg, 0); + pthread_mutex_unlock(&acm_lock); + if (ret < ACM_MSG_HDR_LENGTH || ret != msg.hdr.length || msg.hdr.status) + return; + + ucma_ib_save_resp(*rai, &msg); + + if (af_ib_support && !(hints->ai_flags & RAI_ROUTEONLY) && (*rai)->ai_route_len) + ucma_resolve_af_ib(rai); +} diff --git a/prov/rdmacm/src/addrinfo.c b/prov/rdmacm/src/addrinfo.c new file mode 100644 index 00000000000..68eaddd3497 --- /dev/null +++ b/prov/rdmacm/src/addrinfo.c @@ -0,0 +1,327 @@ +/* + * Copyright (c) 2010 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $Id: cm.c 3453 2005-09-15 21:43:21Z sean.hefty $ + */ + +#if HAVE_CONFIG_H +# include <config.h> +#endif /* HAVE_CONFIG_H */ + +#include <sys/types.h> +#include <sys/socket.h> +#include <netdb.h> +#include <unistd.h> + +#include "cma.h" +#include <rdma/rdma_cma.h> +#include <infiniband/ib.h> + +#ifdef IBV_XRC_OPS +#define RDMA_QPT_XRC_SEND IBV_QPT_XRC_SEND +#define RDMA_QPT_XRC_RECV IBV_QPT_XRC_RECV +#else +#define RDMA_QPT_XRC_SEND 9 +#define RDMA_QPT_XRC_RECV 10 +#endif + +struct rdma_addrinfo nohints; + +static void ucma_convert_to_ai(struct addrinfo *ai, struct rdma_addrinfo *rai) +{ + memset(ai, 0, sizeof *ai); + if (rai->ai_flags & RAI_PASSIVE) + ai->ai_flags = AI_PASSIVE; + if (rai->ai_flags & RAI_NUMERICHOST) + ai->ai_flags |= AI_NUMERICHOST; + if (rai->ai_family != AF_IB) + ai->ai_family = rai->ai_family; + + switch (rai->ai_qp_type) { + case IBV_QPT_RC: + case IBV_QPT_UC: + case RDMA_QPT_XRC_SEND: + case RDMA_QPT_XRC_RECV: + ai->ai_socktype = SOCK_STREAM; + break; + case IBV_QPT_UD: + ai->ai_socktype = SOCK_DGRAM; + break; + } + + switch (rai->ai_port_space) { + case RDMA_PS_TCP: + ai->ai_protocol = IPPROTO_TCP; + break; + case RDMA_PS_IPOIB: + case RDMA_PS_UDP: + ai->ai_protocol = IPPROTO_UDP; + break; + case RDMA_PS_IB: + if (ai->ai_socktype == SOCK_STREAM) + ai->ai_protocol = IPPROTO_TCP; + else if (ai->ai_socktype == SOCK_DGRAM) + ai->ai_protocol = IPPROTO_UDP; + break; + } + + if (rai->ai_flags & RAI_PASSIVE) { + ai->ai_addrlen = rai->ai_src_len; + ai->ai_addr = rai->ai_src_addr; + } else { + ai->ai_addrlen = rai->ai_dst_len; + ai->ai_addr = rai->ai_dst_addr; + } + ai->ai_canonname = rai->ai_dst_canonname; + ai->ai_next = NULL; +} + +static int ucma_copy_addr(struct sockaddr **dst, socklen_t *dst_len, + struct sockaddr *src, socklen_t src_len) +{ + *dst = malloc(src_len); + if (!(*dst)) + return ERR(ENOMEM); + + memcpy(*dst, src, src_len); + *dst_len = src_len; + return 0; +} + +void ucma_set_sid(enum rdma_port_space ps, struct sockaddr *addr, + struct sockaddr_ib *sib) +{ + uint16_t port; + + port = addr ? ucma_get_port(addr) : 0; + sib->sib_sid = htonll(((uint64_t) ps << 16) + ntohs(port)); + + if (ps) + sib->sib_sid_mask = htonll(RDMA_IB_IP_PS_MASK); + if (port) + sib->sib_sid_mask |= htonll(RDMA_IB_IP_PORT_MASK); +} + +static int ucma_convert_in6(int ps, struct sockaddr_ib **dst, socklen_t *dst_len, + struct sockaddr_in6 *src, socklen_t src_len) +{ + *dst = calloc(1, sizeof(struct sockaddr_ib)); + if (!(*dst)) + return ERR(ENOMEM); + + (*dst)->sib_family = AF_IB; + (*dst)->sib_pkey = 0xFFFF; + (*dst)->sib_flowinfo = src->sin6_flowinfo; + ib_addr_set(&(*dst)->sib_addr, src->sin6_addr.s6_addr32[0], + src->sin6_addr.s6_addr32[1], src->sin6_addr.s6_addr32[2], + src->sin6_addr.s6_addr32[3]); + ucma_set_sid(ps, (struct sockaddr *) src, *dst); + (*dst)->sib_scope_id = src->sin6_scope_id; + + *dst_len = sizeof(struct sockaddr_ib); + return 0; +} + +static int ucma_convert_to_rai(struct rdma_addrinfo *rai, + struct rdma_addrinfo *hints, struct addrinfo *ai) +{ + int ret; + + if (hints->ai_qp_type) { + rai->ai_qp_type = hints->ai_qp_type; + } else { + switch (ai->ai_socktype) { + case SOCK_STREAM: + rai->ai_qp_type = IBV_QPT_RC; + break; + case SOCK_DGRAM: + rai->ai_qp_type = IBV_QPT_UD; + break; + } + } + + if (hints->ai_port_space) { + rai->ai_port_space = hints->ai_port_space; + } else { + switch (ai->ai_protocol) { + case IPPROTO_TCP: + rai->ai_port_space = RDMA_PS_TCP; + break; + case IPPROTO_UDP: + rai->ai_port_space = RDMA_PS_UDP; + break; + } + } + + if (ai->ai_flags & AI_PASSIVE) { + rai->ai_flags = RAI_PASSIVE; + if (ai->ai_canonname) + rai->ai_src_canonname = strdup(ai->ai_canonname); + + if ((hints->ai_flags & RAI_FAMILY) && (hints->ai_family == AF_IB) && + (hints->ai_flags & RAI_NUMERICHOST)) { + rai->ai_family = AF_IB; + ret = ucma_convert_in6(rai->ai_port_space, + (struct sockaddr_ib **) &rai->ai_src_addr, + &rai->ai_src_len, + (struct sockaddr_in6 *) ai->ai_addr, + ai->ai_addrlen); + } else { + rai->ai_family = ai->ai_family; + ret = ucma_copy_addr(&rai->ai_src_addr, &rai->ai_src_len, + ai->ai_addr, ai->ai_addrlen); + } + } else { + if (ai->ai_canonname) + rai->ai_dst_canonname = strdup(ai->ai_canonname); + + if ((hints->ai_flags & RAI_FAMILY) && (hints->ai_family == AF_IB) && + (hints->ai_flags & RAI_NUMERICHOST)) { + rai->ai_family = AF_IB; + ret = ucma_convert_in6(rai->ai_port_space, + (struct sockaddr_ib **) &rai->ai_dst_addr, + &rai->ai_dst_len, + (struct sockaddr_in6 *) ai->ai_addr, + ai->ai_addrlen); + } else { + rai->ai_family = ai->ai_family; + ret = ucma_copy_addr(&rai->ai_dst_addr, &rai->ai_dst_len, + ai->ai_addr, ai->ai_addrlen); + } + } + return ret; +} + +static int ucma_getaddrinfo(char *node, char *service, + struct rdma_addrinfo *hints, + struct rdma_addrinfo *rai) +{ + struct addrinfo ai_hints; + struct addrinfo *ai; + int ret; + + if (hints != &nohints) { + ucma_convert_to_ai(&ai_hints, hints); + ret = getaddrinfo(node, service, &ai_hints, &ai); + } else { + ret = getaddrinfo(node, service, NULL, &ai); + } + if (ret) + return ret; + + ret = ucma_convert_to_rai(rai, hints, ai); + freeaddrinfo(ai); + return ret; +} + +int rdma_getaddrinfo(char *node, char *service, + struct rdma_addrinfo *hints, + struct rdma_addrinfo **res) +{ + struct rdma_addrinfo *rai; + int ret; + + if (!service && !node && !hints) + return ERR(EINVAL); + + ret = ucma_init(); + if (ret) + return ret; + + rai = calloc(1, sizeof(*rai)); + if (!rai) + return ERR(ENOMEM); + + if (!hints) + hints = &nohints; + + if (node || service) { + ret = ucma_getaddrinfo(node, service, hints, rai); + } else { + rai->ai_flags = hints->ai_flags; + rai->ai_family = hints->ai_family; + rai->ai_qp_type = hints->ai_qp_type; + rai->ai_port_space = hints->ai_port_space; + if (hints->ai_dst_len) { + ret = ucma_copy_addr(&rai->ai_dst_addr, &rai->ai_dst_len, + hints->ai_dst_addr, hints->ai_dst_len); + } + } + if (ret) + goto err; + + if (!rai->ai_src_len && hints->ai_src_len) { + ret = ucma_copy_addr(&rai->ai_src_addr, &rai->ai_src_len, + hints->ai_src_addr, hints->ai_src_len); + if (ret) + goto err; + } + + if (!(rai->ai_flags & RAI_PASSIVE)) + ucma_ib_resolve(&rai, hints); + + *res = rai; + return 0; + +err: + rdma_freeaddrinfo(rai); + return ret; +} + +void rdma_freeaddrinfo(struct rdma_addrinfo *res) +{ + struct rdma_addrinfo *rai; + + while (res) { + rai = res; + res = res->ai_next; + + if (rai->ai_connect) + free(rai->ai_connect); + + if (rai->ai_route) + free(rai->ai_route); + + if (rai->ai_src_canonname) + free(rai->ai_src_canonname); + + if (rai->ai_dst_canonname) + free(rai->ai_dst_canonname); + + if (rai->ai_src_addr) + free(rai->ai_src_addr); + + if (rai->ai_dst_addr) + free(rai->ai_dst_addr); + + free(rai); + } +} diff --git a/prov/rdmacm/src/cma.c b/prov/rdmacm/src/cma.c new file mode 100644 index 00000000000..b79f73c3397 --- /dev/null +++ b/prov/rdmacm/src/cma.c @@ -0,0 +1,2210 @@ +/* + * Copyright (c) 2005-2012 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +# include <config.h> +#endif /* HAVE_CONFIG_H */ + +#include <stdlib.h> +#include <string.h> +#include <glob.h> +#include <stdio.h> +#include <fcntl.h> +#include <errno.h> +#include <stdint.h> +#include <poll.h> +#include <unistd.h> +#include <pthread.h> +#include <endian.h> +#include <byteswap.h> +#include <stddef.h> +#include <netdb.h> +#include <syslog.h> + +#include "cma.h" +#include "indexer.h" +#include <infiniband/driver.h> +#include <infiniband/marshall.h> +#include <rdma/rdma_cma.h> +#include <rdma/rdma_verbs.h> +#include <infiniband/ib.h> +#include <fi.h> +#include <rdma/fi_ucma.h> + + +#define CMA_INIT_CMD(req, req_size, op) \ + memset(req, 0, req_size) +#define CMA_INIT_CMD_RESP(req, req_size, op, resp, resp_size) \ + memset(req, 0, req_size) + +struct cma_device { + struct ibv_context *verbs; + struct ibv_pd *pd; + uint64_t guid; + int port_cnt; + int refcnt; + int max_qpsize; + uint8_t max_initiator_depth; + uint8_t max_responder_resources; +}; + +struct cma_id_private { + struct rdma_cm_id id; + struct cma_device *cma_dev; + void *connect; + size_t connect_len; + int events_completed; + int connect_error; + int sync; + pthread_cond_t cond; + pthread_mutex_t mut; + uint32_t handle; + struct cma_multicast *mc_list; + struct ibv_qp_init_attr *qp_init_attr; + uint8_t initiator_depth; + uint8_t responder_resources; +}; + +struct cma_multicast { + struct cma_multicast *next; + struct cma_id_private *id_priv; + void *context; + int events_completed; + pthread_cond_t cond; + uint32_t handle; + union ibv_gid mgid; + uint16_t mlid; + struct sockaddr_storage addr; +}; + +struct cma_event { + struct rdma_cm_event event; + uint8_t private_data[RDMA_MAX_PRIVATE_DATA]; + struct cma_id_private *id_priv; + struct cma_multicast *mc; +}; + +static struct cma_device *cma_dev_array; +static int cma_dev_cnt; +static pthread_mutex_t mut = PTHREAD_MUTEX_INITIALIZER; +int af_ib_support; +static struct index_map ucma_idm; +static fastlock_t idm_lock; + +static void ucma_cleanup(void) +{ + ucma_ib_cleanup(); + + if (cma_dev_cnt) { + while (cma_dev_cnt--) { + if (cma_dev_array[cma_dev_cnt].refcnt) + ibv_dealloc_pd(cma_dev_array[cma_dev_cnt].pd); + ibv_close_device(cma_dev_array[cma_dev_cnt].verbs); + } + + fastlock_destroy(&idm_lock); + free(cma_dev_array); + cma_dev_cnt = 0; + } +} + +/* + * This function is called holding the mutex lock + * cma_dev_cnt must be set before calling this function to + * ensure that the lock is not acquired recursively. + */ +static void ucma_set_af_ib_support(void) +{ + struct rdma_cm_id *id; + struct sockaddr_ib sib; + int ret; + + ret = rdma_create_id(NULL, &id, NULL, RDMA_PS_IB); + if (ret) + return; + + memset(&sib, 0, sizeof sib); + sib.sib_family = AF_IB; + sib.sib_sid = htonll(RDMA_IB_IP_PS_TCP); + sib.sib_sid_mask = htonll(RDMA_IB_IP_PS_MASK); + af_ib_support = 1; + ret = rdma_bind_addr(id, (struct sockaddr *) &sib); + af_ib_support = !ret; + + rdma_destroy_id(id); +} + +int _ucma_init(void) +{ + struct ibv_device **dev_list = NULL; + struct cma_device *cma_dev; + struct ibv_device_attr attr; + int i, ret, dev_cnt; + + /* Quick check without lock to see if we're already initialized */ + if (cma_dev_cnt) + return 0; + + pthread_mutex_lock(&mut); + if (cma_dev_cnt) { + pthread_mutex_unlock(&mut); + return 0; + } + + fastlock_init(&idm_lock); + dev_list = ibv_get_device_list(&dev_cnt); + if (!dev_list) { + fprintf(stderr, "rdmacm: fatal: unable to get RDMA device list\n"); + ret = ERR(ENODEV); + goto err1; + } + + if (!dev_cnt) { + fprintf(stderr, "rdmacm: fatal: no RDMA devices found\n"); + ret = ERR(ENODEV); + goto err2; + } + + cma_dev_array = calloc(dev_cnt, sizeof *cma_dev); + if (!cma_dev_array) { + ret = ERR(ENOMEM); + goto err2; + } + + for (i = 0; dev_list[i];) { + cma_dev = &cma_dev_array[i]; + + cma_dev->guid = ibv_get_device_guid(dev_list[i]); + cma_dev->verbs = ibv_open_device(dev_list[i]); + if (!cma_dev->verbs) { + fprintf(stderr, "rdmacm: fatal: unable to open RDMA device\n"); + ret = ERR(ENODEV); + goto err3; + } + + i++; + ret = ibv_query_device(cma_dev->verbs, &attr); + if (ret) { + fprintf(stderr, "rdmacm: fatal: unable to query RDMA device\n"); + ret = ERR(ret); + goto err3; + } + + cma_dev->port_cnt = attr.phys_port_cnt; + cma_dev->max_qpsize = attr.max_qp_wr; + cma_dev->max_initiator_depth = (uint8_t) attr.max_qp_init_rd_atom; + cma_dev->max_responder_resources = (uint8_t) attr.max_qp_rd_atom; + } + + cma_dev_cnt = dev_cnt; + ucma_set_af_ib_support(); + pthread_mutex_unlock(&mut); + ibv_free_device_list(dev_list); + return 0; + +err3: + while (i--) + ibv_close_device(cma_dev_array[i].verbs); + free(cma_dev_array); +err2: + ibv_free_device_list(dev_list); +err1: + fastlock_destroy(&idm_lock); + pthread_mutex_unlock(&mut); + return ret; +} + +struct ibv_context **rdma_get_devices(int *num_devices) +{ + struct ibv_context **devs = NULL; + int i; + + if (_ucma_init()) + goto out; + + devs = malloc(sizeof *devs * (cma_dev_cnt + 1)); + if (!devs) + goto out; + + for (i = 0; i < cma_dev_cnt; i++) + devs[i] = cma_dev_array[i].verbs; + devs[i] = NULL; +out: + if (num_devices) + *num_devices = devs ? cma_dev_cnt : 0; + return devs; +} + +void rdma_free_devices(struct ibv_context **list) +{ + free(list); +} + +void rdma_cm_ini(void) +{ +} + +void rdma_cm_fini(void) +{ + ucma_cleanup(); +} + +struct rdma_event_channel *rdma_create_event_channel(void) +{ + struct rdma_event_channel *channel; + struct fid_ucma *ucma; + int ret; + + if (_ucma_init()) + return NULL; + + channel = malloc(sizeof *channel); + if (!channel) + return NULL; + + ret = fi_open("ucma", NULL, 0, &channel->fid, channel); + if (ret) { + fprintf(stderr, "rdmacm: fatal: unable to open /dev/infiniband/rdma_cm\n"); + goto err; + } + + ucma = container_of(channel->fid, struct fid_ucma, fid); + channel->fd = ucma->fd; + return channel; +err: + free(channel); + return NULL; +} + +void rdma_destroy_event_channel(struct rdma_event_channel *channel) +{ + fi_close(channel->fid); + free(channel); +} + +static int ucma_get_device(struct cma_id_private *id_priv, uint64_t guid) +{ + struct cma_device *cma_dev; + int i, ret = 0; + + for (i = 0; i < cma_dev_cnt; i++) { + cma_dev = &cma_dev_array[i]; + if (cma_dev->guid == guid) + goto match; + } + + return ERR(ENODEV); +match: + pthread_mutex_lock(&mut); + if (!cma_dev->refcnt++) { + cma_dev->pd = ibv_alloc_pd(cma_dev_array[i].verbs); + if (!cma_dev->pd) { + cma_dev->refcnt--; + ret = ERR(ENOMEM); + goto out; + } + } + id_priv->cma_dev = cma_dev; + id_priv->id.verbs = cma_dev->verbs; + id_priv->id.pd = cma_dev->pd; +out: + pthread_mutex_unlock(&mut); + return ret; +} + +static void ucma_put_device(struct cma_device *cma_dev) +{ + pthread_mutex_lock(&mut); + if (!--cma_dev->refcnt) + ibv_dealloc_pd(cma_dev->pd); + pthread_mutex_unlock(&mut); +} + +static void ucma_insert_id(struct cma_id_private *id_priv) +{ + fastlock_acquire(&idm_lock); + idm_set(&ucma_idm, id_priv->handle, id_priv); + fastlock_release(&idm_lock); +} + +static void ucma_remove_id(struct cma_id_private *id_priv) +{ + if (id_priv->handle <= IDX_MAX_INDEX) + idm_clear(&ucma_idm, id_priv->handle); +} + +static struct cma_id_private *ucma_lookup_id(int handle) +{ + return idm_lookup(&ucma_idm, handle); +} + +static void ucma_free_id(struct cma_id_private *id_priv) +{ + ucma_remove_id(id_priv); + if (id_priv->cma_dev) + ucma_put_device(id_priv->cma_dev); + pthread_cond_destroy(&id_priv->cond); + pthread_mutex_destroy(&id_priv->mut); + if (id_priv->id.route.path_rec) + free(id_priv->id.route.path_rec); + + if (id_priv->sync) + rdma_destroy_event_channel(id_priv->id.channel); + if (id_priv->connect_len) + free(id_priv->connect); + free(id_priv); +} + +static struct cma_id_private *ucma_alloc_id(struct rdma_event_channel *channel, + void *context, + enum rdma_port_space ps, + enum ibv_qp_type qp_type) +{ + struct cma_id_private *id_priv; + + id_priv = calloc(1, sizeof *id_priv); + if (!id_priv) + return NULL; + + id_priv->id.context = context; + id_priv->id.ps = ps; + id_priv->id.qp_type = qp_type; + id_priv->handle = 0xFFFFFFFF; + + if (!channel) { + id_priv->id.channel = rdma_create_event_channel(); + if (!id_priv->id.channel) + goto err; + id_priv->sync = 1; + } else { + id_priv->id.channel = channel; + } + + pthread_mutex_init(&id_priv->mut, NULL); + if (pthread_cond_init(&id_priv->cond, NULL)) + goto err; + + return id_priv; + +err: ucma_free_id(id_priv); + return NULL; +} + +static int rdma_create_id2(struct rdma_event_channel *channel, + struct rdma_cm_id **id, void *context, + enum rdma_port_space ps, enum ibv_qp_type qp_type) +{ + struct ucma_abi_create_id_resp resp; + struct ucma_abi_create_id cmd; + struct cma_id_private *id_priv; + int ret; + + ret = _ucma_init(); + if (ret) + return ret; + + id_priv = ucma_alloc_id(channel, context, ps, qp_type); + if (!id_priv) + return ERR(ENOMEM); + + CMA_INIT_CMD_RESP(&cmd, sizeof cmd, CREATE_ID, &resp, sizeof resp); + cmd.uid = (uintptr_t) id_priv; + cmd.ps = ps; + cmd.qp_type = qp_type; + + ret = ucma_create_id(id_priv->id.channel->fid, + &cmd, sizeof cmd, &resp, sizeof resp); + if (ret) { + ret = ERR(-ret); + goto err; + } + + id_priv->handle = resp.id; + ucma_insert_id(id_priv); + *id = &id_priv->id; + return 0; + +err: + ucma_free_id(id_priv); + return ret; +} + +int rdma_create_id(struct rdma_event_channel *channel, + struct rdma_cm_id **id, void *context, + enum rdma_port_space ps) +{ + enum ibv_qp_type qp_type; + + qp_type = (ps == RDMA_PS_IPOIB || ps == RDMA_PS_UDP) ? + IBV_QPT_UD : IBV_QPT_RC; + return rdma_create_id2(channel, id, context, ps, qp_type); +} + +static int ucma_destroy_kern_id(fid_t fid, uint32_t handle) +{ + struct ucma_abi_destroy_id_resp resp; + struct ucma_abi_destroy_id cmd; + int ret; + + CMA_INIT_CMD_RESP(&cmd, sizeof cmd, DESTROY_ID, &resp, sizeof resp); + cmd.id = handle; + + ret = ucma_destroy_id(fid, &cmd, sizeof cmd, &resp, sizeof resp); + if (ret) + return ERR(-ret); + + return resp.events_reported; +} + +int rdma_destroy_id(struct rdma_cm_id *id) +{ + struct cma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct cma_id_private, id); + ret = ucma_destroy_kern_id(id->channel->fid, id_priv->handle); + if (ret < 0) + return ret; + + if (id_priv->id.event) + rdma_ack_cm_event(id_priv->id.event); + + pthread_mutex_lock(&id_priv->mut); + while (id_priv->events_completed < ret) + pthread_cond_wait(&id_priv->cond, &id_priv->mut); + pthread_mutex_unlock(&id_priv->mut); + + ucma_free_id(id_priv); + return 0; +} + +int rdma_addrlen(struct sockaddr *addr) +{ + if (!addr) + return 0; + + switch (addr->sa_family) { + case PF_INET: + return sizeof(struct sockaddr_in); + case PF_INET6: + return sizeof(struct sockaddr_in6); + case PF_IB: + return af_ib_support ? sizeof(struct sockaddr_ib) : 0; + default: + return 0; + } +} + +static int ucma_query_addr(struct rdma_cm_id *id) +{ + struct ucma_abi_query_addr_resp resp; + struct ucma_abi_query cmd; + struct cma_id_private *id_priv; + int ret; + + CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY, &resp, sizeof resp); + id_priv = container_of(id, struct cma_id_private, id); + cmd.id = id_priv->handle; + cmd.option = UCMA_QUERY_ADDR; + + ret = ucma_query(id->channel->fid, &cmd, sizeof cmd, + &resp, sizeof resp); + if (ret) + return ERR(-ret); + + memcpy(&id->route.addr.src_addr, &resp.src_addr, resp.src_size); + memcpy(&id->route.addr.dst_addr, &resp.dst_addr, resp.dst_size); + + if (!id_priv->cma_dev && resp.node_guid) { + ret = ucma_get_device(id_priv, resp.node_guid); + if (ret) + return ret; + id->port_num = resp.port_num; + id->route.addr.addr.ibaddr.pkey = resp.pkey; + } + + return 0; +} + +static int ucma_query_gid(struct rdma_cm_id *id) +{ + struct ucma_abi_query_addr_resp resp; + struct ucma_abi_query cmd; + struct cma_id_private *id_priv; + struct sockaddr_ib *sib; + int ret; + + CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY, &resp, sizeof resp); + id_priv = container_of(id, struct cma_id_private, id); + cmd.id = id_priv->handle; + cmd.option = UCMA_QUERY_GID; + + ret = ucma_query(id->channel->fid, &cmd, sizeof cmd, + &resp, sizeof resp); + if (ret) + return ERR(-ret); + + sib = (struct sockaddr_ib *) &resp.src_addr; + memcpy(id->route.addr.addr.ibaddr.sgid.raw, sib->sib_addr.sib_raw, + sizeof id->route.addr.addr.ibaddr.sgid); + + sib = (struct sockaddr_ib *) &resp.dst_addr; + memcpy(id->route.addr.addr.ibaddr.dgid.raw, sib->sib_addr.sib_raw, + sizeof id->route.addr.addr.ibaddr.dgid); + + return 0; +} + +static void ucma_convert_path(struct ibv_path_data *path_data, + struct ibv_sa_path_rec *sa_path) +{ + uint32_t fl_hop; + + memcpy(&sa_path->dgid, path_data->path.dgid, 16); + memcpy(&sa_path->sgid, path_data->path.sgid, 16); + sa_path->dlid = path_data->path.dlid; + sa_path->slid = path_data->path.slid; + sa_path->raw_traffic = 0; + + fl_hop = ntohl(path_data->path.flowlabel_hoplimit); + sa_path->flow_label = htonl(fl_hop >> 8); + sa_path->hop_limit = (uint8_t) fl_hop; + + sa_path->traffic_class = path_data->path.tclass; + sa_path->reversible = path_data->path.reversible_numpath >> 7; + sa_path->numb_path = 1; + sa_path->pkey = path_data->path.pkey; + sa_path->sl = ntohs(path_data->path.qosclass_sl) & 0xF; + sa_path->mtu_selector = 1; + sa_path->mtu = path_data->path.mtu & 0x1F; + sa_path->rate_selector = 1; + sa_path->rate = path_data->path.rate & 0x1F; + sa_path->packet_life_time_selector = 1; + sa_path->packet_life_time = path_data->path.packetlifetime & 0x1F; + + sa_path->preference = (uint8_t) path_data->flags; +} + +static int ucma_query_path(struct rdma_cm_id *id) +{ + struct ucma_abi_query_path_resp *resp; + struct ucma_abi_query cmd; + struct cma_id_private *id_priv; + int ret, i, size; + + size = sizeof(*resp) + sizeof(struct ibv_path_data) * 6; + resp = alloca(size); + if (!resp) + return ERR(ENOMEM); + + CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY, resp, size); + id_priv = container_of(id, struct cma_id_private, id); + cmd.id = id_priv->handle; + cmd.option = UCMA_QUERY_PATH; + + ret = ucma_query(id->channel->fid, &cmd, sizeof cmd, resp, size); + if (ret) + return ERR(-ret); + + if (resp->num_paths) { + id->route.path_rec = malloc(sizeof(*id->route.path_rec) * + resp->num_paths); + if (!id->route.path_rec) + return ERR(ENOMEM); + + id->route.num_paths = resp->num_paths; + for (i = 0; i < resp->num_paths; i++) + ucma_convert_path(&resp->path_data[i], &id->route.path_rec[i]); + } + + return 0; +} + +static int _ucma_query_route(struct rdma_cm_id *id) +{ + struct ucma_abi_query_route_resp resp; + struct ucma_abi_query cmd; + struct cma_id_private *id_priv; + int ret, i; + + CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY_ROUTE, &resp, sizeof resp); + id_priv = container_of(id, struct cma_id_private, id); + cmd.id = id_priv->handle; + + ret = ucma_query_route(id->channel->fid, &cmd, sizeof cmd, + &resp, sizeof resp); + if (ret) + return ERR(-ret); + + if (resp.num_paths) { + id->route.path_rec = malloc(sizeof *id->route.path_rec * + resp.num_paths); + if (!id->route.path_rec) + return ERR(ENOMEM); + + id->route.num_paths = resp.num_paths; + for (i = 0; i < resp.num_paths; i++) + ibv_copy_path_rec_from_kern(&id->route.path_rec[i], + &resp.ib_route[i]); + } + + memcpy(id->route.addr.addr.ibaddr.sgid.raw, resp.ib_route[0].sgid, + sizeof id->route.addr.addr.ibaddr.sgid); + memcpy(id->route.addr.addr.ibaddr.dgid.raw, resp.ib_route[0].dgid, + sizeof id->route.addr.addr.ibaddr.dgid); + id->route.addr.addr.ibaddr.pkey = resp.ib_route[0].pkey; + memcpy(&id->route.addr.src_addr, &resp.src_addr, + sizeof resp.src_addr); + memcpy(&id->route.addr.dst_addr, &resp.dst_addr, + sizeof resp.dst_addr); + + if (!id_priv->cma_dev && resp.node_guid) { + ret = ucma_get_device(id_priv, resp.node_guid); + if (ret) + return ret; + id_priv->id.port_num = resp.port_num; + } + + return 0; +} + +static int rdma_bind_addr2(struct rdma_cm_id *id, struct sockaddr *addr, + socklen_t addrlen) +{ + struct ucma_abi_bind cmd; + struct cma_id_private *id_priv; + int ret; + + CMA_INIT_CMD(&cmd, sizeof cmd, BIND); + id_priv = container_of(id, struct cma_id_private, id); + cmd.id = id_priv->handle; + cmd.addr_size = addrlen; + memcpy(&cmd.addr, addr, addrlen); + + ret = ucma_bind(id->channel->fid, &cmd, sizeof cmd); + if (ret) + return ERR(-ret); + + return ucma_query_addr(id); +} + +int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) +{ + struct ucma_abi_bind_ip cmd; + struct cma_id_private *id_priv; + int ret, addrlen; + + addrlen = rdma_addrlen(addr); + if (!addrlen) + return ERR(EINVAL); + + if (af_ib_support) + return rdma_bind_addr2(id, addr, addrlen); + + CMA_INIT_CMD(&cmd, sizeof cmd, BIND_IP); + id_priv = container_of(id, struct cma_id_private, id); + cmd.id = id_priv->handle; + memcpy(&cmd.addr, addr, addrlen); + + ret = ucma_bind_ip(id->channel->fid, &cmd, sizeof cmd); + if (ret) + return ERR(-ret); + + return _ucma_query_route(id); +} + +int ucma_complete(struct rdma_cm_id *id) +{ + struct cma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct cma_id_private, id); + if (!id_priv->sync) + return 0; + + if (id_priv->id.event) { + rdma_ack_cm_event(id_priv->id.event); + id_priv->id.event = NULL; + } + + ret = rdma_get_cm_event(id_priv->id.channel, &id_priv->id.event); + if (ret) + return ret; + + if (id_priv->id.event->status) { + if (id_priv->id.event->event == RDMA_CM_EVENT_REJECTED) + ret = ERR(ECONNREFUSED); + else if (id_priv->id.event->status < 0) + ret = ERR(-id_priv->id.event->status); + else + ret = ERR(-id_priv->id.event->status); + } + return ret; +} + +static int rdma_resolve_addr2(struct rdma_cm_id *id, struct sockaddr *src_addr, + socklen_t src_len, struct sockaddr *dst_addr, + socklen_t dst_len, int timeout_ms) +{ + struct ucma_abi_resolve_addr cmd; + struct cma_id_private *id_priv; + int ret; + + CMA_INIT_CMD(&cmd, sizeof cmd, RESOLVE_ADDR); + id_priv = container_of(id, struct cma_id_private, id); + cmd.id = id_priv->handle; + if ((cmd.src_size = src_len)) + memcpy(&cmd.src_addr, src_addr, src_len); + memcpy(&cmd.dst_addr, dst_addr, dst_len); + cmd.dst_size = dst_len; + cmd.timeout_ms = timeout_ms; + + ret = ucma_resolve_addr(id->channel->fid, &cmd, sizeof cmd); + if (ret) + return ERR(-ret); + + memcpy(&id->route.addr.dst_addr, dst_addr, dst_len); + return ucma_complete(id); +} + +int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, + struct sockaddr *dst_addr, int timeout_ms) +{ + struct ucma_abi_resolve_ip cmd; + struct cma_id_private *id_priv; + int ret, dst_len, src_len; + + dst_len = rdma_addrlen(dst_addr); + if (!dst_len) + return ERR(EINVAL); + + src_len = rdma_addrlen(src_addr); + if (src_addr && !src_len) + return ERR(EINVAL); + + if (af_ib_support) + return rdma_resolve_addr2(id, src_addr, src_len, dst_addr, + dst_len, timeout_ms); + + CMA_INIT_CMD(&cmd, sizeof cmd, RESOLVE_IP); + id_priv = container_of(id, struct cma_id_private, id); + cmd.id = id_priv->handle; + if (src_addr) + memcpy(&cmd.src_addr, src_addr, src_len); + memcpy(&cmd.dst_addr, dst_addr, dst_len); + cmd.timeout_ms = timeout_ms; + + ret = ucma_resolve_ip(id->channel->fid, &cmd, sizeof cmd); + if (ret) + return ERR(-ret); + + memcpy(&id->route.addr.dst_addr, dst_addr, dst_len); + return ucma_complete(id); +} + +static int ucma_set_ib_route(struct rdma_cm_id *id) +{ + struct rdma_addrinfo hint, *rai; + int ret; + + memset(&hint, 0, sizeof hint); + hint.ai_flags = RAI_ROUTEONLY; + hint.ai_family = id->route.addr.src_addr.sa_family; + hint.ai_src_len = rdma_addrlen((struct sockaddr *) &id->route.addr.src_addr); + hint.ai_src_addr = &id->route.addr.src_addr; + hint.ai_dst_len = rdma_addrlen((struct sockaddr *) &id->route.addr.dst_addr); + hint.ai_dst_addr = &id->route.addr.dst_addr; + + ret = rdma_getaddrinfo(NULL, NULL, &hint, &rai); + if (ret) + return ret; + + if (rai->ai_route_len) + ret = rdma_set_option(id, RDMA_OPTION_IB, RDMA_OPTION_IB_PATH, + rai->ai_route, rai->ai_route_len); + else + ret = -1; + + rdma_freeaddrinfo(rai); + return ret; +} + +int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms) +{ + struct ucma_abi_resolve_route cmd; + struct cma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct cma_id_private, id); + if (id->verbs->device->transport_type == IBV_TRANSPORT_IB) { + ret = ucma_set_ib_route(id); + if (!ret) + goto out; + } + + CMA_INIT_CMD(&cmd, sizeof cmd, RESOLVE_ROUTE); + cmd.id = id_priv->handle; + cmd.timeout_ms = timeout_ms; + + ret = ucma_resolve_route(id->channel->fid, &cmd, sizeof cmd); + if (ret) + return ERR(-ret); + +out: + return ucma_complete(id); +} + +static int ucma_is_ud_qp(enum ibv_qp_type qp_type) +{ + return (qp_type == IBV_QPT_UD); +} + +static int rdma_init_qp_attr(struct rdma_cm_id *id, struct ibv_qp_attr *qp_attr, + int *qp_attr_mask) +{ + struct ucma_abi_init_qp_attr cmd; + struct ibv_kern_qp_attr resp; + struct cma_id_private *id_priv; + int ret; + + CMA_INIT_CMD_RESP(&cmd, sizeof cmd, INIT_QP_ATTR, &resp, sizeof resp); + id_priv = container_of(id, struct cma_id_private, id); + cmd.id = id_priv->handle; + cmd.qp_state = qp_attr->qp_state; + + ret = ucma_init_qp_attr(id->channel->fid, &cmd, sizeof cmd, + &resp, sizeof resp); + if (ret) + return ERR(-ret); + + ibv_copy_qp_attr_from_kern(qp_attr, &resp); + *qp_attr_mask = resp.qp_attr_mask; + return 0; +} + +static int ucma_modify_qp_rtr(struct rdma_cm_id *id, uint8_t resp_res) +{ + struct ibv_qp_attr qp_attr; + int qp_attr_mask, ret; + + if (!id->qp) + return ERR(EINVAL); + + /* Need to update QP attributes from default values. */ + qp_attr.qp_state = IBV_QPS_INIT; + ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask); + if (ret) + return ret; + + ret = ibv_modify_qp(id->qp, &qp_attr, qp_attr_mask); + if (ret) + return ERR(ret); + + qp_attr.qp_state = IBV_QPS_RTR; + ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask); + if (ret) + return ret; + + if (resp_res != RDMA_MAX_RESP_RES) + qp_attr.max_dest_rd_atomic = resp_res; + return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, qp_attr_mask)); +} + +static int ucma_modify_qp_rts(struct rdma_cm_id *id, uint8_t init_depth) +{ + struct ibv_qp_attr qp_attr; + int qp_attr_mask, ret; + + qp_attr.qp_state = IBV_QPS_RTS; + ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask); + if (ret) + return ret; + + if (init_depth != RDMA_MAX_INIT_DEPTH) + qp_attr.max_rd_atomic = init_depth; + return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, qp_attr_mask)); +} + +static int ucma_modify_qp_sqd(struct rdma_cm_id *id) +{ + struct ibv_qp_attr qp_attr; + + if (!id->qp) + return 0; + + qp_attr.qp_state = IBV_QPS_SQD; + return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, IBV_QP_STATE)); +} + +static int ucma_modify_qp_err(struct rdma_cm_id *id) +{ + struct ibv_qp_attr qp_attr; + + if (!id->qp) + return 0; + + qp_attr.qp_state = IBV_QPS_ERR; + return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, IBV_QP_STATE)); +} + +static int ucma_init_conn_qp(struct cma_id_private *id_priv, struct ibv_qp *qp) +{ + struct ibv_qp_attr qp_attr; + int qp_attr_mask, ret; + + qp_attr.qp_state = IBV_QPS_INIT; + ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); + if (ret) + return ret; + + return rdma_seterrno(ibv_modify_qp(qp, &qp_attr, qp_attr_mask)); +} + +static int ucma_init_ud_qp(struct cma_id_private *id_priv, struct ibv_qp *qp) +{ + struct ibv_qp_attr qp_attr; + int qp_attr_mask, ret; + + qp_attr.qp_state = IBV_QPS_INIT; + ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); + if (ret) + return ret; + + ret = ibv_modify_qp(qp, &qp_attr, qp_attr_mask); + if (ret) + return ERR(ret); + + qp_attr.qp_state = IBV_QPS_RTR; + ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE); + if (ret) + return ERR(ret); + + qp_attr.qp_state = IBV_QPS_RTS; + qp_attr.sq_psn = 0; + ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_SQ_PSN); + return rdma_seterrno(ret); +} + +static void ucma_destroy_cqs(struct rdma_cm_id *id) +{ + if (id->recv_cq) + ibv_destroy_cq(id->recv_cq); + + if (id->recv_cq_channel) + ibv_destroy_comp_channel(id->recv_cq_channel); + + if (id->send_cq && (id->send_cq != id->recv_cq)) + ibv_destroy_cq(id->send_cq); + + if (id->send_cq_channel && (id->send_cq_channel != id->recv_cq_channel)) + ibv_destroy_comp_channel(id->send_cq_channel); +} + +static int ucma_create_cqs(struct rdma_cm_id *id, uint32_t send_size, uint32_t recv_size) +{ + if (recv_size) { + id->recv_cq_channel = ibv_create_comp_channel(id->verbs); + if (!id->recv_cq_channel) + goto err; + + id->recv_cq = ibv_create_cq(id->verbs, recv_size, + id, id->recv_cq_channel, 0); + if (!id->recv_cq) + goto err; + } + + if (send_size) { + id->send_cq_channel = ibv_create_comp_channel(id->verbs); + if (!id->send_cq_channel) + goto err; + + id->send_cq = ibv_create_cq(id->verbs, send_size, + id, id->send_cq_channel, 0); + if (!id->send_cq) + goto err; + } + + return 0; +err: + ucma_destroy_cqs(id); + return ERR(ENOMEM); +} + +int rdma_create_srq(struct rdma_cm_id *id, struct ibv_pd *pd, + struct ibv_srq_init_attr *attr) +{ + struct ibv_srq *srq; + int ret; + + if (!pd) + pd = id->pd; + +#ifdef IBV_XRC_OPS + if (attr->srq_type == IBV_SRQT_XRC) { + if (!attr->ext.xrc.cq) { + ret = ucma_create_cqs(id, 0, attr->attr.max_wr); + if (ret) + return ret; + + attr->ext.xrc.cq = id->recv_cq; + } + } + + srq = ibv_create_xsrq(pd, attr); +#else + srq = ibv_create_srq(pd, attr); +#endif + if (!srq) { + ret = -1; + goto err; + } + + id->pd = pd; + id->srq = srq; + return 0; +err: + ucma_destroy_cqs(id); + return ret; +} + +void rdma_destroy_srq(struct rdma_cm_id *id) +{ + ibv_destroy_srq(id->srq); + if (!id->qp) + ucma_destroy_cqs(id); + id->srq = NULL; +} + +int rdma_create_qp(struct rdma_cm_id *id, struct ibv_pd *pd, + struct ibv_qp_init_attr *qp_init_attr) +{ + struct cma_id_private *id_priv; + struct ibv_qp *qp; + int ret; + + if (id->qp) + return ERR(EINVAL); + + id_priv = container_of(id, struct cma_id_private, id); + if (!pd) + pd = id->pd; + else if (id->verbs != pd->context) + return ERR(EINVAL); + + ret = ucma_create_cqs(id, qp_init_attr->send_cq ? 0 : qp_init_attr->cap.max_send_wr, + qp_init_attr->recv_cq ? 0 : qp_init_attr->cap.max_recv_wr); + if (ret) + return ret; + + if (!qp_init_attr->send_cq) + qp_init_attr->send_cq = id->send_cq; + if (!qp_init_attr->recv_cq) + qp_init_attr->recv_cq = id->recv_cq; + qp = ibv_create_qp(pd, qp_init_attr); + if (!qp) { + ret = ERR(ENOMEM); + goto err1; + } + + if (ucma_is_ud_qp(id->qp_type)) + ret = ucma_init_ud_qp(id_priv, qp); + else + ret = ucma_init_conn_qp(id_priv, qp); + if (ret) + goto err2; + + id->pd = pd; + id->qp = qp; + return 0; +err2: + ibv_destroy_qp(qp); +err1: + ucma_destroy_cqs(id); + return ret; +} + +void rdma_destroy_qp(struct rdma_cm_id *id) +{ + ibv_destroy_qp(id->qp); + ucma_destroy_cqs(id); + id->qp = NULL; +} + +static int ucma_valid_param(struct cma_id_private *id_priv, + struct rdma_conn_param *param) +{ + if (id_priv->id.ps != RDMA_PS_TCP) + return 0; + + if (!id_priv->id.qp && !param) + goto err; + + if (!param) + return 0; + + if ((param->responder_resources != RDMA_MAX_RESP_RES) && + (param->responder_resources > id_priv->cma_dev->max_responder_resources)) + goto err; + + if ((param->initiator_depth != RDMA_MAX_INIT_DEPTH) && + (param->initiator_depth > id_priv->cma_dev->max_initiator_depth)) + goto err; + + return 0; +err: + return ERR(EINVAL); +} + +static void ucma_copy_conn_param_to_kern(struct cma_id_private *id_priv, + struct ucma_abi_conn_param *dst, + struct rdma_conn_param *src, + uint32_t qp_num, uint8_t srq) +{ + dst->qp_num = qp_num; + dst->srq = srq; + dst->responder_resources = id_priv->responder_resources; + dst->initiator_depth = id_priv->initiator_depth; + dst->valid = 1; + + if (id_priv->connect_len) { + memcpy(dst->private_data, id_priv->connect, id_priv->connect_len); + dst->private_data_len = id_priv->connect_len; + } + + if (src) { + dst->flow_control = src->flow_control; + dst->retry_count = src->retry_count; + dst->rnr_retry_count = src->rnr_retry_count; + + if (src->private_data && src->private_data_len) { + memcpy(dst->private_data + dst->private_data_len, + src->private_data, src->private_data_len); + dst->private_data_len += src->private_data_len; + } + } else { + dst->retry_count = 7; + dst->rnr_retry_count = 7; + } +} + +int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) +{ + struct ucma_abi_connect cmd; + struct cma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct cma_id_private, id); + ret = ucma_valid_param(id_priv, conn_param); + if (ret) + return ret; + + if (conn_param && conn_param->initiator_depth != RDMA_MAX_INIT_DEPTH) + id_priv->initiator_depth = conn_param->initiator_depth; + else + id_priv->initiator_depth = id_priv->cma_dev->max_initiator_depth; + if (conn_param && conn_param->responder_resources != RDMA_MAX_RESP_RES) + id_priv->responder_resources = conn_param->responder_resources; + else + id_priv->responder_resources = id_priv->cma_dev->max_responder_resources; + + CMA_INIT_CMD(&cmd, sizeof cmd, CONNECT); + cmd.id = id_priv->handle; + if (id->qp) { + ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param, + conn_param, id->qp->qp_num, + (id->qp->srq != NULL)); + } else if (conn_param) { + ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param, + conn_param, conn_param->qp_num, + conn_param->srq); + } else { + ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param, + conn_param, 0, 0); + } + + ret = ucma_connect(id->channel->fid, &cmd, sizeof cmd); + if (ret) + return ERR(-ret); + + if (id_priv->connect_len) { + free(id_priv->connect); + id_priv->connect_len = 0; + } + + return ucma_complete(id); +} + +int rdma_listen(struct rdma_cm_id *id, int backlog) +{ + struct ucma_abi_listen cmd; + struct cma_id_private *id_priv; + int ret; + + CMA_INIT_CMD(&cmd, sizeof cmd, LISTEN); + id_priv = container_of(id, struct cma_id_private, id); + cmd.id = id_priv->handle; + cmd.backlog = backlog; + + ret = ucma_listen(id->channel->fid, &cmd, sizeof cmd); + if (ret) + return ERR(-ret); + + if (af_ib_support) + return ucma_query_addr(id); + else + return _ucma_query_route(id); +} + +int rdma_get_request(struct rdma_cm_id *listen, struct rdma_cm_id **id) +{ + struct cma_id_private *id_priv; + struct rdma_cm_event *event; + int ret; + + id_priv = container_of(listen, struct cma_id_private, id); + if (!id_priv->sync) + return ERR(EINVAL); + + if (listen->event) { + rdma_ack_cm_event(listen->event); + listen->event = NULL; + } + + ret = rdma_get_cm_event(listen->channel, &event); + if (ret) + return ret; + + if (event->status) { + ret = ERR(event->status); + goto err; + } + + if (event->event != RDMA_CM_EVENT_CONNECT_REQUEST) { + ret = ERR(EINVAL); + goto err; + } + + if (id_priv->qp_init_attr) { + struct ibv_qp_init_attr attr; + + attr = *id_priv->qp_init_attr; + ret = rdma_create_qp(event->id, listen->pd, &attr); + if (ret) + goto err; + } + + *id = event->id; + (*id)->event = event; + return 0; + +err: + listen->event = event; + return ret; +} + +int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) +{ + struct ucma_abi_accept cmd; + struct cma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct cma_id_private, id); + ret = ucma_valid_param(id_priv, conn_param); + if (ret) + return ret; + + if (!conn_param || conn_param->initiator_depth == RDMA_MAX_INIT_DEPTH) { + id_priv->initiator_depth = min(id_priv->initiator_depth, + id_priv->cma_dev->max_initiator_depth); + } else { + id_priv->initiator_depth = conn_param->initiator_depth; + } + if (!conn_param || conn_param->responder_resources == RDMA_MAX_RESP_RES) { + id_priv->responder_resources = min(id_priv->responder_resources, + id_priv->cma_dev->max_responder_resources); + } else { + id_priv->responder_resources = conn_param->responder_resources; + } + + if (!ucma_is_ud_qp(id->qp_type)) { + ret = ucma_modify_qp_rtr(id, id_priv->responder_resources); + if (ret) + return ret; + + ret = ucma_modify_qp_rts(id, id_priv->initiator_depth); + if (ret) + return ret; + } + + CMA_INIT_CMD(&cmd, sizeof cmd, ACCEPT); + cmd.id = id_priv->handle; + cmd.uid = (uintptr_t) id_priv; + if (id->qp) + ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param, + conn_param, id->qp->qp_num, + (id->qp->srq != NULL)); + else + ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param, + conn_param, conn_param->qp_num, + conn_param->srq); + + ret = ucma_accept(id->channel->fid, &cmd, sizeof cmd); + if (ret) { + ucma_modify_qp_err(id); + return ERR(-ret); + } + + if (ucma_is_ud_qp(id->qp_type)) + return 0; + + return ucma_complete(id); +} + +int rdma_reject(struct rdma_cm_id *id, const void *private_data, + uint8_t private_data_len) +{ + struct ucma_abi_reject cmd; + struct cma_id_private *id_priv; + int ret; + + CMA_INIT_CMD(&cmd, sizeof cmd, REJECT); + + id_priv = container_of(id, struct cma_id_private, id); + cmd.id = id_priv->handle; + if (private_data && private_data_len) { + memcpy(cmd.private_data, private_data, private_data_len); + cmd.private_data_len = private_data_len; + } + + ret = ucma_reject(id->channel->fid, &cmd, sizeof cmd); + if (ret) + return ERR(-ret); + + return 0; +} + +int rdma_notify(struct rdma_cm_id *id, enum ibv_event_type event) +{ + struct ucma_abi_notify cmd; + struct cma_id_private *id_priv; + int ret; + + CMA_INIT_CMD(&cmd, sizeof cmd, NOTIFY); + + id_priv = container_of(id, struct cma_id_private, id); + cmd.id = id_priv->handle; + cmd.event = event; + ret = ucma_notify(id->channel->fid, &cmd, sizeof cmd); + if (ret) + return ERR(-ret); + + return 0; +} + +int rdma_disconnect(struct rdma_cm_id *id) +{ + struct ucma_abi_disconnect cmd; + struct cma_id_private *id_priv; + int ret; + + switch (id->verbs->device->transport_type) { + case IBV_TRANSPORT_IB: + ret = ucma_modify_qp_err(id); + break; + case IBV_TRANSPORT_IWARP: + ret = ucma_modify_qp_sqd(id); + break; + default: + ret = ERR(EINVAL); + } + if (ret) + return ret; + + CMA_INIT_CMD(&cmd, sizeof cmd, DISCONNECT); + id_priv = container_of(id, struct cma_id_private, id); + cmd.id = id_priv->handle; + + ret = ucma_disconnect(id->channel->fid, &cmd, sizeof cmd); + if (ret) + return ERR(-ret); + + return ucma_complete(id); +} + +static int rdma_join_multicast2(struct rdma_cm_id *id, struct sockaddr *addr, + socklen_t addrlen, void *context) +{ + struct ucma_abi_create_id_resp resp; + struct cma_id_private *id_priv; + struct cma_multicast *mc, **pos; + int ret; + + id_priv = container_of(id, struct cma_id_private, id); + mc = calloc(1, sizeof *mc); + if (!mc) + return ERR(ENOMEM); + + mc->context = context; + mc->id_priv = id_priv; + memcpy(&mc->addr, addr, addrlen); + if (pthread_cond_init(&mc->cond, NULL)) { + ret = -1; + goto err1; + } + + pthread_mutex_lock(&id_priv->mut); + mc->next = id_priv->mc_list; + id_priv->mc_list = mc; + pthread_mutex_unlock(&id_priv->mut); + + if (af_ib_support) { + struct ucma_abi_join_mcast cmd; + + CMA_INIT_CMD_RESP(&cmd, sizeof cmd, JOIN_MCAST, &resp, sizeof resp); + cmd.id = id_priv->handle; + memcpy(&cmd.addr, addr, addrlen); + cmd.addr_size = addrlen; + cmd.uid = (uintptr_t) mc; + cmd.reserved = 0; + + ret = ucma_join_mcast(id->channel->fid, &cmd, sizeof cmd, + &resp, sizeof resp); + if (ret) { + return ERR(-ret); + goto err2; + } + } else { + struct ucma_abi_join_ip_mcast cmd; + + CMA_INIT_CMD_RESP(&cmd, sizeof cmd, JOIN_IP_MCAST, &resp, sizeof resp); + cmd.id = id_priv->handle; + memcpy(&cmd.addr, addr, addrlen); + cmd.uid = (uintptr_t) mc; + + ret = ucma_join_ip_mcast(id->channel->fid, &cmd, sizeof cmd, + &resp, sizeof resp); + if (ret) { + return ERR(-ret); + goto err2; + } + } + + VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); + + mc->handle = resp.id; + return ucma_complete(id); + +err2: + pthread_mutex_lock(&id_priv->mut); + for (pos = &id_priv->mc_list; *pos != mc; pos = &(*pos)->next) + ; + *pos = mc->next; + pthread_mutex_unlock(&id_priv->mut); +err1: + free(mc); + return ret; +} + +int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, + void *context) +{ + int addrlen; + + addrlen = rdma_addrlen(addr); + if (!addrlen) + return ERR(EINVAL); + + return rdma_join_multicast2(id, addr, addrlen, context); +} + +int rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr) +{ + struct ucma_abi_destroy_id cmd; + struct ucma_abi_destroy_id_resp resp; + struct cma_id_private *id_priv; + struct cma_multicast *mc, **pos; + int ret, addrlen; + + addrlen = rdma_addrlen(addr); + if (!addrlen) + return ERR(EINVAL); + + id_priv = container_of(id, struct cma_id_private, id); + pthread_mutex_lock(&id_priv->mut); + for (pos = &id_priv->mc_list; *pos; pos = &(*pos)->next) + if (!memcmp(&(*pos)->addr, addr, addrlen)) + break; + + mc = *pos; + if (*pos) + *pos = mc->next; + pthread_mutex_unlock(&id_priv->mut); + if (!mc) + return ERR(EADDRNOTAVAIL); + + if (id->qp) + ibv_detach_mcast(id->qp, &mc->mgid, mc->mlid); + + CMA_INIT_CMD_RESP(&cmd, sizeof cmd, LEAVE_MCAST, &resp, sizeof resp); + cmd.id = mc->handle; + + ret = ucma_leave_mcast(id->channel->fid, &cmd, sizeof cmd, &resp, sizeof resp); + if (ret) { + return ERR(-ret); + goto free; + } + + VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); + + pthread_mutex_lock(&id_priv->mut); + while (mc->events_completed < resp.events_reported) + pthread_cond_wait(&mc->cond, &id_priv->mut); + pthread_mutex_unlock(&id_priv->mut); + + ret = 0; +free: + free(mc); + return ret; +} + +static void ucma_complete_event(struct cma_id_private *id_priv) +{ + pthread_mutex_lock(&id_priv->mut); + id_priv->events_completed++; + pthread_cond_signal(&id_priv->cond); + pthread_mutex_unlock(&id_priv->mut); +} + +static void ucma_complete_mc_event(struct cma_multicast *mc) +{ + pthread_mutex_lock(&mc->id_priv->mut); + mc->events_completed++; + pthread_cond_signal(&mc->cond); + mc->id_priv->events_completed++; + pthread_cond_signal(&mc->id_priv->cond); + pthread_mutex_unlock(&mc->id_priv->mut); +} + +int rdma_ack_cm_event(struct rdma_cm_event *event) +{ + struct cma_event *evt; + + if (!event) + return ERR(EINVAL); + + evt = container_of(event, struct cma_event, event); + + if (evt->mc) + ucma_complete_mc_event(evt->mc); + else + ucma_complete_event(evt->id_priv); + free(evt); + return 0; +} + +static void ucma_process_addr_resolved(struct cma_event *evt) +{ + if (af_ib_support) { + evt->event.status = ucma_query_addr(&evt->id_priv->id); + if (!evt->event.status && + evt->id_priv->id.verbs->device->transport_type == IBV_TRANSPORT_IB) + evt->event.status = ucma_query_gid(&evt->id_priv->id); + } else { + evt->event.status = _ucma_query_route(&evt->id_priv->id); + } + + if (evt->event.status) + evt->event.event = RDMA_CM_EVENT_ADDR_ERROR; +} + +static void ucma_process_route_resolved(struct cma_event *evt) +{ + if (evt->id_priv->id.verbs->device->transport_type != IBV_TRANSPORT_IB) + return; + + if (af_ib_support) + evt->event.status = ucma_query_path(&evt->id_priv->id); + else + evt->event.status = _ucma_query_route(&evt->id_priv->id); + + if (evt->event.status) + evt->event.event = RDMA_CM_EVENT_ROUTE_ERROR; +} + +static int ucma_query_req_info(struct rdma_cm_id *id) +{ + int ret; + + if (!af_ib_support) + return _ucma_query_route(id); + + ret = ucma_query_addr(id); + if (ret) + return ret; + + ret = ucma_query_gid(id); + if (ret) + return ret; + + ret = ucma_query_path(id); + if (ret) + return ret; + + return 0; +} + +static int ucma_process_conn_req(struct cma_event *evt, + uint32_t handle) +{ + struct cma_id_private *id_priv; + int ret; + + id_priv = ucma_alloc_id(evt->id_priv->id.channel, + evt->id_priv->id.context, evt->id_priv->id.ps, + evt->id_priv->id.qp_type); + if (!id_priv) { + ucma_destroy_kern_id(evt->id_priv->id.channel->fid, handle); + ret = ERR(ENOMEM); + goto err1; + } + + evt->event.listen_id = &evt->id_priv->id; + evt->event.id = &id_priv->id; + id_priv->handle = handle; + ucma_insert_id(id_priv); + id_priv->initiator_depth = evt->event.param.conn.initiator_depth; + id_priv->responder_resources = evt->event.param.conn.responder_resources; + + if (evt->id_priv->sync) { + ret = rdma_migrate_id(&id_priv->id, NULL); + if (ret) + goto err2; + } + + ret = ucma_query_req_info(&id_priv->id); + if (ret) + goto err2; + + return 0; + +err2: + rdma_destroy_id(&id_priv->id); +err1: + ucma_complete_event(evt->id_priv); + return ret; +} + +static int ucma_process_conn_resp(struct cma_id_private *id_priv) +{ + struct ucma_abi_accept cmd; + int ret; + + ret = ucma_modify_qp_rtr(&id_priv->id, RDMA_MAX_RESP_RES); + if (ret) + goto err; + + ret = ucma_modify_qp_rts(&id_priv->id, RDMA_MAX_INIT_DEPTH); + if (ret) + goto err; + + CMA_INIT_CMD(&cmd, sizeof cmd, ACCEPT); + cmd.id = id_priv->handle; + + ret = ucma_accept(id_priv->id.channel->fid, &cmd, sizeof cmd); + if (ret) { + return ERR(-ret); + goto err; + } + + return 0; +err: + ucma_modify_qp_err(&id_priv->id); + return ret; +} + +static int ucma_process_join(struct cma_event *evt) +{ + evt->mc->mgid = evt->event.param.ud.ah_attr.grh.dgid; + evt->mc->mlid = evt->event.param.ud.ah_attr.dlid; + + if (!evt->id_priv->id.qp) + return 0; + + return rdma_seterrno(ibv_attach_mcast(evt->id_priv->id.qp, + &evt->mc->mgid, evt->mc->mlid)); +} + +static void ucma_copy_conn_event(struct cma_event *event, + struct ucma_abi_conn_param *src) +{ + struct rdma_conn_param *dst = &event->event.param.conn; + + dst->private_data_len = src->private_data_len; + if (src->private_data_len) { + dst->private_data = &event->private_data; + memcpy(&event->private_data, src->private_data, + src->private_data_len); + } + + dst->responder_resources = src->responder_resources; + dst->initiator_depth = src->initiator_depth; + dst->flow_control = src->flow_control; + dst->retry_count = src->retry_count; + dst->rnr_retry_count = src->rnr_retry_count; + dst->srq = src->srq; + dst->qp_num = src->qp_num; +} + +static void ucma_copy_ud_event(struct cma_event *event, + struct ucma_abi_ud_param *src) +{ + struct rdma_ud_param *dst = &event->event.param.ud; + + dst->private_data_len = src->private_data_len; + if (src->private_data_len) { + dst->private_data = &event->private_data; + memcpy(&event->private_data, src->private_data, + src->private_data_len); + } + + ibv_copy_ah_attr_from_kern(&dst->ah_attr, &src->ah_attr); + dst->qp_num = src->qp_num; + dst->qkey = src->qkey; +} + +int rdma_get_cm_event(struct rdma_event_channel *channel, + struct rdma_cm_event **event) +{ + struct ucma_abi_event_resp resp; + struct ucma_abi_get_event cmd; + struct cma_event *evt; + int ret; + + ret = _ucma_init(); + if (ret) + return ret; + + if (!event) + return ERR(EINVAL); + + evt = malloc(sizeof *evt); + if (!evt) + return ERR(ENOMEM); + +retry: + memset(evt, 0, sizeof *evt); + CMA_INIT_CMD_RESP(&cmd, sizeof cmd, GET_EVENT, &resp, sizeof resp); + ret = ucma_get_event(channel->fid, &cmd, sizeof cmd, &resp, sizeof resp); + if (ret) { + free(evt); + return ERR(-ret); + } + + VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); + + evt->event.event = resp.event; + /* + * We should have a non-zero uid, except for connection requests. + * But a bug in older kernels can report a uid 0. Work-around this + * issue by looking up the cma_id based on the kernel's id when the + * uid is 0 and we're processing a connection established event. + * In all other cases, if the uid is 0, we discard the event, like + * the kernel should have done. + */ + if (resp.uid) { + evt->id_priv = (void *) (uintptr_t) resp.uid; + } else { + evt->id_priv = ucma_lookup_id(resp.id); + if (!evt->id_priv) { + syslog(LOG_WARNING, "rdmacm: warning: discarding unmatched " + "event - rdma_destroy_id may hang.\n"); + goto retry; + } + if (resp.event != RDMA_CM_EVENT_ESTABLISHED) { + ucma_complete_event(evt->id_priv); + goto retry; + } + } + evt->event.id = &evt->id_priv->id; + evt->event.status = resp.status; + + switch (resp.event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + ucma_process_addr_resolved(evt); + break; + case RDMA_CM_EVENT_ROUTE_RESOLVED: + ucma_process_route_resolved(evt); + break; + case RDMA_CM_EVENT_CONNECT_REQUEST: + evt->id_priv = (void *) (uintptr_t) resp.uid; + if (ucma_is_ud_qp(evt->id_priv->id.qp_type)) + ucma_copy_ud_event(evt, &resp.param.ud); + else + ucma_copy_conn_event(evt, &resp.param.conn); + + ret = ucma_process_conn_req(evt, resp.id); + if (ret) + goto retry; + break; + case RDMA_CM_EVENT_CONNECT_RESPONSE: + ucma_copy_conn_event(evt, &resp.param.conn); + evt->event.status = ucma_process_conn_resp(evt->id_priv); + if (!evt->event.status) + evt->event.event = RDMA_CM_EVENT_ESTABLISHED; + else { + evt->event.event = RDMA_CM_EVENT_CONNECT_ERROR; + evt->id_priv->connect_error = 1; + } + break; + case RDMA_CM_EVENT_ESTABLISHED: + if (ucma_is_ud_qp(evt->id_priv->id.qp_type)) { + ucma_copy_ud_event(evt, &resp.param.ud); + break; + } + + ucma_copy_conn_event(evt, &resp.param.conn); + break; + case RDMA_CM_EVENT_REJECTED: + if (evt->id_priv->connect_error) { + ucma_complete_event(evt->id_priv); + goto retry; + } + ucma_copy_conn_event(evt, &resp.param.conn); + ucma_modify_qp_err(evt->event.id); + break; + case RDMA_CM_EVENT_DISCONNECTED: + if (evt->id_priv->connect_error) { + ucma_complete_event(evt->id_priv); + goto retry; + } + ucma_copy_conn_event(evt, &resp.param.conn); + break; + case RDMA_CM_EVENT_MULTICAST_JOIN: + evt->mc = (void *) (uintptr_t) resp.uid; + evt->id_priv = evt->mc->id_priv; + evt->event.id = &evt->id_priv->id; + ucma_copy_ud_event(evt, &resp.param.ud); + evt->event.param.ud.private_data = evt->mc->context; + evt->event.status = ucma_process_join(evt); + if (evt->event.status) + evt->event.event = RDMA_CM_EVENT_MULTICAST_ERROR; + break; + case RDMA_CM_EVENT_MULTICAST_ERROR: + evt->mc = (void *) (uintptr_t) resp.uid; + evt->id_priv = evt->mc->id_priv; + evt->event.id = &evt->id_priv->id; + evt->event.param.ud.private_data = evt->mc->context; + break; + default: + evt->id_priv = (void *) (uintptr_t) resp.uid; + evt->event.id = &evt->id_priv->id; + evt->event.status = resp.status; + if (ucma_is_ud_qp(evt->id_priv->id.qp_type)) + ucma_copy_ud_event(evt, &resp.param.ud); + else + ucma_copy_conn_event(evt, &resp.param.conn); + break; + } + + *event = &evt->event; + return 0; +} + +const char *rdma_event_str(enum rdma_cm_event_type event) +{ + switch (event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + return "RDMA_CM_EVENT_ADDR_RESOLVED"; + case RDMA_CM_EVENT_ADDR_ERROR: + return "RDMA_CM_EVENT_ADDR_ERROR"; + case RDMA_CM_EVENT_ROUTE_RESOLVED: + return "RDMA_CM_EVENT_ROUTE_RESOLVED"; + case RDMA_CM_EVENT_ROUTE_ERROR: + return "RDMA_CM_EVENT_ROUTE_ERROR"; + case RDMA_CM_EVENT_CONNECT_REQUEST: + return "RDMA_CM_EVENT_CONNECT_REQUEST"; + case RDMA_CM_EVENT_CONNECT_RESPONSE: + return "RDMA_CM_EVENT_CONNECT_RESPONSE"; + case RDMA_CM_EVENT_CONNECT_ERROR: + return "RDMA_CM_EVENT_CONNECT_ERROR"; + case RDMA_CM_EVENT_UNREACHABLE: + return "RDMA_CM_EVENT_UNREACHABLE"; + case RDMA_CM_EVENT_REJECTED: + return "RDMA_CM_EVENT_REJECTED"; + case RDMA_CM_EVENT_ESTABLISHED: + return "RDMA_CM_EVENT_ESTABLISHED"; + case RDMA_CM_EVENT_DISCONNECTED: + return "RDMA_CM_EVENT_DISCONNECTED"; + case RDMA_CM_EVENT_DEVICE_REMOVAL: + return "RDMA_CM_EVENT_DEVICE_REMOVAL"; + case RDMA_CM_EVENT_MULTICAST_JOIN: + return "RDMA_CM_EVENT_MULTICAST_JOIN"; + case RDMA_CM_EVENT_MULTICAST_ERROR: + return "RDMA_CM_EVENT_MULTICAST_ERROR"; + case RDMA_CM_EVENT_ADDR_CHANGE: + return "RDMA_CM_EVENT_ADDR_CHANGE"; + case RDMA_CM_EVENT_TIMEWAIT_EXIT: + return "RDMA_CM_EVENT_TIMEWAIT_EXIT"; + default: + return "UNKNOWN EVENT"; + } +} + +int rdma_set_option(struct rdma_cm_id *id, int level, int optname, + void *optval, size_t optlen) +{ + struct ucma_abi_set_option cmd; + struct cma_id_private *id_priv; + int ret; + + CMA_INIT_CMD(&cmd, sizeof cmd, SET_OPTION); + id_priv = container_of(id, struct cma_id_private, id); + cmd.id = id_priv->handle; + cmd.optval = (uintptr_t) optval; + cmd.level = level; + cmd.optname = optname; + cmd.optlen = optlen; + + ret = ucma_set_option(id->channel->fid, &cmd, sizeof cmd); + if (ret) + return ERR(-ret); + + return 0; +} + +int rdma_migrate_id(struct rdma_cm_id *id, struct rdma_event_channel *channel) +{ + struct ucma_abi_migrate_resp resp; + struct ucma_abi_migrate_id cmd; + struct cma_id_private *id_priv; + int ret, sync; + + id_priv = container_of(id, struct cma_id_private, id); + if (id_priv->sync && !channel) + return ERR(EINVAL); + + if ((sync = (channel == NULL))) { + channel = rdma_create_event_channel(); + if (!channel) + return -1; + } + + CMA_INIT_CMD_RESP(&cmd, sizeof cmd, MIGRATE_ID, &resp, sizeof resp); + cmd.id = id_priv->handle; + cmd.fd = id->channel->fd; + + ret = ucma_migrate_id(channel->fid, &cmd, sizeof cmd, &resp, sizeof resp); + if (ret) { + if (sync) + rdma_destroy_event_channel(channel); + return ERR(-ret); + } + + VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); + + if (id_priv->sync) { + if (id->event) { + rdma_ack_cm_event(id->event); + id->event = NULL; + } + rdma_destroy_event_channel(id->channel); + } + + /* + * Eventually if we want to support migrating channels while events are + * being processed on the current channel, we need to block here while + * there are any outstanding events on the current channel for this id + * to prevent the user from processing events for this id on the old + * channel after this call returns. + */ + pthread_mutex_lock(&id_priv->mut); + id_priv->sync = sync; + id->channel = channel; + while (id_priv->events_completed < resp.events_reported) + pthread_cond_wait(&id_priv->cond, &id_priv->mut); + pthread_mutex_unlock(&id_priv->mut); + + return 0; +} + +static int ucma_passive_ep(struct rdma_cm_id *id, struct rdma_addrinfo *res, + struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr) +{ + struct cma_id_private *id_priv; + int ret; + + if (af_ib_support) + ret = rdma_bind_addr2(id, res->ai_src_addr, res->ai_src_len); + else + ret = rdma_bind_addr(id, res->ai_src_addr); + if (ret) + return ret; + + id_priv = container_of(id, struct cma_id_private, id); + if (pd) + id->pd = pd; + + if (qp_init_attr) { + id_priv->qp_init_attr = malloc(sizeof *qp_init_attr); + if (!id_priv->qp_init_attr) + return ERR(ENOMEM); + + *id_priv->qp_init_attr = *qp_init_attr; + id_priv->qp_init_attr->qp_type = res->ai_qp_type; + } + + return 0; +} + +int rdma_create_ep(struct rdma_cm_id **id, struct rdma_addrinfo *res, + struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr) +{ + struct rdma_cm_id *cm_id; + struct cma_id_private *id_priv; + int ret; + + ret = rdma_create_id2(NULL, &cm_id, NULL, res->ai_port_space, res->ai_qp_type); + if (ret) + return ret; + + if (res->ai_flags & RAI_PASSIVE) { + ret = ucma_passive_ep(cm_id, res, pd, qp_init_attr); + if (ret) + goto err; + goto out; + } + + if (af_ib_support) + ret = rdma_resolve_addr2(cm_id, res->ai_src_addr, res->ai_src_len, + res->ai_dst_addr, res->ai_dst_len, 2000); + else + ret = rdma_resolve_addr(cm_id, res->ai_src_addr, res->ai_dst_addr, 2000); + if (ret) + goto err; + + if (res->ai_route_len) { + ret = rdma_set_option(cm_id, RDMA_OPTION_IB, RDMA_OPTION_IB_PATH, + res->ai_route, res->ai_route_len); + if (!ret) + ret = ucma_complete(cm_id); + } else { + ret = rdma_resolve_route(cm_id, 2000); + } + if (ret) + goto err; + + if (qp_init_attr) { + qp_init_attr->qp_type = res->ai_qp_type; + ret = rdma_create_qp(cm_id, pd, qp_init_attr); + if (ret) + goto err; + } + + if (res->ai_connect_len) { + id_priv = container_of(cm_id, struct cma_id_private, id); + id_priv->connect = malloc(res->ai_connect_len); + if (!id_priv->connect) { + ret = ERR(ENOMEM); + goto err; + } + memcpy(id_priv->connect, res->ai_connect, res->ai_connect_len); + id_priv->connect_len = res->ai_connect_len; + } + +out: + *id = cm_id; + return 0; + +err: + rdma_destroy_ep(cm_id); + return ret; +} + +void rdma_destroy_ep(struct rdma_cm_id *id) +{ + struct cma_id_private *id_priv; + + if (id->qp) + rdma_destroy_qp(id); + + if (id->srq) + rdma_destroy_srq(id); + + id_priv = container_of(id, struct cma_id_private, id); + if (id_priv->qp_init_attr) + free(id_priv->qp_init_attr); + + rdma_destroy_id(id); +} + +int ucma_max_qpsize(struct rdma_cm_id *id) +{ + struct cma_id_private *id_priv; + int i, max_size = 0; + + id_priv = container_of(id, struct cma_id_private, id); + if (id && id_priv->cma_dev) { + max_size = id_priv->cma_dev->max_qpsize; + } else { + _ucma_init(); + for (i = 0; i < cma_dev_cnt; i++) { + if (!max_size || max_size > cma_dev_array[i].max_qpsize) + max_size = cma_dev_array[i].max_qpsize; + } + } + return max_size; +} + +uint16_t ucma_get_port(struct sockaddr *addr) +{ + switch (addr->sa_family) { + case AF_INET: + return ((struct sockaddr_in *) addr)->sin_port; + case AF_INET6: + return ((struct sockaddr_in6 *) addr)->sin6_port; + case AF_IB: + return htons((uint16_t) ntohll(((struct sockaddr_ib *) addr)->sib_sid)); + default: + return 0; + } +} + +uint16_t rdma_get_src_port(struct rdma_cm_id *id) +{ + return ucma_get_port(&id->route.addr.src_addr); +} + +uint16_t rdma_get_dst_port(struct rdma_cm_id *id) +{ + return ucma_get_port(&id->route.addr.dst_addr); +} + diff --git a/prov/rdmacm/src/cma.h b/prov/rdmacm/src/cma.h new file mode 100644 index 00000000000..97c7cfdf998 --- /dev/null +++ b/prov/rdmacm/src/cma.h @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2005-2012 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#if !defined(CMA_H) +#define CMA_H + +#if HAVE_CONFIG_H +# include <config.h> +#endif /* HAVE_CONFIG_H */ + +#include <stdlib.h> +#include <errno.h> +#include <endian.h> +#include <byteswap.h> +#include <semaphore.h> + +#include <rdma/rdma_cma.h> +#include <infiniband/ib.h> +#include <rdma/fabric.h> +#include <fi.h> + + +/* + * Fast synchronization for low contention locking. + */ +#if DEFINE_ATOMICS +#define fastlock_t pthread_mutex_t +#define fastlock_init(lock) pthread_mutex_init(lock, NULL) +#define fastlock_destroy(lock) pthread_mutex_destroy(lock) +#define fastlock_acquire(lock) pthread_mutex_lock(lock) +#define fastlock_release(lock) pthread_mutex_unlock(lock) + +typedef struct { pthread_mutex_t mut; int val; } atomic_t; +static inline int atomic_inc(atomic_t *atomic) +{ + int v; + + pthread_mutex_lock(&atomic->mut); + v = ++(atomic->val); + pthread_mutex_unlock(&atomic->mut); + return v; +} +static inline int atomic_dec(atomic_t *atomic) +{ + int v; + + pthread_mutex_lock(&atomic->mut); + v = --(atomic->val); + pthread_mutex_unlock(&atomic->mut); + return v; +} +static inline void atomic_init(atomic_t *atomic) +{ + pthread_mutex_init(&atomic->mut, NULL); + atomic->val = 0; +} +#else +typedef struct { + sem_t sem; + volatile int cnt; +} fastlock_t; +static inline void fastlock_init(fastlock_t *lock) +{ + sem_init(&lock->sem, 0, 0); + lock->cnt = 0; +} +static inline void fastlock_destroy(fastlock_t *lock) +{ + sem_destroy(&lock->sem); +} +static inline void fastlock_acquire(fastlock_t *lock) +{ + if (__sync_add_and_fetch(&lock->cnt, 1) > 1) + sem_wait(&lock->sem); +} +static inline void fastlock_release(fastlock_t *lock) +{ + if (__sync_sub_and_fetch(&lock->cnt, 1) > 0) + sem_post(&lock->sem); +} + +typedef struct { volatile int val; } atomic_t; +#define atomic_inc(v) (__sync_add_and_fetch(&(v)->val, 1)) +#define atomic_dec(v) (__sync_sub_and_fetch(&(v)->val, 1)) +#define atomic_init(v) ((v)->val = 0) +#endif /* DEFINE_ATOMICS */ +#define atomic_get(v) ((v)->val) +#define atomic_set(v, s) ((v)->val = s) + +uint16_t ucma_get_port(struct sockaddr *addr); +void ucma_set_sid(enum rdma_port_space ps, struct sockaddr *addr, + struct sockaddr_ib *sib); +int ucma_max_qpsize(struct rdma_cm_id *id); +int ucma_complete(struct rdma_cm_id *id); + +static inline int ERR(int err) +{ + errno = err; + return -1; +} + +int ucma_init(); +extern int af_ib_support; + +#define RAI_ROUTEONLY 0x01000000 + +void ucma_ib_init(); +void ucma_ib_cleanup(); +void ucma_ib_resolve(struct rdma_addrinfo **rai, struct rdma_addrinfo *hints); + +struct ib_connect_hdr { + uint8_t cma_version; + uint8_t ip_version; /* IP version: 7:4 */ + uint16_t port; + uint32_t src_addr[4]; + uint32_t dst_addr[4]; +#define cma_src_ip4 src_addr[3] +#define cma_src_ip6 src_addr[0] +#define cma_dst_ip4 dst_addr[3] +#define cma_dst_ip6 dst_addr[0] +}; + +#define RS_CONF_DIR RDMA_CONF_DIR "/rsocket" + +#endif /* CMA_H */ diff --git a/prov/rdmacm/src/indexer.c b/prov/rdmacm/src/indexer.c new file mode 100644 index 00000000000..c8e8bce53ce --- /dev/null +++ b/prov/rdmacm/src/indexer.c @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2011 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#if HAVE_CONFIG_H +# include <config.h> +#endif /* HAVE_CONFIG_H */ + +#include <sys/types.h> +#include <stdlib.h> + +#include "indexer.h" +#include "cma.h" + +/* + * Indexer - to find a structure given an index + * + * We store pointers using a double lookup and return an index to the + * user which is then used to retrieve the pointer. The upper bits of + * the index are itself an index into an array of memory allocations. + * The lower bits specify the offset into the allocated memory where + * the pointer is stored. + * + * This allows us to adjust the number of pointers stored by the index + * list without taking a lock during data lookups. + */ + +static int idx_grow(struct indexer *idx) +{ + union idx_entry *entry; + int i, start_index; + + if (idx->size >= IDX_ARRAY_SIZE) + goto nomem; + + idx->array[idx->size] = calloc(IDX_ENTRY_SIZE, sizeof(union idx_entry)); + if (!idx->array[idx->size]) + goto nomem; + + entry = idx->array[idx->size]; + start_index = idx->size << IDX_ENTRY_BITS; + entry[IDX_ENTRY_SIZE - 1].next = idx->free_list; + + for (i = IDX_ENTRY_SIZE - 2; i >= 0; i--) + entry[i].next = start_index + i + 1; + + /* Index 0 is reserved */ + if (start_index == 0) + start_index++; + idx->free_list = start_index; + idx->size++; + return start_index; + +nomem: + errno = ENOMEM; + return -1; +} + +int idx_insert(struct indexer *idx, void *item) +{ + union idx_entry *entry; + int index; + + if ((index = idx->free_list) == 0) { + if ((index = idx_grow(idx)) <= 0) + return index; + } + + entry = idx->array[idx_array_index(index)]; + idx->free_list = entry[idx_entry_index(index)].next; + entry[idx_entry_index(index)].item = item; + return index; +} + +void *idx_remove(struct indexer *idx, int index) +{ + union idx_entry *entry; + void *item; + + entry = idx->array[idx_array_index(index)]; + item = entry[idx_entry_index(index)].item; + entry[idx_entry_index(index)].next = idx->free_list; + idx->free_list = index; + return item; +} + +void idx_replace(struct indexer *idx, int index, void *item) +{ + union idx_entry *entry; + + entry = idx->array[idx_array_index(index)]; + entry[idx_entry_index(index)].item = item; +} + + +static int idm_grow(struct index_map *idm, int index) +{ + idm->array[idx_array_index(index)] = calloc(IDX_ENTRY_SIZE, sizeof(void *)); + if (!idm->array[idx_array_index(index)]) + goto nomem; + + return index; + +nomem: + errno = ENOMEM; + return -1; +} + +int idm_set(struct index_map *idm, int index, void *item) +{ + void **entry; + + if (index > IDX_MAX_INDEX) { + errno = ENOMEM; + return -1; + } + + if (!idm->array[idx_array_index(index)]) { + if (idm_grow(idm, index) < 0) + return -1; + } + + entry = idm->array[idx_array_index(index)]; + entry[idx_entry_index(index)] = item; + return index; +} + +void *idm_clear(struct index_map *idm, int index) +{ + void **entry; + void *item; + + entry = idm->array[idx_array_index(index)]; + item = entry[idx_entry_index(index)]; + entry[idx_entry_index(index)] = NULL; + return item; +} diff --git a/prov/rdmacm/src/indexer.h b/prov/rdmacm/src/indexer.h new file mode 100644 index 00000000000..0c5f3882673 --- /dev/null +++ b/prov/rdmacm/src/indexer.h @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2011 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#if !defined(INDEXER_H) +#define INDEXER_H + +#if HAVE_CONFIG_H +# include <config.h> +#endif /* HAVE_CONFIG_H */ + +#include <sys/types.h> + +/* + * Indexer - to find a structure given an index. Synchronization + * must be provided by the caller. Caller must initialize the + * indexer by setting free_list and size to 0. + */ + +union idx_entry { + void *item; + int next; +}; + +#define IDX_INDEX_BITS 16 +#define IDX_ENTRY_BITS 10 +#define IDX_ENTRY_SIZE (1 << IDX_ENTRY_BITS) +#define IDX_ARRAY_SIZE (1 << (IDX_INDEX_BITS - IDX_ENTRY_BITS)) +#define IDX_MAX_INDEX ((1 << IDX_INDEX_BITS) - 1) + +struct indexer +{ + union idx_entry *array[IDX_ARRAY_SIZE]; + int free_list; + int size; +}; + +#define idx_array_index(index) (index >> IDX_ENTRY_BITS) +#define idx_entry_index(index) (index & (IDX_ENTRY_SIZE - 1)) + +int idx_insert(struct indexer *idx, void *item); +void *idx_remove(struct indexer *idx, int index); +void idx_replace(struct indexer *idx, int index, void *item); + +static inline void *idx_at(struct indexer *idx, int index) +{ + return (idx->array[idx_array_index(index)] + idx_entry_index(index))->item; +} + +/* + * Index map - associates a structure with an index. Synchronization + * must be provided by the caller. Caller must initialize the + * index map by setting it to 0. + */ + +struct index_map +{ + void **array[IDX_ARRAY_SIZE]; +}; + +int idm_set(struct index_map *idm, int index, void *item); +void *idm_clear(struct index_map *idm, int index); + +static inline void *idm_at(struct index_map *idm, int index) +{ + void **entry; + entry = idm->array[idx_array_index(index)]; + return entry[idx_entry_index(index)]; +} + +static inline void *idm_lookup(struct index_map *idm, int index) +{ + return ((index <= IDX_MAX_INDEX) && idm->array[idx_array_index(index)]) ? + idm_at(idm, index) : NULL; +} + +typedef struct _dlist_entry { + struct _dlist_entry *next; + struct _dlist_entry *prev; +} dlist_entry; + +static inline void dlist_init(dlist_entry *head) +{ + head->next = head; + head->prev = head; +} + +static inline int dlist_empty(dlist_entry *head) +{ + return head->next == head; +} + +static inline void dlist_insert_after(dlist_entry *item, dlist_entry *head) +{ + item->next = head->next; + item->prev = head; + head->next->prev = item; + head->next = item; +} + +static inline void dlist_insert_before(dlist_entry *item, dlist_entry *head) +{ + dlist_insert_after(item, head->prev); +} + +#define dlist_insert_head dlist_insert_after +#define dlist_insert_tail dlist_insert_before + +static inline void dlist_remove(dlist_entry *item) +{ + item->prev->next = item->next; + item->next->prev = item->prev; +} + +#endif /* INDEXER_H */ diff --git a/prov/rdmacm/src/preload.c b/prov/rdmacm/src/preload.c new file mode 100644 index 00000000000..fb2149bf467 --- /dev/null +++ b/prov/rdmacm/src/preload.c @@ -0,0 +1,1057 @@ +/* + * Copyright (c) 2011-2012 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#if HAVE_CONFIG_H +# include <config.h> +#endif /* HAVE_CONFIG_H */ + +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <sys/sendfile.h> +#include <stdarg.h> +#include <dlfcn.h> +#include <netdb.h> +#include <unistd.h> +#include <fcntl.h> +#include <string.h> +#include <netinet/in.h> +#include <netinet/tcp.h> +#include <unistd.h> +#include <semaphore.h> + +#include <rdma/rdma_cma.h> +#include <rdma/rdma_verbs.h> +#include <rdma/rsocket.h> +#include "cma.h" +#include "indexer.h" + +struct socket_calls { + int (*socket)(int domain, int type, int protocol); + int (*bind)(int socket, const struct sockaddr *addr, socklen_t addrlen); + int (*listen)(int socket, int backlog); + int (*accept)(int socket, struct sockaddr *addr, socklen_t *addrlen); + int (*connect)(int socket, const struct sockaddr *addr, socklen_t addrlen); + ssize_t (*recv)(int socket, void *buf, size_t len, int flags); + ssize_t (*recvfrom)(int socket, void *buf, size_t len, int flags, + struct sockaddr *src_addr, socklen_t *addrlen); + ssize_t (*recvmsg)(int socket, struct msghdr *msg, int flags); + ssize_t (*read)(int socket, void *buf, size_t count); + ssize_t (*readv)(int socket, const struct iovec *iov, int iovcnt); + ssize_t (*send)(int socket, const void *buf, size_t len, int flags); + ssize_t (*sendto)(int socket, const void *buf, size_t len, int flags, + const struct sockaddr *dest_addr, socklen_t addrlen); + ssize_t (*sendmsg)(int socket, const struct msghdr *msg, int flags); + ssize_t (*write)(int socket, const void *buf, size_t count); + ssize_t (*writev)(int socket, const struct iovec *iov, int iovcnt); + int (*poll)(struct pollfd *fds, nfds_t nfds, int timeout); + int (*shutdown)(int socket, int how); + int (*close)(int socket); + int (*getpeername)(int socket, struct sockaddr *addr, socklen_t *addrlen); + int (*getsockname)(int socket, struct sockaddr *addr, socklen_t *addrlen); + int (*setsockopt)(int socket, int level, int optname, + const void *optval, socklen_t optlen); + int (*getsockopt)(int socket, int level, int optname, + void *optval, socklen_t *optlen); + int (*fcntl)(int socket, int cmd, ... /* arg */); + int (*dup2)(int oldfd, int newfd); + ssize_t (*sendfile)(int out_fd, int in_fd, off_t *offset, size_t count); + int (*fxstat)(int ver, int fd, struct stat *buf); +}; + +static struct socket_calls real; +static struct socket_calls rs; + +static struct index_map idm; +static pthread_mutex_t mut = PTHREAD_MUTEX_INITIALIZER; + +static int sq_size; +static int rq_size; +static int sq_inline; +static int fork_support; + +enum fd_type { + fd_normal, + fd_rsocket +}; + +enum fd_fork_state { + fd_ready, + fd_fork, + fd_fork_listen, + fd_fork_active, + fd_fork_passive +}; + +struct fd_info { + enum fd_type type; + enum fd_fork_state state; + int fd; + int dupfd; + atomic_t refcnt; +}; + +static int fd_open(void) +{ + struct fd_info *fdi; + int ret, index; + + fdi = calloc(1, sizeof *fdi); + if (!fdi) + return ERR(ENOMEM); + + index = open("/dev/null", O_RDONLY); + if (index < 0) { + ret = index; + goto err1; + } + + fdi->dupfd = -1; + atomic_init(&fdi->refcnt); + atomic_set(&fdi->refcnt, 1); + pthread_mutex_lock(&mut); + ret = idm_set(&idm, index, fdi); + pthread_mutex_unlock(&mut); + if (ret < 0) + goto err2; + + return index; + +err2: + real.close(index); +err1: + free(fdi); + return ret; +} + +static void fd_store(int index, int fd, enum fd_type type, enum fd_fork_state state) +{ + struct fd_info *fdi; + + fdi = idm_at(&idm, index); + fdi->fd = fd; + fdi->type = type; + fdi->state = state; +} + +static inline enum fd_type fd_get(int index, int *fd) +{ + struct fd_info *fdi; + + fdi = idm_lookup(&idm, index); + if (fdi) { + *fd = fdi->fd; + return fdi->type; + + } else { + *fd = index; + return fd_normal; + } +} + +static inline int fd_getd(int index) +{ + struct fd_info *fdi; + + fdi = idm_lookup(&idm, index); + return fdi ? fdi->fd : index; +} + +static inline enum fd_fork_state fd_gets(int index) +{ + struct fd_info *fdi; + + fdi = idm_lookup(&idm, index); + return fdi ? fdi->state : fd_ready; +} + +static inline enum fd_type fd_gett(int index) +{ + struct fd_info *fdi; + + fdi = idm_lookup(&idm, index); + return fdi ? fdi->type : fd_normal; +} + +static enum fd_type fd_close(int index, int *fd) +{ + struct fd_info *fdi; + enum fd_type type; + + fdi = idm_lookup(&idm, index); + if (fdi) { + idm_clear(&idm, index); + *fd = fdi->fd; + type = fdi->type; + real.close(index); + free(fdi); + } else { + *fd = index; + type = fd_normal; + } + return type; +} + +void getenv_options(void) +{ + char *var; + + var = getenv("RS_SQ_SIZE"); + if (var) + sq_size = atoi(var); + + var = getenv("RS_RQ_SIZE"); + if (var) + rq_size = atoi(var); + + var = getenv("RS_INLINE"); + if (var) + sq_inline = atoi(var); + + var = getenv("RDMAV_FORK_SAFE"); + if (var) + fork_support = atoi(var); +} + +static void init_preload(void) +{ + static int init; + + /* Quick check without lock */ + if (init) + return; + + pthread_mutex_lock(&mut); + if (init) + goto out; + + real.socket = dlsym(RTLD_NEXT, "socket"); + real.bind = dlsym(RTLD_NEXT, "bind"); + real.listen = dlsym(RTLD_NEXT, "listen"); + real.accept = dlsym(RTLD_NEXT, "accept"); + real.connect = dlsym(RTLD_NEXT, "connect"); + real.recv = dlsym(RTLD_NEXT, "recv"); + real.recvfrom = dlsym(RTLD_NEXT, "recvfrom"); + real.recvmsg = dlsym(RTLD_NEXT, "recvmsg"); + real.read = dlsym(RTLD_NEXT, "read"); + real.readv = dlsym(RTLD_NEXT, "readv"); + real.send = dlsym(RTLD_NEXT, "send"); + real.sendto = dlsym(RTLD_NEXT, "sendto"); + real.sendmsg = dlsym(RTLD_NEXT, "sendmsg"); + real.write = dlsym(RTLD_NEXT, "write"); + real.writev = dlsym(RTLD_NEXT, "writev"); + real.poll = dlsym(RTLD_NEXT, "poll"); + real.shutdown = dlsym(RTLD_NEXT, "shutdown"); + real.close = dlsym(RTLD_NEXT, "close"); + real.getpeername = dlsym(RTLD_NEXT, "getpeername"); + real.getsockname = dlsym(RTLD_NEXT, "getsockname"); + real.setsockopt = dlsym(RTLD_NEXT, "setsockopt"); + real.getsockopt = dlsym(RTLD_NEXT, "getsockopt"); + real.fcntl = dlsym(RTLD_NEXT, "fcntl"); + real.dup2 = dlsym(RTLD_NEXT, "dup2"); + real.sendfile = dlsym(RTLD_NEXT, "sendfile"); + real.fxstat = dlsym(RTLD_NEXT, "__fxstat"); + + rs.socket = dlsym(RTLD_DEFAULT, "rsocket"); + rs.bind = dlsym(RTLD_DEFAULT, "rbind"); + rs.listen = dlsym(RTLD_DEFAULT, "rlisten"); + rs.accept = dlsym(RTLD_DEFAULT, "raccept"); + rs.connect = dlsym(RTLD_DEFAULT, "rconnect"); + rs.recv = dlsym(RTLD_DEFAULT, "rrecv"); + rs.recvfrom = dlsym(RTLD_DEFAULT, "rrecvfrom"); + rs.recvmsg = dlsym(RTLD_DEFAULT, "rrecvmsg"); + rs.read = dlsym(RTLD_DEFAULT, "rread"); + rs.readv = dlsym(RTLD_DEFAULT, "rreadv"); + rs.send = dlsym(RTLD_DEFAULT, "rsend"); + rs.sendto = dlsym(RTLD_DEFAULT, "rsendto"); + rs.sendmsg = dlsym(RTLD_DEFAULT, "rsendmsg"); + rs.write = dlsym(RTLD_DEFAULT, "rwrite"); + rs.writev = dlsym(RTLD_DEFAULT, "rwritev"); + rs.poll = dlsym(RTLD_DEFAULT, "rpoll"); + rs.shutdown = dlsym(RTLD_DEFAULT, "rshutdown"); + rs.close = dlsym(RTLD_DEFAULT, "rclose"); + rs.getpeername = dlsym(RTLD_DEFAULT, "rgetpeername"); + rs.getsockname = dlsym(RTLD_DEFAULT, "rgetsockname"); + rs.setsockopt = dlsym(RTLD_DEFAULT, "rsetsockopt"); + rs.getsockopt = dlsym(RTLD_DEFAULT, "rgetsockopt"); + rs.fcntl = dlsym(RTLD_DEFAULT, "rfcntl"); + + getenv_options(); + init = 1; +out: + pthread_mutex_unlock(&mut); +} + +/* + * We currently only handle copying a few common values. + */ +static int copysockopts(int dfd, int sfd, struct socket_calls *dapi, + struct socket_calls *sapi) +{ + socklen_t len; + int param, ret; + + ret = sapi->fcntl(sfd, F_GETFL); + if (ret > 0) + ret = dapi->fcntl(dfd, F_SETFL, ret); + if (ret) + return ret; + + len = sizeof param; + ret = sapi->getsockopt(sfd, SOL_SOCKET, SO_REUSEADDR, ¶m, &len); + if (param && !ret) + ret = dapi->setsockopt(dfd, SOL_SOCKET, SO_REUSEADDR, ¶m, len); + if (ret) + return ret; + + len = sizeof param; + ret = sapi->getsockopt(sfd, IPPROTO_TCP, TCP_NODELAY, ¶m, &len); + if (param && !ret) + ret = dapi->setsockopt(dfd, IPPROTO_TCP, TCP_NODELAY, ¶m, len); + if (ret) + return ret; + + return 0; +} + +/* + * Convert between an rsocket and a normal socket. + */ +static int transpose_socket(int socket, enum fd_type new_type) +{ + socklen_t len = 0; + int sfd, dfd, param, ret; + struct socket_calls *sapi, *dapi; + + sfd = fd_getd(socket); + if (new_type == fd_rsocket) { + dapi = &rs; + sapi = ℜ + } else { + dapi = ℜ + sapi = &rs; + } + + ret = sapi->getsockname(sfd, NULL, &len); + if (ret) + return ret; + + param = (len == sizeof(struct sockaddr_in6)) ? PF_INET6 : PF_INET; + dfd = dapi->socket(param, SOCK_STREAM, 0); + if (dfd < 0) + return dfd; + + ret = copysockopts(dfd, sfd, dapi, sapi); + if (ret) + goto err; + + fd_store(socket, dfd, new_type, fd_ready); + return dfd; + +err: + dapi->close(dfd); + return ret; +} + +/* + * Use defaults on failure. + */ +void set_rsocket_options(int rsocket) +{ + if (sq_size) + rsetsockopt(rsocket, SOL_RDMA, RDMA_SQSIZE, &sq_size, sizeof sq_size); + + if (rq_size) + rsetsockopt(rsocket, SOL_RDMA, RDMA_RQSIZE, &rq_size, sizeof rq_size); + + if (sq_inline) + rsetsockopt(rsocket, SOL_RDMA, RDMA_INLINE, &sq_inline, sizeof sq_inline); +} + +int socket(int domain, int type, int protocol) +{ + static __thread int recursive; + int index, ret; + + if (recursive) + goto real; + + init_preload(); + index = fd_open(); + if (index < 0) + return index; + + if (fork_support && (domain == PF_INET || domain == PF_INET6) && + (type == SOCK_STREAM) && (!protocol || protocol == IPPROTO_TCP)) { + ret = real.socket(domain, type, protocol); + if (ret < 0) + return ret; + fd_store(index, ret, fd_normal, fd_fork); + return index; + } + + recursive = 1; + ret = rsocket(domain, type, protocol); + recursive = 0; + if (ret >= 0) { + fd_store(index, ret, fd_rsocket, fd_ready); + set_rsocket_options(ret); + return index; + } + fd_close(index, &ret); +real: + return real.socket(domain, type, protocol); +} + +int bind(int socket, const struct sockaddr *addr, socklen_t addrlen) +{ + int fd; + return (fd_get(socket, &fd) == fd_rsocket) ? + rbind(fd, addr, addrlen) : real.bind(fd, addr, addrlen); +} + +int listen(int socket, int backlog) +{ + int fd, ret; + if (fd_get(socket, &fd) == fd_rsocket) { + ret = rlisten(fd, backlog); + } else { + ret = real.listen(fd, backlog); + if (!ret && fd_gets(socket) == fd_fork) + fd_store(socket, fd, fd_normal, fd_fork_listen); + } + return ret; +} + +int accept(int socket, struct sockaddr *addr, socklen_t *addrlen) +{ + int fd, index, ret; + + if (fd_get(socket, &fd) == fd_rsocket) { + index = fd_open(); + if (index < 0) + return index; + + ret = raccept(fd, addr, addrlen); + if (ret < 0) { + fd_close(index, &fd); + return ret; + } + + fd_store(index, ret, fd_rsocket, fd_ready); + return index; + } else if (fd_gets(socket) == fd_fork_listen) { + index = fd_open(); + if (index < 0) + return index; + + ret = real.accept(fd, addr, addrlen); + if (ret < 0) { + fd_close(index, &fd); + return ret; + } + + fd_store(index, ret, fd_normal, fd_fork_passive); + return index; + } else { + return real.accept(fd, addr, addrlen); + } +} + +/* + * We can't fork RDMA connections and pass them from the parent to the child + * process. Instead, we need to establish the RDMA connection after calling + * fork. To do this, we delay establishing the RDMA connection until we try + * to send/receive on the server side. + */ +static void fork_active(int socket) +{ + struct sockaddr_storage addr; + int sfd, dfd, ret; + socklen_t len; + uint32_t msg; + long flags; + + sfd = fd_getd(socket); + + flags = real.fcntl(sfd, F_GETFL); + real.fcntl(sfd, F_SETFL, 0); + ret = real.recv(sfd, &msg, sizeof msg, MSG_PEEK); + real.fcntl(sfd, F_SETFL, flags); + if ((ret != sizeof msg) || msg) + goto err1; + + len = sizeof addr; + ret = real.getpeername(sfd, (struct sockaddr *) &addr, &len); + if (ret) + goto err1; + + dfd = rsocket(addr.ss_family, SOCK_STREAM, 0); + if (dfd < 0) + goto err1; + + ret = rconnect(dfd, (struct sockaddr *) &addr, len); + if (ret) + goto err2; + + set_rsocket_options(dfd); + copysockopts(dfd, sfd, &rs, &real); + real.shutdown(sfd, SHUT_RDWR); + real.close(sfd); + fd_store(socket, dfd, fd_rsocket, fd_ready); + return; + +err2: + rclose(dfd); +err1: + fd_store(socket, sfd, fd_normal, fd_ready); +} + +/* + * The server will start listening for the new connection, then send a + * message to the active side when the listen is ready. This does leave + * fork unsupported in the following case: the server is nonblocking and + * calls select/poll waiting to receive data from the client. + */ +static void fork_passive(int socket) +{ + struct sockaddr_in6 sin6; + sem_t *sem; + int lfd, sfd, dfd, ret, param; + socklen_t len; + uint32_t msg; + + sfd = fd_getd(socket); + + len = sizeof sin6; + ret = real.getsockname(sfd, (struct sockaddr *) &sin6, &len); + if (ret) + goto out; + sin6.sin6_flowinfo = sin6.sin6_scope_id = 0; + memset(&sin6.sin6_addr, 0, sizeof sin6.sin6_addr); + + sem = sem_open("/rsocket_fork", O_CREAT | O_RDWR, + S_IRWXU | S_IRWXG, 1); + if (sem == SEM_FAILED) { + ret = -1; + goto out; + } + + lfd = rsocket(sin6.sin6_family, SOCK_STREAM, 0); + if (lfd < 0) { + ret = lfd; + goto sclose; + } + + param = 1; + rsetsockopt(lfd, SOL_SOCKET, SO_REUSEADDR, ¶m, sizeof param); + + sem_wait(sem); + ret = rbind(lfd, (struct sockaddr *) &sin6, sizeof sin6); + if (ret) + goto lclose; + + ret = rlisten(lfd, 1); + if (ret) + goto lclose; + + msg = 0; + len = real.write(sfd, &msg, sizeof msg); + if (len != sizeof msg) + goto lclose; + + dfd = raccept(lfd, NULL, NULL); + if (dfd < 0) { + ret = dfd; + goto lclose; + } + + set_rsocket_options(dfd); + copysockopts(dfd, sfd, &rs, &real); + real.shutdown(sfd, SHUT_RDWR); + real.close(sfd); + fd_store(socket, dfd, fd_rsocket, fd_ready); + +lclose: + rclose(lfd); + sem_post(sem); +sclose: + sem_close(sem); +out: + if (ret) + fd_store(socket, sfd, fd_normal, fd_ready); +} + +static inline enum fd_type fd_fork_get(int index, int *fd) +{ + struct fd_info *fdi; + + fdi = idm_lookup(&idm, index); + if (fdi) { + if (fdi->state == fd_fork_passive) + fork_passive(index); + else if (fdi->state == fd_fork_active) + fork_active(index); + *fd = fdi->fd; + return fdi->type; + + } else { + *fd = index; + return fd_normal; + } +} + +int connect(int socket, const struct sockaddr *addr, socklen_t addrlen) +{ + int fd, ret; + + if (fd_get(socket, &fd) == fd_rsocket) { + ret = rconnect(fd, addr, addrlen); + if (!ret || errno == EINPROGRESS) + return ret; + + ret = transpose_socket(socket, fd_normal); + if (ret < 0) + return ret; + + rclose(fd); + fd = ret; + } else if (fd_gets(socket) == fd_fork) { + fd_store(socket, fd, fd_normal, fd_fork_active); + } + + return real.connect(fd, addr, addrlen); +} + +ssize_t recv(int socket, void *buf, size_t len, int flags) +{ + int fd; + return (fd_fork_get(socket, &fd) == fd_rsocket) ? + rrecv(fd, buf, len, flags) : real.recv(fd, buf, len, flags); +} + +ssize_t recvfrom(int socket, void *buf, size_t len, int flags, + struct sockaddr *src_addr, socklen_t *addrlen) +{ + int fd; + return (fd_fork_get(socket, &fd) == fd_rsocket) ? + rrecvfrom(fd, buf, len, flags, src_addr, addrlen) : + real.recvfrom(fd, buf, len, flags, src_addr, addrlen); +} + +ssize_t recvmsg(int socket, struct msghdr *msg, int flags) +{ + int fd; + return (fd_fork_get(socket, &fd) == fd_rsocket) ? + rrecvmsg(fd, msg, flags) : real.recvmsg(fd, msg, flags); +} + +ssize_t read(int socket, void *buf, size_t count) +{ + int fd; + init_preload(); + return (fd_fork_get(socket, &fd) == fd_rsocket) ? + rread(fd, buf, count) : real.read(fd, buf, count); +} + +ssize_t readv(int socket, const struct iovec *iov, int iovcnt) +{ + int fd; + init_preload(); + return (fd_fork_get(socket, &fd) == fd_rsocket) ? + rreadv(fd, iov, iovcnt) : real.readv(fd, iov, iovcnt); +} + +ssize_t send(int socket, const void *buf, size_t len, int flags) +{ + int fd; + return (fd_fork_get(socket, &fd) == fd_rsocket) ? + rsend(fd, buf, len, flags) : real.send(fd, buf, len, flags); +} + +ssize_t sendto(int socket, const void *buf, size_t len, int flags, + const struct sockaddr *dest_addr, socklen_t addrlen) +{ + int fd; + return (fd_fork_get(socket, &fd) == fd_rsocket) ? + rsendto(fd, buf, len, flags, dest_addr, addrlen) : + real.sendto(fd, buf, len, flags, dest_addr, addrlen); +} + +ssize_t sendmsg(int socket, const struct msghdr *msg, int flags) +{ + int fd; + return (fd_fork_get(socket, &fd) == fd_rsocket) ? + rsendmsg(fd, msg, flags) : real.sendmsg(fd, msg, flags); +} + +ssize_t write(int socket, const void *buf, size_t count) +{ + int fd; + init_preload(); + return (fd_fork_get(socket, &fd) == fd_rsocket) ? + rwrite(fd, buf, count) : real.write(fd, buf, count); +} + +ssize_t writev(int socket, const struct iovec *iov, int iovcnt) +{ + int fd; + init_preload(); + return (fd_fork_get(socket, &fd) == fd_rsocket) ? + rwritev(fd, iov, iovcnt) : real.writev(fd, iov, iovcnt); +} + +static struct pollfd *fds_alloc(nfds_t nfds) +{ + static __thread struct pollfd *rfds; + static __thread nfds_t rnfds; + + if (nfds > rnfds) { + if (rfds) + free(rfds); + + rfds = malloc(sizeof *rfds * nfds); + rnfds = rfds ? nfds : 0; + } + + return rfds; +} + +int poll(struct pollfd *fds, nfds_t nfds, int timeout) +{ + struct pollfd *rfds; + int i, ret; + + init_preload(); + for (i = 0; i < nfds; i++) { + if (fd_gett(fds[i].fd) == fd_rsocket) + goto use_rpoll; + } + + return real.poll(fds, nfds, timeout); + +use_rpoll: + rfds = fds_alloc(nfds); + if (!rfds) + return ERR(ENOMEM); + + for (i = 0; i < nfds; i++) { + rfds[i].fd = fd_getd(fds[i].fd); + rfds[i].events = fds[i].events; + rfds[i].revents = 0; + } + + ret = rpoll(rfds, nfds, timeout); + + for (i = 0; i < nfds; i++) + fds[i].revents = rfds[i].revents; + + return ret; +} + +static void select_to_rpoll(struct pollfd *fds, int *nfds, + fd_set *readfds, fd_set *writefds, fd_set *exceptfds) +{ + int fd, events, i = 0; + + for (fd = 0; fd < *nfds; fd++) { + events = (readfds && FD_ISSET(fd, readfds)) ? POLLIN : 0; + if (writefds && FD_ISSET(fd, writefds)) + events |= POLLOUT; + + if (events || (exceptfds && FD_ISSET(fd, exceptfds))) { + fds[i].fd = fd_getd(fd); + fds[i++].events = events; + } + } + + *nfds = i; +} + +static int rpoll_to_select(struct pollfd *fds, int nfds, + fd_set *readfds, fd_set *writefds, fd_set *exceptfds) +{ + int fd, rfd, i, cnt = 0; + + for (i = 0, fd = 0; i < nfds; fd++) { + rfd = fd_getd(fd); + if (rfd != fds[i].fd) + continue; + + if (readfds && (fds[i].revents & POLLIN)) { + FD_SET(fd, readfds); + cnt++; + } + + if (writefds && (fds[i].revents & POLLOUT)) { + FD_SET(fd, writefds); + cnt++; + } + + if (exceptfds && (fds[i].revents & ~(POLLIN | POLLOUT))) { + FD_SET(fd, exceptfds); + cnt++; + } + i++; + } + + return cnt; +} + +static int rs_convert_timeout(struct timeval *timeout) +{ + return !timeout ? -1 : timeout->tv_sec * 1000 + timeout->tv_usec / 1000; +} + +int select(int nfds, fd_set *readfds, fd_set *writefds, + fd_set *exceptfds, struct timeval *timeout) +{ + struct pollfd *fds; + int ret; + + fds = fds_alloc(nfds); + if (!fds) + return ERR(ENOMEM); + + select_to_rpoll(fds, &nfds, readfds, writefds, exceptfds); + ret = rpoll(fds, nfds, rs_convert_timeout(timeout)); + + if (readfds) + FD_ZERO(readfds); + if (writefds) + FD_ZERO(writefds); + if (exceptfds) + FD_ZERO(exceptfds); + + if (ret > 0) + ret = rpoll_to_select(fds, nfds, readfds, writefds, exceptfds); + + return ret; +} + +int shutdown(int socket, int how) +{ + int fd; + return (fd_get(socket, &fd) == fd_rsocket) ? + rshutdown(fd, how) : real.shutdown(fd, how); +} + +int close(int socket) +{ + struct fd_info *fdi; + int ret; + + init_preload(); + fdi = idm_lookup(&idm, socket); + if (!fdi) + return real.close(socket); + + if (fdi->dupfd != -1) { + ret = close(fdi->dupfd); + if (ret) + return ret; + } + + if (atomic_dec(&fdi->refcnt)) + return 0; + + idm_clear(&idm, socket); + real.close(socket); + ret = (fdi->type == fd_rsocket) ? rclose(fdi->fd) : real.close(fdi->fd); + free(fdi); + return ret; +} + +int getpeername(int socket, struct sockaddr *addr, socklen_t *addrlen) +{ + int fd; + return (fd_get(socket, &fd) == fd_rsocket) ? + rgetpeername(fd, addr, addrlen) : + real.getpeername(fd, addr, addrlen); +} + +int getsockname(int socket, struct sockaddr *addr, socklen_t *addrlen) +{ + int fd; + init_preload(); + return (fd_get(socket, &fd) == fd_rsocket) ? + rgetsockname(fd, addr, addrlen) : + real.getsockname(fd, addr, addrlen); +} + +int setsockopt(int socket, int level, int optname, + const void *optval, socklen_t optlen) +{ + int fd; + return (fd_get(socket, &fd) == fd_rsocket) ? + rsetsockopt(fd, level, optname, optval, optlen) : + real.setsockopt(fd, level, optname, optval, optlen); +} + +int getsockopt(int socket, int level, int optname, + void *optval, socklen_t *optlen) +{ + int fd; + return (fd_get(socket, &fd) == fd_rsocket) ? + rgetsockopt(fd, level, optname, optval, optlen) : + real.getsockopt(fd, level, optname, optval, optlen); +} + +int fcntl(int socket, int cmd, ... /* arg */) +{ + va_list args; + long lparam; + void *pparam; + int fd, ret; + + init_preload(); + va_start(args, cmd); + switch (cmd) { + case F_GETFD: + case F_GETFL: + case F_GETOWN: + case F_GETSIG: + case F_GETLEASE: + ret = (fd_get(socket, &fd) == fd_rsocket) ? + rfcntl(fd, cmd) : real.fcntl(fd, cmd); + break; + case F_DUPFD: + /*case F_DUPFD_CLOEXEC:*/ + case F_SETFD: + case F_SETFL: + case F_SETOWN: + case F_SETSIG: + case F_SETLEASE: + case F_NOTIFY: + lparam = va_arg(args, long); + ret = (fd_get(socket, &fd) == fd_rsocket) ? + rfcntl(fd, cmd, lparam) : real.fcntl(fd, cmd, lparam); + break; + default: + pparam = va_arg(args, void *); + ret = (fd_get(socket, &fd) == fd_rsocket) ? + rfcntl(fd, cmd, pparam) : real.fcntl(fd, cmd, pparam); + break; + } + va_end(args); + return ret; +} + +/* + * dup2 is not thread safe + */ +int dup2(int oldfd, int newfd) +{ + struct fd_info *oldfdi, *newfdi; + int ret; + + init_preload(); + oldfdi = idm_lookup(&idm, oldfd); + if (oldfdi) { + if (oldfdi->state == fd_fork_passive) + fork_passive(oldfd); + else if (oldfdi->state == fd_fork_active) + fork_active(oldfd); + } + + newfdi = idm_lookup(&idm, newfd); + if (newfdi) { + /* newfd cannot have been dup'ed directly */ + if (atomic_get(&newfdi->refcnt) > 1) + return ERR(EBUSY); + close(newfd); + } + + ret = real.dup2(oldfd, newfd); + if (!oldfdi || ret != newfd) + return ret; + + newfdi = calloc(1, sizeof *newfdi); + if (!newfdi) { + close(newfd); + return ERR(ENOMEM); + } + + pthread_mutex_lock(&mut); + idm_set(&idm, newfd, newfdi); + pthread_mutex_unlock(&mut); + + newfdi->fd = oldfdi->fd; + newfdi->type = oldfdi->type; + if (oldfdi->dupfd != -1) { + newfdi->dupfd = oldfdi->dupfd; + oldfdi = idm_lookup(&idm, oldfdi->dupfd); + } else { + newfdi->dupfd = oldfd; + } + atomic_init(&newfdi->refcnt); + atomic_set(&newfdi->refcnt, 1); + atomic_inc(&oldfdi->refcnt); + return newfd; +} + +ssize_t sendfile(int out_fd, int in_fd, off_t *offset, size_t count) +{ + void *file_addr; + int fd; + size_t ret; + + if (fd_get(out_fd, &fd) != fd_rsocket) + return real.sendfile(fd, in_fd, offset, count); + + file_addr = mmap(NULL, count, PROT_READ, 0, in_fd, offset ? *offset : 0); + if (file_addr == (void *) -1) + return -1; + + ret = rwrite(fd, file_addr, count); + if ((ret > 0) && offset) + lseek(in_fd, ret, SEEK_CUR); + munmap(file_addr, count); + return ret; +} + +int __fxstat(int ver, int socket, struct stat *buf) +{ + int fd, ret; + + init_preload(); + if (fd_get(socket, &fd) == fd_rsocket) { + ret = real.fxstat(ver, socket, buf); + if (!ret) + buf->st_mode = (buf->st_mode & ~S_IFMT) | __S_IFSOCK; + } else { + ret = real.fxstat(ver, fd, buf); + } + return ret; +} diff --git a/prov/rdmacm/src/rsocket.c b/prov/rdmacm/src/rsocket.c new file mode 100644 index 00000000000..e5595687db0 --- /dev/null +++ b/prov/rdmacm/src/rsocket.c @@ -0,0 +1,3970 @@ +/* + * Copyright (c) 2008-2013 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#if HAVE_CONFIG_H +# include <config.h> +#endif /* HAVE_CONFIG_H */ + +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/time.h> +#include <stdarg.h> +#include <netdb.h> +#include <unistd.h> +#include <fcntl.h> +#include <stdio.h> +#include <stddef.h> +#include <string.h> +#include <netinet/in.h> +#include <netinet/tcp.h> +#include <sys/epoll.h> +#include <search.h> + +#include <rdma/rdma_cma.h> +#include <rdma/rdma_verbs.h> +#include <rdma/rsocket.h> +#include "cma.h" +#include "indexer.h" + +#define RS_OLAP_START_SIZE 2048 +#define RS_MAX_TRANSFER 65536 +#define RS_SNDLOWAT 2048 +#define RS_QP_MAX_SIZE 0xFFFE +#define RS_QP_CTRL_SIZE 4 +#define RS_CONN_RETRIES 6 +#define RS_SGL_SIZE 2 +static struct index_map idm; +static pthread_mutex_t mut = PTHREAD_MUTEX_INITIALIZER; + +struct rsocket; + +enum { + RS_SVC_DGRAM = 1 << 0 +}; + +struct rs_svc_msg { + uint32_t svcs; + uint32_t status; + struct rsocket *rs; +}; + +static pthread_t svc_id; +static int svc_sock[2]; +static int svc_cnt; +static int svc_size; +static struct rsocket **svc_rss; +static struct pollfd *svc_fds; +static uint8_t svc_buf[RS_SNDLOWAT]; +static void *rs_svc_run(void *arg); + +static uint16_t def_iomap_size = 0; +static uint16_t def_inline = 64; +static uint16_t def_sqsize = 384; +static uint16_t def_rqsize = 384; +static uint32_t def_mem = (1 << 17); +static uint32_t def_wmem = (1 << 17); +static uint32_t polling_time = 10; + +/* + * Immediate data format is determined by the upper bits + * bit 31: message type, 0 - data, 1 - control + * bit 30: buffers updated, 0 - target, 1 - direct-receive + * bit 29: more data, 0 - end of transfer, 1 - more data available + * + * for data transfers: + * bits [28:0]: bytes transferred + * for control messages: + * SGL, CTRL + * bits [28-0]: receive credits granted + * IOMAP_SGL + * bits [28-16]: reserved, bits [15-0]: index + */ + +enum { + RS_OP_DATA, + RS_OP_RSVD_DATA_MORE, + RS_OP_WRITE, /* opcode is not transmitted over the network */ + RS_OP_RSVD_DRA_MORE, + RS_OP_SGL, + RS_OP_RSVD, + RS_OP_IOMAP_SGL, + RS_OP_CTRL +}; +#define rs_msg_set(op, data) ((op << 29) | (uint32_t) (data)) +#define rs_msg_op(imm_data) (imm_data >> 29) +#define rs_msg_data(imm_data) (imm_data & 0x1FFFFFFF) +#define RS_MSG_SIZE sizeof(uint32_t) + +#define RS_WR_ID_FLAG_RECV (((uint64_t) 1) << 63) +#define rs_send_wr_id(data) ((uint64_t) data) +#define rs_recv_wr_id(data) (RS_WR_ID_FLAG_RECV | (uint64_t) data) +#define rs_wr_is_recv(wr_id) (wr_id & RS_WR_ID_FLAG_RECV) +#define rs_wr_data(wr_id) ((uint32_t) wr_id) + +enum { + RS_CTRL_DISCONNECT, + RS_CTRL_SHUTDOWN +}; + +struct rs_msg { + uint32_t op; + uint32_t data; +}; + +struct ds_qp; + +struct ds_rmsg { + struct ds_qp *qp; + uint32_t offset; + uint32_t length; +}; + +struct ds_smsg { + struct ds_smsg *next; +}; + +struct rs_sge { + uint64_t addr; + uint32_t key; + uint32_t length; +}; + +struct rs_iomap { + uint64_t offset; + struct rs_sge sge; +}; + +struct rs_iomap_mr { + uint64_t offset; + struct ibv_mr *mr; + dlist_entry entry; + atomic_t refcnt; + int index; /* -1 if mapping is local and not in iomap_list */ +}; + +#define RS_MIN_INLINE (sizeof(struct rs_sge)) +#define rs_host_is_net() (1 == htonl(1)) +#define RS_CONN_FLAG_NET (1 << 0) +#define RS_CONN_FLAG_IOMAP (1 << 1) + +struct rs_conn_data { + uint8_t version; + uint8_t flags; + uint16_t credits; + uint8_t reserved[3]; + uint8_t target_iomap_size; + struct rs_sge target_sgl; + struct rs_sge data_buf; +}; + +struct rs_conn_private_data { + union { + struct rs_conn_data conn_data; + struct { + struct ib_connect_hdr ib_hdr; + struct rs_conn_data conn_data; + } af_ib; + }; +}; + +/* + * rsocket states are ordered as passive, connecting, connected, disconnected. + */ +enum rs_state { + rs_init, + rs_bound = 0x0001, + rs_listening = 0x0002, + rs_opening = 0x0004, + rs_resolving_addr = rs_opening | 0x0010, + rs_resolving_route = rs_opening | 0x0020, + rs_connecting = rs_opening | 0x0040, + rs_accepting = rs_opening | 0x0080, + rs_connected = 0x0100, + rs_writable = 0x0200, + rs_readable = 0x0400, + rs_connect_rdwr = rs_connected | rs_readable | rs_writable, + rs_connect_error = 0x0800, + rs_disconnected = 0x1000, + rs_error = 0x2000, +}; + +#define RS_OPT_SWAP_SGL (1 << 0) +/* + * iWarp does not support RDMA write with immediate data. For iWarp, we + * transfer rsocket messages as inline sends. + */ +#define RS_OPT_MSG_SEND (1 << 1) + +union socket_addr { + struct sockaddr sa; + struct sockaddr_in sin; + struct sockaddr_in6 sin6; +}; + +struct ds_header { + uint8_t version; + uint8_t length; + uint16_t port; + union { + uint32_t ipv4; + struct { + uint32_t flowinfo; + uint8_t addr[16]; + } ipv6; + } addr; +}; + +#define DS_IPV4_HDR_LEN 8 +#define DS_IPV6_HDR_LEN 24 + +struct ds_dest { + union socket_addr addr; /* must be first */ + struct ds_qp *qp; + struct ibv_ah *ah; + uint32_t qpn; +}; + +struct ds_qp { + dlist_entry list; + struct rsocket *rs; + struct rdma_cm_id *cm_id; + struct ds_header hdr; + struct ds_dest dest; + + struct ibv_mr *smr; + struct ibv_mr *rmr; + uint8_t *rbuf; + + int cq_armed; +}; + +struct rsocket { + int type; + int index; + fastlock_t slock; + fastlock_t rlock; + fastlock_t cq_lock; + fastlock_t cq_wait_lock; + fastlock_t map_lock; /* acquire slock first if needed */ + + union { + /* data stream */ + struct { + struct rdma_cm_id *cm_id; + uint64_t tcp_opts; + + int ctrl_avail; + uint16_t sseq_no; + uint16_t sseq_comp; + uint16_t rseq_no; + uint16_t rseq_comp; + + int remote_sge; + struct rs_sge remote_sgl; + struct rs_sge remote_iomap; + + struct ibv_mr *target_mr; + int target_sge; + int target_iomap_size; + void *target_buffer_list; + volatile struct rs_sge *target_sgl; + struct rs_iomap *target_iomap; + + int rbuf_msg_index; + int rbuf_bytes_avail; + int rbuf_free_offset; + int rbuf_offset; + struct ibv_mr *rmr; + uint8_t *rbuf; + + int sbuf_bytes_avail; + struct ibv_mr *smr; + struct ibv_sge ssgl[2]; + }; + /* datagram */ + struct { + struct ds_qp *qp_list; + void *dest_map; + struct ds_dest *conn_dest; + + int udp_sock; + int epfd; + int rqe_avail; + struct ds_smsg *smsg_free; + }; + }; + + int svcs; + int opts; + long fd_flags; + uint64_t so_opts; + uint64_t ipv6_opts; + void *optval; + size_t optlen; + int state; + int cq_armed; + int retries; + int err; + + int sqe_avail; + uint32_t sbuf_size; + uint16_t sq_size; + uint16_t sq_inline; + + uint32_t rbuf_size; + uint16_t rq_size; + int rmsg_head; + int rmsg_tail; + union { + struct rs_msg *rmsg; + struct ds_rmsg *dmsg; + }; + + uint8_t *sbuf; + struct rs_iomap_mr *remote_iomappings; + dlist_entry iomap_list; + dlist_entry iomap_queue; + int iomap_pending; +}; + +#define DS_UDP_TAG 0x55555555 + +struct ds_udp_header { + uint32_t tag; + uint8_t version; + uint8_t op; + uint8_t length; + uint8_t reserved; + uint32_t qpn; /* lower 8-bits reserved */ + union { + uint32_t ipv4; + uint8_t ipv6[16]; + } addr; +}; + +#define DS_UDP_IPV4_HDR_LEN 16 +#define DS_UDP_IPV6_HDR_LEN 28 + +#define ds_next_qp(qp) container_of((qp)->list.next, struct ds_qp, list) + +static void ds_insert_qp(struct rsocket *rs, struct ds_qp *qp) +{ + if (!rs->qp_list) + dlist_init(&qp->list); + else + dlist_insert_head(&qp->list, &rs->qp_list->list); + rs->qp_list = qp; +} + +static void ds_remove_qp(struct rsocket *rs, struct ds_qp *qp) +{ + if (qp->list.next != &qp->list) { + rs->qp_list = ds_next_qp(qp); + dlist_remove(&qp->list); + } else { + rs->qp_list = NULL; + } +} + +static int rs_modify_svcs(struct rsocket *rs, int svcs) +{ + struct rs_svc_msg msg; + int ret; + + pthread_mutex_lock(&mut); + if (!svc_cnt) { + ret = socketpair(AF_UNIX, SOCK_STREAM, 0, svc_sock); + if (ret) + goto unlock; + + ret = pthread_create(&svc_id, NULL, rs_svc_run, NULL); + if (ret) { + ret = ERR(ret); + goto closepair; + } + } + + msg.svcs = svcs; + msg.status = EINVAL; + msg.rs = rs; + write(svc_sock[0], &msg, sizeof msg); + read(svc_sock[0], &msg, sizeof msg); + ret = rdma_seterrno(msg.status); + if (svc_cnt) + goto unlock; + + pthread_join(svc_id, NULL); +closepair: + close(svc_sock[0]); + close(svc_sock[1]); +unlock: + pthread_mutex_unlock(&mut); + return ret; +} + +static int ds_compare_addr(const void *dst1, const void *dst2) +{ + const struct sockaddr *sa1, *sa2; + size_t len; + + sa1 = (const struct sockaddr *) dst1; + sa2 = (const struct sockaddr *) dst2; + + len = (sa1->sa_family == AF_INET6 && sa2->sa_family == AF_INET6) ? + sizeof(struct sockaddr_in6) : sizeof(struct sockaddr_in); + return memcmp(dst1, dst2, len); +} + +static int rs_value_to_scale(int value, int bits) +{ + return value <= (1 << (bits - 1)) ? + value : (1 << (bits - 1)) | (value >> bits); +} + +static int rs_scale_to_value(int value, int bits) +{ + return value <= (1 << (bits - 1)) ? + value : (value & ~(1 << (bits - 1))) << bits; +} + +void rs_configure(void) +{ + FILE *f; + static int init; + + if (init) + return; + + pthread_mutex_lock(&mut); + if (init) + goto out; + + if (ucma_init()) + goto out; + ucma_ib_init(); + + if ((f = fopen(RS_CONF_DIR "/polling_time", "r"))) { + (void) fscanf(f, "%u", &polling_time); + fclose(f); + } + + if ((f = fopen(RS_CONF_DIR "/inline_default", "r"))) { + (void) fscanf(f, "%hu", &def_inline); + fclose(f); + + if (def_inline < RS_MIN_INLINE) + def_inline = RS_MIN_INLINE; + } + + if ((f = fopen(RS_CONF_DIR "/sqsize_default", "r"))) { + (void) fscanf(f, "%hu", &def_sqsize); + fclose(f); + } + + if ((f = fopen(RS_CONF_DIR "/rqsize_default", "r"))) { + (void) fscanf(f, "%hu", &def_rqsize); + fclose(f); + } + + if ((f = fopen(RS_CONF_DIR "/mem_default", "r"))) { + (void) fscanf(f, "%u", &def_mem); + fclose(f); + + if (def_mem < 1) + def_mem = 1; + } + + if ((f = fopen(RS_CONF_DIR "/wmem_default", "r"))) { + (void) fscanf(f, "%u", &def_wmem); + fclose(f); + if (def_wmem < RS_SNDLOWAT) + def_wmem = RS_SNDLOWAT << 1; + } + + if ((f = fopen(RS_CONF_DIR "/iomap_size", "r"))) { + (void) fscanf(f, "%hu", &def_iomap_size); + fclose(f); + + /* round to supported values */ + def_iomap_size = (uint8_t) rs_value_to_scale( + (uint16_t) rs_scale_to_value(def_iomap_size, 8), 8); + } + init = 1; +out: + pthread_mutex_unlock(&mut); +} + +static int rs_insert(struct rsocket *rs, int index) +{ + pthread_mutex_lock(&mut); + rs->index = idm_set(&idm, index, rs); + pthread_mutex_unlock(&mut); + return rs->index; +} + +static void rs_remove(struct rsocket *rs) +{ + pthread_mutex_lock(&mut); + idm_clear(&idm, rs->index); + pthread_mutex_unlock(&mut); +} + +static struct rsocket *rs_alloc(struct rsocket *inherited_rs, int type) +{ + struct rsocket *rs; + + rs = calloc(1, sizeof *rs); + if (!rs) + return NULL; + + rs->type = type; + rs->index = -1; + if (type == SOCK_DGRAM) { + rs->udp_sock = -1; + rs->epfd = -1; + } + + if (inherited_rs) { + rs->sbuf_size = inherited_rs->sbuf_size; + rs->rbuf_size = inherited_rs->rbuf_size; + rs->sq_inline = inherited_rs->sq_inline; + rs->sq_size = inherited_rs->sq_size; + rs->rq_size = inherited_rs->rq_size; + if (type == SOCK_STREAM) { + rs->ctrl_avail = inherited_rs->ctrl_avail; + rs->target_iomap_size = inherited_rs->target_iomap_size; + } + } else { + rs->sbuf_size = def_wmem; + rs->rbuf_size = def_mem; + rs->sq_inline = def_inline; + rs->sq_size = def_sqsize; + rs->rq_size = def_rqsize; + if (type == SOCK_STREAM) { + rs->ctrl_avail = RS_QP_CTRL_SIZE; + rs->target_iomap_size = def_iomap_size; + } + } + fastlock_init(&rs->slock); + fastlock_init(&rs->rlock); + fastlock_init(&rs->cq_lock); + fastlock_init(&rs->cq_wait_lock); + fastlock_init(&rs->map_lock); + dlist_init(&rs->iomap_list); + dlist_init(&rs->iomap_queue); + return rs; +} + +static int rs_set_nonblocking(struct rsocket *rs, long arg) +{ + struct ds_qp *qp; + int ret = 0; + + if (rs->type == SOCK_STREAM) { + if (rs->cm_id->recv_cq_channel) + ret = fcntl(rs->cm_id->recv_cq_channel->fd, F_SETFL, arg); + + if (!ret && rs->state < rs_connected) + ret = fcntl(rs->cm_id->channel->fd, F_SETFL, arg); + } else { + ret = fcntl(rs->epfd, F_SETFL, arg); + if (!ret && rs->qp_list) { + qp = rs->qp_list; + do { + ret = fcntl(qp->cm_id->recv_cq_channel->fd, + F_SETFL, arg); + qp = ds_next_qp(qp); + } while (qp != rs->qp_list && !ret); + } + } + + return ret; +} + +static void rs_set_qp_size(struct rsocket *rs) +{ + uint16_t max_size; + + max_size = min(ucma_max_qpsize(rs->cm_id), RS_QP_MAX_SIZE); + + if (rs->sq_size > max_size) + rs->sq_size = max_size; + else if (rs->sq_size < 4) + rs->sq_size = 4; + if (rs->sq_size <= (RS_QP_CTRL_SIZE << 2)) + rs->ctrl_avail = 2; + + if (rs->rq_size > max_size) + rs->rq_size = max_size; + else if (rs->rq_size < 4) + rs->rq_size = 4; +} + +static void ds_set_qp_size(struct rsocket *rs) +{ + uint16_t max_size; + + max_size = min(ucma_max_qpsize(NULL), RS_QP_MAX_SIZE); + + if (rs->sq_size > max_size) + rs->sq_size = max_size; + if (rs->rq_size > max_size) + rs->rq_size = max_size; + + if (rs->rq_size > (rs->rbuf_size / RS_SNDLOWAT)) + rs->rq_size = rs->rbuf_size / RS_SNDLOWAT; + else + rs->rbuf_size = rs->rq_size * RS_SNDLOWAT; + + if (rs->sq_size > (rs->sbuf_size / RS_SNDLOWAT)) + rs->sq_size = rs->sbuf_size / RS_SNDLOWAT; + else + rs->sbuf_size = rs->sq_size * RS_SNDLOWAT; +} + +static int rs_init_bufs(struct rsocket *rs) +{ + uint32_t rbuf_msg_size; + size_t len; + + rs->rmsg = calloc(rs->rq_size + 1, sizeof(*rs->rmsg)); + if (!rs->rmsg) + return ERR(ENOMEM); + + rs->sbuf = calloc(rs->sbuf_size, sizeof(*rs->sbuf)); + if (!rs->sbuf) + return ERR(ENOMEM); + + rs->smr = rdma_reg_msgs(rs->cm_id, rs->sbuf, rs->sbuf_size); + if (!rs->smr) + return -1; + + len = sizeof(*rs->target_sgl) * RS_SGL_SIZE + + sizeof(*rs->target_iomap) * rs->target_iomap_size; + rs->target_buffer_list = malloc(len); + if (!rs->target_buffer_list) + return ERR(ENOMEM); + + rs->target_mr = rdma_reg_write(rs->cm_id, rs->target_buffer_list, len); + if (!rs->target_mr) + return -1; + + memset(rs->target_buffer_list, 0, len); + rs->target_sgl = rs->target_buffer_list; + if (rs->target_iomap_size) + rs->target_iomap = (struct rs_iomap *) (rs->target_sgl + RS_SGL_SIZE); + + rbuf_msg_size = rs->rbuf_size; + if (rs->opts & RS_OPT_MSG_SEND) + rbuf_msg_size += rs->rq_size * RS_MSG_SIZE; + rs->rbuf = calloc(rbuf_msg_size, 1); + if (!rs->rbuf) + return ERR(ENOMEM); + + rs->rmr = rdma_reg_write(rs->cm_id, rs->rbuf, rbuf_msg_size); + if (!rs->rmr) + return -1; + + rs->ssgl[0].addr = rs->ssgl[1].addr = (uintptr_t) rs->sbuf; + rs->sbuf_bytes_avail = rs->sbuf_size; + rs->ssgl[0].lkey = rs->ssgl[1].lkey = rs->smr->lkey; + + rs->rbuf_free_offset = rs->rbuf_size >> 1; + rs->rbuf_bytes_avail = rs->rbuf_size >> 1; + rs->sqe_avail = rs->sq_size - rs->ctrl_avail; + rs->rseq_comp = rs->rq_size >> 1; + return 0; +} + +static int ds_init_bufs(struct ds_qp *qp) +{ + qp->rbuf = calloc(qp->rs->rbuf_size + sizeof(struct ibv_grh), 1); + if (!qp->rbuf) + return ERR(ENOMEM); + + qp->smr = rdma_reg_msgs(qp->cm_id, qp->rs->sbuf, qp->rs->sbuf_size); + if (!qp->smr) + return -1; + + qp->rmr = rdma_reg_msgs(qp->cm_id, qp->rbuf, qp->rs->rbuf_size + + sizeof(struct ibv_grh)); + if (!qp->rmr) + return -1; + + return 0; +} + +/* + * If a user is waiting on a datagram rsocket through poll or select, then + * we need the first completion to generate an event on the related epoll fd + * in order to signal the user. We arm the CQ on creation for this purpose + */ +static int rs_create_cq(struct rsocket *rs, struct rdma_cm_id *cm_id) +{ + cm_id->recv_cq_channel = ibv_create_comp_channel(cm_id->verbs); + if (!cm_id->recv_cq_channel) + return -1; + + cm_id->recv_cq = ibv_create_cq(cm_id->verbs, rs->sq_size + rs->rq_size, + cm_id, cm_id->recv_cq_channel, 0); + if (!cm_id->recv_cq) + goto err1; + + if (rs->fd_flags & O_NONBLOCK) { + if (fcntl(cm_id->recv_cq_channel->fd, F_SETFL, O_NONBLOCK)) + goto err2; + } + + ibv_req_notify_cq(cm_id->recv_cq, 0); + cm_id->send_cq_channel = cm_id->recv_cq_channel; + cm_id->send_cq = cm_id->recv_cq; + return 0; + +err2: + ibv_destroy_cq(cm_id->recv_cq); + cm_id->recv_cq = NULL; +err1: + ibv_destroy_comp_channel(cm_id->recv_cq_channel); + cm_id->recv_cq_channel = NULL; + return -1; +} + +static inline int rs_post_recv(struct rsocket *rs) +{ + struct ibv_recv_wr wr, *bad; + struct ibv_sge sge; + + wr.next = NULL; + if (!(rs->opts & RS_OPT_MSG_SEND)) { + wr.wr_id = rs_recv_wr_id(0); + wr.sg_list = NULL; + wr.num_sge = 0; + } else { + wr.wr_id = rs_recv_wr_id(rs->rbuf_msg_index); + sge.addr = (uintptr_t) rs->rbuf + rs->rbuf_size + + (rs->rbuf_msg_index * RS_MSG_SIZE); + sge.length = RS_MSG_SIZE; + sge.lkey = rs->rmr->lkey; + + wr.sg_list = &sge; + wr.num_sge = 1; + if(++rs->rbuf_msg_index == rs->rq_size) + rs->rbuf_msg_index = 0; + } + + return rdma_seterrno(ibv_post_recv(rs->cm_id->qp, &wr, &bad)); +} + +static inline int ds_post_recv(struct rsocket *rs, struct ds_qp *qp, uint32_t offset) +{ + struct ibv_recv_wr wr, *bad; + struct ibv_sge sge[2]; + + sge[0].addr = (uintptr_t) qp->rbuf + rs->rbuf_size; + sge[0].length = sizeof(struct ibv_grh); + sge[0].lkey = qp->rmr->lkey; + sge[1].addr = (uintptr_t) qp->rbuf + offset; + sge[1].length = RS_SNDLOWAT; + sge[1].lkey = qp->rmr->lkey; + + wr.wr_id = rs_recv_wr_id(offset); + wr.next = NULL; + wr.sg_list = sge; + wr.num_sge = 2; + + return rdma_seterrno(ibv_post_recv(qp->cm_id->qp, &wr, &bad)); +} + +static int rs_create_ep(struct rsocket *rs) +{ + struct ibv_qp_init_attr qp_attr; + int i, ret; + + rs_set_qp_size(rs); + if (rs->cm_id->verbs->device->transport_type == IBV_TRANSPORT_IWARP) + rs->opts |= RS_OPT_MSG_SEND; + ret = rs_init_bufs(rs); + if (ret) + return ret; + + ret = rs_create_cq(rs, rs->cm_id); + if (ret) + return ret; + + memset(&qp_attr, 0, sizeof qp_attr); + qp_attr.qp_context = rs; + qp_attr.send_cq = rs->cm_id->send_cq; + qp_attr.recv_cq = rs->cm_id->recv_cq; + qp_attr.qp_type = IBV_QPT_RC; + qp_attr.sq_sig_all = 1; + qp_attr.cap.max_send_wr = rs->sq_size; + qp_attr.cap.max_recv_wr = rs->rq_size; + qp_attr.cap.max_send_sge = 2; + qp_attr.cap.max_recv_sge = 1; + qp_attr.cap.max_inline_data = rs->sq_inline; + + ret = rdma_create_qp(rs->cm_id, NULL, &qp_attr); + if (ret) + return ret; + + for (i = 0; i < rs->rq_size; i++) { + ret = rs_post_recv(rs); + if (ret) + return ret; + } + return 0; +} + +static void rs_release_iomap_mr(struct rs_iomap_mr *iomr) +{ + if (atomic_dec(&iomr->refcnt)) + return; + + dlist_remove(&iomr->entry); + ibv_dereg_mr(iomr->mr); + if (iomr->index >= 0) + iomr->mr = NULL; + else + free(iomr); +} + +static void rs_free_iomappings(struct rsocket *rs) +{ + struct rs_iomap_mr *iomr; + + while (!dlist_empty(&rs->iomap_list)) { + iomr = container_of(rs->iomap_list.next, + struct rs_iomap_mr, entry); + riounmap(rs->index, iomr->mr->addr, iomr->mr->length); + } + while (!dlist_empty(&rs->iomap_queue)) { + iomr = container_of(rs->iomap_queue.next, + struct rs_iomap_mr, entry); + riounmap(rs->index, iomr->mr->addr, iomr->mr->length); + } +} + +static void ds_free_qp(struct ds_qp *qp) +{ + if (qp->smr) + rdma_dereg_mr(qp->smr); + + if (qp->rbuf) { + if (qp->rmr) + rdma_dereg_mr(qp->rmr); + free(qp->rbuf); + } + + if (qp->cm_id) { + if (qp->cm_id->qp) { + tdelete(&qp->dest.addr, &qp->rs->dest_map, ds_compare_addr); + epoll_ctl(qp->rs->epfd, EPOLL_CTL_DEL, + qp->cm_id->recv_cq_channel->fd, NULL); + rdma_destroy_qp(qp->cm_id); + } + rdma_destroy_id(qp->cm_id); + } + + free(qp); +} + +static void ds_free(struct rsocket *rs) +{ + struct ds_qp *qp; + + if (rs->udp_sock >= 0) + close(rs->udp_sock); + + if (rs->index >= 0) + rs_remove(rs); + + if (rs->dmsg) + free(rs->dmsg); + + while ((qp = rs->qp_list)) { + ds_remove_qp(rs, qp); + ds_free_qp(qp); + } + + if (rs->epfd >= 0) + close(rs->epfd); + + if (rs->sbuf) + free(rs->sbuf); + + tdestroy(rs->dest_map, free); + fastlock_destroy(&rs->map_lock); + fastlock_destroy(&rs->cq_wait_lock); + fastlock_destroy(&rs->cq_lock); + fastlock_destroy(&rs->rlock); + fastlock_destroy(&rs->slock); + free(rs); +} + +static void rs_free(struct rsocket *rs) +{ + if (rs->type == SOCK_DGRAM) { + ds_free(rs); + return; + } + + if (rs->index >= 0) + rs_remove(rs); + + if (rs->rmsg) + free(rs->rmsg); + + if (rs->sbuf) { + if (rs->smr) + rdma_dereg_mr(rs->smr); + free(rs->sbuf); + } + + if (rs->rbuf) { + if (rs->rmr) + rdma_dereg_mr(rs->rmr); + free(rs->rbuf); + } + + if (rs->target_buffer_list) { + if (rs->target_mr) + rdma_dereg_mr(rs->target_mr); + free(rs->target_buffer_list); + } + + if (rs->cm_id) { + rs_free_iomappings(rs); + if (rs->cm_id->qp) + rdma_destroy_qp(rs->cm_id); + rdma_destroy_id(rs->cm_id); + } + + fastlock_destroy(&rs->map_lock); + fastlock_destroy(&rs->cq_wait_lock); + fastlock_destroy(&rs->cq_lock); + fastlock_destroy(&rs->rlock); + fastlock_destroy(&rs->slock); + free(rs); +} + +static size_t rs_conn_data_offset(struct rsocket *rs) +{ + return (rs->cm_id->route.addr.src_addr.sa_family == AF_IB) ? + sizeof(struct ib_connect_hdr) : 0; +} + +static void rs_format_conn_data(struct rsocket *rs, struct rs_conn_data *conn) +{ + conn->version = 1; + conn->flags = RS_CONN_FLAG_IOMAP | + (rs_host_is_net() ? RS_CONN_FLAG_NET : 0); + conn->credits = htons(rs->rq_size); + memset(conn->reserved, 0, sizeof conn->reserved); + conn->target_iomap_size = (uint8_t) rs_value_to_scale(rs->target_iomap_size, 8); + + conn->target_sgl.addr = htonll((uintptr_t) rs->target_sgl); + conn->target_sgl.length = htonl(RS_SGL_SIZE); + conn->target_sgl.key = htonl(rs->target_mr->rkey); + + conn->data_buf.addr = htonll((uintptr_t) rs->rbuf); + conn->data_buf.length = htonl(rs->rbuf_size >> 1); + conn->data_buf.key = htonl(rs->rmr->rkey); +} + +static void rs_save_conn_data(struct rsocket *rs, struct rs_conn_data *conn) +{ + rs->remote_sgl.addr = ntohll(conn->target_sgl.addr); + rs->remote_sgl.length = ntohl(conn->target_sgl.length); + rs->remote_sgl.key = ntohl(conn->target_sgl.key); + rs->remote_sge = 1; + if ((rs_host_is_net() && !(conn->flags & RS_CONN_FLAG_NET)) || + (!rs_host_is_net() && (conn->flags & RS_CONN_FLAG_NET))) + rs->opts = RS_OPT_SWAP_SGL; + + if (conn->flags & RS_CONN_FLAG_IOMAP) { + rs->remote_iomap.addr = rs->remote_sgl.addr + + sizeof(rs->remote_sgl) * rs->remote_sgl.length; + rs->remote_iomap.length = rs_scale_to_value(conn->target_iomap_size, 8); + rs->remote_iomap.key = rs->remote_sgl.key; + } + + rs->target_sgl[0].addr = ntohll(conn->data_buf.addr); + rs->target_sgl[0].length = ntohl(conn->data_buf.length); + rs->target_sgl[0].key = ntohl(conn->data_buf.key); + + rs->sseq_comp = ntohs(conn->credits); +} + +static int ds_init(struct rsocket *rs, int domain) +{ + rs->udp_sock = socket(domain, SOCK_DGRAM, 0); + if (rs->udp_sock < 0) + return rs->udp_sock; + + rs->epfd = epoll_create(2); + if (rs->epfd < 0) + return rs->epfd; + + return 0; +} + +static int ds_init_ep(struct rsocket *rs) +{ + struct ds_smsg *msg; + int i, ret; + + ds_set_qp_size(rs); + + rs->sbuf = calloc(rs->sq_size, RS_SNDLOWAT); + if (!rs->sbuf) + return ERR(ENOMEM); + + rs->dmsg = calloc(rs->rq_size + 1, sizeof(*rs->dmsg)); + if (!rs->dmsg) + return ERR(ENOMEM); + + rs->sqe_avail = rs->sq_size; + rs->rqe_avail = rs->rq_size; + + rs->smsg_free = (struct ds_smsg *) rs->sbuf; + msg = rs->smsg_free; + for (i = 0; i < rs->sq_size - 1; i++) { + msg->next = (void *) msg + RS_SNDLOWAT; + msg = msg->next; + } + msg->next = NULL; + + ret = rs_modify_svcs(rs, RS_SVC_DGRAM); + if (ret) + return ret; + + rs->state = rs_readable | rs_writable; + return 0; +} + +int rsocket(int domain, int type, int protocol) +{ + struct rsocket *rs; + int index, ret; + + if ((domain != AF_INET && domain != AF_INET6 && domain != AF_IB) || + ((type != SOCK_STREAM) && (type != SOCK_DGRAM)) || + (type == SOCK_STREAM && protocol && protocol != IPPROTO_TCP) || + (type == SOCK_DGRAM && protocol && protocol != IPPROTO_UDP)) + return ERR(ENOTSUP); + + rs_configure(); + rs = rs_alloc(NULL, type); + if (!rs) + return ERR(ENOMEM); + + if (type == SOCK_STREAM) { + ret = rdma_create_id(NULL, &rs->cm_id, rs, RDMA_PS_TCP); + if (ret) + goto err; + + rs->cm_id->route.addr.src_addr.sa_family = domain; + index = rs->cm_id->channel->fd; + } else { + ret = ds_init(rs, domain); + if (ret) + goto err; + + index = rs->udp_sock; + } + + ret = rs_insert(rs, index); + if (ret < 0) + goto err; + + return rs->index; + +err: + rs_free(rs); + return ret; +} + +int rbind(int socket, const struct sockaddr *addr, socklen_t addrlen) +{ + struct rsocket *rs; + int ret; + + rs = idm_at(&idm, socket); + if (rs->type == SOCK_STREAM) { + ret = rdma_bind_addr(rs->cm_id, (struct sockaddr *) addr); + if (!ret) + rs->state = rs_bound; + } else { + if (rs->state == rs_init) { + ret = ds_init_ep(rs); + if (ret) + return ret; + } + ret = bind(rs->udp_sock, addr, addrlen); + } + return ret; +} + +int rlisten(int socket, int backlog) +{ + struct rsocket *rs; + int ret; + + rs = idm_at(&idm, socket); + ret = rdma_listen(rs->cm_id, backlog); + if (!ret) + rs->state = rs_listening; + return ret; +} + +/* + * Nonblocking is usually not inherited between sockets, but we need to + * inherit it here to establish the connection only. This is needed to + * prevent rdma_accept from blocking until the remote side finishes + * establishing the connection. If we were to allow rdma_accept to block, + * then a single thread cannot establish a connection with itself, or + * two threads which try to connect to each other can deadlock trying to + * form a connection. + * + * Data transfers on the new socket remain blocking unless the user + * specifies otherwise through rfcntl. + */ +int raccept(int socket, struct sockaddr *addr, socklen_t *addrlen) +{ + struct rsocket *rs, *new_rs; + struct rdma_conn_param param; + struct rs_conn_data *creq, cresp; + int ret; + + rs = idm_at(&idm, socket); + new_rs = rs_alloc(rs, rs->type); + if (!new_rs) + return ERR(ENOMEM); + + ret = rdma_get_request(rs->cm_id, &new_rs->cm_id); + if (ret) + goto err; + + ret = rs_insert(new_rs, new_rs->cm_id->channel->fd); + if (ret < 0) + goto err; + + creq = (struct rs_conn_data *) + (new_rs->cm_id->event->param.conn.private_data + rs_conn_data_offset(rs)); + if (creq->version != 1) { + ret = ERR(ENOTSUP); + goto err; + } + + if (rs->fd_flags & O_NONBLOCK) + fcntl(new_rs->cm_id->channel->fd, F_SETFL, O_NONBLOCK); + + ret = rs_create_ep(new_rs); + if (ret) + goto err; + + rs_save_conn_data(new_rs, creq); + param = new_rs->cm_id->event->param.conn; + rs_format_conn_data(new_rs, &cresp); + param.private_data = &cresp; + param.private_data_len = sizeof cresp; + ret = rdma_accept(new_rs->cm_id, ¶m); + if (!ret) + new_rs->state = rs_connect_rdwr; + else if (errno == EAGAIN || errno == EWOULDBLOCK) + new_rs->state = rs_accepting; + else + goto err; + + if (addr && addrlen) + rgetpeername(new_rs->index, addr, addrlen); + return new_rs->index; + +err: + rs_free(new_rs); + return ret; +} + +static int rs_do_connect(struct rsocket *rs) +{ + struct rdma_conn_param param; + struct rs_conn_private_data cdata; + struct rs_conn_data *creq, *cresp; + int to, ret; + + switch (rs->state) { + case rs_init: + case rs_bound: +resolve_addr: + to = 1000 << rs->retries++; + ret = rdma_resolve_addr(rs->cm_id, NULL, + &rs->cm_id->route.addr.dst_addr, to); + if (!ret) + goto resolve_route; + if (errno == EAGAIN || errno == EWOULDBLOCK) + rs->state = rs_resolving_addr; + break; + case rs_resolving_addr: + ret = ucma_complete(rs->cm_id); + if (ret) { + if (errno == ETIMEDOUT && rs->retries <= RS_CONN_RETRIES) + goto resolve_addr; + break; + } + + rs->retries = 0; +resolve_route: + to = 1000 << rs->retries++; + if (rs->optval) { + ret = rdma_set_option(rs->cm_id, RDMA_OPTION_IB, + RDMA_OPTION_IB_PATH, rs->optval, + rs->optlen); + free(rs->optval); + rs->optval = NULL; + if (!ret) { + rs->state = rs_resolving_route; + goto resolving_route; + } + } else { + ret = rdma_resolve_route(rs->cm_id, to); + if (!ret) + goto do_connect; + } + if (errno == EAGAIN || errno == EWOULDBLOCK) + rs->state = rs_resolving_route; + break; + case rs_resolving_route: +resolving_route: + ret = ucma_complete(rs->cm_id); + if (ret) { + if (errno == ETIMEDOUT && rs->retries <= RS_CONN_RETRIES) + goto resolve_route; + break; + } +do_connect: + ret = rs_create_ep(rs); + if (ret) + break; + + memset(¶m, 0, sizeof param); + creq = (void *) &cdata + rs_conn_data_offset(rs); + rs_format_conn_data(rs, creq); + param.private_data = (void *) creq - rs_conn_data_offset(rs); + param.private_data_len = sizeof(*creq) + rs_conn_data_offset(rs); + param.flow_control = 1; + param.retry_count = 7; + param.rnr_retry_count = 7; + /* work-around: iWarp issues RDMA read during connection */ + if (rs->opts & RS_OPT_MSG_SEND) + param.initiator_depth = 1; + rs->retries = 0; + + ret = rdma_connect(rs->cm_id, ¶m); + if (!ret) + goto connected; + if (errno == EAGAIN || errno == EWOULDBLOCK) + rs->state = rs_connecting; + break; + case rs_connecting: + ret = ucma_complete(rs->cm_id); + if (ret) + break; +connected: + cresp = (struct rs_conn_data *) rs->cm_id->event->param.conn.private_data; + if (cresp->version != 1) { + ret = ERR(ENOTSUP); + break; + } + + rs_save_conn_data(rs, cresp); + rs->state = rs_connect_rdwr; + break; + case rs_accepting: + if (!(rs->fd_flags & O_NONBLOCK)) + fcntl(rs->cm_id->channel->fd, F_SETFL, 0); + + ret = ucma_complete(rs->cm_id); + if (ret) + break; + + rs->state = rs_connect_rdwr; + break; + default: + ret = ERR(EINVAL); + break; + } + + if (ret) { + if (errno == EAGAIN || errno == EWOULDBLOCK) { + errno = EINPROGRESS; + } else { + rs->state = rs_connect_error; + rs->err = errno; + } + } + return ret; +} + +static int rs_any_addr(const union socket_addr *addr) +{ + if (addr->sa.sa_family == AF_INET) { + return (addr->sin.sin_addr.s_addr == INADDR_ANY || + addr->sin.sin_addr.s_addr == INADDR_LOOPBACK); + } else { + return (!memcmp(&addr->sin6.sin6_addr, &in6addr_any, 16) || + !memcmp(&addr->sin6.sin6_addr, &in6addr_loopback, 16)); + } +} + +static int ds_get_src_addr(struct rsocket *rs, + const struct sockaddr *dest_addr, socklen_t dest_len, + union socket_addr *src_addr, socklen_t *src_len) +{ + int sock, ret; + uint16_t port; + + *src_len = sizeof *src_addr; + ret = getsockname(rs->udp_sock, &src_addr->sa, src_len); + if (ret || !rs_any_addr(src_addr)) + return ret; + + port = src_addr->sin.sin_port; + sock = socket(dest_addr->sa_family, SOCK_DGRAM, 0); + if (sock < 0) + return sock; + + ret = connect(sock, dest_addr, dest_len); + if (ret) + goto out; + + *src_len = sizeof *src_addr; + ret = getsockname(sock, &src_addr->sa, src_len); + src_addr->sin.sin_port = port; +out: + close(sock); + return ret; +} + +static void ds_format_hdr(struct ds_header *hdr, union socket_addr *addr) +{ + if (addr->sa.sa_family == AF_INET) { + hdr->version = 4; + hdr->length = DS_IPV4_HDR_LEN; + hdr->port = addr->sin.sin_port; + hdr->addr.ipv4 = addr->sin.sin_addr.s_addr; + } else { + hdr->version = 6; + hdr->length = DS_IPV6_HDR_LEN; + hdr->port = addr->sin6.sin6_port; + hdr->addr.ipv6.flowinfo= addr->sin6.sin6_flowinfo; + memcpy(&hdr->addr.ipv6.addr, &addr->sin6.sin6_addr, 16); + } +} + +static int ds_add_qp_dest(struct ds_qp *qp, union socket_addr *addr, + socklen_t addrlen) +{ + struct ibv_port_attr port_attr; + struct ibv_ah_attr attr; + int ret; + + memcpy(&qp->dest.addr, addr, addrlen); + qp->dest.qp = qp; + qp->dest.qpn = qp->cm_id->qp->qp_num; + + ret = ibv_query_port(qp->cm_id->verbs, qp->cm_id->port_num, &port_attr); + if (ret) + return ret; + + memset(&attr, 0, sizeof attr); + attr.dlid = port_attr.lid; + attr.port_num = qp->cm_id->port_num; + qp->dest.ah = ibv_create_ah(qp->cm_id->pd, &attr); + if (!qp->dest.ah) + return ERR(ENOMEM); + + tsearch(&qp->dest.addr, &qp->rs->dest_map, ds_compare_addr); + return 0; +} + +static int ds_create_qp(struct rsocket *rs, union socket_addr *src_addr, + socklen_t addrlen, struct ds_qp **new_qp) +{ + struct ds_qp *qp; + struct ibv_qp_init_attr qp_attr; + struct epoll_event event; + int i, ret; + + qp = calloc(1, sizeof(*qp)); + if (!qp) + return ERR(ENOMEM); + + qp->rs = rs; + ret = rdma_create_id(NULL, &qp->cm_id, qp, RDMA_PS_UDP); + if (ret) + goto err; + + ds_format_hdr(&qp->hdr, src_addr); + ret = rdma_bind_addr(qp->cm_id, &src_addr->sa); + if (ret) + goto err; + + ret = ds_init_bufs(qp); + if (ret) + goto err; + + ret = rs_create_cq(rs, qp->cm_id); + if (ret) + goto err; + + memset(&qp_attr, 0, sizeof qp_attr); + qp_attr.qp_context = qp; + qp_attr.send_cq = qp->cm_id->send_cq; + qp_attr.recv_cq = qp->cm_id->recv_cq; + qp_attr.qp_type = IBV_QPT_UD; + qp_attr.sq_sig_all = 1; + qp_attr.cap.max_send_wr = rs->sq_size; + qp_attr.cap.max_recv_wr = rs->rq_size; + qp_attr.cap.max_send_sge = 1; + qp_attr.cap.max_recv_sge = 2; + qp_attr.cap.max_inline_data = rs->sq_inline; + ret = rdma_create_qp(qp->cm_id, NULL, &qp_attr); + if (ret) + goto err; + + ret = ds_add_qp_dest(qp, src_addr, addrlen); + if (ret) + goto err; + + event.events = EPOLLIN; + event.data.ptr = qp; + ret = epoll_ctl(rs->epfd, EPOLL_CTL_ADD, + qp->cm_id->recv_cq_channel->fd, &event); + if (ret) + goto err; + + for (i = 0; i < rs->rq_size; i++) { + ret = ds_post_recv(rs, qp, i * RS_SNDLOWAT); + if (ret) + goto err; + } + + ds_insert_qp(rs, qp); + *new_qp = qp; + return 0; +err: + ds_free_qp(qp); + return ret; +} + +static int ds_get_qp(struct rsocket *rs, union socket_addr *src_addr, + socklen_t addrlen, struct ds_qp **qp) +{ + if (rs->qp_list) { + *qp = rs->qp_list; + do { + if (!ds_compare_addr(rdma_get_local_addr((*qp)->cm_id), + src_addr)) + return 0; + + *qp = ds_next_qp(*qp); + } while (*qp != rs->qp_list); + } + + return ds_create_qp(rs, src_addr, addrlen, qp); +} + +static int ds_get_dest(struct rsocket *rs, const struct sockaddr *addr, + socklen_t addrlen, struct ds_dest **dest) +{ + union socket_addr src_addr; + socklen_t src_len; + struct ds_qp *qp; + struct ds_dest **tdest, *new_dest; + int ret = 0; + + fastlock_acquire(&rs->map_lock); + tdest = tfind(addr, &rs->dest_map, ds_compare_addr); + if (tdest) + goto found; + + ret = ds_get_src_addr(rs, addr, addrlen, &src_addr, &src_len); + if (ret) + goto out; + + ret = ds_get_qp(rs, &src_addr, src_len, &qp); + if (ret) + goto out; + + tdest = tfind(addr, &rs->dest_map, ds_compare_addr); + if (!tdest) { + new_dest = calloc(1, sizeof(*new_dest)); + if (!new_dest) { + ret = ERR(ENOMEM); + goto out; + } + + memcpy(&new_dest->addr, addr, addrlen); + new_dest->qp = qp; + tdest = tsearch(&new_dest->addr, &rs->dest_map, ds_compare_addr); + } + +found: + *dest = *tdest; +out: + fastlock_release(&rs->map_lock); + return ret; +} + +int rconnect(int socket, const struct sockaddr *addr, socklen_t addrlen) +{ + struct rsocket *rs; + int ret; + + rs = idm_at(&idm, socket); + if (rs->type == SOCK_STREAM) { + memcpy(&rs->cm_id->route.addr.dst_addr, addr, addrlen); + ret = rs_do_connect(rs); + } else { + if (rs->state == rs_init) { + ret = ds_init_ep(rs); + if (ret) + return ret; + } + + fastlock_acquire(&rs->slock); + ret = connect(rs->udp_sock, addr, addrlen); + if (!ret) + ret = ds_get_dest(rs, addr, addrlen, &rs->conn_dest); + fastlock_release(&rs->slock); + } + return ret; +} + +static int rs_post_msg(struct rsocket *rs, uint32_t msg) +{ + struct ibv_send_wr wr, *bad; + struct ibv_sge sge; + + wr.wr_id = rs_send_wr_id(msg); + wr.next = NULL; + if (!(rs->opts & RS_OPT_MSG_SEND)) { + wr.sg_list = NULL; + wr.num_sge = 0; + wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM; + wr.send_flags = 0; + wr.imm_data = htonl(msg); + } else { + sge.addr = (uintptr_t) &msg; + sge.lkey = 0; + sge.length = sizeof msg; + wr.sg_list = &sge; + wr.num_sge = 1; + wr.opcode = IBV_WR_SEND; + wr.send_flags = IBV_SEND_INLINE; + } + + return rdma_seterrno(ibv_post_send(rs->cm_id->qp, &wr, &bad)); +} + +static int rs_post_write(struct rsocket *rs, + struct ibv_sge *sgl, int nsge, + uint32_t wr_data, int flags, + uint64_t addr, uint32_t rkey) +{ + struct ibv_send_wr wr, *bad; + + wr.wr_id = rs_send_wr_id(wr_data); + wr.next = NULL; + wr.sg_list = sgl; + wr.num_sge = nsge; + wr.opcode = IBV_WR_RDMA_WRITE; + wr.send_flags = flags; + wr.wr.rdma.remote_addr = addr; + wr.wr.rdma.rkey = rkey; + + return rdma_seterrno(ibv_post_send(rs->cm_id->qp, &wr, &bad)); +} + +static int rs_post_write_msg(struct rsocket *rs, + struct ibv_sge *sgl, int nsge, + uint32_t msg, int flags, + uint64_t addr, uint32_t rkey) +{ + struct ibv_send_wr wr, *bad; + int ret; + + if (!(rs->opts & RS_OPT_MSG_SEND)) { + wr.wr_id = rs_send_wr_id(msg); + wr.next = NULL; + wr.sg_list = sgl; + wr.num_sge = nsge; + wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM; + wr.send_flags = flags; + wr.imm_data = htonl(msg); + wr.wr.rdma.remote_addr = addr; + wr.wr.rdma.rkey = rkey; + + return rdma_seterrno(ibv_post_send(rs->cm_id->qp, &wr, &bad)); + } else { + ret = rs_post_write(rs, sgl, nsge, msg, flags, addr, rkey); + if (!ret) + ret = rs_post_msg(rs, msg); + return ret; + } +} + +static int ds_post_send(struct rsocket *rs, struct ibv_sge *sge, + uint32_t wr_data) +{ + struct ibv_send_wr wr, *bad; + + wr.wr_id = rs_send_wr_id(wr_data); + wr.next = NULL; + wr.sg_list = sge; + wr.num_sge = 1; + wr.opcode = IBV_WR_SEND; + wr.send_flags = (sge->length <= rs->sq_inline) ? IBV_SEND_INLINE : 0; + wr.wr.ud.ah = rs->conn_dest->ah; + wr.wr.ud.remote_qpn = rs->conn_dest->qpn; + wr.wr.ud.remote_qkey = RDMA_UDP_QKEY; + + return rdma_seterrno(ibv_post_send(rs->conn_dest->qp->cm_id->qp, &wr, &bad)); +} + +/* + * Update target SGE before sending data. Otherwise the remote side may + * update the entry before we do. + */ +static int rs_write_data(struct rsocket *rs, + struct ibv_sge *sgl, int nsge, + uint32_t length, int flags) +{ + uint64_t addr; + uint32_t rkey; + + rs->sseq_no++; + rs->sqe_avail--; + if (rs->opts & RS_OPT_MSG_SEND) + rs->sqe_avail--; + rs->sbuf_bytes_avail -= length; + + addr = rs->target_sgl[rs->target_sge].addr; + rkey = rs->target_sgl[rs->target_sge].key; + + rs->target_sgl[rs->target_sge].addr += length; + rs->target_sgl[rs->target_sge].length -= length; + + if (!rs->target_sgl[rs->target_sge].length) { + if (++rs->target_sge == RS_SGL_SIZE) + rs->target_sge = 0; + } + + return rs_post_write_msg(rs, sgl, nsge, rs_msg_set(RS_OP_DATA, length), + flags, addr, rkey); +} + +static int rs_write_direct(struct rsocket *rs, struct rs_iomap *iom, uint64_t offset, + struct ibv_sge *sgl, int nsge, uint32_t length, int flags) +{ + uint64_t addr; + + rs->sqe_avail--; + rs->sbuf_bytes_avail -= length; + + addr = iom->sge.addr + offset - iom->offset; + return rs_post_write(rs, sgl, nsge, rs_msg_set(RS_OP_WRITE, length), + flags, addr, iom->sge.key); +} + +static int rs_write_iomap(struct rsocket *rs, struct rs_iomap_mr *iomr, + struct ibv_sge *sgl, int nsge, int flags) +{ + uint64_t addr; + + rs->sseq_no++; + rs->sqe_avail--; + if (rs->opts & RS_OPT_MSG_SEND) + rs->sqe_avail--; + rs->sbuf_bytes_avail -= sizeof(struct rs_iomap); + + addr = rs->remote_iomap.addr + iomr->index * sizeof(struct rs_iomap); + return rs_post_write_msg(rs, sgl, nsge, rs_msg_set(RS_OP_IOMAP_SGL, iomr->index), + flags, addr, rs->remote_iomap.key); +} + +static uint32_t rs_sbuf_left(struct rsocket *rs) +{ + return (uint32_t) (((uint64_t) (uintptr_t) &rs->sbuf[rs->sbuf_size]) - + rs->ssgl[0].addr); +} + +static void rs_send_credits(struct rsocket *rs) +{ + struct ibv_sge ibsge; + struct rs_sge sge; + + rs->ctrl_avail--; + rs->rseq_comp = rs->rseq_no + (rs->rq_size >> 1); + if (rs->rbuf_bytes_avail >= (rs->rbuf_size >> 1)) { + if (rs->opts & RS_OPT_MSG_SEND) + rs->ctrl_avail--; + + if (!(rs->opts & RS_OPT_SWAP_SGL)) { + sge.addr = (uintptr_t) &rs->rbuf[rs->rbuf_free_offset]; + sge.key = rs->rmr->rkey; + sge.length = rs->rbuf_size >> 1; + } else { + sge.addr = bswap_64((uintptr_t) &rs->rbuf[rs->rbuf_free_offset]); + sge.key = bswap_32(rs->rmr->rkey); + sge.length = bswap_32(rs->rbuf_size >> 1); + } + + ibsge.addr = (uintptr_t) &sge; + ibsge.lkey = 0; + ibsge.length = sizeof(sge); + + rs_post_write_msg(rs, &ibsge, 1, + rs_msg_set(RS_OP_SGL, rs->rseq_no + rs->rq_size), + IBV_SEND_INLINE, + rs->remote_sgl.addr + + rs->remote_sge * sizeof(struct rs_sge), + rs->remote_sgl.key); + + rs->rbuf_bytes_avail -= rs->rbuf_size >> 1; + rs->rbuf_free_offset += rs->rbuf_size >> 1; + if (rs->rbuf_free_offset >= rs->rbuf_size) + rs->rbuf_free_offset = 0; + if (++rs->remote_sge == rs->remote_sgl.length) + rs->remote_sge = 0; + } else { + rs_post_msg(rs, rs_msg_set(RS_OP_SGL, rs->rseq_no + rs->rq_size)); + } +} + +static int rs_give_credits(struct rsocket *rs) +{ + if (!(rs->opts & RS_OPT_MSG_SEND)) { + return ((rs->rbuf_bytes_avail >= (rs->rbuf_size >> 1)) || + ((short) ((short) rs->rseq_no - (short) rs->rseq_comp) >= 0)) && + rs->ctrl_avail && (rs->state & rs_connected); + } else { + return ((rs->rbuf_bytes_avail >= (rs->rbuf_size >> 1)) || + ((short) ((short) rs->rseq_no - (short) rs->rseq_comp) >= 0)) && + (rs->ctrl_avail > 1) && (rs->state & rs_connected); + } +} + +static void rs_update_credits(struct rsocket *rs) +{ + if (rs_give_credits(rs)) + rs_send_credits(rs); +} + +static int rs_poll_cq(struct rsocket *rs) +{ + struct ibv_wc wc; + uint32_t msg; + int ret, rcnt = 0; + + while ((ret = ibv_poll_cq(rs->cm_id->recv_cq, 1, &wc)) > 0) { + if (rs_wr_is_recv(wc.wr_id)) { + if (wc.status != IBV_WC_SUCCESS) + continue; + rcnt++; + + if (wc.wc_flags & IBV_WC_WITH_IMM) { + msg = ntohl(wc.imm_data); + } else { + msg = ((uint32_t *) (rs->rbuf + rs->rbuf_size)) + [rs_wr_data(wc.wr_id)]; + + } + switch (rs_msg_op(msg)) { + case RS_OP_SGL: + rs->sseq_comp = (uint16_t) rs_msg_data(msg); + break; + case RS_OP_IOMAP_SGL: + /* The iomap was updated, that's nice to know. */ + break; + case RS_OP_CTRL: + if (rs_msg_data(msg) == RS_CTRL_DISCONNECT) { + rs->state = rs_disconnected; + return 0; + } else if (rs_msg_data(msg) == RS_CTRL_SHUTDOWN) { + if (rs->state & rs_writable) { + rs->state &= ~rs_readable; + } else { + rs->state = rs_disconnected; + return 0; + } + } + break; + case RS_OP_WRITE: + /* We really shouldn't be here. */ + break; + default: + rs->rmsg[rs->rmsg_tail].op = rs_msg_op(msg); + rs->rmsg[rs->rmsg_tail].data = rs_msg_data(msg); + if (++rs->rmsg_tail == rs->rq_size + 1) + rs->rmsg_tail = 0; + break; + } + } else { + switch (rs_msg_op(rs_wr_data(wc.wr_id))) { + case RS_OP_SGL: + rs->ctrl_avail++; + break; + case RS_OP_CTRL: + rs->ctrl_avail++; + if (rs_msg_data(rs_wr_data(wc.wr_id)) == RS_CTRL_DISCONNECT) + rs->state = rs_disconnected; + break; + case RS_OP_IOMAP_SGL: + rs->sqe_avail++; + rs->sbuf_bytes_avail += sizeof(struct rs_iomap); + break; + default: + rs->sqe_avail++; + rs->sbuf_bytes_avail += rs_msg_data(rs_wr_data(wc.wr_id)); + break; + } + if (wc.status != IBV_WC_SUCCESS && (rs->state & rs_connected)) { + rs->state = rs_error; + rs->err = EIO; + } + } + } + + if (rs->state & rs_connected) { + while (!ret && rcnt--) + ret = rs_post_recv(rs); + + if (ret) { + rs->state = rs_error; + rs->err = errno; + } + } + return ret; +} + +static int rs_get_cq_event(struct rsocket *rs) +{ + struct ibv_cq *cq; + void *context; + int ret; + + if (!rs->cq_armed) + return 0; + + ret = ibv_get_cq_event(rs->cm_id->recv_cq_channel, &cq, &context); + if (!ret) { + ibv_ack_cq_events(rs->cm_id->recv_cq, 1); + rs->cq_armed = 0; + } else if (errno != EAGAIN) { + rs->state = rs_error; + } + + return ret; +} + +/* + * Although we serialize rsend and rrecv calls with respect to themselves, + * both calls may run simultaneously and need to poll the CQ for completions. + * We need to serialize access to the CQ, but rsend and rrecv need to + * allow each other to make forward progress. + * + * For example, rsend may need to wait for credits from the remote side, + * which could be stalled until the remote process calls rrecv. This should + * not block rrecv from receiving data from the remote side however. + * + * We handle this by using two locks. The cq_lock protects against polling + * the CQ and processing completions. The cq_wait_lock serializes access to + * waiting on the CQ. + */ +static int rs_process_cq(struct rsocket *rs, int nonblock, int (*test)(struct rsocket *rs)) +{ + int ret; + + fastlock_acquire(&rs->cq_lock); + do { + rs_update_credits(rs); + ret = rs_poll_cq(rs); + if (test(rs)) { + ret = 0; + break; + } else if (ret) { + break; + } else if (nonblock) { + ret = ERR(EWOULDBLOCK); + } else if (!rs->cq_armed) { + ibv_req_notify_cq(rs->cm_id->recv_cq, 0); + rs->cq_armed = 1; + } else { + rs_update_credits(rs); + fastlock_acquire(&rs->cq_wait_lock); + fastlock_release(&rs->cq_lock); + + ret = rs_get_cq_event(rs); + fastlock_release(&rs->cq_wait_lock); + fastlock_acquire(&rs->cq_lock); + } + } while (!ret); + + rs_update_credits(rs); + fastlock_release(&rs->cq_lock); + return ret; +} + +static int rs_get_comp(struct rsocket *rs, int nonblock, int (*test)(struct rsocket *rs)) +{ + struct timeval s, e; + uint32_t poll_time = 0; + int ret; + + do { + ret = rs_process_cq(rs, 1, test); + if (!ret || nonblock || errno != EWOULDBLOCK) + return ret; + + if (!poll_time) + gettimeofday(&s, NULL); + + gettimeofday(&e, NULL); + poll_time = (e.tv_sec - s.tv_sec) * 1000000 + + (e.tv_usec - s.tv_usec) + 1; + } while (poll_time <= polling_time); + + ret = rs_process_cq(rs, 0, test); + return ret; +} + +static int ds_valid_recv(struct ds_qp *qp, struct ibv_wc *wc) +{ + struct ds_header *hdr; + + hdr = (struct ds_header *) (qp->rbuf + rs_wr_data(wc->wr_id)); + return ((wc->byte_len >= sizeof(struct ibv_grh) + DS_IPV4_HDR_LEN) && + ((hdr->version == 4 && hdr->length == DS_IPV4_HDR_LEN) || + (hdr->version == 6 && hdr->length == DS_IPV6_HDR_LEN))); +} + +/* + * Poll all CQs associated with a datagram rsocket. We need to drop any + * received messages that we do not have room to store. To limit drops, + * we only poll if we have room to store the receive or we need a send + * buffer. To ensure fairness, we poll the CQs round robin, remembering + * where we left off. + */ +static void ds_poll_cqs(struct rsocket *rs) +{ + struct ds_qp *qp; + struct ds_smsg *smsg; + struct ds_rmsg *rmsg; + struct ibv_wc wc; + int ret, cnt; + + if (!(qp = rs->qp_list)) + return; + + do { + cnt = 0; + do { + ret = ibv_poll_cq(qp->cm_id->recv_cq, 1, &wc); + if (ret <= 0) { + qp = ds_next_qp(qp); + continue; + } + + if (rs_wr_is_recv(wc.wr_id)) { + if (rs->rqe_avail && wc.status == IBV_WC_SUCCESS && + ds_valid_recv(qp, &wc)) { + rs->rqe_avail--; + rmsg = &rs->dmsg[rs->rmsg_tail]; + rmsg->qp = qp; + rmsg->offset = rs_wr_data(wc.wr_id); + rmsg->length = wc.byte_len - sizeof(struct ibv_grh); + if (++rs->rmsg_tail == rs->rq_size + 1) + rs->rmsg_tail = 0; + } else { + ds_post_recv(rs, qp, rs_wr_data(wc.wr_id)); + } + } else { + smsg = (struct ds_smsg *) (rs->sbuf + rs_wr_data(wc.wr_id)); + smsg->next = rs->smsg_free; + rs->smsg_free = smsg; + rs->sqe_avail++; + } + + qp = ds_next_qp(qp); + if (!rs->rqe_avail && rs->sqe_avail) { + rs->qp_list = qp; + return; + } + cnt++; + } while (qp != rs->qp_list); + } while (cnt); +} + +static void ds_req_notify_cqs(struct rsocket *rs) +{ + struct ds_qp *qp; + + if (!(qp = rs->qp_list)) + return; + + do { + if (!qp->cq_armed) { + ibv_req_notify_cq(qp->cm_id->recv_cq, 0); + qp->cq_armed = 1; + } + qp = ds_next_qp(qp); + } while (qp != rs->qp_list); +} + +static int ds_get_cq_event(struct rsocket *rs) +{ + struct epoll_event event; + struct ds_qp *qp; + struct ibv_cq *cq; + void *context; + int ret; + + if (!rs->cq_armed) + return 0; + + ret = epoll_wait(rs->epfd, &event, 1, -1); + if (ret <= 0) + return ret; + + qp = event.data.ptr; + ret = ibv_get_cq_event(qp->cm_id->recv_cq_channel, &cq, &context); + if (!ret) { + ibv_ack_cq_events(qp->cm_id->recv_cq, 1); + qp->cq_armed = 0; + rs->cq_armed = 0; + } + + return ret; +} + +static int ds_process_cqs(struct rsocket *rs, int nonblock, int (*test)(struct rsocket *rs)) +{ + int ret = 0; + + fastlock_acquire(&rs->cq_lock); + do { + ds_poll_cqs(rs); + if (test(rs)) { + ret = 0; + break; + } else if (nonblock) { + ret = ERR(EWOULDBLOCK); + } else if (!rs->cq_armed) { + ds_req_notify_cqs(rs); + rs->cq_armed = 1; + } else { + fastlock_acquire(&rs->cq_wait_lock); + fastlock_release(&rs->cq_lock); + + ret = ds_get_cq_event(rs); + fastlock_release(&rs->cq_wait_lock); + fastlock_acquire(&rs->cq_lock); + } + } while (!ret); + + fastlock_release(&rs->cq_lock); + return ret; +} + +static int ds_get_comp(struct rsocket *rs, int nonblock, int (*test)(struct rsocket *rs)) +{ + struct timeval s, e; + uint32_t poll_time = 0; + int ret; + + do { + ret = ds_process_cqs(rs, 1, test); + if (!ret || nonblock || errno != EWOULDBLOCK) + return ret; + + if (!poll_time) + gettimeofday(&s, NULL); + + gettimeofday(&e, NULL); + poll_time = (e.tv_sec - s.tv_sec) * 1000000 + + (e.tv_usec - s.tv_usec) + 1; + } while (poll_time <= polling_time); + + ret = ds_process_cqs(rs, 0, test); + return ret; +} + +static int rs_nonblocking(struct rsocket *rs, int flags) +{ + return (rs->fd_flags & O_NONBLOCK) || (flags & MSG_DONTWAIT); +} + +static int rs_is_cq_armed(struct rsocket *rs) +{ + return rs->cq_armed; +} + +static int rs_poll_all(struct rsocket *rs) +{ + return 1; +} + +/* + * We use hardware flow control to prevent over running the remote + * receive queue. However, data transfers still require space in + * the remote rmsg queue, or we risk losing notification that data + * has been transfered. + * + * Be careful with race conditions in the check below. The target SGL + * may be updated by a remote RDMA write. + */ +static int rs_can_send(struct rsocket *rs) +{ + if (!(rs->opts & RS_OPT_MSG_SEND)) { + return rs->sqe_avail && (rs->sbuf_bytes_avail >= RS_SNDLOWAT) && + (rs->sseq_no != rs->sseq_comp) && + (rs->target_sgl[rs->target_sge].length != 0); + } else { + return (rs->sqe_avail >= 2) && (rs->sbuf_bytes_avail >= RS_SNDLOWAT) && + (rs->sseq_no != rs->sseq_comp) && + (rs->target_sgl[rs->target_sge].length != 0); + } +} + +static int ds_can_send(struct rsocket *rs) +{ + return rs->sqe_avail; +} + +static int ds_all_sends_done(struct rsocket *rs) +{ + return rs->sqe_avail == rs->sq_size; +} + +static int rs_conn_can_send(struct rsocket *rs) +{ + return rs_can_send(rs) || !(rs->state & rs_writable); +} + +static int rs_conn_can_send_ctrl(struct rsocket *rs) +{ + return rs->ctrl_avail || !(rs->state & rs_connected); +} + +static int rs_have_rdata(struct rsocket *rs) +{ + return (rs->rmsg_head != rs->rmsg_tail); +} + +static int rs_conn_have_rdata(struct rsocket *rs) +{ + return rs_have_rdata(rs) || !(rs->state & rs_readable); +} + +static int rs_conn_all_sends_done(struct rsocket *rs) +{ + return ((rs->sqe_avail + rs->ctrl_avail) == rs->sq_size) || + !(rs->state & rs_connected); +} + +static void ds_set_src(struct sockaddr *addr, socklen_t *addrlen, + struct ds_header *hdr) +{ + union socket_addr sa; + + memset(&sa, 0, sizeof sa); + if (hdr->version == 4) { + if (*addrlen > sizeof(sa.sin)) + *addrlen = sizeof(sa.sin); + + sa.sin.sin_family = AF_INET; + sa.sin.sin_port = hdr->port; + sa.sin.sin_addr.s_addr = hdr->addr.ipv4; + } else { + if (*addrlen > sizeof(sa.sin6)) + *addrlen = sizeof(sa.sin6); + + sa.sin6.sin6_family = AF_INET6; + sa.sin6.sin6_port = hdr->port; + sa.sin6.sin6_flowinfo = hdr->addr.ipv6.flowinfo; + memcpy(&sa.sin6.sin6_addr, &hdr->addr.ipv6.addr, 16); + } + memcpy(addr, &sa, *addrlen); +} + +static ssize_t ds_recvfrom(struct rsocket *rs, void *buf, size_t len, int flags, + struct sockaddr *src_addr, socklen_t *addrlen) +{ + struct ds_rmsg *rmsg; + struct ds_header *hdr; + int ret; + + if (!(rs->state & rs_readable)) + return ERR(EINVAL); + + if (!rs_have_rdata(rs)) { + ret = ds_get_comp(rs, rs_nonblocking(rs, flags), + rs_have_rdata); + if (ret) + return ret; + } + + rmsg = &rs->dmsg[rs->rmsg_head]; + hdr = (struct ds_header *) (rmsg->qp->rbuf + rmsg->offset); + if (len > rmsg->length - hdr->length) + len = rmsg->length - hdr->length; + + memcpy(buf, (void *) hdr + hdr->length, len); + if (addrlen) + ds_set_src(src_addr, addrlen, hdr); + + if (!(flags & MSG_PEEK)) { + ds_post_recv(rs, rmsg->qp, rmsg->offset); + if (++rs->rmsg_head == rs->rq_size + 1) + rs->rmsg_head = 0; + rs->rqe_avail++; + } + + return len; +} + +static ssize_t rs_peek(struct rsocket *rs, void *buf, size_t len) +{ + size_t left = len; + uint32_t end_size, rsize; + int rmsg_head, rbuf_offset; + + rmsg_head = rs->rmsg_head; + rbuf_offset = rs->rbuf_offset; + + for (; left && (rmsg_head != rs->rmsg_tail); left -= rsize) { + if (left < rs->rmsg[rmsg_head].data) { + rsize = left; + } else { + rsize = rs->rmsg[rmsg_head].data; + if (++rmsg_head == rs->rq_size + 1) + rmsg_head = 0; + } + + end_size = rs->rbuf_size - rbuf_offset; + if (rsize > end_size) { + memcpy(buf, &rs->rbuf[rbuf_offset], end_size); + rbuf_offset = 0; + buf += end_size; + rsize -= end_size; + left -= end_size; + } + memcpy(buf, &rs->rbuf[rbuf_offset], rsize); + rbuf_offset += rsize; + buf += rsize; + } + + return len - left; +} + +/* + * Continue to receive any queued data even if the remote side has disconnected. + */ +ssize_t rrecv(int socket, void *buf, size_t len, int flags) +{ + struct rsocket *rs; + size_t left = len; + uint32_t end_size, rsize; + int ret; + + rs = idm_at(&idm, socket); + if (rs->type == SOCK_DGRAM) { + fastlock_acquire(&rs->rlock); + ret = ds_recvfrom(rs, buf, len, flags, NULL, 0); + fastlock_release(&rs->rlock); + return ret; + } + + if (rs->state & rs_opening) { + ret = rs_do_connect(rs); + if (ret) { + if (errno == EINPROGRESS) + errno = EAGAIN; + return ret; + } + } + fastlock_acquire(&rs->rlock); + do { + if (!rs_have_rdata(rs)) { + ret = rs_get_comp(rs, rs_nonblocking(rs, flags), + rs_conn_have_rdata); + if (ret) + break; + } + + ret = 0; + if (flags & MSG_PEEK) { + left = len - rs_peek(rs, buf, left); + break; + } + + for (; left && rs_have_rdata(rs); left -= rsize) { + if (left < rs->rmsg[rs->rmsg_head].data) { + rsize = left; + rs->rmsg[rs->rmsg_head].data -= left; + } else { + rs->rseq_no++; + rsize = rs->rmsg[rs->rmsg_head].data; + if (++rs->rmsg_head == rs->rq_size + 1) + rs->rmsg_head = 0; + } + + end_size = rs->rbuf_size - rs->rbuf_offset; + if (rsize > end_size) { + memcpy(buf, &rs->rbuf[rs->rbuf_offset], end_size); + rs->rbuf_offset = 0; + buf += end_size; + rsize -= end_size; + left -= end_size; + rs->rbuf_bytes_avail += end_size; + } + memcpy(buf, &rs->rbuf[rs->rbuf_offset], rsize); + rs->rbuf_offset += rsize; + buf += rsize; + rs->rbuf_bytes_avail += rsize; + } + + } while (left && (flags & MSG_WAITALL) && (rs->state & rs_readable)); + + fastlock_release(&rs->rlock); + return ret ? ret : len - left; +} + +ssize_t rrecvfrom(int socket, void *buf, size_t len, int flags, + struct sockaddr *src_addr, socklen_t *addrlen) +{ + struct rsocket *rs; + int ret; + + rs = idm_at(&idm, socket); + if (rs->type == SOCK_DGRAM) { + fastlock_acquire(&rs->rlock); + ret = ds_recvfrom(rs, buf, len, flags, src_addr, addrlen); + fastlock_release(&rs->rlock); + return ret; + } + + ret = rrecv(socket, buf, len, flags); + if (ret > 0 && src_addr) + rgetpeername(socket, src_addr, addrlen); + + return ret; +} + +/* + * Simple, straightforward implementation for now that only tries to fill + * in the first vector. + */ +static ssize_t rrecvv(int socket, const struct iovec *iov, int iovcnt, int flags) +{ + return rrecv(socket, iov[0].iov_base, iov[0].iov_len, flags); +} + +ssize_t rrecvmsg(int socket, struct msghdr *msg, int flags) +{ + if (msg->msg_control && msg->msg_controllen) + return ERR(ENOTSUP); + + return rrecvv(socket, msg->msg_iov, (int) msg->msg_iovlen, msg->msg_flags); +} + +ssize_t rread(int socket, void *buf, size_t count) +{ + return rrecv(socket, buf, count, 0); +} + +ssize_t rreadv(int socket, const struct iovec *iov, int iovcnt) +{ + return rrecvv(socket, iov, iovcnt, 0); +} + +static int rs_send_iomaps(struct rsocket *rs, int flags) +{ + struct rs_iomap_mr *iomr; + struct ibv_sge sge; + struct rs_iomap iom; + int ret; + + fastlock_acquire(&rs->map_lock); + while (!dlist_empty(&rs->iomap_queue)) { + if (!rs_can_send(rs)) { + ret = rs_get_comp(rs, rs_nonblocking(rs, flags), + rs_conn_can_send); + if (ret) + break; + if (!(rs->state & rs_writable)) { + ret = ERR(ECONNRESET); + break; + } + } + + iomr = container_of(rs->iomap_queue.next, struct rs_iomap_mr, entry); + if (!(rs->opts & RS_OPT_SWAP_SGL)) { + iom.offset = iomr->offset; + iom.sge.addr = (uintptr_t) iomr->mr->addr; + iom.sge.length = iomr->mr->length; + iom.sge.key = iomr->mr->rkey; + } else { + iom.offset = bswap_64(iomr->offset); + iom.sge.addr = bswap_64((uintptr_t) iomr->mr->addr); + iom.sge.length = bswap_32(iomr->mr->length); + iom.sge.key = bswap_32(iomr->mr->rkey); + } + + if (rs->sq_inline >= sizeof iom) { + sge.addr = (uintptr_t) &iom; + sge.length = sizeof iom; + sge.lkey = 0; + ret = rs_write_iomap(rs, iomr, &sge, 1, IBV_SEND_INLINE); + } else if (rs_sbuf_left(rs) >= sizeof iom) { + memcpy((void *) (uintptr_t) rs->ssgl[0].addr, &iom, sizeof iom); + rs->ssgl[0].length = sizeof iom; + ret = rs_write_iomap(rs, iomr, rs->ssgl, 1, 0); + if (rs_sbuf_left(rs) > sizeof iom) + rs->ssgl[0].addr += sizeof iom; + else + rs->ssgl[0].addr = (uintptr_t) rs->sbuf; + } else { + rs->ssgl[0].length = rs_sbuf_left(rs); + memcpy((void *) (uintptr_t) rs->ssgl[0].addr, &iom, + rs->ssgl[0].length); + rs->ssgl[1].length = sizeof iom - rs->ssgl[0].length; + memcpy(rs->sbuf, ((void *) &iom) + rs->ssgl[0].length, + rs->ssgl[1].length); + ret = rs_write_iomap(rs, iomr, rs->ssgl, 2, 0); + rs->ssgl[0].addr = (uintptr_t) rs->sbuf + rs->ssgl[1].length; + } + dlist_remove(&iomr->entry); + dlist_insert_tail(&iomr->entry, &rs->iomap_list); + if (ret) + break; + } + + rs->iomap_pending = !dlist_empty(&rs->iomap_queue); + fastlock_release(&rs->map_lock); + return ret; +} + +static ssize_t ds_sendv_udp(struct rsocket *rs, const struct iovec *iov, + int iovcnt, int flags, uint8_t op) +{ + struct ds_udp_header hdr; + struct msghdr msg; + struct iovec miov[8]; + ssize_t ret; + + if (iovcnt > 8) + return ERR(ENOTSUP); + + hdr.tag = htonl(DS_UDP_TAG); + hdr.version = rs->conn_dest->qp->hdr.version; + hdr.op = op; + hdr.reserved = 0; + hdr.qpn = htonl(rs->conn_dest->qp->cm_id->qp->qp_num & 0xFFFFFF); + if (rs->conn_dest->qp->hdr.version == 4) { + hdr.length = DS_UDP_IPV4_HDR_LEN; + hdr.addr.ipv4 = rs->conn_dest->qp->hdr.addr.ipv4; + } else { + hdr.length = DS_UDP_IPV6_HDR_LEN; + memcpy(hdr.addr.ipv6, &rs->conn_dest->qp->hdr.addr.ipv6, 16); + } + + miov[0].iov_base = &hdr; + miov[0].iov_len = hdr.length; + if (iov && iovcnt) + memcpy(&miov[1], iov, sizeof *iov * iovcnt); + + memset(&msg, 0, sizeof msg); + msg.msg_name = &rs->conn_dest->addr; + msg.msg_namelen = rdma_addrlen(&rs->conn_dest->addr.sa); + msg.msg_iov = miov; + msg.msg_iovlen = iovcnt + 1; + ret = sendmsg(rs->udp_sock, &msg, flags); + return ret > 0 ? ret - hdr.length : ret; +} + +static ssize_t ds_send_udp(struct rsocket *rs, const void *buf, size_t len, + int flags, uint8_t op) +{ + struct iovec iov; + if (buf && len) { + iov.iov_base = (void *) buf; + iov.iov_len = len; + return ds_sendv_udp(rs, &iov, 1, flags, op); + } else { + return ds_sendv_udp(rs, NULL, 0, flags, op); + } +} + +static ssize_t dsend(struct rsocket *rs, const void *buf, size_t len, int flags) +{ + struct ds_smsg *msg; + struct ibv_sge sge; + uint64_t offset; + int ret = 0; + + if (!rs->conn_dest->ah) + return ds_send_udp(rs, buf, len, flags, RS_OP_DATA); + + if (!ds_can_send(rs)) { + ret = ds_get_comp(rs, rs_nonblocking(rs, flags), ds_can_send); + if (ret) + return ret; + } + + msg = rs->smsg_free; + rs->smsg_free = msg->next; + rs->sqe_avail--; + + memcpy((void *) msg, &rs->conn_dest->qp->hdr, rs->conn_dest->qp->hdr.length); + memcpy((void *) msg + rs->conn_dest->qp->hdr.length, buf, len); + sge.addr = (uintptr_t) msg; + sge.length = rs->conn_dest->qp->hdr.length + len; + sge.lkey = rs->conn_dest->qp->smr->lkey; + offset = (uint8_t *) msg - rs->sbuf; + + ret = ds_post_send(rs, &sge, offset); + return ret ? ret : len; +} + +/* + * We overlap sending the data, by posting a small work request immediately, + * then increasing the size of the send on each iteration. + */ +ssize_t rsend(int socket, const void *buf, size_t len, int flags) +{ + struct rsocket *rs; + struct ibv_sge sge; + size_t left = len; + uint32_t xfer_size, olen = RS_OLAP_START_SIZE; + int ret = 0; + + rs = idm_at(&idm, socket); + if (rs->type == SOCK_DGRAM) { + fastlock_acquire(&rs->slock); + ret = dsend(rs, buf, len, flags); + fastlock_release(&rs->slock); + return ret; + } + + if (rs->state & rs_opening) { + ret = rs_do_connect(rs); + if (ret) { + if (errno == EINPROGRESS) + errno = EAGAIN; + return ret; + } + } + + fastlock_acquire(&rs->slock); + if (rs->iomap_pending) { + ret = rs_send_iomaps(rs, flags); + if (ret) + goto out; + } + for (; left; left -= xfer_size, buf += xfer_size) { + if (!rs_can_send(rs)) { + ret = rs_get_comp(rs, rs_nonblocking(rs, flags), + rs_conn_can_send); + if (ret) + break; + if (!(rs->state & rs_writable)) { + ret = ERR(ECONNRESET); + break; + } + } + + if (olen < left) { + xfer_size = olen; + if (olen < RS_MAX_TRANSFER) + olen <<= 1; + } else { + xfer_size = left; + } + + if (xfer_size > rs->sbuf_bytes_avail) + xfer_size = rs->sbuf_bytes_avail; + if (xfer_size > rs->target_sgl[rs->target_sge].length) + xfer_size = rs->target_sgl[rs->target_sge].length; + + if (xfer_size <= rs->sq_inline) { + sge.addr = (uintptr_t) buf; + sge.length = xfer_size; + sge.lkey = 0; + ret = rs_write_data(rs, &sge, 1, xfer_size, IBV_SEND_INLINE); + } else if (xfer_size <= rs_sbuf_left(rs)) { + memcpy((void *) (uintptr_t) rs->ssgl[0].addr, buf, xfer_size); + rs->ssgl[0].length = xfer_size; + ret = rs_write_data(rs, rs->ssgl, 1, xfer_size, 0); + if (xfer_size < rs_sbuf_left(rs)) + rs->ssgl[0].addr += xfer_size; + else + rs->ssgl[0].addr = (uintptr_t) rs->sbuf; + } else { + rs->ssgl[0].length = rs_sbuf_left(rs); + memcpy((void *) (uintptr_t) rs->ssgl[0].addr, buf, + rs->ssgl[0].length); + rs->ssgl[1].length = xfer_size - rs->ssgl[0].length; + memcpy(rs->sbuf, buf + rs->ssgl[0].length, rs->ssgl[1].length); + ret = rs_write_data(rs, rs->ssgl, 2, xfer_size, 0); + rs->ssgl[0].addr = (uintptr_t) rs->sbuf + rs->ssgl[1].length; + } + if (ret) + break; + } +out: + fastlock_release(&rs->slock); + + return (ret && left == len) ? ret : len - left; +} + +ssize_t rsendto(int socket, const void *buf, size_t len, int flags, + const struct sockaddr *dest_addr, socklen_t addrlen) +{ + struct rsocket *rs; + int ret; + + rs = idm_at(&idm, socket); + if (rs->type == SOCK_STREAM) { + if (dest_addr || addrlen) + return ERR(EISCONN); + + return rsend(socket, buf, len, flags); + } + + if (rs->state == rs_init) { + ret = ds_init_ep(rs); + if (ret) + return ret; + } + + fastlock_acquire(&rs->slock); + if (!rs->conn_dest || ds_compare_addr(dest_addr, &rs->conn_dest->addr)) { + ret = ds_get_dest(rs, dest_addr, addrlen, &rs->conn_dest); + if (ret) + goto out; + } + + ret = dsend(rs, buf, len, flags); +out: + fastlock_release(&rs->slock); + return ret; +} + +static void rs_copy_iov(void *dst, const struct iovec **iov, size_t *offset, size_t len) +{ + size_t size; + + while (len) { + size = (*iov)->iov_len - *offset; + if (size > len) { + memcpy (dst, (*iov)->iov_base + *offset, len); + *offset += len; + break; + } + + memcpy(dst, (*iov)->iov_base + *offset, size); + len -= size; + dst += size; + (*iov)++; + *offset = 0; + } +} + +static ssize_t rsendv(int socket, const struct iovec *iov, int iovcnt, int flags) +{ + struct rsocket *rs; + const struct iovec *cur_iov; + size_t left, len, offset = 0; + uint32_t xfer_size, olen = RS_OLAP_START_SIZE; + int i, ret = 0; + + rs = idm_at(&idm, socket); + if (rs->state & rs_opening) { + ret = rs_do_connect(rs); + if (ret) { + if (errno == EINPROGRESS) + errno = EAGAIN; + return ret; + } + } + + cur_iov = iov; + len = iov[0].iov_len; + for (i = 1; i < iovcnt; i++) + len += iov[i].iov_len; + left = len; + + fastlock_acquire(&rs->slock); + if (rs->iomap_pending) { + ret = rs_send_iomaps(rs, flags); + if (ret) + goto out; + } + for (; left; left -= xfer_size) { + if (!rs_can_send(rs)) { + ret = rs_get_comp(rs, rs_nonblocking(rs, flags), + rs_conn_can_send); + if (ret) + break; + if (!(rs->state & rs_writable)) { + ret = ERR(ECONNRESET); + break; + } + } + + if (olen < left) { + xfer_size = olen; + if (olen < RS_MAX_TRANSFER) + olen <<= 1; + } else { + xfer_size = left; + } + + if (xfer_size > rs->sbuf_bytes_avail) + xfer_size = rs->sbuf_bytes_avail; + if (xfer_size > rs->target_sgl[rs->target_sge].length) + xfer_size = rs->target_sgl[rs->target_sge].length; + + if (xfer_size <= rs_sbuf_left(rs)) { + rs_copy_iov((void *) (uintptr_t) rs->ssgl[0].addr, + &cur_iov, &offset, xfer_size); + rs->ssgl[0].length = xfer_size; + ret = rs_write_data(rs, rs->ssgl, 1, xfer_size, + xfer_size <= rs->sq_inline ? IBV_SEND_INLINE : 0); + if (xfer_size < rs_sbuf_left(rs)) + rs->ssgl[0].addr += xfer_size; + else + rs->ssgl[0].addr = (uintptr_t) rs->sbuf; + } else { + rs->ssgl[0].length = rs_sbuf_left(rs); + rs_copy_iov((void *) (uintptr_t) rs->ssgl[0].addr, &cur_iov, + &offset, rs->ssgl[0].length); + rs->ssgl[1].length = xfer_size - rs->ssgl[0].length; + rs_copy_iov(rs->sbuf, &cur_iov, &offset, rs->ssgl[1].length); + ret = rs_write_data(rs, rs->ssgl, 2, xfer_size, + xfer_size <= rs->sq_inline ? IBV_SEND_INLINE : 0); + rs->ssgl[0].addr = (uintptr_t) rs->sbuf + rs->ssgl[1].length; + } + if (ret) + break; + } +out: + fastlock_release(&rs->slock); + + return (ret && left == len) ? ret : len - left; +} + +ssize_t rsendmsg(int socket, const struct msghdr *msg, int flags) +{ + if (msg->msg_control && msg->msg_controllen) + return ERR(ENOTSUP); + + return rsendv(socket, msg->msg_iov, (int) msg->msg_iovlen, flags); +} + +ssize_t rwrite(int socket, const void *buf, size_t count) +{ + return rsend(socket, buf, count, 0); +} + +ssize_t rwritev(int socket, const struct iovec *iov, int iovcnt) +{ + return rsendv(socket, iov, iovcnt, 0); +} + +static struct pollfd *rs_fds_alloc(nfds_t nfds) +{ + static __thread struct pollfd *rfds; + static __thread nfds_t rnfds; + + if (nfds > rnfds) { + if (rfds) + free(rfds); + + rfds = malloc(sizeof *rfds * nfds); + rnfds = rfds ? nfds : 0; + } + + return rfds; +} + +static int rs_poll_rs(struct rsocket *rs, int events, + int nonblock, int (*test)(struct rsocket *rs)) +{ + struct pollfd fds; + short revents; + int ret; + +check_cq: + if ((rs->type == SOCK_STREAM) && ((rs->state & rs_connected) || + (rs->state == rs_disconnected) || (rs->state & rs_error))) { + rs_process_cq(rs, nonblock, test); + + revents = 0; + if ((events & POLLIN) && rs_conn_have_rdata(rs)) + revents |= POLLIN; + if ((events & POLLOUT) && rs_can_send(rs)) + revents |= POLLOUT; + if (!(rs->state & rs_connected)) { + if (rs->state == rs_disconnected) + revents |= POLLHUP; + else + revents |= POLLERR; + } + + return revents; + } else if (rs->type == SOCK_DGRAM) { + ds_process_cqs(rs, nonblock, test); + + revents = 0; + if ((events & POLLIN) && rs_have_rdata(rs)) + revents |= POLLIN; + if ((events & POLLOUT) && ds_can_send(rs)) + revents |= POLLOUT; + + return revents; + } + + if (rs->state == rs_listening) { + fds.fd = rs->cm_id->channel->fd; + fds.events = events; + fds.revents = 0; + poll(&fds, 1, 0); + return fds.revents; + } + + if (rs->state & rs_opening) { + ret = rs_do_connect(rs); + if (ret) { + if (errno == EINPROGRESS) { + errno = 0; + return 0; + } else { + return POLLOUT; + } + } + goto check_cq; + } + + if (rs->state == rs_connect_error) + return (rs->err && events & POLLOUT) ? POLLOUT : 0; + + return 0; +} + +static int rs_poll_check(struct pollfd *fds, nfds_t nfds) +{ + struct rsocket *rs; + int i, cnt = 0; + + for (i = 0; i < nfds; i++) { + rs = idm_lookup(&idm, fds[i].fd); + if (rs) + fds[i].revents = rs_poll_rs(rs, fds[i].events, 1, rs_poll_all); + else + poll(&fds[i], 1, 0); + + if (fds[i].revents) + cnt++; + } + return cnt; +} + +static int rs_poll_arm(struct pollfd *rfds, struct pollfd *fds, nfds_t nfds) +{ + struct rsocket *rs; + int i; + + for (i = 0; i < nfds; i++) { + rs = idm_lookup(&idm, fds[i].fd); + if (rs) { + fds[i].revents = rs_poll_rs(rs, fds[i].events, 0, rs_is_cq_armed); + if (fds[i].revents) + return 1; + + if (rs->type == SOCK_STREAM) { + if (rs->state >= rs_connected) + rfds[i].fd = rs->cm_id->recv_cq_channel->fd; + else + rfds[i].fd = rs->cm_id->channel->fd; + } else { + rfds[i].fd = rs->epfd; + } + rfds[i].events = POLLIN; + } else { + rfds[i].fd = fds[i].fd; + rfds[i].events = fds[i].events; + } + rfds[i].revents = 0; + } + return 0; +} + +static int rs_poll_events(struct pollfd *rfds, struct pollfd *fds, nfds_t nfds) +{ + struct rsocket *rs; + int i, cnt = 0; + + for (i = 0; i < nfds; i++) { + if (!rfds[i].revents) + continue; + + rs = idm_lookup(&idm, fds[i].fd); + if (rs) { + fastlock_acquire(&rs->cq_wait_lock); + if (rs->type == SOCK_STREAM) + rs_get_cq_event(rs); + else + ds_get_cq_event(rs); + fastlock_release(&rs->cq_wait_lock); + fds[i].revents = rs_poll_rs(rs, fds[i].events, 1, rs_poll_all); + } else { + fds[i].revents = rfds[i].revents; + } + if (fds[i].revents) + cnt++; + } + return cnt; +} + +/* + * We need to poll *all* fd's that the user specifies at least once. + * Note that we may receive events on an rsocket that may not be reported + * to the user (e.g. connection events or credit updates). Process those + * events, then return to polling until we find ones of interest. + */ +int rpoll(struct pollfd *fds, nfds_t nfds, int timeout) +{ + struct timeval s, e; + struct pollfd *rfds; + uint32_t poll_time = 0; + int ret; + + do { + ret = rs_poll_check(fds, nfds); + if (ret || !timeout) + return ret; + + if (!poll_time) + gettimeofday(&s, NULL); + + gettimeofday(&e, NULL); + poll_time = (e.tv_sec - s.tv_sec) * 1000000 + + (e.tv_usec - s.tv_usec) + 1; + } while (poll_time <= polling_time); + + rfds = rs_fds_alloc(nfds); + if (!rfds) + return ERR(ENOMEM); + + do { + ret = rs_poll_arm(rfds, fds, nfds); + if (ret) + break; + + ret = poll(rfds, nfds, timeout); + if (ret <= 0) + break; + + ret = rs_poll_events(rfds, fds, nfds); + } while (!ret); + + return ret; +} + +static struct pollfd * +rs_select_to_poll(int *nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds) +{ + struct pollfd *fds; + int fd, i = 0; + + fds = calloc(*nfds, sizeof *fds); + if (!fds) + return NULL; + + for (fd = 0; fd < *nfds; fd++) { + if (readfds && FD_ISSET(fd, readfds)) { + fds[i].fd = fd; + fds[i].events = POLLIN; + } + + if (writefds && FD_ISSET(fd, writefds)) { + fds[i].fd = fd; + fds[i].events |= POLLOUT; + } + + if (exceptfds && FD_ISSET(fd, exceptfds)) + fds[i].fd = fd; + + if (fds[i].fd) + i++; + } + + *nfds = i; + return fds; +} + +static int +rs_poll_to_select(int nfds, struct pollfd *fds, fd_set *readfds, + fd_set *writefds, fd_set *exceptfds) +{ + int i, cnt = 0; + + for (i = 0; i < nfds; i++) { + if (readfds && (fds[i].revents & (POLLIN | POLLHUP))) { + FD_SET(fds[i].fd, readfds); + cnt++; + } + + if (writefds && (fds[i].revents & POLLOUT)) { + FD_SET(fds[i].fd, writefds); + cnt++; + } + + if (exceptfds && (fds[i].revents & ~(POLLIN | POLLOUT))) { + FD_SET(fds[i].fd, exceptfds); + cnt++; + } + } + return cnt; +} + +static int rs_convert_timeout(struct timeval *timeout) +{ + return !timeout ? -1 : + timeout->tv_sec * 1000 + timeout->tv_usec / 1000; +} + +int rselect(int nfds, fd_set *readfds, fd_set *writefds, + fd_set *exceptfds, struct timeval *timeout) +{ + struct pollfd *fds; + int ret; + + fds = rs_select_to_poll(&nfds, readfds, writefds, exceptfds); + if (!fds) + return ERR(ENOMEM); + + ret = rpoll(fds, nfds, rs_convert_timeout(timeout)); + + if (readfds) + FD_ZERO(readfds); + if (writefds) + FD_ZERO(writefds); + if (exceptfds) + FD_ZERO(exceptfds); + + if (ret > 0) + ret = rs_poll_to_select(nfds, fds, readfds, writefds, exceptfds); + + free(fds); + return ret; +} + +/* + * For graceful disconnect, notify the remote side that we're + * disconnecting and wait until all outstanding sends complete, provided + * that the remote side has not sent a disconnect message. + */ +int rshutdown(int socket, int how) +{ + struct rsocket *rs; + int ctrl, ret = 0; + + rs = idm_at(&idm, socket); + if (rs->fd_flags & O_NONBLOCK) + rs_set_nonblocking(rs, 0); + + if (rs->state & rs_connected) { + if (how == SHUT_RDWR) { + ctrl = RS_CTRL_DISCONNECT; + rs->state &= ~(rs_readable | rs_writable); + } else if (how == SHUT_WR) { + rs->state &= ~rs_writable; + ctrl = (rs->state & rs_readable) ? + RS_CTRL_SHUTDOWN : RS_CTRL_DISCONNECT; + } else { + rs->state &= ~rs_readable; + if (rs->state & rs_writable) + goto out; + ctrl = RS_CTRL_DISCONNECT; + } + if (!rs->ctrl_avail) { + ret = rs_process_cq(rs, 0, rs_conn_can_send_ctrl); + if (ret) + goto out; + } + + if ((rs->state & rs_connected) && rs->ctrl_avail) { + rs->ctrl_avail--; + ret = rs_post_msg(rs, rs_msg_set(RS_OP_CTRL, ctrl)); + } + } + + if (rs->state & rs_connected) + rs_process_cq(rs, 0, rs_conn_all_sends_done); + +out: + if ((rs->fd_flags & O_NONBLOCK) && (rs->state & rs_connected)) + rs_set_nonblocking(rs, rs->fd_flags); + + if (rs->state & rs_disconnected) { + /* Generate event by flushing receives to unblock rpoll */ + ibv_req_notify_cq(rs->cm_id->recv_cq, 0); + rdma_disconnect(rs->cm_id); + } + + return ret; +} + +static void ds_shutdown(struct rsocket *rs) +{ + if (rs->svcs) + rs_modify_svcs(rs, 0); + + if (rs->fd_flags & O_NONBLOCK) + rs_set_nonblocking(rs, 0); + + rs->state &= ~(rs_readable | rs_writable); + ds_process_cqs(rs, 0, ds_all_sends_done); + + if (rs->fd_flags & O_NONBLOCK) + rs_set_nonblocking(rs, rs->fd_flags); +} + +int rclose(int socket) +{ + struct rsocket *rs; + + rs = idm_at(&idm, socket); + if (rs->type == SOCK_STREAM) { + if (rs->state & rs_connected) + rshutdown(socket, SHUT_RDWR); + } else { + ds_shutdown(rs); + } + + rs_free(rs); + return 0; +} + +static void rs_copy_addr(struct sockaddr *dst, struct sockaddr *src, socklen_t *len) +{ + socklen_t size; + + if (src->sa_family == AF_INET) { + size = min(*len, sizeof(struct sockaddr_in)); + *len = sizeof(struct sockaddr_in); + } else { + size = min(*len, sizeof(struct sockaddr_in6)); + *len = sizeof(struct sockaddr_in6); + } + memcpy(dst, src, size); +} + +int rgetpeername(int socket, struct sockaddr *addr, socklen_t *addrlen) +{ + struct rsocket *rs; + + rs = idm_at(&idm, socket); + if (rs->type == SOCK_STREAM) { + rs_copy_addr(addr, rdma_get_peer_addr(rs->cm_id), addrlen); + return 0; + } else { + return getpeername(rs->udp_sock, addr, addrlen); + } +} + +int rgetsockname(int socket, struct sockaddr *addr, socklen_t *addrlen) +{ + struct rsocket *rs; + + rs = idm_at(&idm, socket); + if (rs->type == SOCK_STREAM) { + rs_copy_addr(addr, rdma_get_local_addr(rs->cm_id), addrlen); + return 0; + } else { + return getsockname(rs->udp_sock, addr, addrlen); + } +} + +int rsetsockopt(int socket, int level, int optname, + const void *optval, socklen_t optlen) +{ + struct rsocket *rs; + int ret, opt_on = 0; + uint64_t *opts = NULL; + + ret = ERR(ENOTSUP); + rs = idm_at(&idm, socket); + if (rs->type == SOCK_DGRAM && level != SOL_RDMA) { + ret = setsockopt(rs->udp_sock, level, optname, optval, optlen); + if (ret) + return ret; + } + + switch (level) { + case SOL_SOCKET: + opts = &rs->so_opts; + switch (optname) { + case SO_REUSEADDR: + if (rs->type == SOCK_STREAM) { + ret = rdma_set_option(rs->cm_id, RDMA_OPTION_ID, + RDMA_OPTION_ID_REUSEADDR, + (void *) optval, optlen); + if (ret && ((errno == ENOSYS) || ((rs->state != rs_init) && + rs->cm_id->context && + (rs->cm_id->verbs->device->transport_type == IBV_TRANSPORT_IB)))) + ret = 0; + } + opt_on = *(int *) optval; + break; + case SO_RCVBUF: + if ((rs->type == SOCK_STREAM && !rs->rbuf) || + (rs->type == SOCK_DGRAM && !rs->qp_list)) + rs->rbuf_size = (*(uint32_t *) optval) << 1; + ret = 0; + break; + case SO_SNDBUF: + if (!rs->sbuf) + rs->sbuf_size = (*(uint32_t *) optval) << 1; + if (rs->sbuf_size < RS_SNDLOWAT) + rs->sbuf_size = RS_SNDLOWAT << 1; + ret = 0; + break; + case SO_LINGER: + /* Invert value so default so_opt = 0 is on */ + opt_on = !((struct linger *) optval)->l_onoff; + ret = 0; + break; + case SO_KEEPALIVE: + opt_on = *(int *) optval; + ret = 0; + break; + case SO_OOBINLINE: + opt_on = *(int *) optval; + ret = 0; + break; + default: + break; + } + break; + case IPPROTO_TCP: + opts = &rs->tcp_opts; + switch (optname) { + case TCP_NODELAY: + opt_on = *(int *) optval; + ret = 0; + break; + case TCP_MAXSEG: + ret = 0; + break; + default: + break; + } + break; + case IPPROTO_IPV6: + opts = &rs->ipv6_opts; + switch (optname) { + case IPV6_V6ONLY: + if (rs->type == SOCK_STREAM) { + ret = rdma_set_option(rs->cm_id, RDMA_OPTION_ID, + RDMA_OPTION_ID_AFONLY, + (void *) optval, optlen); + } + opt_on = *(int *) optval; + break; + default: + break; + } + break; + case SOL_RDMA: + if (rs->state >= rs_opening) { + ret = ERR(EINVAL); + break; + } + + switch (optname) { + case RDMA_SQSIZE: + rs->sq_size = min((*(uint32_t *) optval), RS_QP_MAX_SIZE); + ret = 0; + break; + case RDMA_RQSIZE: + rs->rq_size = min((*(uint32_t *) optval), RS_QP_MAX_SIZE); + ret = 0; + break; + case RDMA_INLINE: + rs->sq_inline = min(*(uint32_t *) optval, RS_QP_MAX_SIZE); + if (rs->sq_inline < RS_MIN_INLINE) + rs->sq_inline = RS_MIN_INLINE; + ret = 0; + break; + case RDMA_IOMAPSIZE: + rs->target_iomap_size = (uint16_t) rs_scale_to_value( + (uint8_t) rs_value_to_scale(*(int *) optval, 8), 8); + ret = 0; + break; + case RDMA_ROUTE: + if ((rs->optval = calloc(optlen, 1))) { + memcpy(rs->optval, optval, optlen); + rs->optlen = optlen; + ret = 0; + } else { + ret = ERR(ENOMEM); + } + break; + default: + break; + } + break; + default: + break; + } + + if (!ret && opts) { + if (opt_on) + *opts |= (1 << optname); + else + *opts &= ~(1 << optname); + } + + return ret; +} + +int rgetsockopt(int socket, int level, int optname, + void *optval, socklen_t *optlen) +{ + struct rsocket *rs; + int ret = 0; + + rs = idm_at(&idm, socket); + switch (level) { + case SOL_SOCKET: + switch (optname) { + case SO_REUSEADDR: + case SO_KEEPALIVE: + case SO_OOBINLINE: + *((int *) optval) = !!(rs->so_opts & (1 << optname)); + *optlen = sizeof(int); + break; + case SO_RCVBUF: + *((int *) optval) = rs->rbuf_size; + *optlen = sizeof(int); + break; + case SO_SNDBUF: + *((int *) optval) = rs->sbuf_size; + *optlen = sizeof(int); + break; + case SO_LINGER: + /* Value is inverted so default so_opt = 0 is on */ + ((struct linger *) optval)->l_onoff = + !(rs->so_opts & (1 << optname)); + ((struct linger *) optval)->l_linger = 0; + *optlen = sizeof(struct linger); + break; + case SO_ERROR: + *((int *) optval) = rs->err; + *optlen = sizeof(int); + rs->err = 0; + break; + default: + ret = ENOTSUP; + break; + } + break; + case IPPROTO_TCP: + switch (optname) { + case TCP_NODELAY: + *((int *) optval) = !!(rs->tcp_opts & (1 << optname)); + *optlen = sizeof(int); + break; + case TCP_MAXSEG: + *((int *) optval) = (rs->cm_id && rs->cm_id->route.num_paths) ? + 1 << (7 + rs->cm_id->route.path_rec->mtu) : + 2048; + *optlen = sizeof(int); + break; + default: + ret = ENOTSUP; + break; + } + break; + case IPPROTO_IPV6: + switch (optname) { + case IPV6_V6ONLY: + *((int *) optval) = !!(rs->ipv6_opts & (1 << optname)); + *optlen = sizeof(int); + break; + default: + ret = ENOTSUP; + break; + } + break; + case SOL_RDMA: + switch (optname) { + case RDMA_SQSIZE: + *((int *) optval) = rs->sq_size; + *optlen = sizeof(int); + break; + case RDMA_RQSIZE: + *((int *) optval) = rs->rq_size; + *optlen = sizeof(int); + break; + case RDMA_INLINE: + *((int *) optval) = rs->sq_inline; + *optlen = sizeof(int); + break; + case RDMA_IOMAPSIZE: + *((int *) optval) = rs->target_iomap_size; + *optlen = sizeof(int); + break; + default: + ret = ENOTSUP; + break; + } + break; + default: + ret = ENOTSUP; + break; + } + + return rdma_seterrno(ret); +} + +int rfcntl(int socket, int cmd, ... /* arg */ ) +{ + struct rsocket *rs; + va_list args; + long param; + int ret = 0; + + rs = idm_at(&idm, socket); + va_start(args, cmd); + switch (cmd) { + case F_GETFL: + ret = (int) rs->fd_flags; + break; + case F_SETFL: + param = va_arg(args, long); + if (param & O_NONBLOCK) + ret = rs_set_nonblocking(rs, O_NONBLOCK); + + if (!ret) + rs->fd_flags |= param; + break; + default: + ret = ERR(ENOTSUP); + break; + } + va_end(args); + return ret; +} + +static struct rs_iomap_mr *rs_get_iomap_mr(struct rsocket *rs) +{ + int i; + + if (!rs->remote_iomappings) { + rs->remote_iomappings = calloc(rs->remote_iomap.length, + sizeof(*rs->remote_iomappings)); + if (!rs->remote_iomappings) + return NULL; + + for (i = 0; i < rs->remote_iomap.length; i++) + rs->remote_iomappings[i].index = i; + } + + for (i = 0; i < rs->remote_iomap.length; i++) { + if (!rs->remote_iomappings[i].mr) + return &rs->remote_iomappings[i]; + } + return NULL; +} + +/* + * If an offset is given, we map to it. If offset is -1, then we map the + * offset to the address of buf. We do not check for conflicts, which must + * be fixed at some point. + */ +off_t riomap(int socket, void *buf, size_t len, int prot, int flags, off_t offset) +{ + struct rsocket *rs; + struct rs_iomap_mr *iomr; + int access = IBV_ACCESS_LOCAL_WRITE; + + rs = idm_at(&idm, socket); + if (!rs->cm_id->pd || (prot & ~(PROT_WRITE | PROT_NONE))) + return ERR(EINVAL); + + fastlock_acquire(&rs->map_lock); + if (prot & PROT_WRITE) { + iomr = rs_get_iomap_mr(rs); + access |= IBV_ACCESS_REMOTE_WRITE; + } else { + iomr = calloc(1, sizeof *iomr); + iomr->index = -1; + } + if (!iomr) { + offset = ERR(ENOMEM); + goto out; + } + + iomr->mr = ibv_reg_mr(rs->cm_id->pd, buf, len, access); + if (!iomr->mr) { + if (iomr->index < 0) + free(iomr); + offset = -1; + goto out; + } + + if (offset == -1) + offset = (uintptr_t) buf; + iomr->offset = offset; + atomic_init(&iomr->refcnt); + atomic_set(&iomr->refcnt, 1); + + if (iomr->index >= 0) { + dlist_insert_tail(&iomr->entry, &rs->iomap_queue); + rs->iomap_pending = 1; + } else { + dlist_insert_tail(&iomr->entry, &rs->iomap_list); + } +out: + fastlock_release(&rs->map_lock); + return offset; +} + +int riounmap(int socket, void *buf, size_t len) +{ + struct rsocket *rs; + struct rs_iomap_mr *iomr; + dlist_entry *entry; + int ret = 0; + + rs = idm_at(&idm, socket); + fastlock_acquire(&rs->map_lock); + + for (entry = rs->iomap_list.next; entry != &rs->iomap_list; + entry = entry->next) { + iomr = container_of(entry, struct rs_iomap_mr, entry); + if (iomr->mr->addr == buf && iomr->mr->length == len) { + rs_release_iomap_mr(iomr); + goto out; + } + } + + for (entry = rs->iomap_queue.next; entry != &rs->iomap_queue; + entry = entry->next) { + iomr = container_of(entry, struct rs_iomap_mr, entry); + if (iomr->mr->addr == buf && iomr->mr->length == len) { + rs_release_iomap_mr(iomr); + goto out; + } + } + ret = ERR(EINVAL); +out: + fastlock_release(&rs->map_lock); + return ret; +} + +static struct rs_iomap *rs_find_iomap(struct rsocket *rs, off_t offset) +{ + int i; + + for (i = 0; i < rs->target_iomap_size; i++) { + if (offset >= rs->target_iomap[i].offset && + offset < rs->target_iomap[i].offset + rs->target_iomap[i].sge.length) + return &rs->target_iomap[i]; + } + return NULL; +} + +size_t riowrite(int socket, const void *buf, size_t count, off_t offset, int flags) +{ + struct rsocket *rs; + struct rs_iomap *iom = NULL; + struct ibv_sge sge; + size_t left = count; + uint32_t xfer_size, olen = RS_OLAP_START_SIZE; + int ret = 0; + + rs = idm_at(&idm, socket); + fastlock_acquire(&rs->slock); + if (rs->iomap_pending) { + ret = rs_send_iomaps(rs, flags); + if (ret) + goto out; + } + for (; left; left -= xfer_size, buf += xfer_size, offset += xfer_size) { + if (!iom || offset > iom->offset + iom->sge.length) { + iom = rs_find_iomap(rs, offset); + if (!iom) + break; + } + + if (!rs_can_send(rs)) { + ret = rs_get_comp(rs, rs_nonblocking(rs, flags), + rs_conn_can_send); + if (ret) + break; + if (!(rs->state & rs_writable)) { + ret = ERR(ECONNRESET); + break; + } + } + + if (olen < left) { + xfer_size = olen; + if (olen < RS_MAX_TRANSFER) + olen <<= 1; + } else { + xfer_size = left; + } + + if (xfer_size > rs->sbuf_bytes_avail) + xfer_size = rs->sbuf_bytes_avail; + if (xfer_size > iom->offset + iom->sge.length - offset) + xfer_size = iom->offset + iom->sge.length - offset; + + if (xfer_size <= rs->sq_inline) { + sge.addr = (uintptr_t) buf; + sge.length = xfer_size; + sge.lkey = 0; + ret = rs_write_direct(rs, iom, offset, &sge, 1, + xfer_size, IBV_SEND_INLINE); + } else if (xfer_size <= rs_sbuf_left(rs)) { + memcpy((void *) (uintptr_t) rs->ssgl[0].addr, buf, xfer_size); + rs->ssgl[0].length = xfer_size; + ret = rs_write_direct(rs, iom, offset, rs->ssgl, 1, xfer_size, 0); + if (xfer_size < rs_sbuf_left(rs)) + rs->ssgl[0].addr += xfer_size; + else + rs->ssgl[0].addr = (uintptr_t) rs->sbuf; + } else { + rs->ssgl[0].length = rs_sbuf_left(rs); + memcpy((void *) (uintptr_t) rs->ssgl[0].addr, buf, + rs->ssgl[0].length); + rs->ssgl[1].length = xfer_size - rs->ssgl[0].length; + memcpy(rs->sbuf, buf + rs->ssgl[0].length, rs->ssgl[1].length); + ret = rs_write_direct(rs, iom, offset, rs->ssgl, 2, xfer_size, 0); + rs->ssgl[0].addr = (uintptr_t) rs->sbuf + rs->ssgl[1].length; + } + if (ret) + break; + } +out: + fastlock_release(&rs->slock); + + return (ret && left == count) ? ret : count - left; +} + +static int rs_svc_grow_sets(void) +{ + struct rsocket **rss; + struct pollfd *fds; + void *set; + + set = calloc(svc_size + 2, sizeof(*rss) + sizeof(*fds)); + if (!set) + return ENOMEM; + + svc_size += 2; + rss = set; + fds = set + sizeof(*rss) * svc_size; + if (svc_cnt) { + memcpy(rss, svc_rss, sizeof(*rss) * svc_cnt); + memcpy(fds, svc_fds, sizeof(*fds) * svc_cnt); + } + + free(svc_rss); + free(svc_fds); + svc_rss = rss; + svc_fds = fds; + return 0; +} + +/* + * Index 0 is reserved for the service's communication socket. + */ +static int rs_svc_add_rs(struct rsocket *rs) +{ + int ret; + + if (svc_cnt >= svc_size - 1) { + ret = rs_svc_grow_sets(); + if (ret) + return ret; + } + + svc_rss[++svc_cnt] = rs; + svc_fds[svc_cnt].fd = rs->udp_sock; + svc_fds[svc_cnt].events = POLLIN; + svc_fds[svc_cnt].revents = 0; + return 0; +} + +static int rs_svc_rm_rs(struct rsocket *rs) +{ + int i; + + for (i = 1; i <= svc_cnt; i++) { + if (svc_rss[i] == rs) { + svc_cnt--; + svc_rss[i] = svc_rss[svc_cnt]; + svc_fds[i] = svc_fds[svc_cnt]; + return 0; + } + } + return EBADF; +} + +static void rs_svc_process_sock(void) +{ + struct rs_svc_msg msg; + + read(svc_sock[1], &msg, sizeof msg); + if (msg.svcs & RS_SVC_DGRAM) { + msg.status = rs_svc_add_rs(msg.rs); + } else if (!msg.svcs) { + msg.status = rs_svc_rm_rs(msg.rs); + } + + if (!msg.status) + msg.rs->svcs = msg.svcs; + write(svc_sock[1], &msg, sizeof msg); +} + +static uint8_t rs_svc_sgid_index(struct ds_dest *dest, union ibv_gid *sgid) +{ + union ibv_gid gid; + int i; + + for (i = 0; i < 16; i++) { + ibv_query_gid(dest->qp->cm_id->verbs, dest->qp->cm_id->port_num, + i, &gid); + if (!memcmp(sgid, &gid, sizeof gid)) + return i; + } + return 0; +} + +static uint8_t rs_svc_path_bits(struct ds_dest *dest) +{ + struct ibv_port_attr attr; + + if (!ibv_query_port(dest->qp->cm_id->verbs, dest->qp->cm_id->port_num, &attr)) + return (uint8_t) ((1 << attr.lmc) - 1); + return 0x7f; +} + +static void rs_svc_create_ah(struct rsocket *rs, struct ds_dest *dest, uint32_t qpn) +{ + union socket_addr saddr; + struct rdma_cm_id *id; + struct ibv_ah_attr attr; + int ret; + + if (dest->ah) { + fastlock_acquire(&rs->slock); + ibv_destroy_ah(dest->ah); + dest->ah = NULL; + fastlock_release(&rs->slock); + } + + ret = rdma_create_id(NULL, &id, NULL, dest->qp->cm_id->ps); + if (ret) + return; + + memcpy(&saddr, rdma_get_local_addr(dest->qp->cm_id), + rdma_addrlen(rdma_get_local_addr(dest->qp->cm_id))); + if (saddr.sa.sa_family == AF_INET) + saddr.sin.sin_port = 0; + else + saddr.sin6.sin6_port = 0; + ret = rdma_resolve_addr(id, &saddr.sa, &dest->addr.sa, 2000); + if (ret) + goto out; + + ret = rdma_resolve_route(id, 2000); + if (ret) + goto out; + + memset(&attr, 0, sizeof attr); + if (id->route.path_rec->hop_limit > 1) { + attr.is_global = 1; + attr.grh.dgid = id->route.path_rec->dgid; + attr.grh.flow_label = ntohl(id->route.path_rec->flow_label); + attr.grh.sgid_index = rs_svc_sgid_index(dest, &id->route.path_rec->sgid); + attr.grh.hop_limit = id->route.path_rec->hop_limit; + attr.grh.traffic_class = id->route.path_rec->traffic_class; + } + attr.dlid = ntohs(id->route.path_rec->dlid); + attr.sl = id->route.path_rec->sl; + attr.src_path_bits = id->route.path_rec->slid & rs_svc_path_bits(dest); + attr.static_rate = id->route.path_rec->rate; + attr.port_num = id->port_num; + + fastlock_acquire(&rs->slock); + dest->qpn = qpn; + dest->ah = ibv_create_ah(dest->qp->cm_id->pd, &attr); + fastlock_release(&rs->slock); +out: + rdma_destroy_id(id); +} + +static int rs_svc_valid_udp_hdr(struct ds_udp_header *udp_hdr, + union socket_addr *addr) +{ + return (udp_hdr->tag == ntohl(DS_UDP_TAG)) && + ((udp_hdr->version == 4 && addr->sa.sa_family == AF_INET && + udp_hdr->length == DS_UDP_IPV4_HDR_LEN) || + (udp_hdr->version == 6 && addr->sa.sa_family == AF_INET6 && + udp_hdr->length == DS_UDP_IPV6_HDR_LEN)); +} + +static void rs_svc_forward(struct rsocket *rs, void *buf, size_t len, + union socket_addr *src) +{ + struct ds_header hdr; + struct ds_smsg *msg; + struct ibv_sge sge; + uint64_t offset; + + if (!ds_can_send(rs)) { + if (ds_get_comp(rs, 0, ds_can_send)) + return; + } + + msg = rs->smsg_free; + rs->smsg_free = msg->next; + rs->sqe_avail--; + + ds_format_hdr(&hdr, src); + memcpy((void *) msg, &hdr, hdr.length); + memcpy((void *) msg + hdr.length, buf, len); + sge.addr = (uintptr_t) msg; + sge.length = hdr.length + len; + sge.lkey = rs->conn_dest->qp->smr->lkey; + offset = (uint8_t *) msg - rs->sbuf; + + ds_post_send(rs, &sge, offset); +} + +static void rs_svc_process_rs(struct rsocket *rs) +{ + struct ds_dest *dest, *cur_dest; + struct ds_udp_header *udp_hdr; + union socket_addr addr; + socklen_t addrlen = sizeof addr; + int len, ret; + + ret = recvfrom(rs->udp_sock, svc_buf, sizeof svc_buf, 0, &addr.sa, &addrlen); + if (ret < DS_UDP_IPV4_HDR_LEN) + return; + + udp_hdr = (struct ds_udp_header *) svc_buf; + if (!rs_svc_valid_udp_hdr(udp_hdr, &addr)) + return; + + len = ret - udp_hdr->length; + udp_hdr->tag = ntohl(udp_hdr->tag); + udp_hdr->qpn = ntohl(udp_hdr->qpn) & 0xFFFFFF; + ret = ds_get_dest(rs, &addr.sa, addrlen, &dest); + if (ret) + return; + + if (udp_hdr->op == RS_OP_DATA) { + fastlock_acquire(&rs->slock); + cur_dest = rs->conn_dest; + rs->conn_dest = dest; + ds_send_udp(rs, NULL, 0, 0, RS_OP_CTRL); + rs->conn_dest = cur_dest; + fastlock_release(&rs->slock); + } + + if (!dest->ah || (dest->qpn != udp_hdr->qpn)) + rs_svc_create_ah(rs, dest, udp_hdr->qpn); + + /* to do: handle when dest local ip address doesn't match udp ip */ + if (udp_hdr->op == RS_OP_DATA) { + fastlock_acquire(&rs->slock); + cur_dest = rs->conn_dest; + rs->conn_dest = &dest->qp->dest; + rs_svc_forward(rs, svc_buf + udp_hdr->length, len, &addr); + rs->conn_dest = cur_dest; + fastlock_release(&rs->slock); + } +} + +static void *rs_svc_run(void *arg) +{ + struct rs_svc_msg msg; + int i, ret; + + ret = rs_svc_grow_sets(); + if (ret) { + msg.status = ret; + write(svc_sock[1], &msg, sizeof msg); + return (void *) (uintptr_t) ret; + } + + svc_fds[0].fd = svc_sock[1]; + svc_fds[0].events = POLLIN; + do { + for (i = 0; i <= svc_cnt; i++) + svc_fds[i].revents = 0; + + poll(svc_fds, svc_cnt + 1, -1); + if (svc_fds[0].revents) + rs_svc_process_sock(); + + for (i = 1; i <= svc_cnt; i++) { + if (svc_fds[i].revents) + rs_svc_process_rs(svc_rss[i]); + } + } while (svc_cnt >= 1); + + return NULL; +} diff --git a/src/fabric.c b/src/fabric.c new file mode 100644 index 00000000000..dcb0e299392 --- /dev/null +++ b/src/fabric.c @@ -0,0 +1,306 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2013 Intel Corp., Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +# include <config.h> +#endif /* HAVE_CONFIG_H */ + +#include <errno.h> +#include <fcntl.h> +#include <pthread.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/types.h> +#include <unistd.h> + +#include <rdma/fabric.h> +#include <rdma/fi_arch.h> +#include <rdma/fi_atomic.h> +#include <rdma/fi_cm.h> +#include <rdma/fi_domain.h> +#include <rdma/fi_prov.h> +#include <rdma/fi_rdma.h> +#include <rdma/fi_socket.h> +#include <rdma/fi_tagged.h> +#include <rdma/fi_ucma.h> +#include <rdma/fi_umad.h> +#include <rdma/fi_uverbs.h> +#include <rdma/fi_errno.h> +#include "fi.h" + +static pthread_mutex_t mut = PTHREAD_MUTEX_INITIALIZER; +static int init; +static struct fi_prov *prov_head, *prov_tail; + + +const char *fi_sysfs_path(void) +{ + static char *sysfs_path; + char *env = NULL; + + if (sysfs_path) + return sysfs_path; + + /* + * Only follow path passed in through the calling user's + * environment if we're not running SUID. + */ + if (getuid() == geteuid()) + env = getenv("SYSFS_PATH"); + + if (env) { + int len; + + sysfs_path = strndup(env, FI_PATH_MAX); + len = strlen(sysfs_path); + while (len > 0 && sysfs_path[len - 1] == '/') { + --len; + sysfs_path[len] = '\0'; + } + } else { + sysfs_path = "/sys"; + } + + return sysfs_path; +} + +int fi_read_file(const char *dir, const char *file, char *buf, size_t size) +{ + char *path; + int fd, len; + + if (asprintf(&path, "%s/%s", dir, file) < 0) + return -1; + + fd = open(path, O_RDONLY); + if (fd < 0) { + free(path); + return -1; + } + + len = read(fd, buf, size); + close(fd); + free(path); + + if (len > 0 && buf[len - 1] == '\n') + buf[--len] = '\0'; + + return len; +} + +void fi_register(struct fi_ops_prov *ops) +{ + struct fi_prov *prov; + + prov = calloc(sizeof *prov, 1); + if (!prov) + return; + + prov->ops = ops; + if (prov_tail) + prov_tail->next = prov; + else + prov_head = prov; + prov_tail = prov; +} + +int ucma_init(void); +int fi_init() +{ + int ret = 0; + + pthread_mutex_lock(&mut); + if (init) + goto out; + + ret = uv_init(); + if (ret) + goto out; + + ret = ucma_init(); + if (ret) + goto out; + + init = 1; +out: + pthread_mutex_unlock(&mut); + return ret; +} + +static void __attribute__((constructor)) fi_ini(void) +{ + uv_ini(); + ibv_ini(); + ucma_ini(); + rdma_cm_ini(); + psmx_ini(); + mlx4_ini(); +} + +static void __attribute__((destructor)) fi_fini(void) +{ + mlx4_fini(); + psmx_fini(); + rdma_cm_fini(); + ucma_fini(); + ibv_fini(); + uv_fini(); +} + +int fi_getinfo(char *node, char *service, struct fi_info *hints, + struct fi_info **info) +{ + struct fi_prov *prov; + struct fi_info *tail, *cur; + int ret = -ENOSYS; + + if (!init) + fi_init(); + + *info = tail = NULL; + for (prov = prov_head; prov; prov = prov->next) { + if (!prov->ops->getinfo) + continue; + + ret = prov->ops->getinfo(node, service, hints, &cur); + if (ret) + continue; + + if (!*info) + *info = cur; + else + tail->next = cur; + for (tail = cur; tail->next; tail = tail->next) + ; + } + + return *info ? 0 : ret; +} + +void __fi_freeinfo(struct fi_info *info) +{ + if (info->src_addr) + free(info->src_addr); + if (info->dst_addr) + free(info->dst_addr); +// if (info->src_canonname) +// free(info->src_canonname); +// if (info->dst_canonname) +// free(info->dst_canonname); + if (info->domain_name) + free(info->domain_name); + if (info->data) + free(info->data); + + free(info); +} + +void fi_freeinfo(struct fi_info *info) +{ + struct fi_prov *prov; + struct fi_info *next; + int ret; + + while (info) { + next = info->next; + for (prov = prov_head; prov && info; prov = prov->next) { + if (!prov->ops->freeinfo) + continue; + + ret = prov->ops->freeinfo(info); + if (!ret) + goto cont; + } + __fi_freeinfo(info); +cont: + info = next; + } +} + +int fi_open(char *name, struct fi_info *info, uint64_t flags, fid_t *fid, void *context) +{ + struct fi_prov *prov; + int ret = -ENOSYS; + + if (!init) + fi_init(); + + for (prov = prov_head; prov; prov = prov->next) { + if (!prov->ops->open) + continue; + + ret = prov->ops->open(name, info, flags, fid, context); + if (!ret) + break; + } + + return ret; +} + +int fi_socket(struct fi_info *info, fid_t *fid, void *context) +{ + struct fi_prov *prov; + int ret = -ENOSYS; + + if (!init) + fi_init(); + + for (prov = prov_head; prov; prov = prov->next) { + if (!prov->ops->socket) + continue; + + ret = prov->ops->socket(info, fid, context); + if (!ret) + break; + } + + return ret; +} + +#define FI_ERRNO_OFFSET 256 + +static const char *const errstr[] = { + [FI_EOTHER - FI_ERRNO_OFFSET] = "Unspecified error", + [FI_ETOOSMALL - FI_ERRNO_OFFSET] = "Provided buffer is too small" + +}; + +const char *fi_strerror(int errnum) +{ + if (errnum < FI_ERRNO_OFFSET) + return strerror(errnum); + else + return errstr[errnum - FI_ERRNO_OFFSET]; +} diff --git a/src/libfabric.map b/src/libfabric.map new file mode 100644 index 00000000000..75c1a048cd2 --- /dev/null +++ b/src/libfabric.map @@ -0,0 +1,38 @@ +FABRIC_1.0 { + global: + fi_getinfo; + fi_freeinfo; + fi_open; + fi_socket; + fi_strerror; + rsocket; + rbind; + rlisten; + raccept; + rconnect; + rshutdown; + rclose; + rrecv; + rrecvfrom; + rrecvmsg; + rsend; + rsendto; + rsendmsg; + rread; + rreadv; + rwrite; + rwritev; + rpoll; + rselect; + rgetpeername; + rgetsockname; + rsetsockopt; + rgetsockopt; + rfcntl; + rpoll; + rselect; + riomap; + riounmap; + riowrite; + local: *; +}; diff --git a/src/ucma.c b/src/ucma.c new file mode 100644 index 00000000000..1fc368bd419 --- /dev/null +++ b/src/ucma.c @@ -0,0 +1,497 @@ +/* + * Copyright (c) 2005-2012 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +# include <config.h> +#endif /* HAVE_CONFIG_H */ + +#include <stdlib.h> +#include <string.h> +#include <glob.h> +#include <stdio.h> +#include <fcntl.h> +#include <errno.h> +#include <stdint.h> +#include <poll.h> +#include <unistd.h> +#include <pthread.h> +#include <endian.h> +#include <byteswap.h> +#include <stddef.h> +#include <netdb.h> +#include <syslog.h> + +#include <rdma/fabric.h> +#include <rdma/fi_prov.h> +#include <rdma/fi_ucma.h> +#include "fi.h" + + +static int ucma_abi_ver = RDMA_USER_CM_MAX_ABI_VERSION; + +#define UCMA_INIT_CMD(req, req_size, op) \ +do { \ + (req)->cmd = UCMA_CMD_##op; \ + (req)->in = (req_size) - sizeof(struct ucma_abi_cmd_hdr); \ + (req)->out = 0; \ +} while (0) + +#define UCMA_INIT_CMD_RESP(req, req_size, op, resp, resp_size) \ +do { \ + (req)->cmd = UCMA_CMD_##op; \ + (req)->in = (req_size) - sizeof(struct ucma_abi_cmd_hdr); \ + (req)->out = (resp_size); \ + (req)->response = (uintptr_t) (resp); \ +} while (0) + +static int ucma_open(const char *name, struct fi_info *info, uint64_t flags, + fid_t *fid, void *context); + +static struct fi_ops_prov ucma_prov_ops = { + .size = sizeof(struct fi_ops_prov), + .getinfo = NULL, + .freeinfo = NULL, + .socket = NULL, + .open = ucma_open +}; + + +static int ucma_abi_version(void) +{ + char value[8]; + + if ((fi_read_file(fi_sysfs_path(), "class/misc/rdma_cm/abi_version", + value, sizeof value) < 0) && + (fi_read_file(fi_sysfs_path(), "class/infiniband_ucma/abi_version", + value, sizeof value) < 0)) { + return -ENOSYS; + } + + ucma_abi_ver = strtol(value, NULL, 10); + if (ucma_abi_ver < RDMA_USER_CM_MIN_ABI_VERSION || + ucma_abi_ver > RDMA_USER_CM_MAX_ABI_VERSION) { + fprintf(stderr, PFX "ucma kernel ABI version %d not supported (%d).\n", + ucma_abi_ver, RDMA_USER_CM_MAX_ABI_VERSION); + return -ENOSYS; + } + + return 0; +} + +int ucma_init(void) +{ + return ucma_abi_version(); +} + +void ucma_ini(void) +{ + fi_register(&ucma_prov_ops); +} + +void ucma_fini(void) +{ +} + +static int __ucma_create_id(fid_t fid, + struct ucma_abi_create_id *cmd, size_t cmd_size, + struct ucma_abi_create_id_resp *resp, size_t resp_size) +{ + struct fid_ucma *ucma; + + ucma = container_of(fid, struct fid_ucma, fid); + UCMA_INIT_CMD_RESP(cmd, cmd_size, CREATE_ID, resp, resp_size); + if (write(ucma->fd, cmd, cmd_size) != cmd_size) + return -errno; + + VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); + return 0; +} + +static int __ucma_destroy_id(fid_t fid, + struct ucma_abi_destroy_id *cmd, size_t cmd_size, + struct ucma_abi_destroy_id_resp *resp, size_t resp_size) +{ + struct fid_ucma *ucma; + + ucma = container_of(fid, struct fid_ucma, fid); + UCMA_INIT_CMD_RESP(cmd, cmd_size, DESTROY_ID, resp, resp_size); + if (write(ucma->fd, cmd, cmd_size) != cmd_size) + return -errno; + + VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); + return 0; +} + +static int __ucma_bind_ip(fid_t fid, + struct ucma_abi_bind_ip *cmd, size_t cmd_size) +{ + struct fid_ucma *ucma; + + ucma = container_of(fid, struct fid_ucma, fid); + UCMA_INIT_CMD(cmd, cmd_size, BIND_IP); + if (write(ucma->fd, cmd, cmd_size) != cmd_size) + return -errno; + + return 0; +} + +static int __ucma_bind(fid_t fid, + struct ucma_abi_bind *cmd, size_t cmd_size) +{ + struct fid_ucma *ucma; + + ucma = container_of(fid, struct fid_ucma, fid); + UCMA_INIT_CMD(cmd, cmd_size, BIND); + if (write(ucma->fd, cmd, cmd_size) != cmd_size) + return -errno; + + return 0; +} +static int __ucma_resolve_ip(fid_t fid, + struct ucma_abi_resolve_ip *cmd, size_t cmd_size) +{ + struct fid_ucma *ucma; + + ucma = container_of(fid, struct fid_ucma, fid); + UCMA_INIT_CMD(cmd, cmd_size, RESOLVE_IP); + if (write(ucma->fd, cmd, cmd_size) != cmd_size) + return -errno; + + return 0; +} + +static int __ucma_resolve_addr(fid_t fid, + struct ucma_abi_resolve_addr *cmd, size_t cmd_size) +{ + struct fid_ucma *ucma; + + ucma = container_of(fid, struct fid_ucma, fid); + UCMA_INIT_CMD(cmd, cmd_size, RESOLVE_ADDR); + if (write(ucma->fd, cmd, cmd_size) != cmd_size) + return -errno; + + return 0; +} + +static int __ucma_resolve_route(fid_t fid, + struct ucma_abi_resolve_route *cmd, size_t cmd_size) +{ + struct fid_ucma *ucma; + + ucma = container_of(fid, struct fid_ucma, fid); + UCMA_INIT_CMD(cmd, cmd_size, RESOLVE_ROUTE); + if (write(ucma->fd, cmd, cmd_size) != cmd_size) + return -errno; + + return 0; +} + +static int __ucma_query_route(fid_t fid, + struct ucma_abi_query *cmd, size_t cmd_size, + struct ucma_abi_query_route_resp *resp, size_t resp_size) +{ + struct fid_ucma *ucma; + + ucma = container_of(fid, struct fid_ucma, fid); + UCMA_INIT_CMD_RESP(cmd, cmd_size, QUERY_ROUTE, resp, resp_size); + if (write(ucma->fd, cmd, cmd_size) != cmd_size) + return -errno; + + VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); + return 0; +} + +static int __ucma_query(fid_t fid, + struct ucma_abi_query *cmd, size_t cmd_size, + void *resp, size_t resp_size) +{ + struct fid_ucma *ucma; + + ucma = container_of(fid, struct fid_ucma, fid); + UCMA_INIT_CMD_RESP(cmd, cmd_size, QUERY, resp, resp_size); + if (write(ucma->fd, cmd, cmd_size) != cmd_size) + return -errno; + + VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); + return 0; +} + +static int __ucma_connect(fid_t fid, + struct ucma_abi_connect *cmd, size_t cmd_size) +{ + struct fid_ucma *ucma; + + ucma = container_of(fid, struct fid_ucma, fid); + UCMA_INIT_CMD(cmd, cmd_size, CONNECT); + if (write(ucma->fd, cmd, cmd_size) != cmd_size) + return -errno; + + return 0; +} + +static int __ucma_listen(fid_t fid, + struct ucma_abi_listen *cmd, size_t cmd_size) +{ + struct fid_ucma *ucma; + + ucma = container_of(fid, struct fid_ucma, fid); + UCMA_INIT_CMD(cmd, cmd_size, LISTEN); + if (write(ucma->fd, cmd, cmd_size) != cmd_size) + return -errno; + + return 0; +} + +static int __ucma_accept(fid_t fid, + struct ucma_abi_accept *cmd, size_t cmd_size) +{ + struct fid_ucma *ucma; + + ucma = container_of(fid, struct fid_ucma, fid); + UCMA_INIT_CMD(cmd, cmd_size, ACCEPT); + if (write(ucma->fd, cmd, cmd_size) != cmd_size) + return -errno; + + return 0; +} + +static int __ucma_reject(fid_t fid, + struct ucma_abi_reject *cmd, size_t cmd_size) +{ + struct fid_ucma *ucma; + + ucma = container_of(fid, struct fid_ucma, fid); + UCMA_INIT_CMD(cmd, cmd_size, REJECT); + if (write(ucma->fd, cmd, cmd_size) != cmd_size) + return -errno; + + return 0; +} + +static int __ucma_disconnect(fid_t fid, + struct ucma_abi_disconnect *cmd, size_t cmd_size) +{ + struct fid_ucma *ucma; + + ucma = container_of(fid, struct fid_ucma, fid); + UCMA_INIT_CMD(cmd, cmd_size, DISCONNECT); + if (write(ucma->fd, cmd, cmd_size) != cmd_size) + return -errno; + + return 0; +} + +static int __ucma_init_qp_attr(fid_t fid, + struct ucma_abi_init_qp_attr *cmd, size_t cmd_size, + struct ibv_kern_qp_attr *resp, size_t resp_size) +{ + struct fid_ucma *ucma; + + ucma = container_of(fid, struct fid_ucma, fid); + UCMA_INIT_CMD_RESP(cmd, cmd_size, INIT_QP_ATTR, resp, resp_size); + if (write(ucma->fd, cmd, cmd_size) != cmd_size) + return -errno; + + VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); + return 0; +} + +static int __ucma_get_event(fid_t fid, + struct ucma_abi_get_event *cmd, size_t cmd_size, + struct ucma_abi_event_resp *resp, size_t resp_size) +{ + struct fid_ucma *ucma; + + ucma = container_of(fid, struct fid_ucma, fid); + UCMA_INIT_CMD_RESP(cmd, cmd_size, GET_EVENT, resp, resp_size); + if (write(ucma->fd, cmd, cmd_size) != cmd_size) + return -errno; + + VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); + return 0; +} + +static int __ucma_set_option(fid_t fid, + struct ucma_abi_set_option *cmd, size_t cmd_size) +{ + struct fid_ucma *ucma; + + ucma = container_of(fid, struct fid_ucma, fid); + UCMA_INIT_CMD(cmd, cmd_size, SET_OPTION); + if (write(ucma->fd, cmd, cmd_size) != cmd_size) + return -errno; + + return 0; +} + +static int __ucma_notify(fid_t fid, + struct ucma_abi_notify *cmd, size_t cmd_size) +{ + struct fid_ucma *ucma; + + ucma = container_of(fid, struct fid_ucma, fid); + UCMA_INIT_CMD(cmd, cmd_size, NOTIFY); + if (write(ucma->fd, cmd, cmd_size) != cmd_size) + return -errno; + + return 0; +} + +static int __ucma_join_ip_mcast(fid_t fid, + struct ucma_abi_join_ip_mcast *cmd, size_t cmd_size, + struct ucma_abi_create_id_resp *resp, size_t resp_size) +{ + struct fid_ucma *ucma; + + ucma = container_of(fid, struct fid_ucma, fid); + UCMA_INIT_CMD_RESP(cmd, cmd_size, JOIN_IP_MCAST, resp, resp_size); + if (write(ucma->fd, cmd, cmd_size) != cmd_size) + return -errno; + + VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); + return 0; +} + +static int __ucma_join_mcast(fid_t fid, + struct ucma_abi_join_mcast *cmd, size_t cmd_size, + struct ucma_abi_create_id_resp *resp, size_t resp_size) +{ + struct fid_ucma *ucma; + + ucma = container_of(fid, struct fid_ucma, fid); + UCMA_INIT_CMD_RESP(cmd, cmd_size, JOIN_MCAST, resp, resp_size); + if (write(ucma->fd, cmd, cmd_size) != cmd_size) + return -errno; + + VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); + return 0; +} + +static int __ucma_leave_mcast(fid_t fid, + struct ucma_abi_destroy_id *cmd, size_t cmd_size, + struct ucma_abi_destroy_id_resp *resp, size_t resp_size) +{ + struct fid_ucma *ucma; + + ucma = container_of(fid, struct fid_ucma, fid); + UCMA_INIT_CMD_RESP(cmd, cmd_size, LEAVE_MCAST, resp, resp_size); + if (write(ucma->fd, cmd, cmd_size) != cmd_size) + return -errno; + + VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); + return 0; +} + +static int __ucma_migrate_id(fid_t fid, + struct ucma_abi_migrate_id *cmd, size_t cmd_size, + struct ucma_abi_migrate_resp *resp, size_t resp_size) +{ + struct fid_ucma *ucma; + + ucma = container_of(fid, struct fid_ucma, fid); + UCMA_INIT_CMD_RESP(cmd, cmd_size, MIGRATE_ID, resp, resp_size); + if (write(ucma->fd, cmd, cmd_size) != cmd_size) + return -errno; + + VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); + return 0; +} + + +static struct fi_ops_ucma ops_ucma = { + .size = sizeof(struct fi_ops_ucma), + .create_id = __ucma_create_id, + .destroy_id = __ucma_destroy_id, + .bind_ip = __ucma_bind_ip, + .bind = __ucma_bind, + .resolve_ip = __ucma_resolve_ip, + .resolve_addr = __ucma_resolve_addr, + .resolve_route = __ucma_resolve_route, + .query_route = __ucma_query_route, + .query = __ucma_query, + .connect = __ucma_connect, + .listen = __ucma_listen, + .accept = __ucma_accept, + .reject = __ucma_reject, + .disconnect = __ucma_disconnect, + .init_qp_attr = __ucma_init_qp_attr, + .get_event = __ucma_get_event, + .set_option = __ucma_set_option, + .notify = __ucma_notify, + .join_ip_mcast = __ucma_join_ip_mcast, + .join_mcast = __ucma_join_mcast, + .leave_mcast = __ucma_leave_mcast, + .migrate_id = __ucma_migrate_id +}; + +static int ucma_close(fid_t fid) +{ + struct fid_ucma *ucma; + + ucma = container_of(fid, struct fid_ucma, fid); + close(ucma->fd); + free(ucma); + return 0; +} + +static struct fi_ops ops_fi = { + .size = sizeof(struct fi_ops), + .close = ucma_close +}; + +static int ucma_open(const char *name, struct fi_info *info, uint64_t flags, + fid_t *fid, void *context) +{ + struct fid_ucma *ucma; + + if (!name || strcmp(FI_UCMA_INTERFACE, name)) + return -ENOSYS; + + ucma = calloc(1, sizeof(*ucma)); + if (!ucma) + return -ENOMEM; + + ucma->fd = open("/dev/infiniband/rdma_cm", O_RDWR | O_CLOEXEC); + if (ucma->fd < 0) { + free(ucma); + return -errno; + } + + ucma->fid.fclass = FID_CLASS_INTERFACE; + ucma->fid.size = sizeof(*ucma); + ucma->fid.ops = &ops_fi; + ucma->fid.context = context; + ucma->ops = &ops_ucma; + + *fid = &ucma->fid; + return 0; +} diff --git a/src/uverbs.c b/src/uverbs.c new file mode 100644 index 00000000000..e3381d335b6 --- /dev/null +++ b/src/uverbs.c @@ -0,0 +1,710 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2013 Intel Corp., Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +# include <config.h> +#endif /* HAVE_CONFIG_H */ + +#include <dirent.h> +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <strings.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <rdma/fabric.h> +#include <rdma/fi_prov.h> +#include <rdma/fi_uverbs.h> +#include "fi.h" + + +int uv_abi_ver; +struct uv_dev *udev_head, *udev_tail; + +#define UV_INIT_CMD(cmd, size, opcode) \ + do { \ + (cmd)->command = UVERBS_CMD_##opcode; \ + (cmd)->in_words = (size) / 4; \ + (cmd)->out_words = 0; \ + } while (0) + +#define UV_INIT_CMD_RESP(cmd, size, opcode, out, outsize) \ + do { \ + (cmd)->command = UVERBS_CMD_##opcode; \ + (cmd)->in_words = (size) / 4; \ + (cmd)->out_words = (outsize) / 4; \ + (cmd)->response = (uintptr_t) (out); \ + } while (0) + +static int uv_open(const char *name, struct fi_info *info, uint64_t flags, + fid_t *fid, void *context); + +static struct fi_ops_prov uv_prov_ops = { + .size = sizeof(struct fi_ops_prov), + .getinfo = NULL, + .freeinfo = NULL, + .socket = NULL, + .open = uv_open +}; + +static int uv_abi_version(void) +{ + char value[8]; + + if (fi_read_file(fi_sysfs_path(), "class/infiniband_verbs/abi_version", + value, sizeof value) < 0) { + return -ENOSYS; + } + + uv_abi_ver = strtol(value, NULL, 10); + if (uv_abi_ver < UVERBS_MIN_ABI_VERSION || + uv_abi_ver > UVERBS_MAX_ABI_VERSION) { + fprintf(stderr, PFX "uverbs kernel ABI version %d not supported (%d).\n", + uv_abi_ver, UVERBS_MAX_ABI_VERSION); + return -ENOSYS; + } + + return 0; +} + +int uv_init(void) +{ + char class_path[FI_PATH_MAX]; + DIR *class_dir; + struct dirent *dent; + struct uv_dev *udev = NULL; + struct stat buf; + int ret; + + ret = uv_abi_version(); + if (ret) + return ret; + + snprintf(class_path, sizeof class_path, "%s/class/infiniband_verbs", + fi_sysfs_path()); + + class_dir = opendir(class_path); + if (!class_dir) + return -ENOSYS; + + while ((dent = readdir(class_dir))) { + if (dent->d_name[0] == '.') + continue; + + if (!udev) + udev = calloc(sizeof *udev, 1); + if (!udev) { + ret = -ENOMEM; + break; + } + + snprintf(udev->sysfs_path, sizeof udev->sysfs_path, + "%s/%s", class_path, dent->d_name); + + if (stat(udev->sysfs_path, &buf)) { + fprintf(stderr, PFX "Warning: couldn't stat '%s'.\n", + udev->sysfs_path); + continue; + } + + if (!S_ISDIR(buf.st_mode)) + continue; + + snprintf(udev->sysfs_name, sizeof udev->sysfs_name, "%s", dent->d_name); + + if (fi_read_file(udev->sysfs_path, "ibdev", udev->dev_name, + sizeof udev->dev_name) < 0) { + fprintf(stderr, PFX "Warning: no dev class attr for '%s'.\n", + dent->d_name); + continue; + } + + snprintf(udev->dev_path, sizeof udev->dev_path, + "%s/class/infiniband/%s", fi_sysfs_path(), udev->dev_name); + + if (udev_tail) + udev_tail->next = udev; + else + udev_head = udev; + udev_tail = udev; + udev = NULL; + } + + if (udev) + free(udev); + + closedir(class_dir); + return ret; +} + +void uv_ini(void) +{ + fi_register(&uv_prov_ops); +} + +void uv_fini(void) +{ +} + +static int __uv_get_context(fid_t fid, + struct ibv_get_context *cmd, size_t cmd_size, + struct ibv_get_context_resp *resp, size_t resp_size) +{ + struct fid_uverbs *uv; + + uv = container_of(fid, struct fid_uverbs, fid); + UV_INIT_CMD_RESP(cmd, cmd_size, GET_CONTEXT, resp, resp_size); + if (write(uv->fd, cmd, cmd_size) != cmd_size) + return -errno; + + VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); + return 0; +} + +static int __uv_query_device(fid_t fid, + struct ibv_query_device *cmd, size_t cmd_size, + struct ibv_query_device_resp *resp, size_t resp_size) +{ + struct fid_uverbs *uv; + + uv = container_of(fid, struct fid_uverbs, fid); + UV_INIT_CMD_RESP(cmd, cmd_size, QUERY_DEVICE, resp, resp_size); + if (write(uv->fd, cmd, cmd_size) != cmd_size) + return -errno; + + VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); + return 0; +} + +static int __uv_query_port(fid_t fid, + struct ibv_query_port *cmd, size_t cmd_size, + struct ibv_query_port_resp *resp, size_t resp_size) +{ + struct fid_uverbs *uv; + + uv = container_of(fid, struct fid_uverbs, fid); + UV_INIT_CMD_RESP(cmd, cmd_size, QUERY_PORT, resp, resp_size); + if (write(uv->fd, cmd, cmd_size) != cmd_size) + return -errno; + + VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); + return 0; +} + +static int __uv_alloc_pd(fid_t fid, + struct ibv_alloc_pd *cmd, size_t cmd_size, + struct ibv_alloc_pd_resp *resp, size_t resp_size) +{ + struct fid_uverbs *uv; + + uv = container_of(fid, struct fid_uverbs, fid); + UV_INIT_CMD_RESP(cmd, cmd_size, ALLOC_PD, resp, resp_size); + if (write(uv->fd, cmd, cmd_size) != cmd_size) + return -errno; + + VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); + return 0; +} + +static int __uv_dealloc_pd(fid_t fid, + struct ibv_dealloc_pd *cmd, size_t cmd_size) +{ + struct fid_uverbs *uv; + + uv = container_of(fid, struct fid_uverbs, fid); + UV_INIT_CMD(cmd, cmd_size, DEALLOC_PD); + if (write(uv->fd, cmd, cmd_size) != cmd_size) + return -errno; + return 0; +} + +static int __uv_open_xrcd(fid_t fid, + struct ibv_open_xrcd *cmd, size_t cmd_size, + struct ibv_open_xrcd_resp *resp, size_t resp_size) +{ + struct fid_uverbs *uv; + + uv = container_of(fid, struct fid_uverbs, fid); + UV_INIT_CMD_RESP(cmd, cmd_size, OPEN_XRCD, resp, resp_size); + if (write(uv->fd, cmd, cmd_size) != cmd_size) + return -errno; + + VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); + return 0; +} + +static int __uv_close_xrcd(fid_t fid, + struct ibv_close_xrcd *cmd, size_t cmd_size) +{ + struct fid_uverbs *uv; + + uv = container_of(fid, struct fid_uverbs, fid); + UV_INIT_CMD(cmd, cmd_size, CLOSE_XRCD); + if (write(uv->fd, cmd, cmd_size) != cmd_size) + return -errno; + return 0; +} + +static int __uv_reg_mr(fid_t fid, + struct ibv_reg_mr *cmd, size_t cmd_size, + struct ibv_reg_mr_resp *resp, size_t resp_size) +{ + struct fid_uverbs *uv; + + uv = container_of(fid, struct fid_uverbs, fid); + UV_INIT_CMD_RESP(cmd, cmd_size, REG_MR, resp, resp_size); + if (write(uv->fd, cmd, cmd_size) != cmd_size) + return -errno; + + VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); + return 0; +} + +static int __uv_dereg_mr(fid_t fid, + struct ibv_dereg_mr *cmd, size_t cmd_size) +{ + struct fid_uverbs *uv; + + uv = container_of(fid, struct fid_uverbs, fid); + UV_INIT_CMD(cmd, cmd_size, DEREG_MR); + if (write(uv->fd, cmd, cmd_size) != cmd_size) + return -errno; + return 0; +} + +static int __uv_create_comp_channel(fid_t fid, + struct ibv_create_comp_channel *cmd, size_t cmd_size, + struct ibv_create_comp_channel_resp *resp, size_t resp_size) +{ + struct fid_uverbs *uv; + + uv = container_of(fid, struct fid_uverbs, fid); + UV_INIT_CMD_RESP(cmd, cmd_size, CREATE_COMP_CHANNEL, resp, resp_size); + if (write(uv->fd, cmd, cmd_size) != cmd_size) + return -errno; + + VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); + return 0; +} + +static int __uv_create_cq(fid_t fid, + struct ibv_create_cq *cmd, size_t cmd_size, + struct ibv_create_cq_resp *resp, size_t resp_size) +{ + struct fid_uverbs *uv; + + uv = container_of(fid, struct fid_uverbs, fid); + UV_INIT_CMD_RESP(cmd, cmd_size, CREATE_CQ, resp, resp_size); + if (write(uv->fd, cmd, cmd_size) != cmd_size) + return -errno; + + VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); + return 0; +} + +static int __uv_poll_cq(fid_t fid, + struct ibv_poll_cq *cmd, size_t cmd_size, + struct ibv_poll_cq_resp *resp, size_t resp_size) +{ + struct fid_uverbs *uv; + + uv = container_of(fid, struct fid_uverbs, fid); + UV_INIT_CMD_RESP(cmd, cmd_size, POLL_CQ, resp, resp_size); + if (write(uv->fd, cmd, cmd_size) != cmd_size) + return -errno; + + VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); + return 0; +} + +static int __uv_req_notify_cq(fid_t fid, + struct ibv_req_notify_cq *cmd, size_t cmd_size) +{ + struct fid_uverbs *uv; + + uv = container_of(fid, struct fid_uverbs, fid); + UV_INIT_CMD(cmd, cmd_size, REQ_NOTIFY_CQ); + if (write(uv->fd, cmd, cmd_size) != cmd_size) + return -errno; + return 0; +} + +static int __uv_resize_cq(fid_t fid, + struct ibv_resize_cq *cmd, size_t cmd_size, + struct ibv_resize_cq_resp *resp, size_t resp_size) +{ + struct fid_uverbs *uv; + + uv = container_of(fid, struct fid_uverbs, fid); + UV_INIT_CMD_RESP(cmd, cmd_size, RESIZE_CQ, resp, resp_size); + if (write(uv->fd, cmd, cmd_size) != cmd_size) + return -errno; + + VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); + return 0; +} + +static int __uv_destroy_cq(fid_t fid, + struct ibv_destroy_cq *cmd, size_t cmd_size, + struct ibv_destroy_cq_resp *resp, size_t resp_size) +{ + struct fid_uverbs *uv; + + uv = container_of(fid, struct fid_uverbs, fid); + UV_INIT_CMD_RESP(cmd, cmd_size, DESTROY_CQ, resp, resp_size); + if (write(uv->fd, cmd, cmd_size) != cmd_size) + return -errno; + + VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); + return 0; +} + +static int __uv_create_srq(fid_t fid, + struct ibv_create_srq *cmd, size_t cmd_size, + struct ibv_create_srq_resp *resp, size_t resp_size) +{ + struct fid_uverbs *uv; + + uv = container_of(fid, struct fid_uverbs, fid); + UV_INIT_CMD_RESP(cmd, cmd_size, CREATE_SRQ, resp, resp_size); + if (write(uv->fd, cmd, cmd_size) != cmd_size) + return -errno; + + VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); + return 0; +} + +static int __uv_modify_srq(fid_t fid, + struct ibv_modify_srq *cmd, size_t cmd_size) +{ + struct fid_uverbs *uv; + + uv = container_of(fid, struct fid_uverbs, fid); + UV_INIT_CMD(cmd, cmd_size, MODIFY_SRQ); + if (write(uv->fd, cmd, cmd_size) != cmd_size) + return -errno; + return 0; +} + +static int __uv_query_srq(fid_t fid, + struct ibv_query_srq *cmd, size_t cmd_size, + struct ibv_query_srq_resp *resp, size_t resp_size) +{ + struct fid_uverbs *uv; + + uv = container_of(fid, struct fid_uverbs, fid); + UV_INIT_CMD_RESP(cmd, cmd_size, QUERY_SRQ, resp, resp_size); + if (write(uv->fd, cmd, cmd_size) != cmd_size) + return -errno; + + VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); + return 0; +} + +static int __uv_destroy_srq(fid_t fid, + struct ibv_destroy_srq *cmd, size_t cmd_size, + struct ibv_destroy_srq_resp *resp, size_t resp_size) +{ + struct fid_uverbs *uv; + + uv = container_of(fid, struct fid_uverbs, fid); + UV_INIT_CMD_RESP(cmd, cmd_size, DESTROY_SRQ, resp, resp_size); + if (write(uv->fd, cmd, cmd_size) != cmd_size) + return -errno; + + VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); + return 0; +} + +static int __uv_create_qp(fid_t fid, + struct ibv_create_qp *cmd, size_t cmd_size, + struct ibv_create_qp_resp *resp, size_t resp_size) +{ + struct fid_uverbs *uv; + + uv = container_of(fid, struct fid_uverbs, fid); + UV_INIT_CMD_RESP(cmd, cmd_size, CREATE_QP, resp, resp_size); + if (write(uv->fd, cmd, cmd_size) != cmd_size) + return -errno; + + VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); + return 0; +} + +static int __uv_open_qp(fid_t fid, + struct ibv_open_qp *cmd, size_t cmd_size, + struct ibv_create_qp_resp *resp, size_t resp_size) +{ + struct fid_uverbs *uv; + + uv = container_of(fid, struct fid_uverbs, fid); + UV_INIT_CMD_RESP(cmd, cmd_size, OPEN_QP, resp, resp_size); + if (write(uv->fd, cmd, cmd_size) != cmd_size) + return -errno; + + VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); + return 0; +} + +static int __uv_query_qp(fid_t fid, + struct ibv_query_qp *cmd, size_t cmd_size, + struct ibv_query_qp_resp *resp, size_t resp_size) +{ + struct fid_uverbs *uv; + + uv = container_of(fid, struct fid_uverbs, fid); + UV_INIT_CMD_RESP(cmd, cmd_size, QUERY_QP, resp, resp_size); + if (write(uv->fd, cmd, cmd_size) != cmd_size) + return -errno; + + VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); + return 0; +} + +static int __uv_modify_qp(fid_t fid, + struct ibv_modify_qp *cmd, size_t cmd_size) +{ + struct fid_uverbs *uv; + + uv = container_of(fid, struct fid_uverbs, fid); + UV_INIT_CMD(cmd, cmd_size, MODIFY_QP); + if (write(uv->fd, cmd, cmd_size) != cmd_size) + return -errno; + return 0; +} + +static int __uv_destroy_qp(fid_t fid, + struct ibv_destroy_qp *cmd, size_t cmd_size, + struct ibv_destroy_qp_resp *resp, size_t resp_size) +{ + struct fid_uverbs *uv; + + uv = container_of(fid, struct fid_uverbs, fid); + UV_INIT_CMD_RESP(cmd, cmd_size, DESTROY_QP, resp, resp_size); + if (write(uv->fd, cmd, cmd_size) != cmd_size) + return -errno; + + VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); + return 0; +} + +static int __uv_post_send(fid_t fid, + struct ibv_post_send *cmd, size_t cmd_size, + struct ibv_post_send_resp *resp, size_t resp_size) +{ + struct fid_uverbs *uv; + + uv = container_of(fid, struct fid_uverbs, fid); + UV_INIT_CMD_RESP(cmd, cmd_size, POST_SEND, resp, resp_size); + if (write(uv->fd, cmd, cmd_size) != cmd_size) + return -errno; + + VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); + return 0; +} + +static int __uv_post_recv(fid_t fid, + struct ibv_post_recv *cmd, size_t cmd_size, + struct ibv_post_recv_resp *resp, size_t resp_size) +{ + struct fid_uverbs *uv; + + uv = container_of(fid, struct fid_uverbs, fid); + UV_INIT_CMD_RESP(cmd, cmd_size, POST_RECV, resp, resp_size); + if (write(uv->fd, cmd, cmd_size) != cmd_size) + return -errno; + + VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); + return 0; +} + +static int __uv_post_srq_recv(fid_t fid, + struct ibv_post_srq_recv *cmd, size_t cmd_size, + struct ibv_post_srq_recv_resp *resp, size_t resp_size) +{ + struct fid_uverbs *uv; + + uv = container_of(fid, struct fid_uverbs, fid); + UV_INIT_CMD_RESP(cmd, cmd_size, POST_SRQ_RECV, resp, resp_size); + if (write(uv->fd, cmd, cmd_size) != cmd_size) + return -errno; + + VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); + return 0; +} + +static int __uv_create_ah(fid_t fid, + struct ibv_create_ah *cmd, size_t cmd_size, + struct ibv_create_ah_resp *resp, size_t resp_size) +{ + struct fid_uverbs *uv; + + uv = container_of(fid, struct fid_uverbs, fid); + UV_INIT_CMD_RESP(cmd, cmd_size, CREATE_AH, resp, resp_size); + if (write(uv->fd, cmd, cmd_size) != cmd_size) + return -errno; + + VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); + return 0; +} + +static int __uv_destroy_ah(fid_t fid, + struct ibv_destroy_ah *cmd, size_t cmd_size) +{ + struct fid_uverbs *uv; + + uv = container_of(fid, struct fid_uverbs, fid); + UV_INIT_CMD(cmd, cmd_size, DESTROY_AH); + if (write(uv->fd, cmd, cmd_size) != cmd_size) + return -errno; + return 0; +} + +static int __uv_attach_mcast(fid_t fid, + struct ibv_attach_mcast *cmd, size_t cmd_size) +{ + struct fid_uverbs *uv; + + uv = container_of(fid, struct fid_uverbs, fid); + UV_INIT_CMD(cmd, cmd_size, ATTACH_MCAST); + if (write(uv->fd, cmd, cmd_size) != cmd_size) + return -errno; + return 0; +} + +static int __uv_detach_mcast(fid_t fid, + struct ibv_detach_mcast *cmd, size_t cmd_size) +{ + struct fid_uverbs *uv; + + uv = container_of(fid, struct fid_uverbs, fid); + UV_INIT_CMD(cmd, cmd_size, DETACH_MCAST); + if (write(uv->fd, cmd, cmd_size) != cmd_size) + return -errno; + return 0; +} + +static struct fi_ops_uverbs ops_uv = { + .size = sizeof(struct fi_ops_uverbs), + .get_context = __uv_get_context, + .query_device = __uv_query_device, + .query_port = __uv_query_port, + .alloc_pd = __uv_alloc_pd, + .dealloc_pd = __uv_dealloc_pd, + .open_xrcd = __uv_open_xrcd, + .close_xrcd = __uv_close_xrcd, + .reg_mr = __uv_reg_mr, + .dereg_mr = __uv_dereg_mr, + .create_comp_channel = __uv_create_comp_channel, + .create_cq = __uv_create_cq, + .poll_cq = __uv_poll_cq, + .req_notify_cq = __uv_req_notify_cq, + .resize_cq = __uv_resize_cq, + .destroy_cq = __uv_destroy_cq, + .create_srq = __uv_create_srq, + .modify_srq = __uv_modify_srq, + .query_srq = __uv_query_srq, + .destroy_srq = __uv_destroy_srq, + .create_qp = __uv_create_qp, + .open_qp = __uv_open_qp, + .query_qp = __uv_query_qp, + .modify_qp = __uv_modify_qp, + .destroy_qp = __uv_destroy_qp, + .post_send = __uv_post_send, + .post_recv = __uv_post_recv, + .post_srq_recv = __uv_post_srq_recv, + .create_ah = __uv_create_ah, + .destroy_ah = __uv_destroy_ah, + .attach_mcast = __uv_attach_mcast, + .detach_mcast = __uv_detach_mcast +}; + +static int uv_close(fid_t fid) +{ + struct fid_uverbs *uv; + + uv = container_of(fid, struct fid_uverbs, fid); + close(uv->fd); + free(uv); + return 0; +} + +static struct fi_ops ops_fi = { + .size = sizeof(struct fi_ops), + .close = uv_close +}; + +static int uv_open(const char *name, struct fi_info *info, uint64_t flags, + fid_t *fid, void *context) +{ + struct fid_uverbs *uv; + char *dev_path; + int ret = 0; + + if (!name || strncmp(FI_UVERBS_INTERFACE "/", name, 7)) + return -ENOSYS; + + if (asprintf(&dev_path, "/dev/infiniband%s", strstr(name, "/")) < 0) + return -ENOMEM; + + uv = calloc(1, sizeof(*uv)); + if (!uv) { + ret = -ENOMEM; + goto out; + } + + uv->fd = open(dev_path, O_RDWR | O_CLOEXEC); + if (uv->fd < 1) { + ret = -errno; + free(uv); + goto out; + } + + uv->fid.fclass = FID_CLASS_INTERFACE; + uv->fid.size = sizeof(*uv); + uv->fid.ops = &ops_fi; + uv->fid.context = context; + uv->ops = &ops_uv; + + *fid = &uv->fid; +out: + free(dev_path); + return ret; +}