From 30ec628a0cb66f658fa00f9ffbd238b46c2d42f3 Mon Sep 17 00:00:00 2001
From: Sean Hefty <sean.hefty@intel.com>
Date: Thu, 7 Nov 2013 12:18:59 -0800
Subject: [PATCH] libfabric: Initial commit

libfabric overview

libfabric is an extensible framework for application access
to fabric services.  The framework supports multiple providers,
including providers built into the library.

The layout of the libfabric source tree is outlined below.
Note that this is under development, and that full functionality
is missing.

include/rdma
------------
Contains header files for the framework, including the base
framework APIs in fabric.h.  There are sample APIs available
for message queue operations, RDMA operations, and tagged
messages.

Proposed APIs and objects that support communication and data
transfer functionality are found in fi_domain.h.

fabric.h - Base framework APIs
fi_domain.h - General resource management objects
fi_socket.h - Base communication object

src
---
Contains the base implementation for the framework and kernel
supported APIs.

src/fabric - Base framework implementation
src/ucma - Interface to kernel rdma cm ABI
src/uverbs - Interface to kernel verbs ABI

examples
--------
Includes some simple examples that demonstrate how an application
can use the framework and various API sets.

examples/perf - Simple latency/bandwidth test.
examples/provinfo - List available provider information.

prov
----
Providers built into the libfabric library are under the prov
subdirectory.

prov/ibverbs -
This is a *sample* provider that sits over libibverbs.  It is NOT
meant as a real provider because of the overhead that results from
converting libfabric calls directly into libibverbs calls.  It is
intended to show how a hardware vendor can implement an optimized
version of their provider library for libfabric.

prov/mlx4 -
This is a sample provider that works in conjunction with the
ibverbs provider.  It is mostly unchanged from the existing libmlx4
verbs provider.

prov/psm -
This is a sample provider that sits over the Intel PSM library.

prov/rdmacm -
Incorporates the librdmacm functionality into libfabric.

Signed-off-by: Sean Hefty <sean.hefty@intel.com>
---
 AUTHORS                                    |    4 +
 COPYING                                    |  378 ++
 Makefile.am                                |  141 +
 README                                     |   10 +
 autogen.sh                                 |    9 +
 configure.ac                               |  139 +
 examples/perf.c                            |  657 ++++
 examples/provinfo.c                        |   95 +
 examples/shared.c                          |  111 +
 examples/shared.h                          |   53 +
 include/fi.h                               |  132 +
 include/infiniband/ib.h                    |  107 +
 include/rdma/fabric.h                      |  387 ++
 include/rdma/fi_arch.h                     |  117 +
 include/rdma/fi_atomic.h                   |   54 +
 include/rdma/fi_cm.h                       |  118 +
 include/rdma/fi_domain.h                   |  434 +++
 include/rdma/fi_errno.h                    |  179 +
 include/rdma/fi_prov.h                     |   78 +
 include/rdma/fi_rdma.h                     |   93 +
 include/rdma/fi_socket.h                   |  187 +
 include/rdma/fi_tagged.h                   |  111 +
 include/rdma/fi_ucma.h                     |  718 ++++
 include/rdma/fi_umad.h                     |  112 +
 include/rdma/fi_uverbs.h                   | 1289 +++++++
 libfabric.spec.in                          |   71 +
 man/fi_getinfo.3                           |   53 +
 man/fi_open.3                              |   27 +
 man/fi_socket.3                            |   30 +
 prov/ibverbs/AUTHORS                       |    4 +
 prov/ibverbs/COPYING                       |  378 ++
 prov/ibverbs/include/infiniband/driver.h   |  145 +
 prov/ibverbs/include/infiniband/marshall.h |   64 +
 prov/ibverbs/include/infiniband/opcode.h   |  147 +
 prov/ibverbs/include/infiniband/verbs.h    | 1158 ++++++
 prov/ibverbs/src/cmd.c                     |  879 +++++
 prov/ibverbs/src/device.c                  |  255 ++
 prov/ibverbs/src/enum_strs.c               |  128 +
 prov/ibverbs/src/fi_verbs.c                | 1277 +++++++
 prov/ibverbs/src/ibverbs.h                 |   62 +
 prov/ibverbs/src/init.c                    |  473 +++
 prov/ibverbs/src/marshall.c                |  144 +
 prov/ibverbs/src/memory.c                  |  719 ++++
 prov/ibverbs/src/verbs.c                   |  534 +++
 prov/mlx4/AUTHORS                          |    1 +
 prov/mlx4/COPYING                          |  378 ++
 prov/mlx4/src/buf.c                        |   65 +
 prov/mlx4/src/cq.c                         |  480 +++
 prov/mlx4/src/dbrec.c                      |  154 +
 prov/mlx4/src/doorbell.h                   |   63 +
 prov/mlx4/src/mlx4-abi.h                   |  108 +
 prov/mlx4/src/mlx4.c                       |  276 ++
 prov/mlx4/src/mlx4.h                       |  350 ++
 prov/mlx4/src/mlx4_verbs.c                 |  741 ++++
 prov/mlx4/src/qp.c                         |  702 ++++
 prov/mlx4/src/srq.c                        |  175 +
 prov/mlx4/src/wqe.h                        |  121 +
 prov/psm/AUTHORS                           |    1 +
 prov/psm/COPYING                           |  378 ++
 prov/psm/src/psmx.h                        |   91 +
 prov/psm/src/psmx_av.c                     |  145 +
 prov/psm/src/psmx_cm.c                     |  105 +
 prov/psm/src/psmx_domain.c                 |  177 +
 prov/psm/src/psmx_ec.c                     |  205 +
 prov/psm/src/psmx_init.c                   |  163 +
 prov/psm/src/psmx_sock.c                   |  177 +
 prov/psm/src/psmx_tagged.c                 |  173 +
 prov/psm/src/psmx_util.c                   |  270 ++
 prov/rdmacm/AUTHORS                        |    1 +
 prov/rdmacm/COPYING                        |  378 ++
 prov/rdmacm/examples/common.c              |  168 +
 prov/rdmacm/examples/common.h              |   94 +
 prov/rdmacm/examples/rcopy.c               |  628 ++++
 prov/rdmacm/examples/riostream.c           |  639 ++++
 prov/rdmacm/examples/rstream.c             |  609 +++
 prov/rdmacm/examples/udpong.c              |  568 +++
 prov/rdmacm/include/rdma/rdma_cma.h        |  684 ++++
 prov/rdmacm/include/rdma/rdma_verbs.h      |  316 ++
 prov/rdmacm/include/rdma/rsocket.h         |   99 +
 prov/rdmacm/src/acm.c                      |  439 +++
 prov/rdmacm/src/addrinfo.c                 |  327 ++
 prov/rdmacm/src/cma.c                      | 2210 +++++++++++
 prov/rdmacm/src/cma.h                      |  155 +
 prov/rdmacm/src/indexer.c                  |  166 +
 prov/rdmacm/src/indexer.h                  |  144 +
 prov/rdmacm/src/preload.c                  | 1057 ++++++
 prov/rdmacm/src/rsocket.c                  | 3970 ++++++++++++++++++++
 src/fabric.c                               |  306 ++
 src/libfabric.map                          |   38 +
 src/ucma.c                                 |  497 +++
 src/uverbs.c                               |  710 ++++
 91 files changed, 31963 insertions(+)
 create mode 100644 AUTHORS
 create mode 100644 COPYING
 create mode 100644 Makefile.am
 create mode 100644 README
 create mode 100755 autogen.sh
 create mode 100644 configure.ac
 create mode 100644 examples/perf.c
 create mode 100644 examples/provinfo.c
 create mode 100644 examples/shared.c
 create mode 100644 examples/shared.h
 create mode 100644 include/fi.h
 create mode 100644 include/infiniband/ib.h
 create mode 100644 include/rdma/fabric.h
 create mode 100644 include/rdma/fi_arch.h
 create mode 100644 include/rdma/fi_atomic.h
 create mode 100644 include/rdma/fi_cm.h
 create mode 100644 include/rdma/fi_domain.h
 create mode 100644 include/rdma/fi_errno.h
 create mode 100644 include/rdma/fi_prov.h
 create mode 100644 include/rdma/fi_rdma.h
 create mode 100644 include/rdma/fi_socket.h
 create mode 100644 include/rdma/fi_tagged.h
 create mode 100644 include/rdma/fi_ucma.h
 create mode 100644 include/rdma/fi_umad.h
 create mode 100644 include/rdma/fi_uverbs.h
 create mode 100644 libfabric.spec.in
 create mode 100755 man/fi_getinfo.3
 create mode 100644 man/fi_open.3
 create mode 100644 man/fi_socket.3
 create mode 100644 prov/ibverbs/AUTHORS
 create mode 100644 prov/ibverbs/COPYING
 create mode 100644 prov/ibverbs/include/infiniband/driver.h
 create mode 100644 prov/ibverbs/include/infiniband/marshall.h
 create mode 100644 prov/ibverbs/include/infiniband/opcode.h
 create mode 100644 prov/ibverbs/include/infiniband/verbs.h
 create mode 100644 prov/ibverbs/src/cmd.c
 create mode 100644 prov/ibverbs/src/device.c
 create mode 100644 prov/ibverbs/src/enum_strs.c
 create mode 100644 prov/ibverbs/src/fi_verbs.c
 create mode 100644 prov/ibverbs/src/ibverbs.h
 create mode 100644 prov/ibverbs/src/init.c
 create mode 100644 prov/ibverbs/src/marshall.c
 create mode 100644 prov/ibverbs/src/memory.c
 create mode 100644 prov/ibverbs/src/verbs.c
 create mode 100644 prov/mlx4/AUTHORS
 create mode 100644 prov/mlx4/COPYING
 create mode 100644 prov/mlx4/src/buf.c
 create mode 100644 prov/mlx4/src/cq.c
 create mode 100644 prov/mlx4/src/dbrec.c
 create mode 100644 prov/mlx4/src/doorbell.h
 create mode 100644 prov/mlx4/src/mlx4-abi.h
 create mode 100644 prov/mlx4/src/mlx4.c
 create mode 100644 prov/mlx4/src/mlx4.h
 create mode 100644 prov/mlx4/src/mlx4_verbs.c
 create mode 100644 prov/mlx4/src/qp.c
 create mode 100644 prov/mlx4/src/srq.c
 create mode 100644 prov/mlx4/src/wqe.h
 create mode 100644 prov/psm/AUTHORS
 create mode 100644 prov/psm/COPYING
 create mode 100644 prov/psm/src/psmx.h
 create mode 100644 prov/psm/src/psmx_av.c
 create mode 100644 prov/psm/src/psmx_cm.c
 create mode 100644 prov/psm/src/psmx_domain.c
 create mode 100644 prov/psm/src/psmx_ec.c
 create mode 100644 prov/psm/src/psmx_init.c
 create mode 100644 prov/psm/src/psmx_sock.c
 create mode 100644 prov/psm/src/psmx_tagged.c
 create mode 100644 prov/psm/src/psmx_util.c
 create mode 100644 prov/rdmacm/AUTHORS
 create mode 100644 prov/rdmacm/COPYING
 create mode 100644 prov/rdmacm/examples/common.c
 create mode 100644 prov/rdmacm/examples/common.h
 create mode 100644 prov/rdmacm/examples/rcopy.c
 create mode 100644 prov/rdmacm/examples/riostream.c
 create mode 100644 prov/rdmacm/examples/rstream.c
 create mode 100644 prov/rdmacm/examples/udpong.c
 create mode 100644 prov/rdmacm/include/rdma/rdma_cma.h
 create mode 100644 prov/rdmacm/include/rdma/rdma_verbs.h
 create mode 100644 prov/rdmacm/include/rdma/rsocket.h
 create mode 100644 prov/rdmacm/src/acm.c
 create mode 100644 prov/rdmacm/src/addrinfo.c
 create mode 100644 prov/rdmacm/src/cma.c
 create mode 100644 prov/rdmacm/src/cma.h
 create mode 100644 prov/rdmacm/src/indexer.c
 create mode 100644 prov/rdmacm/src/indexer.h
 create mode 100644 prov/rdmacm/src/preload.c
 create mode 100644 prov/rdmacm/src/rsocket.c
 create mode 100644 src/fabric.c
 create mode 100644 src/libfabric.map
 create mode 100644 src/ucma.c
 create mode 100644 src/uverbs.c

diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 00000000000..fcea3504a51
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1,4 @@
+Roland Dreier		<roland@topspin.com>
+Dotan Barak		<dotanba@gmail.com>
+Sean Hefty		<sean.hefty@intel.com>
+Michael S. Tsirkin	<mst@mellanox.co.il>
diff --git a/COPYING b/COPYING
new file mode 100644
index 00000000000..39f3831585f
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,378 @@
+This software is available to you under a choice of one of two
+licenses.  You may choose to be licensed under the terms of the the
+OpenIB.org BSD license or the GNU General Public License (GPL) Version
+2, both included below.
+
+Copyright (c) 2005 Intel Corporation.  All rights reserved.
+
+==================================================================
+
+		       OpenIB.org BSD license
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+
+  * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following
+    disclaimer in the documentation and/or other materials provided
+    with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+==================================================================
+
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+                       59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+                            NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Library General
+Public License instead of this License.
diff --git a/Makefile.am b/Makefile.am
new file mode 100644
index 00000000000..74bb7bbe617
--- /dev/null
+++ b/Makefile.am
@@ -0,0 +1,141 @@
+AM_CPPFLAGS = -I$(srcdir)/include -I$(srcdir)/prov/ibverbs/include \
+	-I$(srcdir)/prov/rdmacm/include
+
+lib_LTLIBRARIES = src/libfabric.la
+
+ACLOCAL_AMFLAGS = -I config
+AM_CFLAGS = -g -Wall -D_GNU_SOURCE
+
+src_libfabric_la_CFLAGS = $(AM_CFLAGS) -DSYSCONFDIR=\"$(sysconfdir)\" -DRDMADIR=\"@rdmadir@\"
+
+if HAVE_LD_VERSION_SCRIPT
+    libfabric_version_script = -Wl,--version-script=$(srcdir)/src/libfabric.map
+else
+    libfabric_version_script =
+endif
+
+src_libfabric_la_SOURCES = src/fabric.c src/uverbs.c src/ucma.c \
+	prov/ibverbs/src/cmd.c \
+	prov/ibverbs/src/device.c \
+	prov/ibverbs/src/enum_strs.c \
+	prov/ibverbs/src/fi_verbs.c \
+	prov/ibverbs/src/init.c \
+	prov/ibverbs/src/marshall.c \
+	prov/ibverbs/src/memory.c \
+	prov/ibverbs/src/verbs.c \
+	prov/rdmacm/src/acm.c \
+	prov/rdmacm/src/addrinfo.c \
+	prov/rdmacm/src/cma.c \
+	prov/rdmacm/src/indexer.c \
+	prov/rdmacm/src/rsocket.c \
+	prov/mlx4/src/buf.c \
+	prov/mlx4/src/cq.c \
+	prov/mlx4/src/dbrec.c \
+	prov/mlx4/src/mlx4.c \
+	prov/mlx4/src/qp.c \
+	prov/mlx4/src/srq.c \
+	prov/mlx4/src/mlx4_verbs.c
+
+if HAVE_PSM
+src_libfabric_la_SOURCES += prov/psm/src/psmx_init.c \
+	prov/psm/src/psmx_domain.c \
+	prov/psm/src/psmx_ec.c \
+	prov/psm/src/psmx_av.c \
+	prov/psm/src/psmx_sock.c \
+	prov/psm/src/psmx_cm.c \
+	prov/psm/src/psmx_tagged.c \
+	prov/psm/src/psmx_util.c
+endif
+
+src_libfabric_la_LDFLAGS = -version-info 1 -export-dynamic \
+			   $(libfabric_version_script)
+
+src_libfabric_la_DEPENDENCIES =  $(srcdir)/src/libfabric.map
+
+bin_PROGRAMS = \
+	prov/rdmacm/examples/rstream \
+	prov/rdmacm/examples/rcopy \
+	prov/rdmacm/examples/riostream \
+	prov/rdmacm/examples/udpong \
+	examples/fi_provinfo \
+	examples/fi_perf
+
+prov_rdmacm_examples_rstream_SOURCES = \
+	prov/rdmacm/examples/rstream.c \
+	prov/rdmacm/examples/common.c
+prov_rdmacm_examples_rstream_LDADD = \
+	$(top_builddir)/src/libfabric.la
+prov_rdmacm_examples_riostream_SOURCES = \
+	prov/rdmacm/examples/riostream.c \
+	prov/rdmacm/examples/common.c
+prov_rdmacm_examples_riostream_LDADD = \
+	$(top_builddir)/src/libfabric.la
+prov_rdmacm_examples_rcopy_SOURCES = \
+	prov/rdmacm/examples/rcopy.c
+prov_rdmacm_examples_rcopy_LDADD = \
+	$(top_builddir)/src/libfabric.la
+prov_rdmacm_examples_udpong_SOURCES = \
+	prov/rdmacm/examples/udpong.c \
+	prov/rdmacm/examples/common.c
+prov_rdmacm_examples_udpong_LDADD = \
+	$(top_builddir)/src/libfabric.la
+examples_fi_provinfo_SOURCES = \
+	examples/provinfo.c \
+	examples/shared.c
+examples_fi_provinfo_LDADD = \
+	$(top_builddir)/src/libfabric.la
+examples_fi_perf_SOURCES = \
+	examples/perf.c \
+	examples/shared.c
+examples_fi_perf_LDADD = \
+	$(top_builddir)/src/libfabric.la
+
+libfabricincludedir = $(includedir)/rdma
+infinibandincludedir = $(includedir)/infiniband
+
+libfabricinclude_HEADERS = $(top_srcdir)/include/rdma/fabric.h \
+			   $(top_srcdir)/include/rdma/fi_arch.h \
+			   $(top_srcdir)/include/rdma/fi_atomic.h \
+			   $(top_srcdir)/include/rdma/fi_cm.h \
+			   $(top_srcdir)/include/rdma/fi_domain.h \
+			   $(top_srcdir)/include/rdma/fi_prov.h \
+			   $(top_srcdir)/include/rdma/fi_rdma.h \
+			   $(top_srcdir)/include/rdma/fi_socket.h \
+			   $(top_srcdir)/include/rdma/fi_errno.h \
+			   $(top_srcdir)/include/rdma/fi_tagged.h \
+			   $(top_srcdir)/include/rdma/fi_ucma.h \
+			   $(top_srcdir)/include/rdma/fi_umad.h \
+			   $(top_srcdir)/include/rdma/fi_uverbs.h \
+			   $(top_srcdir)/prov/rdmacm/include/rdma/rsocket.h
+
+infinibandinclude_HEADERS = $(top_srcdir)/include/infiniband/ib.h
+
+man_MANS = man/fi_getinfo.3 man/fi_socket.3 man/fi_open.3
+
+EXTRA_DIST = include/fi.h src/libfabric.map libfabric.spec.in $(man_MANS) \
+	prov/ibverbs/include/infiniband/driver.h \
+	prov/ibverbs/include/infiniband/marshall.h \
+	prov/ibverbs/include/infiniband/opcode.h \
+	prov/ibverbs/include/infiniband/sa.h \
+	prov/ibverbs/include/infiniband/sa-kern-abi.h \
+	prov/ibverbs/include/infiniband/verbs.h \
+	prov/ibverbs/src/ibverbs.h \
+	prov/rdmacm/include/rdma/rdma_cma.h \
+	prov/rdmacm/include/rdma/rdma_verbs.h \
+	prov/rdmacm/src/cma.h \
+	prov/rdmacm/src/indexer.h \
+	prov/mlx4/src/doorbell.h \
+	prov/mlx4/src/mlx4.h \
+	prov/mlx4/src/mlx4-abi.h \
+	prov/mlx4/wqe.h \
+	examples/shared.h
+
+dist-hook: libfabric.spec
+	cp libfabric.spec $(distdir)
+
+install-data-hook:
+	cd $(DESTDIR)$(mandir)/man3 && \
+	$(RM) fi_freeinfo.3 && \
+	$(RM) fi_close.3 && \
+	$(LN_S) fi_getinfo.3 fi_freeinfo.3 && \
+	$(LN_S) fi_open.3 fi_close.3
diff --git a/README b/README
new file mode 100644
index 00000000000..1cebdef42c8
--- /dev/null
+++ b/README
@@ -0,0 +1,10 @@
+This README is for userspace RDMA fabric library.
+
+Building
+========
+To make this directory, run:
+./autogen.sh && ./configure && make && make install
+
+Typically the autogen and configure steps only need be done the first
+time unless configure.ac or Makefile.am changes.
+
diff --git a/autogen.sh b/autogen.sh
new file mode 100755
index 00000000000..f433312161d
--- /dev/null
+++ b/autogen.sh
@@ -0,0 +1,9 @@
+#! /bin/sh
+
+set -x
+test -d ./config || mkdir ./config
+aclocal -I config
+libtoolize --force --copy
+autoheader
+automake --foreign --add-missing --copy
+autoconf
diff --git a/configure.ac b/configure.ac
new file mode 100644
index 00000000000..3bbb05a64d4
--- /dev/null
+++ b/configure.ac
@@ -0,0 +1,139 @@
+dnl Process this file with autoconf to produce a configure script.
+
+AC_PREREQ(2.57)
+AC_INIT(libfabric, 0.0.1, linux-rdma@vger.kernel.org)
+AC_CONFIG_SRCDIR([src/fabric.c])
+AC_CONFIG_AUX_DIR(config)
+AC_CONFIG_MACRO_DIR(config)
+AC_CONFIG_HEADERS(config.h)
+AM_INIT_AUTOMAKE(libfabric, 0.0.1)
+m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
+
+AC_ARG_ENABLE([debug],
+	      [AS_HELP_STRING([--enable-debug],
+			      [Enable debugging @<:@default=no@:>@])
+	      ],
+	      [CFLAGS="$CFLAGS -g -O0 -Wall"],
+	      [enable_debug=no])
+
+dnl Fix autoconf's habit of adding -g -O2 by default
+AS_IF([test -z "$CFLAGS"],
+      [CFLAGS='-O2 -DNDEBUG -Wall'])
+
+AM_PROG_LIBTOOL
+
+AC_ARG_WITH([valgrind],
+    AC_HELP_STRING([--with-valgrind],
+		   [Enable valgrind annotations - default NO]))
+
+if test "$with_valgrind" != "" && test "$with_valgrind" != "no"; then
+	AC_DEFINE([INCLUDE_VALGRIND], 1,
+		  [Define to 1 to enable valgrind annotations])
+	if test -d $with_valgrind; then
+		CPPFLAGS="$CPPLFAGS -I$with_valgrind/include"
+	fi
+fi
+
+AC_ARG_ENABLE(libcheck, [  --disable-libcheck      do not test for presence of libraries],
+[       if test "$enableval" = "no"; then
+                disable_libcheck=yes
+        fi
+])
+
+dnl Checks for programs
+AC_PROG_CC
+
+dnl Checks for typedefs, structures, and compiler characteristics.
+AC_C_CONST
+AC_CHECK_SIZEOF(long)
+
+dnl Checks for libraries
+AC_CHECK_LIB(dl, dlsym, [],
+    AC_MSG_ERROR([dlsym() not found.  libfabric requires libdl.]))
+AC_CHECK_LIB(pthread, pthread_mutex_init, [],
+    AC_MSG_ERROR([pthread_mutex_init() not found.  libfabric requires libpthread.]))
+
+dnl Check for gcc atomic intrinsics
+AC_MSG_CHECKING(compiler support for atomics)
+AC_TRY_LINK([int i = 0;],
+    [ return __sync_add_and_fetch(&i, 1) != __sync_sub_and_fetch(&i, 1); ],
+    [ AC_MSG_RESULT(yes) ],
+    [
+        AC_MSG_RESULT(no)
+        AC_DEFINE(DEFINE_ATOMICS, 1, [Set to 1 to implement atomics])
+    ])
+
+dnl Checks for header files.
+AC_HEADER_STDC
+
+if test "$disable_libcheck" != "yes"; then
+if test "$with_valgrind" != "" && test "$with_valgrind" != "no"; then
+AC_CHECK_HEADER(valgrind/memcheck.h, [],
+    AC_MSG_ERROR([valgrind requested but <valgrind/memcheck.h> not found.]))
+fi
+fi
+
+AC_CACHE_CHECK(whether ld accepts --version-script, ac_cv_version_script,
+    if test -n "`$LD --help < /dev/null 2>/dev/null | grep version-script`"; then
+        ac_cv_version_script=yes
+    else
+        ac_cv_version_script=no
+    fi)
+
+AM_CONDITIONAL(HAVE_LD_VERSION_SCRIPT, test "$ac_cv_version_script" = "yes")
+
+AC_ARG_ENABLE([psm],
+	      [AS_HELP_STRING([--enable-psm],
+			      [Enable PSM provider @<:@default=no@:>@])
+	      ],
+	      [AC_DEFINE([HAVE_PSM], [1], [Define if PSM is enabled])
+	       LIBS="-lpsm_infinipath $LIBS"],
+	      [enable_psm=no])
+
+AC_ARG_WITH([psm],
+	    [AS_HELP_STRING([--with-psm=@<:@PSM installation path@:>@],
+			    [Provide path to PSM installation])
+	    ],
+	    [AS_CASE([$with_psm],
+		     [yes|no], [AC_DEFINE([HAVE_PSM], [1], [Define if PSM is enabled])],
+		     [CPPFLAGS="-I$with_psm/include $CPPFLAGS"
+		      LDFLAGS="-L$with_psm/lib64 -Wl,-rpath=$with_psm/lib64 $LDFLAGS"
+		      LIBS="-lpsm_infinipath $LIBS"
+		      AC_DEFINE([HAVE_PSM], [1], [Define if PSM is enabled])])
+	    ])
+
+AC_ARG_WITH([psm-include],
+            [AS_HELP_STRING([--with-psm-include=@<:@PSM include path@:>@],
+                            [Provide path to PSM include files])
+            ],
+            [AS_CASE([$with_psm_include],
+                     [yes|no], [AC_DEFINE([HAVE_PSM], [1], [Define if PSM is enabled])],
+                     [CPPFLAGS="-I$with_psm_include $CPPFLAGS"
+		      AC_DEFINE([HAVE_PSM], [1], [Define if PSM is enabled])
+		     ])
+            ])
+
+AC_ARG_WITH([psm-lib],
+            [AS_HELP_STRING([--with-psm-lib=@<:@PSM library path@:>@],
+                            [Provide path to PSM library files])
+            ],
+            [AS_CASE([$with_psm_lib],
+                     [yes|no], [],
+                     [LDFLAGS="-L$with_psm_lib -Wl,-rpath=$with_psm_lib $LDFLAGS"
+		      LIBS="-lpsm_infinipath $LIBS"
+		      AC_DEFINE([HAVE_PSM], [1], [Define if PSM is enabled])
+		      ])
+            ])
+
+AS_IF([test x"$enable_psm" = x"yes"],
+      [AC_CHECK_LIB(psm_infinipath, psm_init, 
+      		[AC_CHECK_HEADER([psm.h], [], 
+		       [AC_MSG_ERROR([psm.h not found. Provide the correct path to PSM with --with-psm-include (or --with-psm)])]
+		       		)],
+      AC_MSG_ERROR([psm_init() not found. Provide the correct path to PSM --with-psm-lib]))],
+      [AC_MSG_NOTICE(PSM not enabled)])
+
+AM_CONDITIONAL([HAVE_PSM], [test x"$enable_psm" = x"yes"])
+
+AC_CONFIG_FILES([Makefile libfabric.spec])
+AC_OUTPUT
diff --git a/examples/perf.c b/examples/perf.c
new file mode 100644
index 00000000000..6260aea9a51
--- /dev/null
+++ b/examples/perf.c
@@ -0,0 +1,657 @@
+/*
+ * Copyright (c) 2013 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under the OpenIB.org BSD license
+ * below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AWV
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#include <errno.h>
+#include <getopt.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <netdb.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+
+#include <rdma/fabric.h>
+#include <rdma/fi_domain.h>
+#include <rdma/fi_errno.h>
+#include <rdma/fi_socket.h>
+#include <rdma/fi_cm.h>
+#include "shared.h"
+
+
+struct test_size_param {
+	int size;
+	int option;
+};
+
+static struct test_size_param test_size[] = {
+	{ 1 <<  6, 0 },
+	{ 1 <<  7, 1 }, { (1 <<  7) + (1 <<  6), 1},
+	{ 1 <<  8, 1 }, { (1 <<  8) + (1 <<  7), 1},
+	{ 1 <<  9, 1 }, { (1 <<  9) + (1 <<  8), 1},
+	{ 1 << 10, 1 }, { (1 << 10) + (1 <<  9), 1},
+	{ 1 << 11, 1 }, { (1 << 11) + (1 << 10), 1},
+	{ 1 << 12, 0 }, { (1 << 12) + (1 << 11), 1},
+	{ 1 << 13, 1 }, { (1 << 13) + (1 << 12), 1},
+	{ 1 << 14, 1 }, { (1 << 14) + (1 << 13), 1},
+	{ 1 << 15, 1 }, { (1 << 15) + (1 << 14), 1},
+	{ 1 << 16, 0 }, { (1 << 16) + (1 << 15), 1},
+	{ 1 << 17, 1 }, { (1 << 17) + (1 << 16), 1},
+	{ 1 << 18, 1 }, { (1 << 18) + (1 << 17), 1},
+	{ 1 << 19, 1 }, { (1 << 19) + (1 << 18), 1},
+	{ 1 << 20, 0 }, { (1 << 20) + (1 << 19), 1},
+	{ 1 << 21, 1 }, { (1 << 21) + (1 << 20), 1},
+	{ 1 << 22, 1 }, { (1 << 22) + (1 << 21), 1},
+};
+#define TEST_CNT (sizeof test_size / sizeof test_size[0])
+
+enum perf_optimization {
+	opt_latency,
+	opt_bandwidth
+};
+
+#define SEND_CONTEXT	NULL
+
+static int custom;
+static enum perf_optimization optimization;
+static int size_option;
+static int iterations = 1;
+static int transfer_size = 1000;
+static int transfer_count = 1000;
+/* TODO: make max_credits dynamic based on user input or socket size */
+static int max_credits = 128;
+static int credits = 128;
+static char test_name[10] = "custom";
+static struct timeval start, end;
+static void *buf;
+static size_t buffer_size;
+
+static struct fi_info hints;
+static char *dst_addr, *src_addr;
+static char *port = "9228";
+static fid_t lfs, ldom, lcm;
+static fid_t fs, dom, mr, cq;
+
+
+static void show_perf(void)
+{
+	char str[32];
+	float usec;
+	long long bytes;
+
+	usec = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec);
+	bytes = (long long) iterations * transfer_count * transfer_size * 2;
+
+	/* name size transfers iterations bytes seconds Gb/sec usec/xfer */
+	printf("%-10s", test_name);
+	size_str(str, sizeof str, transfer_size);
+	printf("%-8s", str);
+	cnt_str(str, sizeof str, transfer_count);
+	printf("%-8s", str);
+	cnt_str(str, sizeof str, iterations);
+	printf("%-8s", str);
+	size_str(str, sizeof str, bytes);
+	printf("%-8s", str);
+	printf("%8.2fs%10.2f%11.2f\n",
+		usec / 1000000., (bytes * 8) / (1000. * usec),
+		(usec / iterations) / (transfer_count * 2));
+}
+
+static void init_latency_test(int size)
+{
+	char sstr[5];
+
+	size_str(sstr, sizeof sstr, size);
+	snprintf(test_name, sizeof test_name, "%s_lat", sstr);
+	transfer_count = 1;
+	transfer_size = size;
+	iterations = size_to_count(transfer_size);
+}
+
+static void init_bandwidth_test(int size)
+{
+	char sstr[5];
+
+	size_str(sstr, sizeof sstr, size);
+	snprintf(test_name, sizeof test_name, "%s_bw", sstr);
+	iterations = 1;
+	transfer_size = size;
+	transfer_count = size_to_count(transfer_size);
+}
+
+static int poll_all(void)
+{
+	struct fi_ec_entry comp;
+	int ret;
+
+	do {
+		ret = fi_ec_read(cq, &comp, sizeof comp);
+		if (ret > 0) {
+			if (comp.op_context == SEND_CONTEXT)
+				credits++;
+		} else if (ret < 0) {
+			printf("Completion queue read %d (%s)\n", ret, fi_strerror(-ret));
+			return ret;
+		}
+	} while (ret);
+	return 0;
+}
+
+static int send_xfer(int size)
+{
+	struct fi_ec_entry comp;
+	int ret;
+
+	while (!credits) {
+		ret = fi_ec_read(cq, &comp, sizeof comp);
+		if (ret > 0) {
+			if (comp.op_context == SEND_CONTEXT)
+				goto post;
+		} else if (ret < 0) {
+			printf("Completion queue read %d (%s)\n", ret, fi_strerror(-ret));
+			return ret;
+		}
+	}
+
+	credits--;
+post:
+	ret = fi_sendmem(fs, buf, size, fi_mr_desc(mr), SEND_CONTEXT);
+	if (ret)
+		printf("fi_write %d (%s)\n", ret, fi_strerror(-ret));
+
+	return ret;
+}
+
+static int recv_xfer(int size)
+{
+	struct fi_ec_entry comp;
+	int ret;
+
+	while (1) {
+		ret = fi_ec_read(cq, &comp, sizeof comp);
+		if (ret > 0) {
+			if (comp.op_context == SEND_CONTEXT)
+				credits++;
+			else
+				break;
+		} else if (ret < 0) {
+			printf("Completion queue read %d (%s)\n", ret, fi_strerror(-ret));
+			return ret;
+		}
+	}
+
+	ret = fi_recvmem(fs, buf, buffer_size, fi_mr_desc(mr), buf);
+	if (ret)
+		printf("fi_recvmem %d (%s)\n", ret, fi_strerror(-ret));
+
+	return ret;
+}
+
+static int sync_test(void)
+{
+	int ret;
+
+	while (credits < max_credits)
+		poll_all();
+
+	ret = dst_addr ? send_xfer(16) : recv_xfer(16);
+	if (ret)
+		return ret;
+
+	return dst_addr ? recv_xfer(16) : send_xfer(16);
+}
+
+static int run_test(void)
+{
+	int ret, i, t;
+
+	ret = sync_test();
+	if (ret)
+		goto out;
+
+	gettimeofday(&start, NULL);
+	for (i = 0; i < iterations; i++) {
+		for (t = 0; t < transfer_count; t++) {
+			ret = dst_addr ? send_xfer(transfer_size) :
+					 recv_xfer(transfer_size);
+			if (ret)
+				goto out;
+		}
+
+		for (t = 0; t < transfer_count; t++) {
+			ret = dst_addr ? recv_xfer(transfer_size) :
+					 send_xfer(transfer_size);
+			if (ret)
+				goto out;
+		}
+	}
+	gettimeofday(&end, NULL);
+	show_perf();
+	ret = 0;
+
+out:
+	return ret;
+}
+
+static int alloc_cm_ec(fid_t dom, fid_t *cm_ec)
+{
+	struct fi_ec_attr cm_attr;
+	int ret;
+
+	memset(&cm_attr, 0, sizeof cm_attr);
+	cm_attr.ec_mask = FI_EC_ATTR_MASK_V1;
+	cm_attr.domain = FI_EC_DOMAIN_CM;
+	cm_attr.type = FI_EC_QUEUE;
+	cm_attr.format = FI_EC_FORMAT_CM;
+	cm_attr.wait_obj = FI_EC_WAIT_FD;
+	cm_attr.flags = FI_AUTO_RESET;
+	ret = fi_ec_open(dom, &cm_attr, cm_ec, NULL);
+	if (ret)
+		printf("fi_ec_open cm %s\n", fi_strerror(-ret));
+
+	return ret;
+}
+
+static void free_lres(void)
+{
+	fi_close(lcm);
+	fi_close(ldom);
+}
+
+static int alloc_lres(struct fi_info *fi)
+{
+	int ret;
+
+	ret = fi_open(NULL, fi, 0, &ldom, NULL);
+	if (ret) {
+		printf("fi_open %s %s\n", fi->domain_name, fi_strerror(-ret));
+		return ret;
+	}
+
+	ret = alloc_cm_ec(ldom, &lcm);
+	if (ret)
+		fi_close(ldom);
+
+	return ret;
+}
+
+static void free_res(void)
+{
+	fi_mr_unreg(mr);
+	fi_close(cq);
+	fi_close(dom);
+	free(buf);
+}
+
+static int alloc_res(struct fi_info *fi)
+{
+	struct fi_ec_attr cq_attr;
+	int ret;
+
+	buffer_size = !custom ? test_size[TEST_CNT - 1].size : transfer_size;
+	buf = malloc(buffer_size);
+	if (!buf) {
+		perror("malloc");
+		return -1;
+	}
+
+	ret = fi_open(NULL, fi, 0, &dom, NULL);
+	if (ret) {
+		printf("fi_open %s %s\n", fi->domain_name, fi_strerror(-ret));
+		goto err1;
+	}
+
+	memset(&cq_attr, 0, sizeof cq_attr);
+	cq_attr.ec_mask = FI_EC_ATTR_MASK_V1;
+	cq_attr.domain = FI_EC_DOMAIN_COMP;
+	cq_attr.type = FI_EC_QUEUE;
+	cq_attr.format = FI_EC_FORMAT_CONTEXT;
+	cq_attr.wait_obj = FI_EC_WAIT_NONE;
+	cq_attr.size = max_credits << 1;
+	ret = fi_ec_open(dom, &cq_attr, &cq, NULL);
+	if (ret) {
+		printf("fi_eq_open comp %s\n", fi_strerror(-ret));
+		goto err2;
+	}
+
+	ret = fi_mr_reg(dom, buf, buffer_size, &mr, 0, NULL);
+	if (ret) {
+		printf("fi_mr_reg %s\n", fi_strerror(-ret));
+		goto err3;
+	}
+	return 0;
+
+err3:
+	fi_close(cq);
+err2:
+	fi_close(dom);
+err1:
+	free(buf);
+	return ret;
+}
+
+static int bind_fid(fid_t sock, fid_t res, uint64_t flags)
+{
+	struct fi_resource fr;
+	int ret;
+
+	fr.fid = res;
+	fr.flags = flags;
+	ret = fi_bind(sock, &fr, 1);
+	if (ret)
+		printf("fi_bind %s\n", fi_strerror(-ret));
+	return ret;
+}
+
+static int bind_lres(void)
+{
+	return bind_fid(lfs, lcm, 0);
+}
+
+static int bind_res(void)
+{
+	int ret;
+
+	ret = bind_fid(fs, cq, FI_SEND | FI_RECV);
+	if (!ret) {
+		ret = fi_recvmem(fs, buf, buffer_size, fi_mr_desc(mr), buf);
+		if (ret)
+			printf("fi_read %d (%s)\n", ret, fi_strerror(-ret));
+	}
+	return ret;
+}
+
+static int server_listen(void)
+{
+	struct fi_info *fi;
+	int ret;
+
+	hints.flags = FI_PASSIVE;
+	ret = fi_getinfo(src_addr, port, &hints, &fi);
+	if (ret) {
+		printf("fi_getinfo %s\n", strerror(-ret));
+		return ret;
+	}
+
+	ret = fi_socket(fi, &lfs, NULL);
+	if (ret) {
+		printf("fi_socket %s\n", fi_strerror(-ret));
+		goto err1;
+	}
+
+	ret = alloc_lres(fi);
+	if (ret)
+		goto err2;
+
+	ret = bind_lres();
+	if (ret)
+		goto err3;
+
+	ret = fi_listen(lfs);
+	if (ret) {
+		printf("fi_listen %s\n", fi_strerror(-ret));
+		goto err3;
+	}
+
+	fi_freeinfo(fi);
+	return 0;
+err3:
+	free_lres();
+err2:
+	fi_close(lfs);
+err1:
+	fi_freeinfo(fi);
+	return ret;
+}
+
+static int server_connect(void)
+{
+	struct fi_ec_cm_entry entry;
+	ssize_t rd;
+	int ret;
+
+	rd = fi_ec_read(lcm, &entry, sizeof entry);
+	if (rd != sizeof entry) {
+		printf("fi_ec_read %zd %s\n", rd, fi_strerror((int) -rd));
+		return (int) rd;
+	}
+
+	if (entry.event != FI_CONNREQ) {
+		printf("Unexpected CM event %d\n", entry.event);
+		ret = -FI_EOTHER;
+		goto err1;
+	}
+
+	ret = fi_socket(entry.info, &fs, NULL);
+	if (ret) {
+		printf("fi_socket for req %s\n", fi_strerror(-ret));
+		goto err1;
+	}
+
+	ret = alloc_res(entry.info);
+	if (ret)
+		 goto err2;
+
+	ret = bind_res();
+	if (ret)
+		goto err3;
+
+	ret = fi_accept(fs, NULL, 0);
+	if (ret) {
+		printf("fi_accept %s\n", fi_strerror(-ret));
+		goto err3;
+	}
+
+	fi_freeinfo(entry.info);
+	return 0;
+
+err3:
+	free_res();
+err2:
+	fi_close(fs);
+err1:
+	fi_freeinfo(entry.info);
+	return ret;
+}
+
+static int client_connect(void)
+{
+	struct fi_info *fi;
+	int ret;
+
+	if (src_addr) {
+		ret = getaddr(src_addr, NULL, (struct sockaddr **) &hints.src_addr,
+			      (socklen_t *) &hints.src_addrlen);
+		if (ret)
+			printf("source address error %s\n", gai_strerror(ret));
+	}
+
+	ret = fi_getinfo(dst_addr, port, &hints, &fi);
+	if (ret) {
+		printf("fi_getinfo %s\n", strerror(-ret));
+		goto err1;
+	}
+
+	ret = fi_socket(fi, &fs, NULL);
+	if (ret) {
+		printf("fi_socket %s\n", fi_strerror(-ret));
+		goto err2;
+	}
+
+	ret = alloc_res(fi);
+	if (ret)
+		goto err3;
+
+	ret = bind_res();
+	if (ret)
+		goto err4;
+
+	ret = fi_connect(fs, NULL, 0);
+	if (ret) {
+		printf("fi_connect %s\n", fi_strerror(-ret));
+		goto err4;
+	}
+
+	if (hints.src_addr)
+		free(hints.src_addr);
+	fi_freeinfo(fi);
+	return 0;
+
+err4:
+	free_res();
+err3:
+	fi_close(fs);
+err2:
+	fi_freeinfo(fi);
+err1:
+	if (hints.src_addr)
+		free(hints.src_addr);
+	return ret;
+}
+
+static int run(void)
+{
+	int i, ret = 0;
+
+	if (!dst_addr) {
+		ret = server_listen();
+		if (ret)
+			return ret;
+	}
+
+	printf("%-10s%-8s%-8s%-8s%-8s%8s %10s%13s\n",
+	       "name", "bytes", "xfers", "iters", "total", "time", "Gb/sec", "usec/xfer");
+	if (!custom) {
+		optimization = opt_latency;
+		ret = dst_addr ? client_connect() : server_connect();
+		if (ret)
+			return ret;
+
+		for (i = 0; i < TEST_CNT; i++) {
+			if (test_size[i].option > size_option)
+				continue;
+			init_latency_test(test_size[i].size);
+			run_test();
+		}
+
+		/*
+		 * disable bandwidth test until we have a correct flooding
+		 * message protocol
+		fi_shutdown(fs, 0);
+		poll_all();
+		fi_close(fs);
+		free_res();
+
+		optimization = opt_bandwidth;
+		ret = dst_addr ? client_connect() : server_connect();
+		if (ret)
+			return ret;
+
+		for (i = 0; i < TEST_CNT; i++) {
+			if (test_size[i].option > size_option)
+				continue;
+			init_bandwidth_test(test_size[i].size);
+			run_test();
+		}
+		*/
+	} else {
+		ret = dst_addr ? client_connect() : server_connect();
+		if (ret)
+			return ret;
+
+		ret = run_test();
+	}
+
+	while (credits < max_credits)
+		poll_all();
+	fi_shutdown(fs, 0);
+	fi_close(fs);
+	free_res();
+	if (!dst_addr)
+		free_lres();
+	return ret;
+}
+
+int main(int argc, char **argv)
+{
+	int op, ret;
+
+	while ((op = getopt(argc, argv, "d:n:p:s:C:I:S:")) != -1) {
+		switch (op) {
+		case 'd':
+			dst_addr = optarg;
+			break;
+		case 'n':
+			hints.domain_name = optarg;
+			break;
+		case 'p':
+			port = optarg;
+			break;
+		case 's':
+			src_addr = optarg;
+			break;
+		case 'C':
+			custom = 1;
+			transfer_count = atoi(optarg);
+			break;
+		case 'I':
+			custom = 1;
+			iterations = atoi(optarg);
+			break;
+		case 'S':
+			if (!strncasecmp("all", optarg, 3)) {
+				size_option = 1;
+			} else {
+				custom = 1;
+				transfer_size = atoi(optarg);
+			}
+			break;
+		default:
+			printf("usage: %s\n", argv[0]);
+			printf("\t[-d destination_address]\n");
+			printf("\t[-n domain_name]\n");
+			printf("\t[-p port_number]\n");
+			printf("\t[-s source_address]\n");
+			printf("\t[-C transfer_count]\n");
+			printf("\t[-I iterations]\n");
+			printf("\t[-S transfer_size or 'all']\n");
+			exit(1);
+		}
+	}
+
+	hints.type = FID_MSG;
+	ret = run();
+	return ret;
+}
diff --git a/examples/provinfo.c b/examples/provinfo.c
new file mode 100644
index 00000000000..3e09cc7a0fa
--- /dev/null
+++ b/examples/provinfo.c
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2013 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under the OpenIB.org BSD license
+ * below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AWV
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <errno.h>
+#include <getopt.h>
+#include <netdb.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include <rdma/fabric.h>
+#include "shared.h"
+
+
+static struct fi_info hints;
+static char *dst_addr;
+
+static int run(void)
+{
+	struct fi_info *fi, *cur;
+	int ret;
+
+	ret = fi_getinfo(dst_addr, NULL, &hints, &fi);
+	if (ret) {
+		printf("fi_getinfo %s\n", strerror(-ret));
+		return ret;
+	}
+
+	for (cur = fi; cur; cur = cur->next) {
+		printf("domain: %s\n", cur->domain_name);
+	}
+
+	return ret;
+}
+
+int main(int argc, char **argv)
+{
+	int op, ret;
+
+	while ((op = getopt(argc, argv, "d:n:s:")) != -1) {
+		switch (op) {
+		case 'd':
+			dst_addr = optarg;
+			break;
+		case 'n':
+			hints.domain_name = optarg;
+			break;
+		case 's':
+			ret = getaddr(optarg, NULL, (struct sockaddr **) &hints.src_addr,
+				      (socklen_t *) &hints.src_addrlen);
+			if (ret) {
+				printf("source address error %s\n",
+					gai_strerror(errno));
+			}
+			break;
+		default:
+			printf("usage: %s\n", argv[0]);
+			printf("\t[-d destination_address]\n");
+			printf("\t[-n domain_name]\n");
+			printf("\t[-s source_address]\n");
+			exit(1);
+		}
+	}
+
+	ret = run();
+	return ret;
+}
diff --git a/examples/shared.c b/examples/shared.c
new file mode 100644
index 00000000000..7925f41cf1b
--- /dev/null
+++ b/examples/shared.c
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2013 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under the OpenIB.org BSD license
+ * below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AWV
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <errno.h>
+#include <netdb.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include "shared.h"
+
+
+int getaddr(char *node, char *service, struct sockaddr **addr, socklen_t *len)
+{
+	struct addrinfo *ai;
+	int ret;
+
+	ret = getaddrinfo(node, service, NULL, &ai);
+	if (ret)
+		return ret;
+
+	if ((*addr = malloc(ai->ai_addrlen))) {
+		memcpy(*addr, ai->ai_addr, ai->ai_addrlen);
+		*len = ai->ai_addrlen;
+	} else {
+		ret = EAI_MEMORY;
+	}
+
+	freeaddrinfo(ai);
+	return ret;
+}
+
+void size_str(char *str, size_t ssize, long long size)
+{
+	long long base, fraction = 0;
+	char mag;
+
+	if (size >= (1 << 30)) {
+		base = 1 << 30;
+		mag = 'g';
+	} else if (size >= (1 << 20)) {
+		base = 1 << 20;
+		mag = 'm';
+	} else if (size >= (1 << 10)) {
+		base = 1 << 10;
+		mag = 'k';
+	} else {
+		base = 1;
+		mag = '\0';
+	}
+
+	if (size / base < 10)
+		fraction = (size % base) * 10 / base;
+	if (fraction) {
+		snprintf(str, ssize, "%lld.%lld%c", size / base, fraction, mag);
+	} else {
+		snprintf(str, ssize, "%lld%c", size / base, mag);
+	}
+}
+
+void cnt_str(char *str, size_t ssize, long long cnt)
+{
+	if (cnt >= 1000000000)
+		snprintf(str, ssize, "%lldb", cnt / 1000000000);
+	else if (cnt >= 1000000)
+		snprintf(str, ssize, "%lldm", cnt / 1000000);
+	else if (cnt >= 1000)
+		snprintf(str, ssize, "%lldk", cnt / 1000);
+	else
+		snprintf(str, ssize, "%lld", cnt);
+}
+
+int size_to_count(int size)
+{
+	if (size >= (1 << 20))
+		return 100;
+	else if (size >= (1 << 16))
+		return 1000;
+	else if (size >= (1 << 10))
+		return 10000;
+	else
+		return 100000;
+}
diff --git a/examples/shared.h b/examples/shared.h
new file mode 100644
index 00000000000..1fb3660852e
--- /dev/null
+++ b/examples/shared.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2013 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under the OpenIB.org BSD license
+ * below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AWV
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _SHARED_H_
+#define _SHARED_H_
+
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include <rdma/fabric.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+int getaddr(char *node, char *service, struct sockaddr **addr, socklen_t *len);
+void size_str(char *str, size_t ssize, long long size);
+void cnt_str(char *str, size_t ssize, long long cnt);
+int size_to_count(int size);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SHARED_H_ */
diff --git a/include/fi.h b/include/fi.h
new file mode 100644
index 00000000000..2c6d237d0fb
--- /dev/null
+++ b/include/fi.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2013 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenFabrics.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _FI_H_
+#define _FI_H_
+
+#include <endian.h>
+#include <byteswap.h>
+#include <rdma/fabric.h>
+#include <rdma/fi_prov.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define PFX "libfabric: "
+
+#ifdef INCLUDE_VALGRIND
+#   include <valgrind/memcheck.h>
+#   ifndef VALGRIND_MAKE_MEM_DEFINED
+#      warning "Valgrind requested, but VALGRIND_MAKE_MEM_DEFINED undefined"
+#   endif
+#endif
+
+#ifndef VALGRIND_MAKE_MEM_DEFINED
+#   define VALGRIND_MAKE_MEM_DEFINED(addr, len)
+#endif
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+static inline be64_t htonll(uint64_t x) { return bswap_64(x); }
+static inline uint64_t ntohll(be64_t x) { return bswap_64(x); }
+#else
+static inline be64_t htonll(uint64_t x) { return x; }
+static inline uint64_t ntohll(be64_t x) { return x; }
+#endif
+
+#define max(a, b) ((a) > (b) ? a : b)
+#define min(a, b) ((a) < (b) ? a : b)
+
+struct fi_prov {
+	struct fi_prov		*next;
+	struct fi_ops_prov	*ops;
+};
+
+struct uv_dev {
+	struct uv_dev		*next;
+	char			sysfs_name[FI_NAME_MAX];
+	char			dev_name[FI_NAME_MAX];
+	char			sysfs_path[FI_PATH_MAX];
+	char			dev_path[FI_PATH_MAX];
+};
+
+extern int uv_abi_ver;
+extern struct uv_dev *udev_head, *udev_tail;
+
+int  fi_init(void);
+
+void uv_ini(void);
+void uv_fini(void);
+int  uv_init(void);
+
+void ibv_ini(void);
+void ibv_fini(void);
+
+void ucma_ini(void);
+void ucma_fini(void);
+int  ucma_init(void);
+
+void rdma_cm_ini(void);
+void rdma_cm_fini(void);
+
+void mlx4_ini(void);
+void mlx4_fini(void);
+
+#ifdef HAVE_PSM
+void psmx_ini(void);
+void psmx_fini(void);
+#else
+#define psmx_ini()
+#define psmx_fini()
+#endif
+
+const char *fi_sysfs_path(void);
+int fi_read_file(const char *dir, const char *file, char *buf, size_t size);
+void __fi_freeinfo(struct fi_info *info);
+
+#define IBV_PREFIX "ibv"
+#ifndef SYSCONFDIR
+#define SYSCONFDIR "/etc"
+#endif
+#ifndef RDMADIR
+#define RDMADIR "rdma"
+#endif
+#define RDMA_CONF_DIR  SYSCONFDIR "/" RDMADIR
+#define FI_CONF_DIR RDMA_CONF_DIR "/fabric"
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _FI_H_ */
diff --git a/include/infiniband/ib.h b/include/infiniband/ib.h
new file mode 100644
index 00000000000..2e5029ac29b
--- /dev/null
+++ b/include/infiniband/ib.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2010 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(_RDMA_IB_H)
+#define _RDMA_IB_H
+
+#include <linux/types.h>
+#include <string.h>
+
+#ifndef AF_IB
+#define AF_IB 27
+#endif
+#ifndef PF_IB
+#define PF_IB AF_IB
+#endif
+
+#ifndef __be16
+#define __be16 __u16
+#endif
+#ifndef __be32
+#define __be32 __u32
+#endif
+#ifndef __be64
+#define __be64 __u64
+#endif
+
+struct ib_addr {
+	union {
+		__u8		uib_addr8[16];
+		__be16		uib_addr16[8];
+		__be32		uib_addr32[4];
+		__be64		uib_addr64[2];
+	} ib_u;
+#define sib_addr8		ib_u.uib_addr8
+#define sib_addr16		ib_u.uib_addr16
+#define sib_addr32		ib_u.uib_addr32
+#define sib_addr64		ib_u.uib_addr64
+#define sib_raw			ib_u.uib_addr8
+#define sib_subnet_prefix	ib_u.uib_addr64[0]
+#define sib_interface_id	ib_u.uib_addr64[1]
+};
+
+static inline int ib_addr_any(const struct ib_addr *a)
+{
+	return ((a->sib_addr64[0] | a->sib_addr64[1]) == 0);
+}
+
+static inline int ib_addr_loopback(const struct ib_addr *a)
+{
+	return ((a->sib_addr32[0] | a->sib_addr32[1] |
+		 a->sib_addr32[2] | (a->sib_addr32[3] ^ htonl(1))) == 0);
+}
+
+static inline void ib_addr_set(struct ib_addr *addr,
+			       __be32 w1, __be32 w2, __be32 w3, __be32 w4)
+{
+	addr->sib_addr32[0] = w1;
+	addr->sib_addr32[1] = w2;
+	addr->sib_addr32[2] = w3;
+	addr->sib_addr32[3] = w4;
+}
+
+static inline int ib_addr_cmp(const struct ib_addr *a1, const struct ib_addr *a2)
+{
+	return memcmp(a1, a2, sizeof(struct ib_addr));
+}
+
+struct sockaddr_ib {
+	unsigned short int	sib_family;	/* AF_IB */
+	__be16			sib_pkey;
+	__be32			sib_flowinfo;
+	struct ib_addr		sib_addr;
+	__be64			sib_sid;
+	__be64			sib_sid_mask;
+	__u64			sib_scope_id;
+};
+
+#endif /* _RDMA_IB_H */
diff --git a/include/rdma/fabric.h b/include/rdma/fabric.h
new file mode 100644
index 00000000000..2fa6c84201c
--- /dev/null
+++ b/include/rdma/fabric.h
@@ -0,0 +1,387 @@
+/*
+ * Copyright (c) 2013 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenFabrics.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _FABRIC_H_
+#define _FABRIC_H_
+
+#include <stdint.h>
+#include <stddef.h>
+#include <assert.h>
+#include <sys/socket.h>
+#include <assert.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+typedef uint16_t be16_t;
+typedef uint32_t be32_t;
+typedef uint64_t be64_t;
+
+#ifndef container_of
+#define container_of(ptr, type, field) \
+	((type *) ((char *)ptr - offsetof(type, field)))
+#endif
+
+enum {
+	FI_PATH_MAX		= 256,
+	FI_NAME_MAX		= 64,
+	FI_VERSION_MAX		= 64
+};
+
+/* fi_info and operation flags - pass into socket ops calls.
+ * A user may also set these on a socket by using fcntl, which has the
+ * affect of applying them to all applicable operations.
+ */
+
+/* PASSIVE - Indicates that the allocated socket will be used
+ * to listen for connection requests.
+ * fi_info
+ */
+#define FI_PASSIVE		(1ULL << 0)
+/* NUMERICHOST - The node parameter passed into fi_getinfo is a
+ * numeric IP address or GID.  When set, name resolution is not
+ * performed.
+ * fi_info
+ */
+#define FI_NUMERICHOST		(1ULL << 1)
+/* FAMILY - If set, then the node parameter passed into fi_getinfo
+ * is encoded address.  The format of the address is given by the
+ * sa_family field in fi_info.  This flag is needed by providers
+ * in order to determine if an address is an IPv6 or GID based
+ * address.
+ * fi_info
+ */
+//#define FI_FAMILY		(1ULL << 2)
+
+/* AUTO_RESET - automatically resets the event queue to generate
+ * a new wake-up event on the next entry.  Example use:
+ * 1. wait on eq wait object -- poll(fd)
+ * 2. wait object is ready -- fd is readable
+ * 3. read eq to retrieve events
+ * 4. continue reading until read returns 0
+ */
+#define FI_AUTO_RESET		(1ULL << 7)
+
+/* fi_info type, fcntl, fi_open flags */
+
+/* Reserve lower 8-bits for type selection
+ * fi_info type, fi_open, fcntl
+ */
+#define FI_NONBLOCK		(1ULL << 8)
+/* Reserve lower 8-bits for type selection
+ * fi_info type, fi_open, fcntl
+ */
+#define FI_SYNC			(1ULL << 9)
+/* EXCL - Indicates that the specified domain should not share
+ * resources with another opened domain.  By default, resources
+ * associated with a resource domain are shared across all open
+ * calls by the same process.
+ * reserve lower 8-bits for type selection
+ * fi_info type, fi_open, fcntl
+ */
+#define FI_EXCL			(1ULL << 10)
+/* BUFFERED_RECV - If set, the provider should attempt to queue inbound
+ * data that arrives before a receive buffer has been posted.  In the
+ * absence of this flag, any messages that arrive before a receive is
+ * posted are lost.
+ * When set, the user must use struct fi_context * as their per
+ * operation context.
+ * reserve lower 8-bits for type selection
+ * fi_info type, fi_open, fcntl
+ */
+/* TODO: Should buffered be its own bit */
+#define FI_BUFFERED_RECV	(1ULL << 11)
+/* CANCEL - Indicates that the user wants the ability to cancel
+ * the operation if it does not complete first.  Providers use this
+ * to return a handle to the request, which the user may then cancel.
+ * Also used by search to indicate that a request should be canceled.
+ * fi_info type, fi_open, fcntl, data transfer ops
+ */
+#define FI_CANCEL		(1ULL << 12)
+/* SHARED_RECV - A socket created with this flag will share the same
+ * receive queue as other sockets created on the same domain.
+ * fi_info type, fi_open, fcntl
+ */
+/* TODO: should shared be its own bit? */
+#define FI_SHARED_RECV		(1ULL << 13)
+/* READ - Used to enable read access to data buffers.
+ */
+#define FI_READ			(1ULL << 14)
+/* WRITE - Used to enable write access to data buffers.
+ */
+#define FI_WRITE		(1ULL << 15)
+/* RECV - Report recv completion EQs
+ */
+/* TODO: Use with buffered_recv / shared_recv? */
+#define FI_RECV			(1ULL << 16)
+/* SEND - Report send completion EQs
+ */
+/* TODO: Use with buffered_send? */
+#define FI_SEND			(1ULL << 17)
+
+/* fcntl and data transfer ops */
+
+#define FI_DONTWAIT		FI_NONBLOCK
+#define FI_PEEK			(1ULL << 25)
+/* ERRQUEUE - A read operation should retrieve any queued error data.
+ * In the case of a failure, a read operation may return an error code,
+ * indicating that an operation has failed and extended error data is
+ * available.  Queued error data must be read before additional
+ * completions may be read.
+ *
+ * Added eq.readerr call, which should eliminate the need for this.
+ */
+#define FI_ERRQUEUE		(1ULL << 26)
+/* TRUNC - Signals that received data has been truncated.
+ */
+#define FI_TRUNC		(1ULL << 27)
+/* CTRUNC - Indicates that control data was truncated.  Use case?
+ */
+#define FI_CTRUNC		(1ULL << 28)
+#define FI_ATRUNC		(1ULL << 29)
+/* IMM - Indicates that immediate data is available.  IMM data is
+ * communicated to a receiver through completion data, rather than
+ * appearing in targeted receive buffers.
+ */
+#define FI_IMM			(1ULL << 30)
+/* NOCOMP - Indicates that no completion should be generated for the
+ * specified operation.
+ */
+#define FI_NOCOMP		(1ULL << 31)
+/* MORE: Indicates that additional requests are pending.  Providers may
+ * use this to optimize access to hardware.
+ */
+#define FI_MORE			(1ULL << 32)
+/* SIGNAL - Indicates if a completion event should be generated.
+ */
+#define FI_SIGNAL		(1ULL << 33)
+/* BUFFERED_SEND - If set, the outbound data buffer should be returned
+ * to user immediately after the call returns, even if the operation is
+ * handled asynchronously.  This may require that the provider copy
+ * the data into a local buffer and transfer out of that buffer.
+ */
+#define FI_BUFFERED_SEND	(1ULL << 34)
+/* ACK - Indicates that a completion event is not generated until the operation
+ * initiated is acknowledged by the remote side */
+#define FI_ACK			(1ULL << 35)
+
+/* ERRINLINE - Error events are reported inline with other events, rather
+ * than through a separate error queue (see ERRQUEUE).
+ */
+#define FI_ERRINLINE		(1ULL << 36)
+/* REMOTE - Indicates remote access
+ */
+#define FI_REMOTE		(1ULL << 37)
+
+
+/*
+ * Format for 'vectored' data transfer calls: sendv, writev, etc.
+ */
+enum fi_iov_format {
+	FI_IOV,			/* struct iovec */
+	FI_IOMV,		/* struct fi_iomv */
+	FI_IOTAGGED,		/* struct fi_iotagged */
+	FI_IOTAGGEDV,		/* struct fi_iotaggedv */
+};
+
+/*
+ * Format for transport addresses: sendto, writeto, etc.
+ */
+enum fi_addr_format {
+	FI_ADDR,		/* void * fi_addr */
+	FI_AV,			/* struct fi_av_addr */
+	FI_ADDR_INDEX,		/* size_t fi_addr */
+	FI_INFO_ADDR,		/* struct fi_info_addr */
+	FI_SOCKADDR,		/* struct sockaddr */
+	FI_SOCKADDR_IN,		/* struct sockaddr_in */
+	FI_SOCKADDR_IN6,	/* struct sockaddr_in6 */
+	FI_SOCKADDR_IB,		/* struct sockaddr_ib */
+};
+
+struct fi_info {
+	struct fi_info		*next;
+	size_t			size;
+	uint64_t		flags;
+	uint64_t		type;
+	uint64_t		protocol;
+	enum fi_iov_format	iov_format;
+	enum fi_addr_format	addr_format;
+	enum fi_addr_format	info_addr_format;
+	size_t			src_addrlen;
+	size_t			dst_addrlen;
+	void			*src_addr;
+	void			*dst_addr;
+	/*char			*src_canonname;*/
+	/*char			*dst_canonname;*/
+	/* Authorization key is intended to limit communication with only
+	 * those sockets sharing the same key.
+	 */
+	size_t			auth_keylen;
+	void			*auth_key;
+	/* A shared_fd is intended to allow a domain to share resources
+	 * and data with other processes that have access to the same
+	 * shared_fd.  Based on XRC work.
+	 */
+	int			shared_fd;
+	char			*domain_name;
+	size_t			datalen;
+	void			*data;
+};
+
+enum {
+	FID_CLASS_UNSPEC,
+	FID_CLASS_SOCKET,
+	FID_CLASS_RESOURCE_DOMAIN,
+	FID_CLASS_INTERFACE,
+	FID_CLASS_AV,
+	FID_CLASS_MR,
+	FID_CLASS_EC
+};
+
+/* See FI_BUFFERED_RECV, FI_CANCEL */
+struct fi_context {
+	void			*internal[4];
+};
+
+struct fid;
+typedef struct fid *fid_t;
+
+struct fi_resource {
+	fid_t			fid;
+	uint64_t		flags;
+};
+
+struct fi_ops {
+	size_t	size;
+	int	(*close)(fid_t fid);
+	/* Associate resources with this object */
+	int	(*bind)(fid_t fid, struct fi_resource *fids, int nfids);
+	/* Operation that completes after all previous async requests complete */
+	int	(*sync)(fid_t fid, uint64_t flags, void *context);
+	/* low-level control - similar to fcntl & ioctl operations */
+	int	(*control)(fid_t fid, int command, void *arg);
+};
+
+/* All fabric interface descriptors must start with this structure */
+struct fid {
+	int			fclass;
+	int			size;
+	void			*context;
+	struct fi_ops		*ops;
+};
+
+#define FI_PREFIX		"fi"
+#define FI_DOMAIN_NAMES		"domains"
+#define FI_UNBOUND_NAME		"local"
+
+int fi_getinfo(char *node, char *service, struct fi_info *hints,
+	       struct fi_info **info);
+void fi_freeinfo(struct fi_info *info);
+
+/* Either name or info must be provided.  Providing both is allowed. */
+int fi_open(char *name, struct fi_info *info, uint64_t flags,
+	    fid_t *fid, void *context);
+/*
+ * Allocate a fabric socket.  A fabric socket is a software construct.
+ */
+int fi_socket(struct fi_info *info, fid_t *fid, void *context);
+
+#define FI_ASSERT_CLASS(fid, f_class)   assert(fid->fclass == f_class)
+#define FI_ASSERT_FIELD(ptr, ftype, field) assert(ptr->size > offsetof(ftype, field))
+#define FI_ASSERT_OPS(fid, ftype, ops) FI_ASSERT_FIELD(fid, ftype, ops)
+#define FI_ASSERT_OP(ops, otype, op)   FI_ASSERT_FIELD(ops, otype, op)
+
+static inline int fi_close(fid_t fid)
+{
+	FI_ASSERT_OPS(fid, struct fid, ops);
+	FI_ASSERT_OP(fid->ops, struct fi_ops, close);
+	return fid->ops->close(fid);
+}
+#define fi_destroy(fid) fi_close(fid)
+
+static inline int fi_bind(fid_t fid, struct fi_resource *fids, int nfids)
+{
+	FI_ASSERT_OPS(fid, struct fid, ops);
+	FI_ASSERT_OP(fid->ops, struct fi_ops, bind);
+	return fid->ops->bind(fid, fids, nfids);
+}
+
+static inline int fi_sync(fid_t fid, uint64_t flags, void *context)
+{
+	FI_ASSERT_OPS(fid, struct fid, ops);
+	FI_ASSERT_OP(fid->ops, struct fi_ops, sync);
+	return fid->ops->sync(fid, flags, context);
+}
+
+/* control commands */
+enum {
+	FI_GETFIDFLAG,		/* uint64_t flags */
+	FI_SETFIDFLAG,		/* uint64_t flags */
+	FI_GETOPSFLAG,		/* uint64_t flags */
+	FI_SETOPSFLAG,		/* uint64_t flags */
+
+	/* Duplicate a fid_t.  This allows for 2 fids that refer to a single
+	 * HW resource.  Each fid may reference functions that are optimized
+	 * for different use cases.
+	 */
+	FI_DUPFID,		/* fid_t * */
+	FI_GETECWAIT,		/* void * wait object */
+
+	/* Start/stop an internal progress thread.  This is only needed if the
+	 * provider does not support active_progress, and the app does not
+	 * want to poll for progress.
+	 */
+	FI_STARTPROGRESS,	/* NULL - flags? */
+	FI_STOPPROGRESS		/* NULL - flags? */
+};
+
+/*
+ * fi_control may be used to set the flags for data transfer operations.  This
+ * is done using the FI_SETOPSFLAG command with arg a uint64_t flags value.  The
+ * FI_READ, FI_WRITE, FI_SEND, FI_RECV flags indicate the type of data transfer
+ * that the flags should apply to, with other flags OR'ed in.
+ */
+static inline int fi_control(fid_t fid, int command, void *arg)
+{
+	FI_ASSERT_OPS(fid, struct fid, ops);
+	FI_ASSERT_OP(fid->ops, struct fi_ops, control);
+	return fid->ops->control(fid, command, arg);
+}
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _FABRIC_H_ */
diff --git a/include/rdma/fi_arch.h b/include/rdma/fi_arch.h
new file mode 100644
index 00000000000..41616e2ff95
--- /dev/null
+++ b/include/rdma/fi_arch.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _FI_ARCH_H_
+#define _FI_ARCH_H_
+
+#include <stdint.h>
+
+/*
+ * Architecture-specific defines.  Currently, an architecture is
+ * required to implement the following operations:
+ *
+ * mb() - memory barrier.  No loads or stores may be reordered across
+ *     this macro by either the compiler or the CPU.
+ * rmb() - read memory barrier.  No loads may be reordered across this
+ *     macro by either the compiler or the CPU.
+ * wmb() - write memory barrier.  No stores may be reordered across
+ *     this macro by either the compiler or the CPU.
+ * wc_wmb() - flush write combine buffers.  No write-combined writes
+ *     will be reordered across this macro by either the compiler or
+ *     the CPU.
+ */
+
+#if defined(__i386__)
+
+#define mb()	 asm volatile("lock; addl $0,0(%%esp) " ::: "memory")
+#define rmb()	 mb()
+#define wmb()	 asm volatile("" ::: "memory")
+#define wc_wmb() mb()
+
+#elif defined(__x86_64__)
+
+/*
+ * Only use lfence for mb() and rmb() because we don't care about
+ * ordering against non-temporal stores (for now at least).
+ */
+#define mb()	 asm volatile("lfence" ::: "memory")
+#define rmb()	 mb()
+#define wmb()	 asm volatile("" ::: "memory")
+#define wc_wmb() asm volatile("sfence" ::: "memory")
+
+#elif defined(__PPC64__)
+
+#define mb()	 asm volatile("sync" ::: "memory")
+#define rmb()	 asm volatile("lwsync" ::: "memory")
+#define wmb()	 mb()
+#define wc_wmb() wmb()
+
+#elif defined(__ia64__)
+
+#define mb()	 asm volatile("mf" ::: "memory")
+#define rmb()	 mb()
+#define wmb()	 mb()
+#define wc_wmb() asm volatile("fwb" ::: "memory")
+
+#elif defined(__PPC__)
+
+#define mb()	 asm volatile("sync" ::: "memory")
+#define rmb()	 mb()
+#define wmb()	 mb()
+#define wc_wmb() wmb()
+
+#elif defined(__sparc_v9__)
+
+#define mb()	 asm volatile("membar #LoadLoad | #LoadStore | #StoreStore | #StoreLoad" ::: "memory")
+#define rmb()	 asm volatile("membar #LoadLoad" ::: "memory")
+#define wmb()	 asm volatile("membar #StoreStore" ::: "memory")
+#define wc_wmb() wmb()
+
+#elif defined(__sparc__)
+
+#define mb()	 asm volatile("" ::: "memory")
+#define rmb()	 mb()
+#define wmb()	 mb()
+#define wc_wmb() wmb()
+
+#else
+
+#warning No architecture specific defines found.  Using generic implementation.
+
+#define mb()	 asm volatile("" ::: "memory")
+#define rmb()	 mb()
+#define wmb()	 mb()
+#define wc_wmb() wmb()
+
+#endif
+
+#endif /* _FI_ARCH_H_ */
diff --git a/include/rdma/fi_atomic.h b/include/rdma/fi_atomic.h
new file mode 100644
index 00000000000..f5bb994d4c9
--- /dev/null
+++ b/include/rdma/fi_atomic.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2013 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenFabrics.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _FI_ATOMIC_H_
+#define _FI_ATOMIC_H_
+
+#include <rdma/fi_socket.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+struct fi_ops_atomic {
+	size_t	size;
+	/* add/compare_swap */
+};
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _FI_ATOMIC_H_ */
diff --git a/include/rdma/fi_cm.h b/include/rdma/fi_cm.h
new file mode 100644
index 00000000000..1105ad1e5c5
--- /dev/null
+++ b/include/rdma/fi_cm.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2013 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenFabrics.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _FI_CM_H_
+#define _FI_CM_H_
+
+#include <rdma/fi_socket.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+struct fi_ops_cm {
+	size_t	size;
+	int	(*getname)(fid_t fid, void *addr, size_t *addrlen);
+	int	(*getpeer)(fid_t fid, void *addr, size_t *addrlen);
+	int	(*connect)(fid_t fid, const void *param, size_t paramlen);
+	int	(*listen)(fid_t fid);
+	int	(*accept)(fid_t fid, const void *param, size_t paramlen);
+	int	(*reject)(fid_t fid, struct fi_info *info,
+			  const void *param, size_t paramlen);
+	int	(*shutdown)(fid_t fid, uint64_t flags);
+	int	(*join)(fid_t fid, void *addr, void **fi_addr, uint64_t flags);
+	int	(*leave)(fid_t fid, void *addr, void *fi_addr, uint64_t flags);
+};
+
+static inline int fi_getsockname(fid_t fid, void *addr, size_t *addrlen)
+{
+	struct fid_socket *sock = container_of(fid, struct fid_socket, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_SOCKET);
+	FI_ASSERT_OPS(fid, struct fid_socket, cm);
+	FI_ASSERT_OP(sock->cm, struct fi_ops_cm, getname);
+	return sock->cm->getname(fid, addr, addrlen);
+}
+
+static inline int fi_listen(fid_t fid)
+{
+	struct fid_socket *sock = container_of(fid, struct fid_socket, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_SOCKET);
+	FI_ASSERT_OPS(fid, struct fid_socket, cm);
+	FI_ASSERT_OP(sock->cm, struct fi_ops_cm, listen);
+	return sock->cm->listen(fid);
+}
+
+static inline int fi_connect(fid_t fid, const void *param, size_t paramlen)
+{
+	struct fid_socket *sock = container_of(fid, struct fid_socket, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_SOCKET);
+	FI_ASSERT_OPS(fid, struct fid_socket, cm);
+	FI_ASSERT_OP(sock->cm, struct fi_ops_cm, connect);
+	return sock->cm->connect(fid, param, paramlen);
+}
+
+static inline int fi_accept(fid_t fid, const void *param, size_t paramlen)
+{
+	struct fid_socket *sock = container_of(fid, struct fid_socket, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_SOCKET);
+	FI_ASSERT_OPS(fid, struct fid_socket, cm);
+	FI_ASSERT_OP(sock->cm, struct fi_ops_cm, accept);
+	return sock->cm->accept(fid, param, paramlen);
+}
+
+static inline int fi_reject(fid_t fid, struct fi_info *info,
+			    const void *param, size_t paramlen)
+{
+	struct fid_socket *sock = container_of(fid, struct fid_socket, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_SOCKET);
+	FI_ASSERT_OPS(fid, struct fid_socket, cm);
+	FI_ASSERT_OP(sock->cm, struct fi_ops_cm, reject);
+	return sock->cm->reject(fid, info, param, paramlen);
+}
+
+static inline int fi_shutdown(fid_t fid, uint64_t flags)
+{
+	struct fid_socket *sock = container_of(fid, struct fid_socket, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_SOCKET);
+	FI_ASSERT_OPS(fid, struct fid_socket, cm);
+	FI_ASSERT_OP(sock->cm, struct fi_ops_cm, shutdown);
+	return sock->cm->shutdown(fid, flags);
+}
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _FI_CM_H_ */
diff --git a/include/rdma/fi_domain.h b/include/rdma/fi_domain.h
new file mode 100644
index 00000000000..152041cc2e9
--- /dev/null
+++ b/include/rdma/fi_domain.h
@@ -0,0 +1,434 @@
+/*
+ * Copyright (c) 2013 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenFabrics.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _FI_DOMAIN_H_
+#define _FI_DOMAIN_H_
+
+#include <rdma/fabric.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+struct fi_iomv {
+	void			*addr;
+	size_t			len;
+	uint64_t		mem_desc;
+};
+
+/* TODO: Will this be used? */
+struct fi_iotagged {
+	uint64_t		itag_addr;
+	be64_t			itag_tag;
+	be64_t			itag_mask;
+};
+
+/* TODO: Will this be used? */
+struct fi_iotaggedv {
+	uint64_t		itag_addr;
+	be64_t			itag_tag;
+	be64_t			itag_mask;
+	uint64_t		itag_desc;
+};
+
+/*
+ * AV = Address Vector
+ * Maps and stores transport/network addresses.
+ */
+
+struct fi_av_addr {
+	fid_t			av;
+	uint64_t		av_index;
+};
+
+enum fi_av_type {
+	FI_AV_MAP,
+	FI_AV_TABLE
+};
+
+enum {
+	FI_AV_ATTR_TYPE		= 1 << 0,
+	FI_AV_ATTR_ADDR_FORMAT	= 1 << 1,
+	FI_AV_ATTR_ADDRLEN	= 1 << 2,
+	FI_AV_ATTR_SIZE		= 1 << 3,
+	FI_AV_ATTR_FLAGS	= 1 << 4,
+	FI_AV_ATTR_MASK_V1	= (FI_AV_ATTR_FLAGS << 1) - 1
+};
+
+struct fi_av_attr {
+	int			av_mask;
+	enum fi_av_type		type;
+	enum fi_addr_format	addr_format;
+	size_t			addrlen;
+	size_t			count;
+	uint64_t		flags;
+};
+
+struct fi_ops_av {
+	size_t	size;
+	int	(*insert)(fid_t fid, const void *addr, size_t count,
+			void **fi_addr, uint64_t flags);
+	int	(*remove)(fid_t fid, void *fi_addr, size_t count,
+			uint64_t flags);
+};
+
+struct fid_av {
+	struct fid		fid;
+	struct fi_ops_av	*ops;
+};
+
+
+/*
+ * MR = Memory Region
+ * Tracks registered memory regions, primarily for remote access,
+ * but also for local access until we can remove that need.
+ */
+struct fid_mr {
+	struct fid		fid;
+	uint64_t		mem_desc;
+	be64_t			key;
+};
+
+
+/*
+ * EC = Event Collector
+ * Used to report various events and the completion of asynchronous
+ * operations.
+ */
+enum fi_ec_domain {
+	FI_EC_DOMAIN_GENERAL,
+	FI_EC_DOMAIN_COMP,
+	FI_EC_DOMAIN_CM,
+	FI_EC_DOMAIN_AV
+};
+
+enum fi_ec_type {
+	FI_EC_QUEUE,
+	FI_EC_COUNTER
+};
+
+enum fi_ec_format {
+	FI_EC_FORMAT_UNSPEC,
+	FI_EC_FORMAT_CONTEXT,
+	FI_EC_FORMAT_COMP,
+	FI_EC_FORMAT_DATA,
+	FI_EC_FORMAT_TAGGED,
+	FI_EC_FORMAT_ERR,
+	FI_EC_FORMAT_CM
+};
+
+/* Use fi_control GETECWAIT to get underlying wait object */
+enum fi_ec_wait_obj {
+	FI_EC_WAIT_NONE,
+	FI_EC_WAIT_FD
+};
+
+enum fi_ec_wait_cond {
+	FI_EC_COND_NONE,
+	FI_EC_COND_THRESHOLD	/* size_t threshold */
+};
+
+enum {
+	FI_EC_ATTR_DOMAIN	= 1 << 0,
+	FI_EC_ATTR_TYPE		= 1 << 1,
+	FI_EC_ATTR_FORMAT	= 1 << 2,
+	FI_EC_ATTR_WAIT_OBJ	= 1 << 3,
+	FI_EC_ATTR_WAIT_COND	= 1 << 4,
+	FI_EC_ATTR_SIZE		= 1 << 5,
+	FI_EC_ATTR_VECTOR	= 1 << 6,
+	FI_EC_ATTR_FLAGS	= 1 << 7,
+	FI_EC_ATTR_COND		= 1 << 8,
+	FI_EC_ATTR_MASK_V1	= (FI_EC_ATTR_COND << 1) - 1
+};
+
+struct fi_ec_attr {
+	int			ec_mask;
+	enum fi_ec_domain	domain;
+	enum fi_ec_type		type;
+	enum fi_ec_format	format;
+	enum fi_ec_wait_obj	wait_obj;
+	enum fi_ec_wait_cond	wait_cond;
+	size_t			size;
+	int			signaling_vector;
+	uint64_t		flags;
+	/* If AUTO_RESET is enabled, and wait_cond is not NONE */
+	void			*cond;
+};
+
+struct fi_ec_entry {
+	void			*op_context;
+};
+
+struct fi_ec_comp_entry {
+	void			*op_context;
+	uint64_t		flags;
+	size_t			len;
+};
+
+struct fi_ec_data_entry {
+	void			*op_context;
+	void			*buf;
+	uint64_t		flags;
+	size_t			len;
+	/* data depends on operation and/or flags - e.g. immediate data */
+	uint64_t		data;
+};
+
+struct fi_ec_tagged_entry {
+	void			*op_context;
+	void			*buf;
+	uint64_t		flags;
+	size_t			len;
+	uint64_t		data;
+	uint64_t		tag;
+	size_t			olen;
+};
+
+struct fi_ec_err_entry {
+	void			*fid_context;
+	void			*op_context;
+	uint64_t		flags;
+	int			err;
+	int			prov_errno;
+	uint64_t		data;
+	/* prov_data is available until the next time the EQ is read */
+	void			*prov_data;
+};
+
+enum fi_cm_event {
+	FI_CONNREQ,
+	FI_CONNECTED,
+	FI_SHUTDOWN
+};
+
+struct fi_ec_cm_entry {
+	void			*fid_context;
+	uint64_t		flags;
+	enum fi_cm_event	event;
+	/* user must call fi_freeinfo to release info */
+	struct fi_info		*info;
+	/* connection data placed here, up to space provided */
+	uint8_t			data[0];
+};
+
+struct fi_ops_ec {
+	size_t	size;
+	ssize_t	(*read)(fid_t fid, void *buf, size_t len);
+	ssize_t	(*readfrom)(fid_t fid, void *buf, size_t len,
+			    void *src_addr, size_t *addrlen);
+	ssize_t	(*readerr)(fid_t fid, void *buf, size_t len, uint64_t flags);
+	ssize_t	(*write)(fid_t fid, void *buf, size_t len);
+	int	(*reset)(fid_t fid, void *cond);
+	ssize_t	(*condread)(fid_t fid, void *buf, size_t len, void *cond);
+	ssize_t	(*condreadfrom)(fid_t fid, void *buf, size_t len,
+				void *src_addr, size_t *addrlen, void *cond);
+	const char * (*strerror)(fid_t fid, int prov_errno, void *prov_data,
+			    void *buf, size_t len);
+};
+
+struct fid_ec {
+	struct fid		fid;
+	struct fi_ops_ec	*ops;
+};
+
+
+enum fi_progress {
+	FI_PROGRESS_AUTO,
+	FI_PROGRESS_INDIRECT,	/* progress possible through any domain call */
+	FI_PROGRESS_EXPLICIT	/* user must explicitly request progress */
+};
+
+/*
+ * The thought is that domain attributes should be relative to what it can
+ * provide to the applications, and is not intended as a set of available
+ * hardware limits.
+ */
+struct fi_domain_attr {
+	/* Note to providers: set prov_attr to static struct */
+	size_t			prov_attr_size;
+	void			*prov_attr;
+	size_t			max_auth_key_size;
+	enum fi_progress	progress;
+};
+
+struct fi_ops_domain {
+	size_t	size;
+	int	(*progress)(fid_t fid);
+	int	(*query)(fid_t fid, struct fi_domain_attr *attr, size_t *attrlen);
+	int	(*av_open)(fid_t fid, struct fi_av_attr *attr, fid_t *av,
+			   void *context);
+	int	(*ec_open)(fid_t fid, struct fi_ec_attr *attr, fid_t *ec,
+			   void *context);
+	int	(*mr_reg)(fid_t fid, const void *buf, size_t len, fid_t *mr,
+			  uint64_t flags, void *context);
+	int	(*mr_regv)(fid_t fid, const struct iovec *iov, size_t count,
+			   fid_t *mr, uint64_t flags, void *context);
+};
+
+struct fid_domain {
+	struct fid		fid;
+	struct fi_ops_domain	*ops;
+};
+
+static inline int fi_ec_open(fid_t fid, struct fi_ec_attr *attr, fid_t *ec,
+			    void *context)
+{
+	struct fid_domain *domain = container_of(fid, struct fid_domain, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_RESOURCE_DOMAIN);
+	FI_ASSERT_OPS(fid, struct fid_domain, ops);
+	FI_ASSERT_OP(domain->ops, struct fi_ops_domain, ec_open);
+	return domain->ops->ec_open(fid, attr, ec, context);
+}
+
+static inline ssize_t fi_ec_read(fid_t fid, void *buf, size_t len)
+{
+	struct fid_ec *ec = container_of(fid, struct fid_ec, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_EC);
+	FI_ASSERT_OPS(fid, struct fid_ec, ops);
+	FI_ASSERT_OP(ec->ops, struct fi_ops_ec, read);
+	return ec->ops->read(fid, buf, len);
+}
+
+static inline ssize_t fi_ec_readfrom(fid_t fid, void *buf, size_t len,
+				     void *src_addr, size_t *addrlen)
+{
+	struct fid_ec *ec = container_of(fid, struct fid_ec, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_EC);
+	FI_ASSERT_OPS(fid, struct fid_ec, ops);
+	FI_ASSERT_OP(ec->ops, struct fi_ops_ec, readfrom);
+	return ec->ops->readfrom(fid, buf, len, src_addr, addrlen);
+}
+
+static inline ssize_t fi_ec_readerr(fid_t fid, void *buf, size_t len, uint64_t flags)
+{
+	struct fid_ec *ec = container_of(fid, struct fid_ec, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_EC);
+	FI_ASSERT_OPS(fid, struct fid_ec, ops);
+	FI_ASSERT_OP(ec->ops, struct fi_ops_ec, readerr);
+	return ec->ops->readerr(fid, buf, len, flags);
+}
+
+static inline int fi_ec_reset(fid_t fid, void *cond)
+{
+	struct fid_ec *ec = container_of(fid, struct fid_ec, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_EC);
+	FI_ASSERT_OPS(fid, struct fid_ec, ops);
+	FI_ASSERT_OP(ec->ops, struct fi_ops_ec, reset);
+	return ec->ops->reset(fid, cond);
+}
+
+static inline const char * fi_ec_strerror(fid_t fid, int prov_errno, void *prov_data,
+	void *buf, size_t len)
+{
+	struct fid_ec *ec = container_of(fid, struct fid_ec, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_EC);
+	FI_ASSERT_OPS(fid, struct fid_ec, ops);
+	FI_ASSERT_OP(ec->ops, struct fi_ops_ec, strerror);
+	return ec->ops->strerror(fid, prov_errno, prov_data, buf, len);
+}
+
+static inline int fi_mr_reg(fid_t fid, const void *buf, size_t len,
+			    fid_t *mr, uint64_t flags, void *context)
+{
+	struct fid_domain *domain = container_of(fid, struct fid_domain, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_RESOURCE_DOMAIN);
+	FI_ASSERT_OPS(fid, struct fid_domain, ops);
+	FI_ASSERT_OP(domain->ops, struct fi_ops_domain, mr_reg);
+	return domain->ops->mr_reg(fid, buf, len, mr, flags, context);
+}
+
+static inline uint64_t fi_mr_desc(fid_t fid)
+{
+	struct fid_mr *mr = container_of(fid, struct fid_mr, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_MR);
+	FI_ASSERT_FIELD(fid, struct fid_mr, mem_desc);
+	return mr->mem_desc;
+}
+
+static inline be64_t fi_mr_key(fid_t fid)
+{
+	struct fid_mr *mr = container_of(fid, struct fid_mr, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_MR);
+	FI_ASSERT_FIELD(fid, struct fid_mr, key);
+	return mr->key;
+}
+
+static inline int fi_mr_unreg(fid_t fid)
+{
+	FI_ASSERT_CLASS(fid, FID_CLASS_MR);
+	return fi_close(fid);
+}
+
+static inline int fi_av_open(fid_t fid, struct fi_av_attr *attr, fid_t *av,
+			     void *context)
+{
+	struct fid_domain *domain = container_of(fid, struct fid_domain, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_RESOURCE_DOMAIN);
+	FI_ASSERT_OPS(fid, struct fid_domain, ops);
+	FI_ASSERT_OP(domain->ops, struct fi_ops_domain, av_open);
+	return domain->ops->av_open(fid, attr, av, context);
+}
+
+static inline int fi_av_map(fid_t fid, const void *addr, size_t count,
+			    void **fi_addr, uint64_t flags)
+{
+	struct fid_av *av = container_of(fid, struct fid_av, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_AV);
+	FI_ASSERT_OPS(fid, struct fid_av, ops);
+	FI_ASSERT_OP(av->ops, struct fi_ops_av, insert);
+	return av->ops->insert(fid, addr, count, fi_addr, flags);
+}
+
+static inline int fi_av_unmap(fid_t fid, void *fi_addr, size_t count,
+			      uint64_t flags)
+{
+	struct fid_av *av = container_of(fid, struct fid_av, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_AV);
+	FI_ASSERT_OPS(fid, struct fid_av, ops);
+	FI_ASSERT_OP(av->ops, struct fi_ops_av, remove);
+	return av->ops->remove(fid, fi_addr, count, flags);
+}
+
+static inline int fi_av_sync(fid_t fid, uint64_t flags, void *context)
+{
+	FI_ASSERT_CLASS(fid, FID_CLASS_AV);
+	return fi_sync(fid, flags, context);
+}
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _FI_DOMAIN_H_ */
diff --git a/include/rdma/fi_errno.h b/include/rdma/fi_errno.h
new file mode 100644
index 00000000000..980f108d133
--- /dev/null
+++ b/include/rdma/fi_errno.h
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2013 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenFabrics.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _FI_ERRNO_H_
+#define _FI_ERRNO_H_
+
+#include <errno.h>
+
+/* FI directly mapped errno values */
+
+#define	FI_EPERM		EPERM		/* Operation not permitted */
+#define	FI_ENOENT		ENOENT		/* No such file or directory */
+#define	FI_ESRCH		ESRCH		/* No such process */
+#define	FI_EINTR		EINTR		/* Interrupted system call */
+#define	FI_EIO		 	EIO		/* I/O error */
+#define	FI_ENXIO		ENXIO		/* No such device or address */
+#define	FI_E2BIG		E2BIG		/* Argument list too long */
+#define	FI_ENOEXEC		ENOEXEC		/* Exec format error */
+#define	FI_EBADF		EBADF		/* Bad file number */
+#define	FI_ECHILD		ECHILD		/* No child processes */
+#define	FI_EAGAIN		EAGAIN		/* Try again */
+#define	FI_ENOMEM		ENOMEM		/* Out of memory */
+#define	FI_EACCES		EACCES		/* Permission denied */
+#define	FI_EFAULT		EFAULT		/* Bad address */
+#define	FI_ENOTBLK		ENOTBLK		/* Block device required */
+#define	FI_EBUSY		EBUSY		/* Device or resource busy */
+#define	FI_EEXIST		EEXIST		/* File exists */
+#define	FI_EXDEV		EXDEV		/* Cross-device link */
+#define	FI_ENODEV		ENODEV		/* No such device */
+#define	FI_ENOTDIR		ENOTDIR		/* Not a directory */
+#define	FI_EISDIR		EISDIR		/* Is a directory */
+#define	FI_EINVAL		EINVAL		/* Invalid argument */
+#define	FI_ENFILE		ENFILE		/* File table overflow */
+#define	FI_EMFILE		EMFILE		/* Too many open files */
+#define	FI_ENOTTY		ENOTTY		/* Not a typewriter */
+#define	FI_ETXTBSY		ETXTBSY		/* Text file busy */
+#define	FI_EFBIG		EFBIG		/* File too large */
+#define	FI_ENOSPC		ENOSPC		/* No space left on device */
+#define	FI_ESPIPE		ESPIPE		/* Illegal seek */
+#define	FI_EROFS		EROFS		/* Read-only file system */
+#define	FI_EMLINK		EMLINK		/* Too many links */
+#define	FI_EPIPE		EPIPE		/* Broken pipe */
+#define	FI_EDOM			EDOM		/* Math argument out of domain of func */
+#define	FI_ERANGE		ERANGE		/* Math result not representable */
+#define	FI_EDEADLK		EDEADLK		/* Resource deadlock would occur */
+#define	FI_ENAMETOOLONG		ENAMETOLONG	/* File name too long */
+#define	FI_ENOLCK		ENOLCK		/* No record locks available */
+#define	FI_ENOSYS		ENOSYS		/* Function not implemented */
+#define	FI_ENOTEMPTY		ENOTEMPTY	/* Directory not empty */
+#define	FI_ELOOP		ELOOP		/* Too many symbolic links encountered */
+#define	FI_EWOULDBLOCK		EWOULDBLOCK	/* Operation would block */
+#define	FI_ENOMSG		ENOMSG		/* No message of desired type */
+#define	FI_EIDRM		EIDRM		/* Identifier removed */
+#define	FI_ECHRNG		ECHRNG		/* Channel number out of range */
+#define	FI_EL2NSYNC		EL2NSYCN	/* Level 2 not synchronized */
+#define	FI_EL3HLT		EL3HLT		/* Level 3 halted */
+#define	FI_EL3RST		EL3RST		/* Level 3 reset */
+#define	FI_ELNRNG		ELNRNG		/* Link number out of range */
+#define	FI_EUNATCH		EUNATCH		/* Protocol driver not attached */
+#define	FI_ENOCSI		ENOCSI		/* No CSI structure available */
+#define	FI_EL2HLT		EL2HLT		/* Level 2 halted */
+#define	FI_EBADE		EBADE		/* Invalid exchange */
+#define	FI_EBADR		EBADDR		/* Invalid request descriptor */
+#define	FI_EXFULL		EXFULL		/* Exchange full */
+#define	FI_ENOANO		ENOANO		/* No anode */
+#define	FI_EBADRQC		EBADRQC		/* Invalid request code */
+#define	FI_EBADSLT		EBADSLT		/* Invalid slot */
+#define	FI_EDEADLOCK		EDEADLOCK	/* Resource deadlock would occur */
+#define	FI_EBFONT		EBFONT		/* Bad font file format */
+#define	FI_ENOSTR		ENOSTR		/* Device not a stream */
+#define	FI_ENODATA		ENODATA		/* No data available */
+#define	FI_ETIME		ETIME		/* Timer expired */
+#define	FI_ENOSR		ENOSR		/* Out of streams resources */
+#define	FI_ENONET		ENONET		/* Machine is not on the network */
+#define	FI_ENOPKG		ENOPKG		/* Package not installed */
+#define	FI_EREMOTE		EREMOTE		/* Object is remote */
+#define	FI_ENOLINK		ENOLINK		/* Link has been severed */
+#define	FI_EADV			EADV		/* Advertise error */
+#define	FI_ESRMNT		ESRMNT		/* Srmount error */
+#define	FI_ECOMM		ECOMM		/* Communication error on send */
+#define	FI_EPROTO		EPROTO		/* Protocol error */
+#define	FI_EMULTIHOP		EMULTIHOP	/* Multihop attempted */
+#define	FI_EDOTDOT		EDOTDOT		/* RFS specific error */
+#define	FI_EBADMSG		EBADMSG		/* Not a data message */
+#define	FI_EOVERFLOW		EOVERFLOW	/* Value too large for defined data type */
+#define	FI_ENOTUNIQ		ENOTUNIQ	/* Name not unique on network */
+#define	FI_EBADFD		EBADFD		/* File descriptor in bad state */
+#define	FI_EREMCHG		EREMCHG		/* Remote address changed */
+#define	FI_ELIBACC		ELIBACC		/* Can not access a needed shared library */
+#define	FI_ELIBBAD		ELIBBAD		/* Accessing a corrupted shared library */
+#define	FI_ELIBSCN		ELIBSCN		/* .lib section in a.out corrupted */
+#define	FI_ELIBMAX		ELIBMAX		/* Attempting to link in too many shared libraries */
+#define	FI_ELIBEXEC		ELIBEXEC	/* Cannot exec a shared library directly */
+#define	FI_EILSEQ		EILSEQ		/* Illegal byte sequence */
+#define	FI_ERESTART		ERESTART	/* Interrupted system call should be restarted */
+#define	FI_ESTRPIPE		ESTRPIPE	/* Streams pipe error */
+#define	FI_EUSERS		EUSERS		/* Too many users */
+#define	FI_ENOTSOCK		ENOTSOCK	/* Socket operation on non-socket */
+#define	FI_EDESTADDRREQ		EDESTADDRREQ	/* Destination address required */
+#define	FI_EMSGSIZE		EMSGSIZE	/* Message too long */
+#define	FI_EPROTOTYPE		EPROTOTYPE	/* Protocol wrong type for socket */
+#define	FI_ENOPROTOOPT		ENOPROTOOPT	/* Protocol not available */
+#define	FI_EPROTONOSUPPORT	EPROTONOSUPPORT	/* Protocol not supported */
+#define	FI_ESOCKTNOSUPPORT	ESOCKTNOSUPPORT	/* Socket type not supported */
+#define	FI_EOPNOTSUPP		EOPNOTSUPP	/* Operation not supported on transport endpoint */
+#define	FI_EPFNOSUPPORT		EPFNOSUPPORT	/* Protocol family not supported */
+#define	FI_EAFNOSUPPORT		EAFNOSUPPORT	/* Address family not supported by protocol */
+#define	FI_EADDRINUSE		EADDRINUSE	/* Address already in use */
+#define	FI_EADDRNOTAVAIL	EADDRNOTAVAIL	/* Cannot assign requested address */
+#define	FI_ENETDOWN		ENETDOWN	/* Network is down */
+#define	FI_ENETUNREACH		ENETUNREACH	/* Network is unreachable */
+#define	FI_ENETRESET		ENETRESET	/* Network dropped connection because of reset */
+#define	FI_ECONNABORTED		ECONNABORTED	/* Software caused connection abort */
+#define	FI_ECONNRESET		ECONNRESET	/* Connection reset by peer */
+#define	FI_ENOBUFS		ENOBUFS		/* No buffer space available */
+#define	FI_EISCONN		EISCONN		/* Transport endpoint is already connected */
+#define	FI_ENOTCONN		ENOTCONN	/* Transport endpoint is not connected */
+#define	FI_ESHUTDOWN		ESHUTDOWN	/* Cannot send after transport endpoint shutdown */
+#define	FI_ETOOMANYREFS		ETOOMANYREFS	/* Too many references: cannot splice */
+#define	FI_ETIMEDOUT		ETIMEDOUT	/* Connection timed out */
+#define	FI_ECONNREFUSED		ECONNREFUSED	/* Connection refused */
+#define	FI_EHOSTDOWN		EHOSTDOWN	/* Host is down */
+#define	FI_EHOSTUNREACH		EHOSTUNREACH	/* No route to host */
+#define	FI_EALREADY		EALREADY	/* Operation already in progress */
+#define	FI_EINPROGRESS		EINPROGRESS	/* Operation now in progress */
+#define	FI_ESTALE		ESTALE		/* Stale NFS file handle */
+#define	FI_EUCLEAN		EUNCLEAN	/* Structure needs cleaning */
+#define	FI_ENOTNAM		ENOTNAM		/* Not a XENIX named type file */
+#define	FI_ENAVAIL		ENAVAIL		/* No XENIX semaphores available */
+#define	FI_EISNAM		EISNAM		/* Is a named type file */
+#define	FI_EREMOTEIO		EREMOTEIO	/* Remote I/O error */
+#define	FI_EDQUOT		EDQUOT		/* Quota exceeded */
+#define	FI_ENOMEDIUM		ENOMEDIUM	/* No medium found */
+#define	FI_EMEDIUMTYPE		EMEDIUMTYPE	/* Wrong medium type */
+#define	FI_ECANCELED		ECANCELED	/* Operation Canceled */
+#define	FI_ENOKEY		ENOKEY		/* Required key not available */
+#define	FI_EKEYEXPIRED		EKEYEXPIRED	/* Key has expired */
+#define	FI_EKEYREVOKED		EKEYREVOKED	/* Key has been revoked */
+#define	FI_EKEYREJECTED		EKEYREJECTED	/* Key was rejected by service */
+#define	FI_EOWNERDEAD		EOWNERDEAD	/* Owner died */
+#define	FI_ENOTRECOVERABLE	ENOTRECOVERABLE	/* State not recoverable */
+
+/* FI specific return values: >= 256 */
+
+#define FI_EOTHER		256		/* Unspecified error */
+#define FI_ETOOSMALL		257		/* Provided buffer is too small */
+
+const char *fi_strerror(int errnum);
+
+#endif /* _FI_ERRNO_H_ */
diff --git a/include/rdma/fi_prov.h b/include/rdma/fi_prov.h
new file mode 100644
index 00000000000..90cc52b6746
--- /dev/null
+++ b/include/rdma/fi_prov.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc.  All rights reserved.
+ * Copyright (c) 2013 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenFabrics.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _FI_PROV_H_
+#define _FI_PROV_H_
+
+#include <rdma/fabric.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Extension that low-level drivers should add to their .so filename
+ * (probably via libtool "-release" option).  For example a low-level
+ * driver named "libfoo" should build a plug-in named "libfoo-fi.so".
+ */
+#define FI_LIB_EXTENSION fi
+
+struct fi_ops_prov {
+	size_t	size;
+	int	(*getinfo)(char *node, char *service, struct fi_info *hints,
+			   struct fi_info **info);
+	int	(*freeinfo)(struct fi_info *info);
+	int	(*socket)(struct fi_info *info, fid_t *fid, void *context);
+	int	(*open)(const char *name, struct fi_info *info, uint64_t flags,
+			fid_t *fid, void *context);
+};
+
+void fi_register(struct fi_ops_prov *ops);
+
+#define FI_LIB_CLASS_NAME	"libfabric"
+
+struct fi_ops_lib {
+	size_t		size;
+	size_t		(*context_size)(void);
+	const char *	(*sysfs_path)(void);
+	int		(*read_file)(const char *dir, const char *file,
+				     char *buf, size_t size);
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _FI_PROV_H_ */
diff --git a/include/rdma/fi_rdma.h b/include/rdma/fi_rdma.h
new file mode 100644
index 00000000000..27ede3537b7
--- /dev/null
+++ b/include/rdma/fi_rdma.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2013 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenFabrics.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _FI_RDMA_H_
+#define _FI_RDMA_H_
+
+#include <rdma/fi_socket.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+struct fi_rdma_iov {
+	uint64_t		addr;
+	size_t			len;
+	be64_t			key;
+};
+
+struct fi_msg_rdma {
+	const void		*msg_iov;
+	size_t			iov_count;
+	const void		*addr;
+	const struct fi_rdma_iov *rdma_iov;
+	size_t			rdma_iov_count;
+	void			*context;
+	uint64_t		data;
+};
+
+struct fi_ops_rdma {
+	size_t	size;
+	int	(*read)(fid_t fid, void *buf, size_t len, uint64_t addr,
+			be64_t key, void *context);
+	int	(*readmem)(fid_t fid, void *buf, size_t len, uint64_t mem_desc,
+			   uint64_t addr, be64_t key, void *context);
+	int	(*readv)(fid_t fid, const void *iov, size_t count, uint64_t addr,
+			 be64_t key, void *context);
+	int	(*readfrom)(fid_t fid, void *buf, size_t len, const void *src_addr,
+			    uint64_t addr, be64_t key, void *context);
+	int	(*readmemfrom)(fid_t fid, void *buf, size_t len, uint64_t mem_desc,
+			       const void *src_addr, uint64_t addr, be64_t key,
+			       void *context);
+	int	(*readmsg)(fid_t fid, const struct fi_msg_rdma *msg, uint64_t flags);
+	int	(*write)(fid_t fid, const void *buf, size_t len, uint64_t addr,
+			 be64_t key, void *context);
+	int	(*writemem)(fid_t fid, const void *buf, size_t len, uint64_t mem_desc,
+			    uint64_t addr, be64_t key, void *context);
+	int	(*writev)(fid_t fid, const void *iov, size_t count, uint64_t addr,
+			  be64_t key, void *context);
+	int	(*writememto)(fid_t fid, const void *buf, size_t len, uint64_t mem_desc,
+			      const void *dst_addr, uint64_t addr, be64_t key,
+			      void *context);
+	int	(*writeto)(fid_t fid, const void *buf, size_t len, const void *dst_addr,
+			   uint64_t addr, be64_t key, void *context);
+	int	(*writemsg)(fid_t fid, const struct fi_msg_rdma *msg, uint64_t flags);
+};
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _FI_RDMA_H_ */
diff --git a/include/rdma/fi_socket.h b/include/rdma/fi_socket.h
new file mode 100644
index 00000000000..544a6ddcb59
--- /dev/null
+++ b/include/rdma/fi_socket.h
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2013 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenFabrics.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _FI_SOCKET_H_
+#define _FI_SOCKET_H_
+
+#include <sys/socket.h>
+#include <rdma/fabric.h>
+#include <stddef.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+enum fid_type {
+	FID_UNSPEC,
+	FID_MSG,		/* pick a better name */
+	FID_STREAM,
+	FID_DGRAM,
+	FID_RAW,
+	FID_RDM,
+	FID_PACKET,
+	FID_MAX
+};
+
+#define FID_TYPE_MASK		0xFF
+
+enum fi_proto {
+	FI_PROTO_UNSPEC,
+	FI_PROTO_IB_RC,
+	FI_PROTO_IWARP,
+	FI_PROTO_IB_UC,
+	FI_PROTO_IB_UD,
+	FI_PROTO_IB_XRC,
+	FI_PROTO_RAW,
+	FI_PROTO_MAX
+};
+
+#define FI_PROTO_MASK		0xFF
+#define FI_PROTO_MSG		(1ULL << 8)
+#define FI_PROTO_RDMA		(1ULL << 9)
+#define FI_PROTO_TAGGED		(1ULL << 10)
+#define FI_PROTO_ATOMICS	(1ULL << 11)
+#define FI_PROTO_MULTICAST	(1ULL << 12)	/* multicast uses MSG ops */
+/*#define FI_PROTO_COLLECTIVES	(1ULL << 13)*/
+
+struct fi_msg {
+	const void		*msg_iov;
+	size_t			iov_count;
+	const void		*addr;
+	void			*context;
+	uint64_t		data;
+};
+
+struct fi_ops_sock {
+	size_t	size;
+	ssize_t	(*cancel)(fid_t fid, struct fi_context *context);
+	/* syncto? (fid_t fid, void *addr, uint64_t flags, void *context); */
+	int	(*getopt)(fid_t fid, int level, int optname,
+			  void *optval, size_t *optlen);
+	int	(*setopt)(fid_t fid, int level, int optname,
+			  const void *optval, size_t optlen);
+};
+
+struct fi_ops_msg {
+	size_t	size;
+	ssize_t (*recv)(fid_t fid, void *buf, size_t len, void *context);
+	ssize_t (*recvmem)(fid_t fid, void *buf, size_t len, uint64_t mem_desc,
+			  void *context);
+	ssize_t (*recvv)(fid_t fid, const void *iov, size_t count, void *context);
+	ssize_t (*recvfrom)(fid_t fid, void *buf, size_t len,
+			    const void *src_addr, void *context);
+	ssize_t (*recvmemfrom)(fid_t fid, void *buf, size_t len, uint64_t mem_desc,
+			       const void *src_addr, void *context);
+	ssize_t (*recvmsg)(fid_t fid, const struct fi_msg *msg, uint64_t flags);
+	ssize_t (*send)(fid_t fid, const void *buf, size_t len, void *context);
+	ssize_t (*sendmem)(fid_t fid, const void *buf, size_t len,
+			   uint64_t mem_desc, void *context);
+	ssize_t (*sendv)(fid_t fid, const void *iov, size_t count, void *context);
+	ssize_t (*sendto)(fid_t fid, const void *buf, size_t len,
+			  const void *dest_addr, void *context);
+	ssize_t (*sendmemto)(fid_t fid, const void *buf, size_t len, uint64_t mem_desc,
+			     const void *dest_addr, void *context);
+	ssize_t (*sendmsg)(fid_t fid, const struct fi_msg *msg, uint64_t flags);
+};
+
+struct fi_ops_cm;
+struct fi_ops_rdma;
+struct fi_ops_tagged;
+/* struct fi_ops_atomic; */
+/* struct fi_ops_collectives; */
+
+/*
+ * Calls which modify the properties of a socket (control, setopt, bind, ...)
+ * must be serialized against all other operations.  Those calls may modify the
+ * operations referenced by a socket in order to optimize the data transfer code
+ * paths.
+ *
+ * A provider may allocate the minimal size structure needed to support the
+ * ops requested by the user.
+ */
+struct fid_socket {
+	struct fid		fid;
+	struct fi_ops_sock	*ops;
+	struct fi_ops_msg	*msg;
+	struct fi_ops_cm	*cm;
+	struct fi_ops_rdma	*rdma;
+	struct fi_ops_tagged	*tagged;
+	/* struct fi_ops_atomics	*atomic; */
+};
+
+static inline ssize_t fi_cancel(fid_t fid, struct fi_context *context)
+{
+	struct fid_socket *sock = container_of(fid, struct fid_socket, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_SOCKET);
+	FI_ASSERT_OPS(fid, struct fid_socket, ops);
+	FI_ASSERT_OP(sock->ops, struct fi_ops_sock, cancel);
+	return sock->ops->cancel(fid, context);
+}
+
+static inline ssize_t fi_setsockopt(fid_t fid, int level, int optname,
+				    const void *optval, size_t optlen)
+{
+	struct fid_socket *sock = container_of(fid, struct fid_socket, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_SOCKET);
+	FI_ASSERT_OPS(fid, struct fid_socket, ops);
+	FI_ASSERT_OP(sock->ops, struct fi_ops_sock, setopt);
+	return sock->ops->setopt(fid, level, optname, optval, optlen);
+}
+
+static inline ssize_t fi_recvmem(fid_t fid, void *buf, size_t len,
+				 uint64_t mem_desc, void *context)
+{
+	struct fid_socket *sock = container_of(fid, struct fid_socket, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_SOCKET);
+	FI_ASSERT_OPS(fid, struct fid_socket, msg);
+	FI_ASSERT_OP(sock->msg, struct fi_ops_msg, recvmem);
+	return sock->msg->recvmem(fid, buf, len, mem_desc, context);
+}
+
+static inline ssize_t fi_sendmem(fid_t fid, void *buf, size_t len,
+				 uint64_t mem_desc, void *context)
+{
+	struct fid_socket *sock = container_of(fid, struct fid_socket, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_SOCKET);
+	FI_ASSERT_OPS(fid, struct fid_socket, msg);
+	FI_ASSERT_OP(sock->msg, struct fi_ops_msg, sendmem);
+	return sock->msg->sendmem(fid, buf, len, mem_desc, context);
+}
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _FI_SOCKET_H_ */
diff --git a/include/rdma/fi_tagged.h b/include/rdma/fi_tagged.h
new file mode 100644
index 00000000000..b1631d84079
--- /dev/null
+++ b/include/rdma/fi_tagged.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2013 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenFabrics.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _FI_TAGGED_H_
+#define _FI_TAGGED_H_
+
+#include <assert.h>
+#include <rdma/fi_socket.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct fi_msg_tagged {
+	const void		*msg_iov;
+	size_t			iov_count;
+	const void		*addr;
+	be64_t			tag;
+	be64_t			mask;
+	void			*context;
+	uint64_t		data;
+};
+
+struct fi_ops_tagged {
+	size_t	size;
+	ssize_t (*recv)(fid_t fid, void *buf, size_t len,
+			be64_t tag, be64_t mask, void *context);
+	ssize_t (*recvv)(fid_t fid, const void *iov, size_t count,
+			be64_t tag, be64_t mask, void *context);
+	ssize_t (*recvfrom)(fid_t fid, void *buf, size_t len, const void *src_addr,
+			    be64_t tag, be64_t mask, void *context);
+	ssize_t (*recvmsg)(fid_t fid, const struct fi_msg_tagged *msg, uint64_t flags);
+	ssize_t (*send)(fid_t fid, const void *buf, size_t len, be64_t tag,
+			void *context);
+	ssize_t (*sendv)(fid_t fid, const void *iov, size_t count, be64_t tag,
+			 void *context);
+	ssize_t (*sendto)(fid_t fid, const void *buf, size_t len,
+			  const void *dest_addr, be64_t tag, void *context);
+	ssize_t (*sendmsg)(fid_t fid, const struct fi_msg_tagged *msg, uint64_t flags);
+	ssize_t (*search)(fid_t fid, be64_t *tag, be64_t mask, uint64_t flags,
+			  void *src_addr, size_t *src_addrlen, size_t *len, void *context);
+};
+
+static inline ssize_t
+fi_tsendto(fid_t fid, const void *buf, size_t len,
+	   const void *dest_addr, be64_t tag, void *context)
+{
+	struct fid_socket *sock = container_of(fid, struct fid_socket, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_SOCKET);
+	FI_ASSERT_OPS(fid, struct fid_socket, tagged);
+	FI_ASSERT_OP(sock->tagged, struct fi_ops_tagged, sendto);
+	return sock->tagged->sendto(fid, buf, len, dest_addr, tag, context);
+}
+
+static inline ssize_t
+fi_trecvfrom(fid_t fid, void *buf, size_t len, const void *src_addr,
+	     be64_t tag, be64_t mask, void *context)
+{
+	struct fid_socket *sock = container_of(fid, struct fid_socket, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_SOCKET);
+	FI_ASSERT_OPS(fid, struct fid_socket, tagged);
+	FI_ASSERT_OP(sock->tagged, struct fi_ops_tagged, recvfrom);
+	return sock->tagged->recvfrom(fid, buf, len, src_addr, tag, mask, context);
+}
+
+static inline ssize_t
+fi_tsearch(fid_t fid, be64_t *tag, be64_t mask, uint64_t flags,
+	   void *src_addr, size_t *src_addrlen, size_t *len, void *context)
+{
+	struct fid_socket *sock = container_of(fid, struct fid_socket, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_SOCKET);
+	FI_ASSERT_OPS(fid, struct fid_socket, tagged);
+	FI_ASSERT_OP(sock->tagged, struct fi_ops_tagged, search);
+	return sock->tagged->search(fid, tag, mask, flags, src_addr, src_addrlen, len, context);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _FI_TAGGED_H_ */
diff --git a/include/rdma/fi_ucma.h b/include/rdma/fi_ucma.h
new file mode 100644
index 00000000000..36d2b8a5e5a
--- /dev/null
+++ b/include/rdma/fi_ucma.h
@@ -0,0 +1,718 @@
+/*
+ * Copyright (c) 2005-2013 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _FI_UCMA_H_
+#define _FI_UCMA_H_
+
+#include <linux/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+
+#include <rdma/fabric.h>
+#include <rdma/fi_uverbs.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+struct ibv_kern_path_rec {
+	__u8  dgid[16];
+	__u8  sgid[16];
+	__u16 dlid;
+	__u16 slid;
+	__u32 raw_traffic;
+	__u32 flow_label;
+	__u32 reversible;
+	__u32 mtu;
+	__u16 pkey;
+	__u8  hop_limit;
+	__u8  traffic_class;
+	__u8  numb_path;
+	__u8  sl;
+	__u8  mtu_selector;
+	__u8  rate_selector;
+	__u8  rate;
+	__u8  packet_life_time_selector;
+	__u8  packet_life_time;
+	__u8  preference;
+};
+
+#define IBV_PATH_RECORD_REVERSIBLE 0x80
+
+struct ibv_path_record {
+	uint64_t service_id;
+	uint8_t  dgid[16];
+	uint8_t  sgid[16];
+	uint16_t dlid;
+	uint16_t slid;
+	uint32_t flowlabel_hoplimit; /* resv-31:28 flow label-27:8 hop limit-7:0*/
+	uint8_t  tclass;
+	uint8_t  reversible_numpath; /* reversible-7:7 num path-6:0 */
+	uint16_t pkey;
+	uint16_t qosclass_sl;	    /* qos class-15:4 sl-3:0 */
+	uint8_t  mtu;		    /* mtu selector-7:6 mtu-5:0 */
+	uint8_t  rate;		    /* rate selector-7:6 rate-5:0 */
+	uint8_t  packetlifetime;	    /* lifetime selector-7:6 lifetime-5:0 */
+	uint8_t  preference;
+	uint8_t  reserved[6];
+};
+
+#define IBV_PATH_FLAG_GMP		(1<<0)
+#define IBV_PATH_FLAG_PRIMARY		(1<<1)
+#define IBV_PATH_FLAG_ALTERNATE		(1<<2)
+#define IBV_PATH_FLAG_OUTBOUND		(1<<3)
+#define IBV_PATH_FLAG_INBOUND		(1<<4)
+#define IBV_PATH_FLAG_INBOUND_REVERSE	(1<<5)
+#define IBV_PATH_FLAG_BIDIRECTIONAL	(IBV_PATH_FLAG_OUTBOUND |     \
+					 IBV_PATH_FLAG_INBOUND_REVERSE)
+
+struct ibv_path_data {
+	uint32_t		flags;
+	uint32_t		reserved;
+	struct ibv_path_record	path;
+};
+
+
+/*
+ * This file must be kept in sync with the kernel's version of rdma_user_cm.h
+ */
+
+#define RDMA_USER_CM_MIN_ABI_VERSION	4
+#define RDMA_USER_CM_MAX_ABI_VERSION	4
+
+#define RDMA_MAX_PRIVATE_DATA		256
+
+enum {
+	UCMA_CMD_CREATE_ID,
+	UCMA_CMD_DESTROY_ID,
+	UCMA_CMD_BIND_IP,
+	UCMA_CMD_RESOLVE_IP,
+	UCMA_CMD_RESOLVE_ROUTE,
+	UCMA_CMD_QUERY_ROUTE,
+	UCMA_CMD_CONNECT,
+	UCMA_CMD_LISTEN,
+	UCMA_CMD_ACCEPT,
+	UCMA_CMD_REJECT,
+	UCMA_CMD_DISCONNECT,
+	UCMA_CMD_INIT_QP_ATTR,
+	UCMA_CMD_GET_EVENT,
+	UCMA_CMD_GET_OPTION,	/* unused */
+	UCMA_CMD_SET_OPTION,
+	UCMA_CMD_NOTIFY,
+ 	UCMA_CMD_JOIN_IP_MCAST,
+ 	UCMA_CMD_LEAVE_MCAST,
+	UCMA_CMD_MIGRATE_ID,
+	UCMA_CMD_QUERY,
+	UCMA_CMD_BIND,
+	UCMA_CMD_RESOLVE_ADDR,
+	UCMA_CMD_JOIN_MCAST
+};
+
+struct ucma_abi_cmd_hdr {
+	__u32 cmd;
+	__u16 in;
+	__u16 out;
+};
+
+struct ucma_abi_create_id {
+	__u32 cmd;
+	__u16 in;
+	__u16 out;
+	__u64 uid;
+	__u64 response;
+	__u16 ps;
+	__u8  qp_type;
+	__u8  reserved[5];
+};
+
+struct ucma_abi_create_id_resp {
+	__u32 id;
+};
+
+struct ucma_abi_destroy_id {
+	__u32 cmd;
+	__u16 in;
+	__u16 out;
+	__u64 response;
+	__u32 id;
+	__u32 reserved;
+};
+
+struct ucma_abi_destroy_id_resp {
+	__u32 events_reported;
+};
+
+struct ucma_abi_bind_ip {
+	__u32 cmd;
+	__u16 in;
+	__u16 out;
+	__u64 response;
+	struct sockaddr_in6 addr;
+	__u32 id;
+};
+
+struct ucma_abi_bind {
+	__u32 cmd;
+	__u16 in;
+	__u16 out;
+	__u32 id;
+	__u16 addr_size;
+	__u16 reserved;
+	struct sockaddr_storage addr;
+};
+
+struct ucma_abi_resolve_ip {
+	__u32 cmd;
+	__u16 in;
+	__u16 out;
+	struct sockaddr_in6 src_addr;
+	struct sockaddr_in6 dst_addr;
+	__u32 id;
+	__u32 timeout_ms;
+};
+
+struct ucma_abi_resolve_addr {
+	__u32 cmd;
+	__u16 in;
+	__u16 out;
+	__u32 id;
+	__u32 timeout_ms;
+	__u16 src_size;
+	__u16 dst_size;
+	__u32 reserved;
+	struct sockaddr_storage src_addr;
+	struct sockaddr_storage dst_addr;
+};
+
+struct ucma_abi_resolve_route {
+	__u32 cmd;
+	__u16 in;
+	__u16 out;
+	__u32 id;
+	__u32 timeout_ms;
+};
+
+enum {
+	UCMA_QUERY_ADDR,
+	UCMA_QUERY_PATH,
+	UCMA_QUERY_GID
+};
+
+struct ucma_abi_query {
+	__u32 cmd;
+	__u16 in;
+	__u16 out;
+	__u64 response;
+	__u32 id;
+	__u32 option;
+};
+
+struct ucma_abi_query_route_resp {
+	__u64 node_guid;
+	struct ibv_kern_path_rec ib_route[2];
+	struct sockaddr_in6 src_addr;
+	struct sockaddr_in6 dst_addr;
+	__u32 num_paths;
+	__u8 port_num;
+	__u8 reserved[3];
+};
+
+struct ucma_abi_query_addr_resp {
+	__u64 node_guid;
+	__u8  port_num;
+	__u8  reserved;
+	__u16 pkey;
+	__u16 src_size;
+	__u16 dst_size;
+	struct sockaddr_storage src_addr;
+	struct sockaddr_storage dst_addr;
+};
+
+struct ucma_abi_query_path_resp {
+	__u32 num_paths;
+	__u32 reserved;
+	struct ibv_path_data path_data[0];
+};
+
+struct ucma_abi_conn_param {
+	__u32 qp_num;
+	__u32 reserved;
+	__u8  private_data[RDMA_MAX_PRIVATE_DATA];
+	__u8  private_data_len;
+	__u8  srq;
+	__u8  responder_resources;
+	__u8  initiator_depth;
+	__u8  flow_control;
+	__u8  retry_count;
+	__u8  rnr_retry_count;
+	__u8  valid;
+};
+
+struct ucma_abi_ud_param {
+	__u32 qp_num;
+	__u32 qkey;
+	struct ibv_kern_ah_attr ah_attr;
+	__u8 private_data[RDMA_MAX_PRIVATE_DATA];
+	__u8 private_data_len;
+	__u8 reserved[7];
+	__u8 reserved2[4];  /* Round to 8-byte boundary to support 32/64 */
+};
+
+struct ucma_abi_connect {
+	__u32 cmd;
+	__u16 in;
+	__u16 out;
+	struct ucma_abi_conn_param conn_param;
+	__u32 id;
+	__u32 reserved;
+};
+
+struct ucma_abi_listen {
+	__u32 cmd;
+	__u16 in;
+	__u16 out;
+	__u32 id;
+	__u32 backlog;
+};
+
+struct ucma_abi_accept {
+	__u32 cmd;
+	__u16 in;
+	__u16 out;
+	__u64 uid;
+	struct ucma_abi_conn_param conn_param;
+	__u32 id;
+	__u32 reserved;
+};
+
+struct ucma_abi_reject {
+	__u32 cmd;
+	__u16 in;
+	__u16 out;
+	__u32 id;
+	__u8  private_data_len;
+	__u8  reserved[3];
+	__u8  private_data[RDMA_MAX_PRIVATE_DATA];
+};
+
+struct ucma_abi_disconnect {
+	__u32 cmd;
+	__u16 in;
+	__u16 out;
+	__u32 id;
+};
+
+struct ucma_abi_init_qp_attr {
+	__u32 cmd;
+	__u16 in;
+	__u16 out;
+	__u64 response;
+	__u32 id;
+	__u32 qp_state;
+};
+
+struct ucma_abi_notify {
+	__u32 cmd;
+	__u16 in;
+	__u16 out;
+	__u32 id;
+	__u32 event;
+};
+
+struct ucma_abi_join_ip_mcast {
+	__u32 cmd;
+	__u16 in;
+	__u16 out;
+	__u64 response;		/* ucma_abi_create_id_resp */
+	__u64 uid;
+	struct sockaddr_in6 addr;
+	__u32 id;
+};
+
+struct ucma_abi_join_mcast {
+	__u32 cmd;
+	__u16 in;
+	__u16 out;
+	__u64 response;		/* rdma_ucma_create_id_resp */
+	__u64 uid;
+	__u32 id;
+	__u16 addr_size;
+	__u16 reserved;
+	struct sockaddr_storage addr;
+};
+
+struct ucma_abi_get_event {
+	__u32 cmd;
+	__u16 in;
+	__u16 out;
+	__u64 response;
+};
+
+struct ucma_abi_event_resp {
+	__u64 uid;
+	__u32 id;
+	__u32 event;
+	__u32 status;
+	union {
+		struct ucma_abi_conn_param conn;
+		struct ucma_abi_ud_param   ud;
+	} param;
+};
+
+struct ucma_abi_set_option {
+	__u32 cmd;
+	__u16 in;
+	__u16 out;
+	__u64 optval;
+	__u32 id;
+	__u32 level;
+	__u32 optname;
+	__u32 optlen;
+};
+
+struct ucma_abi_migrate_id {
+	__u32 cmd;
+	__u16 in;
+	__u16 out;
+	__u64 response;
+	__u32 id;
+	__u32 fd;
+};
+
+struct ucma_abi_migrate_resp {
+	__u32 events_reported;
+};
+
+
+struct fi_ops_ucma {
+	size_t	size;
+	int	(*create_id)(fid_t fid,
+				struct ucma_abi_create_id *cmd, size_t cmd_size,
+				struct ucma_abi_create_id_resp *resp, size_t resp_size);
+	int	(*destroy_id)(fid_t fid,
+				struct ucma_abi_destroy_id *cmd, size_t cmd_size,
+				struct ucma_abi_destroy_id_resp *resp, size_t resp_size);
+	int	(*bind_ip)(fid_t fid,
+				struct ucma_abi_bind_ip *cmd, size_t cmd_size);
+	int	(*bind)(fid_t fid,
+				struct ucma_abi_bind *cmd, size_t cmd_size);
+	int	(*resolve_ip)(fid_t fid,
+				struct ucma_abi_resolve_ip *cmd, size_t cmd_size);
+	int	(*resolve_addr)(fid_t fid,
+				struct ucma_abi_resolve_addr *cmd, size_t cmd_size);
+	int	(*resolve_route)(fid_t fid,
+				struct ucma_abi_resolve_route *cmd, size_t cmd_size);
+	int	(*query_route)(fid_t fid,
+				struct ucma_abi_query *cmd, size_t cmd_size,
+				struct ucma_abi_query_route_resp *resp, size_t resp_size);
+	int	(*query)(fid_t fid,
+				struct ucma_abi_query *cmd, size_t cmd_size,
+				void *resp, size_t resp_size);
+	int	(*connect)(fid_t fid,
+				struct ucma_abi_connect *cmd, size_t cmd_size);
+	int	(*listen)(fid_t fid,
+				struct ucma_abi_listen *cmd, size_t cmd_size);
+	int	(*accept)(fid_t fid,
+				struct ucma_abi_accept *cmd, size_t cmd_size);
+	int	(*reject)(fid_t fid,
+				struct ucma_abi_reject *cmd, size_t cmd_size);
+	int	(*disconnect)(fid_t fid,
+				struct ucma_abi_disconnect *cmd, size_t cmd_size);
+	int	(*init_qp_attr)(fid_t fid,
+				struct ucma_abi_init_qp_attr *cmd, size_t cmd_size,
+				struct ibv_kern_qp_attr *resp, size_t resp_size);
+	int	(*get_event)(fid_t fid,
+				struct ucma_abi_get_event *cmd, size_t cmd_size,
+				struct ucma_abi_event_resp *resp, size_t resp_size);
+	int	(*set_option)(fid_t fid,
+				struct ucma_abi_set_option *cmd, size_t cmd_size);
+	int	(*notify)(fid_t fid,
+				struct ucma_abi_notify *cmd, size_t cmd_size);
+	int	(*join_ip_mcast)(fid_t fid,
+				struct ucma_abi_join_ip_mcast *cmd, size_t cmd_size,
+				struct ucma_abi_create_id_resp *resp, size_t resp_size);
+	int	(*join_mcast)(fid_t fid,
+				struct ucma_abi_join_mcast *cmd, size_t cmd_size,
+				struct ucma_abi_create_id_resp *resp, size_t resp_size);
+	int	(*leave_mcast)(fid_t fid,
+				struct ucma_abi_destroy_id *cmd, size_t cmd_size,
+				struct ucma_abi_destroy_id_resp *resp, size_t resp_size);
+	int	(*migrate_id)(fid_t fid,
+				struct ucma_abi_migrate_id *cmd, size_t cmd_size,
+				struct ucma_abi_migrate_resp *resp, size_t resp_size);
+};
+
+#define FI_UCMA_INTERFACE	"ucma"
+
+struct fid_ucma {
+	struct fid		fid;
+	int			fd;
+	struct fi_ops_ucma	*ops;
+
+};
+
+static inline int ucma_create_id(fid_t fid,
+			struct ucma_abi_create_id *cmd, size_t cmd_size,
+			struct ucma_abi_create_id_resp *resp, size_t resp_size)
+{
+	struct fid_ucma *ucma = container_of(fid, struct fid_ucma, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_ucma, ops);
+	FI_ASSERT_OP(ucma->ops, struct fi_ops_ucma, create_id);
+	return ucma->ops->create_id(fid, cmd, cmd_size, resp, resp_size);
+}
+
+static inline int ucma_destroy_id(fid_t fid,
+			struct ucma_abi_destroy_id *cmd, size_t cmd_size,
+			struct ucma_abi_destroy_id_resp *resp, size_t resp_size)
+{
+	struct fid_ucma *ucma = container_of(fid, struct fid_ucma, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_ucma, ops);
+	FI_ASSERT_OP(ucma->ops, struct fi_ops_ucma, destroy_id);
+	return ucma->ops->destroy_id(fid, cmd, cmd_size, resp, resp_size);
+}
+
+static inline int ucma_bind_ip(fid_t fid,
+			struct ucma_abi_bind_ip *cmd, size_t cmd_size)
+{
+	struct fid_ucma *ucma = container_of(fid, struct fid_ucma, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_ucma, ops);
+	FI_ASSERT_OP(ucma->ops, struct fi_ops_ucma, bind_ip);
+	return ucma->ops->bind_ip(fid, cmd, cmd_size);
+}
+
+static inline int ucma_bind(fid_t fid,
+			struct ucma_abi_bind *cmd, size_t cmd_size)
+{
+	struct fid_ucma *ucma = container_of(fid, struct fid_ucma, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_ucma, ops);
+	FI_ASSERT_OP(ucma->ops, struct fi_ops_ucma, bind);
+	return ucma->ops->bind(fid, cmd, cmd_size);
+}
+
+static inline int ucma_resolve_ip(fid_t fid,
+			struct ucma_abi_resolve_ip *cmd, size_t cmd_size)
+{
+	struct fid_ucma *ucma = container_of(fid, struct fid_ucma, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_ucma, ops);
+	FI_ASSERT_OP(ucma->ops, struct fi_ops_ucma, resolve_ip);
+	return ucma->ops->resolve_ip(fid, cmd, cmd_size);
+}
+
+static inline int ucma_resolve_addr(fid_t fid,
+			struct ucma_abi_resolve_addr *cmd, size_t cmd_size)
+{
+	struct fid_ucma *ucma = container_of(fid, struct fid_ucma, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_ucma, ops);
+	FI_ASSERT_OP(ucma->ops, struct fi_ops_ucma, resolve_addr);
+	return ucma->ops->resolve_addr(fid, cmd, cmd_size);
+}
+
+static inline int ucma_resolve_route(fid_t fid,
+			struct ucma_abi_resolve_route *cmd, size_t cmd_size)
+{
+	struct fid_ucma *ucma = container_of(fid, struct fid_ucma, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_ucma, ops);
+	FI_ASSERT_OP(ucma->ops, struct fi_ops_ucma, resolve_route);
+	return ucma->ops->resolve_route(fid, cmd, cmd_size);
+}
+
+static inline int ucma_query_route(fid_t fid,
+			struct ucma_abi_query *cmd, size_t cmd_size,
+			struct ucma_abi_query_route_resp *resp, size_t resp_size)
+{
+	struct fid_ucma *ucma = container_of(fid, struct fid_ucma, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_ucma, ops);
+	FI_ASSERT_OP(ucma->ops, struct fi_ops_ucma, query_route);
+	return ucma->ops->query_route(fid, cmd, cmd_size, resp, resp_size);
+}
+
+static inline int ucma_query(fid_t fid,
+			struct ucma_abi_query *cmd, size_t cmd_size,
+			void *resp, size_t resp_size)
+{
+	struct fid_ucma *ucma = container_of(fid, struct fid_ucma, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_ucma, ops);
+	FI_ASSERT_OP(ucma->ops, struct fi_ops_ucma, query);
+	return ucma->ops->query(fid, cmd, cmd_size, resp, resp_size);
+}
+
+static inline int ucma_connect(fid_t fid,
+			struct ucma_abi_connect *cmd, size_t cmd_size)
+{
+	struct fid_ucma *ucma = container_of(fid, struct fid_ucma, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_ucma, ops);
+	FI_ASSERT_OP(ucma->ops, struct fi_ops_ucma, connect);
+	return ucma->ops->connect(fid, cmd, cmd_size);
+}
+
+static inline int ucma_listen(fid_t fid,
+			struct ucma_abi_listen *cmd, size_t cmd_size)
+{
+	struct fid_ucma *ucma = container_of(fid, struct fid_ucma, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_ucma, ops);
+	FI_ASSERT_OP(ucma->ops, struct fi_ops_ucma, listen);
+	return ucma->ops->listen(fid, cmd, cmd_size);
+}
+
+static inline int ucma_accept(fid_t fid,
+			struct ucma_abi_accept *cmd, size_t cmd_size)
+{
+	struct fid_ucma *ucma = container_of(fid, struct fid_ucma, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_ucma, ops);
+	FI_ASSERT_OP(ucma->ops, struct fi_ops_ucma, accept);
+	return ucma->ops->accept(fid, cmd, cmd_size);
+}
+
+static inline int ucma_reject(fid_t fid,
+			struct ucma_abi_reject *cmd, size_t cmd_size)
+{
+	struct fid_ucma *ucma = container_of(fid, struct fid_ucma, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_ucma, ops);
+	FI_ASSERT_OP(ucma->ops, struct fi_ops_ucma, reject);
+	return ucma->ops->reject(fid, cmd, cmd_size);
+}
+
+static inline int ucma_disconnect(fid_t fid,
+			struct ucma_abi_disconnect *cmd, size_t cmd_size)
+{
+	struct fid_ucma *ucma = container_of(fid, struct fid_ucma, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_ucma, ops);
+	FI_ASSERT_OP(ucma->ops, struct fi_ops_ucma, disconnect);
+	return ucma->ops->disconnect(fid, cmd, cmd_size);
+}
+
+static inline int ucma_init_qp_attr(fid_t fid,
+			struct ucma_abi_init_qp_attr *cmd, size_t cmd_size,
+			struct ibv_kern_qp_attr *resp, size_t resp_size)
+{
+	struct fid_ucma *ucma = container_of(fid, struct fid_ucma, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_ucma, ops);
+	FI_ASSERT_OP(ucma->ops, struct fi_ops_ucma, init_qp_attr);
+	return ucma->ops->init_qp_attr(fid, cmd, cmd_size, resp, resp_size);
+}
+
+static inline int ucma_get_event(fid_t fid,
+			struct ucma_abi_get_event *cmd, size_t cmd_size,
+			struct ucma_abi_event_resp *resp, size_t resp_size)
+{
+	struct fid_ucma *ucma = container_of(fid, struct fid_ucma, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_ucma, ops);
+	FI_ASSERT_OP(ucma->ops, struct fi_ops_ucma, get_event);
+	return ucma->ops->get_event(fid, cmd, cmd_size, resp, resp_size);
+}
+
+static inline int ucma_set_option(fid_t fid,
+			struct ucma_abi_set_option *cmd, size_t cmd_size)
+{
+	struct fid_ucma *ucma = container_of(fid, struct fid_ucma, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_ucma, ops);
+	FI_ASSERT_OP(ucma->ops, struct fi_ops_ucma, set_option);
+	return ucma->ops->set_option(fid, cmd, cmd_size);
+}
+
+static inline int ucma_notify(fid_t fid,
+			struct ucma_abi_notify *cmd, size_t cmd_size)
+{
+	struct fid_ucma *ucma = container_of(fid, struct fid_ucma, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_ucma, ops);
+	FI_ASSERT_OP(ucma->ops, struct fi_ops_ucma, notify);
+	return ucma->ops->notify(fid, cmd, cmd_size);
+}
+
+static inline int ucma_join_ip_mcast(fid_t fid,
+			struct ucma_abi_join_ip_mcast *cmd, size_t cmd_size,
+			struct ucma_abi_create_id_resp *resp, size_t resp_size)
+{
+	struct fid_ucma *ucma = container_of(fid, struct fid_ucma, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_ucma, ops);
+	FI_ASSERT_OP(ucma->ops, struct fi_ops_ucma, join_ip_mcast);
+	return ucma->ops->join_ip_mcast(fid, cmd, cmd_size, resp, resp_size);
+}
+
+static inline int ucma_join_mcast(fid_t fid,
+			struct ucma_abi_join_mcast *cmd, size_t cmd_size,
+			struct ucma_abi_create_id_resp *resp, size_t resp_size)
+{
+	struct fid_ucma *ucma = container_of(fid, struct fid_ucma, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_ucma, ops);
+	FI_ASSERT_OP(ucma->ops, struct fi_ops_ucma, join_mcast);
+	return ucma->ops->join_mcast(fid, cmd, cmd_size, resp, resp_size);
+}
+
+static inline int ucma_leave_mcast(fid_t fid,
+			struct ucma_abi_destroy_id *cmd, size_t cmd_size,
+			struct ucma_abi_destroy_id_resp *resp, size_t resp_size)
+{
+	struct fid_ucma *ucma = container_of(fid, struct fid_ucma, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_ucma, ops);
+	FI_ASSERT_OP(ucma->ops, struct fi_ops_ucma, leave_mcast);
+	return ucma->ops->leave_mcast(fid, cmd, cmd_size, resp, resp_size);
+}
+
+static inline int ucma_migrate_id(fid_t fid,
+			struct ucma_abi_migrate_id *cmd, size_t cmd_size,
+			struct ucma_abi_migrate_resp *resp, size_t resp_size)
+{
+	struct fid_ucma *ucma = container_of(fid, struct fid_ucma, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_ucma, ops);
+	FI_ASSERT_OP(ucma->ops, struct fi_ops_ucma, migrate_id);
+	return ucma->ops->migrate_id(fid, cmd, cmd_size, resp, resp_size);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _FI_UCMA_H_ */
diff --git a/include/rdma/fi_umad.h b/include/rdma/fi_umad.h
new file mode 100644
index 00000000000..d0d18ecf58c
--- /dev/null
+++ b/include/rdma/fi_umad.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2013 Intel Corp., Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _FI_UMAD_H_
+#define _FI_UMAD_H_
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/*
+ * This file must be kept in sync with the kernel's version of ib_user_mad.h
+ */
+
+#define UMAD_MIN_ABI_VERSION	5
+#define UMAD_MAX_ABI_VERSION	5
+
+
+struct umad_hdr {
+	__u32	id;
+	__u32	status;
+	__u32	timeout_ms;
+	__u32	retries;
+	__u32	length;
+	__be32	qpn;
+	__be32  qkey;
+	__be16	lid;
+	__u8	sl;
+	__u8	path_bits;
+	__u8	grh_present;
+	__u8	gid_index;
+	__u8	hop_limit;
+	__u8	traffic_class;
+	__u8	gid[16];
+	__be32	flow_label;
+	__u16	pkey_index;
+	__u8	reserved[6];
+};
+
+struct umad_data {
+	struct umad_hdr hdr;
+	__u64	data[0];
+};
+
+typedef unsigned long __attribute__((aligned(4))) packed_ulong;
+#define UMAD_LONGS_PER_METHOD_MASK (128 / (8 * sizeof (long)))
+
+struct umad_reg_req {
+	__u32	id;
+	packed_ulong method_mask[UMAD_LONGS_PER_METHOD_MASK];
+	__u8	qpn;
+	__u8	mgmt_class;
+	__u8	mgmt_class_version;
+	__u8    oui[3];
+	__u8	rmpp_version;
+};
+
+#define UMAD_IOCTL_MAGIC	0x1b
+#define UMAD_REGISTER_AGENT	_IOWR(UMAD_IOCTL_MAGIC, 1, struct umad_reg_req)
+#define UMAD_UNREGISTER_AGENT	_IOW(UMAD_IOCTL_MAGIC, 2, __u32)
+#define UMAD_ENABLE_PKEY	_IO(UMAD_IOCTL_MAGIC, 3)
+
+
+#define FI_UVERBS_CLASS_NAME	"umad"
+#define FI_UMAD_OPS		(4ULL << FI_OPS_LIB_SHIFT)
+
+struct fi_umad_ops {
+	size_t	size;
+	int	(*get_abi)(void);
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _FI_UMAD_H_ */
diff --git a/include/rdma/fi_uverbs.h b/include/rdma/fi_uverbs.h
new file mode 100644
index 00000000000..e3d2ba19487
--- /dev/null
+++ b/include/rdma/fi_uverbs.h
@@ -0,0 +1,1289 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc.  All rights reserved.
+ * Copyright (c) 2013 Intel Corporation, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _FI_UVERBS_H_
+#define _FI_UVERBS_H_
+
+
+#include <linux/types.h>
+#include <rdma/fabric.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/*
+ * This file must be kept in sync with the kernel's version of ib_user_verbs.h
+ */
+
+#define UVERBS_MIN_ABI_VERSION	6
+#define UVERBS_MAX_ABI_VERSION	6
+
+enum {
+	UVERBS_CMD_GET_CONTEXT,
+	UVERBS_CMD_QUERY_DEVICE,
+	UVERBS_CMD_QUERY_PORT,
+	UVERBS_CMD_ALLOC_PD,
+	UVERBS_CMD_DEALLOC_PD,
+	UVERBS_CMD_CREATE_AH,
+	UVERBS_CMD_MODIFY_AH,	/* unused */
+	UVERBS_CMD_QUERY_AH,	/* unused */
+	UVERBS_CMD_DESTROY_AH,
+	UVERBS_CMD_REG_MR,
+	UVERBS_CMD_REG_SMR,	/* unused */
+	UVERBS_CMD_REREG_MR,	/* unused */
+	UVERBS_CMD_QUERY_MR,	/* unused */
+	UVERBS_CMD_DEREG_MR,
+	UVERBS_CMD_ALLOC_MW,	/* unused */
+	UVERBS_CMD_BIND_MW,	/* unused */
+	UVERBS_CMD_DEALLOC_MW,	/* unused */
+	UVERBS_CMD_CREATE_COMP_CHANNEL,
+	UVERBS_CMD_CREATE_CQ,
+	UVERBS_CMD_RESIZE_CQ,
+	UVERBS_CMD_DESTROY_CQ,
+	UVERBS_CMD_POLL_CQ,
+	UVERBS_CMD_PEEK_CQ,
+	UVERBS_CMD_REQ_NOTIFY_CQ,
+	UVERBS_CMD_CREATE_QP,
+	UVERBS_CMD_QUERY_QP,
+	UVERBS_CMD_MODIFY_QP,
+	UVERBS_CMD_DESTROY_QP,
+	UVERBS_CMD_POST_SEND,
+	UVERBS_CMD_POST_RECV,
+	UVERBS_CMD_ATTACH_MCAST,
+	UVERBS_CMD_DETACH_MCAST,
+	UVERBS_CMD_CREATE_SRQ,
+	UVERBS_CMD_MODIFY_SRQ,
+	UVERBS_CMD_QUERY_SRQ,
+	UVERBS_CMD_DESTROY_SRQ,
+	UVERBS_CMD_POST_SRQ_RECV,
+	UVERBS_CMD_OPEN_XRCD,	/* TODO */
+	UVERBS_CMD_CLOSE_XRCD,	/* TODO */
+	UVERBS_CMD_CREATE_XSRQ,	/* TODO */
+	UVERBS_CMD_OPEN_QP,	/* TODO */
+};
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * Specifically:
+ *  - Do not use pointer types -- pass pointers in __u64 instead.
+ *  - Make sure that any structure larger than 4 bytes is padded to a
+ *    multiple of 8 bytes.  Otherwise the structure size will be
+ *    different between 32-bit and 64-bit architectures.
+ */
+
+struct ibv_kern_async_event {
+	__u64 element;
+	__u32 event_type;
+	__u32 reserved;
+};
+
+struct ibv_comp_event {
+	__u64 cq_handle;
+};
+
+/*
+ * All commands from userspace should start with a __u32 command field
+ * followed by __u16 in_words and out_words fields (which give the
+ * length of the command block and response buffer if any in 32-bit
+ * words).  The kernel driver will read these fields first and read
+ * the rest of the command struct based on these value.
+ */
+
+struct ibv_query_params {
+	__u32 command;
+	__u16 in_words;
+	__u16 out_words;
+	__u64 response;
+};
+
+struct ibv_query_params_resp {
+	__u32 num_cq_events;
+};
+
+struct ibv_get_context {
+	__u32 command;
+	__u16 in_words;
+	__u16 out_words;
+	__u64 response;
+	__u64 driver_data[0];
+};
+
+struct ibv_get_context_resp {
+	__u32 async_fd;
+	__u32 num_comp_vectors;
+};
+
+struct ibv_query_device {
+	__u32 command;
+	__u16 in_words;
+	__u16 out_words;
+	__u64 response;
+	__u64 driver_data[0];
+};
+
+struct ibv_query_device_resp {
+	__u64 fw_ver;
+	__u64 node_guid;
+	__u64 sys_image_guid;
+	__u64 max_mr_size;
+	__u64 page_size_cap;
+	__u32 vendor_id;
+	__u32 vendor_part_id;
+	__u32 hw_ver;
+	__u32 max_qp;
+	__u32 max_qp_wr;
+	__u32 device_cap_flags;
+	__u32 max_sge;
+	__u32 max_sge_rd;
+	__u32 max_cq;
+	__u32 max_cqe;
+	__u32 max_mr;
+	__u32 max_pd;
+	__u32 max_qp_rd_atom;
+	__u32 max_ee_rd_atom;
+	__u32 max_res_rd_atom;
+	__u32 max_qp_init_rd_atom;
+	__u32 max_ee_init_rd_atom;
+	__u32 atomic_cap;
+	__u32 max_ee;
+	__u32 max_rdd;
+	__u32 max_mw;
+	__u32 max_raw_ipv6_qp;
+	__u32 max_raw_ethy_qp;
+	__u32 max_mcast_grp;
+	__u32 max_mcast_qp_attach;
+	__u32 max_total_mcast_qp_attach;
+	__u32 max_ah;
+	__u32 max_fmr;
+	__u32 max_map_per_fmr;
+	__u32 max_srq;
+	__u32 max_srq_wr;
+	__u32 max_srq_sge;
+	__u16 max_pkeys;
+	__u8  local_ca_ack_delay;
+	__u8  phys_port_cnt;
+	__u8  reserved[4];
+};
+
+struct ibv_query_port {
+	__u32 command;
+	__u16 in_words;
+	__u16 out_words;
+	__u64 response;
+	__u8  port_num;
+	__u8  reserved[7];
+	__u64 driver_data[0];
+};
+
+struct ibv_query_port_resp {
+	__u32 port_cap_flags;
+	__u32 max_msg_sz;
+	__u32 bad_pkey_cntr;
+	__u32 qkey_viol_cntr;
+	__u32 gid_tbl_len;
+	__u16 pkey_tbl_len;
+	__u16 lid;
+	__u16 sm_lid;
+	__u8  state;
+	__u8  max_mtu;
+	__u8  active_mtu;
+	__u8  lmc;
+	__u8  max_vl_num;
+	__u8  sm_sl;
+	__u8  subnet_timeout;
+	__u8  init_type_reply;
+	__u8  active_width;
+	__u8  active_speed;
+	__u8  phys_state;
+	__u8  link_layer;
+	__u8  reserved[2];
+};
+
+struct ibv_alloc_pd {
+	__u32 command;
+	__u16 in_words;
+	__u16 out_words;
+	__u64 response;
+	__u64 driver_data[0];
+};
+
+struct ibv_alloc_pd_resp {
+	__u32 pd_handle;
+};
+
+struct ibv_dealloc_pd {
+	__u32 command;
+	__u16 in_words;
+	__u16 out_words;
+	__u32 pd_handle;
+};
+
+struct ibv_open_xrcd {
+	__u32 command;
+	__u16 in_words;
+	__u16 out_words;
+	__u64 response;
+	__u32 fd;
+	__u32 oflags;
+	__u64 driver_data[0];
+};
+
+struct ibv_open_xrcd_resp {
+	__u32 xrcd_handle;
+};
+
+struct ibv_close_xrcd {
+	__u32 command;
+	__u16 in_words;
+	__u16 out_words;
+	__u32 xrcd_handle;
+};
+
+struct ibv_reg_mr {
+	__u32 command;
+	__u16 in_words;
+	__u16 out_words;
+	__u64 response;
+	__u64 start;
+	__u64 length;
+	__u64 hca_va;
+	__u32 pd_handle;
+	__u32 access_flags;
+	__u64 driver_data[0];
+};
+
+struct ibv_reg_mr_resp {
+	__u32 mr_handle;
+	__u32 lkey;
+	__u32 rkey;
+};
+
+struct ibv_dereg_mr {
+	__u32 command;
+	__u16 in_words;
+	__u16 out_words;
+	__u32 mr_handle;
+};
+
+struct ibv_create_comp_channel {
+	__u32 command;
+	__u16 in_words;
+	__u16 out_words;
+	__u64 response;
+};
+
+struct ibv_create_comp_channel_resp {
+	__u32 fd;
+};
+
+struct ibv_create_cq {
+	__u32 command;
+	__u16 in_words;
+	__u16 out_words;
+	__u64 response;
+	__u64 user_handle;
+	__u32 cqe;
+	__u32 comp_vector;
+	__s32 comp_channel;
+	__u32 reserved;
+	__u64 driver_data[0];
+};
+
+struct ibv_create_cq_resp {
+	__u32 cq_handle;
+	__u32 cqe;
+};
+
+struct ibv_kern_wc {
+	__u64  wr_id;
+	__u32  status;
+	__u32  opcode;
+	__u32  vendor_err;
+	__u32  byte_len;
+	__u32  imm_data;
+	__u32  qp_num;
+	__u32  src_qp;
+	__u32  wc_flags;
+	__u16  pkey_index;
+	__u16  slid;
+	__u8   sl;
+	__u8   dlid_path_bits;
+	__u8   port_num;
+	__u8   reserved;
+};
+
+struct ibv_poll_cq {
+	__u32 command;
+	__u16 in_words;
+	__u16 out_words;
+	__u64 response;
+	__u32 cq_handle;
+	__u32 ne;
+};
+
+struct ibv_poll_cq_resp {
+	__u32 count;
+	__u32 reserved;
+	struct ibv_kern_wc wc[0];
+};
+
+struct ibv_req_notify_cq {
+	__u32 command;
+	__u16 in_words;
+	__u16 out_words;
+	__u32 cq_handle;
+	__u32 solicited;
+};
+
+struct ibv_resize_cq {
+	__u32 command;
+	__u16 in_words;
+	__u16 out_words;
+	__u64 response;
+	__u32 cq_handle;
+	__u32 cqe;
+	__u64 driver_data[0];
+};
+
+struct ibv_resize_cq_resp {
+	__u32 cqe;
+	__u32 reserved;
+	__u64 driver_data[0];
+};
+
+struct ibv_destroy_cq {
+	__u32 command;
+	__u16 in_words;
+	__u16 out_words;
+	__u64 response;
+	__u32 cq_handle;
+	__u32 reserved;
+};
+
+struct ibv_destroy_cq_resp {
+	__u32 comp_events_reported;
+	__u32 async_events_reported;
+};
+
+struct ibv_kern_global_route {
+	__u8  dgid[16];
+	__u32 flow_label;
+	__u8  sgid_index;
+	__u8  hop_limit;
+	__u8  traffic_class;
+	__u8  reserved;
+};
+
+struct ibv_kern_ah_attr {
+	struct ibv_kern_global_route grh;
+	__u16 dlid;
+	__u8  sl;
+	__u8  src_path_bits;
+	__u8  static_rate;
+	__u8  is_global;
+	__u8  port_num;
+	__u8  reserved;
+};
+
+struct ibv_kern_qp_attr {
+	__u32	qp_attr_mask;
+	__u32	qp_state;
+	__u32	cur_qp_state;
+	__u32	path_mtu;
+	__u32	path_mig_state;
+	__u32	qkey;
+	__u32	rq_psn;
+	__u32	sq_psn;
+	__u32	dest_qp_num;
+	__u32	qp_access_flags;
+
+	struct ibv_kern_ah_attr ah_attr;
+	struct ibv_kern_ah_attr alt_ah_attr;
+
+	/* ib_qp_cap */
+	__u32	max_send_wr;
+	__u32	max_recv_wr;
+	__u32	max_send_sge;
+	__u32	max_recv_sge;
+	__u32	max_inline_data;
+
+	__u16	pkey_index;
+	__u16	alt_pkey_index;
+	__u8	en_sqd_async_notify;
+	__u8	sq_draining;
+	__u8	max_rd_atomic;
+	__u8	max_dest_rd_atomic;
+	__u8	min_rnr_timer;
+	__u8	port_num;
+	__u8	timeout;
+	__u8	retry_cnt;
+	__u8	rnr_retry;
+	__u8	alt_port_num;
+	__u8	alt_timeout;
+	__u8	reserved[5];
+};
+
+struct ibv_create_qp {
+	__u32 command;
+	__u16 in_words;
+	__u16 out_words;
+	__u64 response;
+	__u64 user_handle;
+	__u32 pd_handle;
+	__u32 send_cq_handle;
+	__u32 recv_cq_handle;
+	__u32 srq_handle;
+	__u32 max_send_wr;
+	__u32 max_recv_wr;
+	__u32 max_send_sge;
+	__u32 max_recv_sge;
+	__u32 max_inline_data;
+	__u8  sq_sig_all;
+	__u8  qp_type;
+	__u8  is_srq;
+	__u8  reserved;
+	__u64 driver_data[0];
+};
+
+struct ibv_open_qp {
+	__u32 command;
+	__u16 in_words;
+	__u16 out_words;
+	__u64 response;
+	__u64 user_handle;
+	__u32 pd_handle;
+	__u32 qpn;
+	__u8  qp_type;
+	__u8  reserved[7];
+	__u64 driver_data[0];
+};
+
+/* also used for open response */
+struct ibv_create_qp_resp {
+	__u32 qp_handle;
+	__u32 qpn;
+	__u32 max_send_wr;
+	__u32 max_recv_wr;
+	__u32 max_send_sge;
+	__u32 max_recv_sge;
+	__u32 max_inline_data;
+	__u32 reserved;
+};
+
+struct ibv_qp_dest {
+	__u8  dgid[16];
+	__u32 flow_label;
+	__u16 dlid;
+	__u16 reserved;
+	__u8  sgid_index;
+	__u8  hop_limit;
+	__u8  traffic_class;
+	__u8  sl;
+	__u8  src_path_bits;
+	__u8  static_rate;
+	__u8  is_global;
+	__u8  port_num;
+};
+
+struct ibv_query_qp {
+	__u32 command;
+	__u16 in_words;
+	__u16 out_words;
+	__u64 response;
+	__u32 qp_handle;
+	__u32 attr_mask;
+	__u64 driver_data[0];
+};
+
+struct ibv_query_qp_resp {
+	struct ibv_qp_dest dest;
+	struct ibv_qp_dest alt_dest;
+	__u32 max_send_wr;
+	__u32 max_recv_wr;
+	__u32 max_send_sge;
+	__u32 max_recv_sge;
+	__u32 max_inline_data;
+	__u32 qkey;
+	__u32 rq_psn;
+	__u32 sq_psn;
+	__u32 dest_qp_num;
+	__u32 qp_access_flags;
+	__u16 pkey_index;
+	__u16 alt_pkey_index;
+	__u8  qp_state;
+	__u8  cur_qp_state;
+	__u8  path_mtu;
+	__u8  path_mig_state;
+	__u8  sq_draining;
+	__u8  max_rd_atomic;
+	__u8  max_dest_rd_atomic;
+	__u8  min_rnr_timer;
+	__u8  port_num;
+	__u8  timeout;
+	__u8  retry_cnt;
+	__u8  rnr_retry;
+	__u8  alt_port_num;
+	__u8  alt_timeout;
+	__u8  sq_sig_all;
+	__u8  reserved[5];
+	__u64 driver_data[0];
+};
+
+struct ibv_modify_qp {
+	__u32 command;
+	__u16 in_words;
+	__u16 out_words;
+	struct ibv_qp_dest dest;
+	struct ibv_qp_dest alt_dest;
+	__u32 qp_handle;
+	__u32 attr_mask;
+	__u32 qkey;
+	__u32 rq_psn;
+	__u32 sq_psn;
+	__u32 dest_qp_num;
+	__u32 qp_access_flags;
+	__u16 pkey_index;
+	__u16 alt_pkey_index;
+	__u8  qp_state;
+	__u8  cur_qp_state;
+	__u8  path_mtu;
+	__u8  path_mig_state;
+	__u8  en_sqd_async_notify;
+	__u8  max_rd_atomic;
+	__u8  max_dest_rd_atomic;
+	__u8  min_rnr_timer;
+	__u8  port_num;
+	__u8  timeout;
+	__u8  retry_cnt;
+	__u8  rnr_retry;
+	__u8  alt_port_num;
+	__u8  alt_timeout;
+	__u8  reserved[2];
+	__u64 driver_data[0];
+};
+
+struct ibv_destroy_qp {
+	__u32 command;
+	__u16 in_words;
+	__u16 out_words;
+	__u64 response;
+	__u32 qp_handle;
+	__u32 reserved;
+};
+
+struct ibv_destroy_qp_resp {
+	__u32 events_reported;
+};
+
+struct ibv_kern_send_wr {
+	__u64 wr_id;
+	__u32 num_sge;
+	__u32 opcode;
+	__u32 send_flags;
+	__u32 imm_data;
+	union {
+		struct {
+			__u64 remote_addr;
+			__u32 rkey;
+			__u32 reserved;
+		} rdma;
+		struct {
+			__u64 remote_addr;
+			__u64 compare_add;
+			__u64 swap;
+			__u32 rkey;
+			__u32 reserved;
+		} atomic;
+		struct {
+			__u32 ah;
+			__u32 remote_qpn;
+			__u32 remote_qkey;
+			__u32 reserved;
+		} ud;
+		struct {
+			__u64 reserved[3];
+			__u32 reserved2;
+			__u32 remote_srqn;
+		} xrc;
+	} wr;
+};
+
+struct ibv_post_send {
+	__u32 command;
+	__u16 in_words;
+	__u16 out_words;
+	__u64 response;
+	__u32 qp_handle;
+	__u32 wr_count;
+	__u32 sge_count;
+	__u32 wqe_size;
+	struct ibv_kern_send_wr send_wr[0];
+};
+
+struct ibv_post_send_resp {
+	__u32 bad_wr;
+};
+
+struct ibv_kern_recv_wr {
+	__u64 wr_id;
+	__u32 num_sge;
+	__u32 reserved;
+};
+
+struct ibv_post_recv {
+	__u32 command;
+	__u16 in_words;
+	__u16 out_words;
+	__u64 response;
+	__u32 qp_handle;
+	__u32 wr_count;
+	__u32 sge_count;
+	__u32 wqe_size;
+	struct ibv_kern_recv_wr recv_wr[0];
+};
+
+struct ibv_post_recv_resp {
+	__u32 bad_wr;
+};
+
+struct ibv_post_srq_recv {
+	__u32 command;
+	__u16 in_words;
+	__u16 out_words;
+	__u64 response;
+	__u32 srq_handle;
+	__u32 wr_count;
+	__u32 sge_count;
+	__u32 wqe_size;
+	struct ibv_kern_recv_wr recv_wr[0];
+};
+
+struct ibv_post_srq_recv_resp {
+	__u32 bad_wr;
+};
+
+struct ibv_create_ah {
+	__u32 command;
+	__u16 in_words;
+	__u16 out_words;
+	__u64 response;
+	__u64 user_handle;
+	__u32 pd_handle;
+	__u32 reserved;
+	struct ibv_kern_ah_attr attr;
+};
+
+struct ibv_create_ah_resp {
+	__u32 handle;
+};
+
+struct ibv_destroy_ah {
+	__u32 command;
+	__u16 in_words;
+	__u16 out_words;
+	__u32 ah_handle;
+};
+
+struct ibv_attach_mcast {
+	__u32 command;
+	__u16 in_words;
+	__u16 out_words;
+	__u8  gid[16];
+	__u32 qp_handle;
+	__u16 mlid;
+	__u16 reserved;
+	__u64 driver_data[0];
+};
+
+struct ibv_detach_mcast {
+	__u32 command;
+	__u16 in_words;
+	__u16 out_words;
+	__u8  gid[16];
+	__u32 qp_handle;
+	__u16 mlid;
+	__u16 reserved;
+	__u64 driver_data[0];
+};
+
+struct ibv_create_srq {
+	__u32 command;
+	__u16 in_words;
+	__u16 out_words;
+	__u64 response;
+	__u64 user_handle;
+	__u32 pd_handle;
+	__u32 max_wr;
+	__u32 max_sge;
+	__u32 srq_limit;
+	__u64 driver_data[0];
+};
+
+struct ibv_create_xsrq {
+	__u32 command;
+	__u16 in_words;
+	__u16 out_words;
+	__u64 response;
+	__u64 user_handle;
+	__u32 srq_type;
+	__u32 pd_handle;
+	__u32 max_wr;
+	__u32 max_sge;
+	__u32 srq_limit;
+	__u32 reserved;
+	__u32 xrcd_handle;
+	__u32 cq_handle;
+	__u64 driver_data[0];
+};
+
+struct ibv_create_srq_resp {
+	__u32 srq_handle;
+	__u32 max_wr;
+	__u32 max_sge;
+	__u32 srqn;
+};
+
+struct ibv_modify_srq {
+	__u32 command;
+	__u16 in_words;
+	__u16 out_words;
+	__u32 srq_handle;
+	__u32 attr_mask;
+	__u32 max_wr;
+	__u32 srq_limit;
+	__u64 driver_data[0];
+};
+
+struct ibv_query_srq {
+	__u32 command;
+	__u16 in_words;
+	__u16 out_words;
+	__u64 response;
+	__u32 srq_handle;
+	__u32 reserved;
+	__u64 driver_data[0];
+};
+
+struct ibv_query_srq_resp {
+	__u32 max_wr;
+	__u32 max_sge;
+	__u32 srq_limit;
+	__u32 reserved;
+};
+
+struct ibv_destroy_srq {
+	__u32 command;
+	__u16 in_words;
+	__u16 out_words;
+	__u64 response;
+	__u32 srq_handle;
+	__u32 reserved;
+};
+
+struct ibv_destroy_srq_resp {
+	__u32 events_reported;
+};
+
+
+struct fi_ops_uverbs {
+	size_t	size;
+	int	(*get_context)(fid_t fid,
+				struct ibv_get_context *cmd, size_t cmd_size,
+				struct ibv_get_context_resp *resp, size_t resp_size);
+	int	(*query_device)(fid_t fid,
+				struct ibv_query_device *cmd, size_t cmd_size,
+				struct ibv_query_device_resp *resp, size_t resp_size);
+	int	(*query_port)(fid_t fid,
+				struct ibv_query_port *cmd, size_t cmd_size,
+				struct ibv_query_port_resp *resp, size_t resp_size);
+	int	(*alloc_pd)(fid_t fid,
+				struct ibv_alloc_pd *cmd, size_t cmd_size,
+				struct ibv_alloc_pd_resp *resp, size_t resp_size);
+	int	(*dealloc_pd)(fid_t fid,
+				struct ibv_dealloc_pd *cmd, size_t cmd_size);
+	int	(*create_ah)(fid_t fid,
+				struct ibv_create_ah *cmd, size_t cmd_size,
+				struct ibv_create_ah_resp *resp, size_t resp_size);
+	int	(*destroy_ah)(fid_t fid,
+				struct ibv_destroy_ah *cmd, size_t cmd_size);
+	int	(*open_xrcd)(fid_t fid,
+				struct ibv_open_xrcd *cmd, size_t cmd_size,
+				struct ibv_open_xrcd_resp *resp, size_t resp_size);
+	int	(*close_xrcd)(fid_t fid,
+				struct ibv_close_xrcd *cmd, size_t cmd_size);
+	int	(*reg_mr)(fid_t fid,
+				struct ibv_reg_mr *cmd, size_t cmd_size,
+				struct ibv_reg_mr_resp *resp, size_t resp_size);
+	int	(*dereg_mr)(fid_t fid,
+				struct ibv_dereg_mr *cd, size_t cmd_size);
+	int	(*create_comp_channel)(fid_t fid,
+				struct ibv_create_comp_channel *cmd, size_t cmd_size,
+				struct ibv_create_comp_channel_resp *resp, size_t resp_size);
+	int	(*create_cq)(fid_t fid,
+				struct ibv_create_cq *cmd, size_t cmd_size,
+				struct ibv_create_cq_resp *resp, size_t resp_size);
+	int	(*poll_cq)(fid_t fid,
+				struct ibv_poll_cq *cmd, size_t cmd_size,
+				struct ibv_poll_cq_resp *resp, size_t resp_size);
+	int	(*req_notify_cq)(fid_t fid,
+				struct ibv_req_notify_cq *cmd, size_t cmd_size);
+	int	(*resize_cq)(fid_t fid,
+				struct ibv_resize_cq *cmd, size_t cmd_size,
+				struct ibv_resize_cq_resp *resp, size_t resp_size);
+	int	(*destroy_cq)(fid_t fid,
+				struct ibv_destroy_cq *cmd, size_t cmd_size,
+				struct ibv_destroy_cq_resp *resp, size_t resp_size);
+	int	(*create_srq)(fid_t fid,
+				struct ibv_create_srq *cmd, size_t cmd_size,
+				struct ibv_create_srq_resp *resp, size_t resp_size);
+	int	(*modify_srq)(fid_t fid,
+				struct ibv_modify_srq *cmd, size_t cmd_size);
+	int	(*query_srq)(fid_t fid,
+				struct ibv_query_srq *cmd, size_t cmd_size,
+				struct ibv_query_srq_resp *resp, size_t resp_size);
+	int	(*destroy_srq)(fid_t fid,
+				struct ibv_destroy_srq *cmd, size_t cmd_size,
+				struct ibv_destroy_srq_resp *resp, size_t resp_size);
+	int	(*create_qp)(fid_t fid,
+				struct ibv_create_qp *cmd, size_t cmd_size,
+				struct ibv_create_qp_resp *resp, size_t resp_size);
+	int	(*open_qp)(fid_t fid,
+				struct ibv_open_qp *cmd, size_t cmd_size,
+				struct ibv_create_qp_resp *resp, size_t resp_size);
+	int	(*query_qp)(fid_t fid,
+				struct ibv_query_qp *cmd, size_t cmd_size,
+				struct ibv_query_qp_resp *resp, size_t resp_size);
+	int	(*modify_qp)(fid_t fid,
+				struct ibv_modify_qp *cmd, size_t cmd_size);
+	int	(*destroy_qp)(fid_t fid,
+				struct ibv_destroy_qp *cmd, size_t cmd_size,
+				struct ibv_destroy_qp_resp *resp, size_t resp_size);
+	int	(*post_send)(fid_t fid,
+				struct ibv_post_send *cmd, size_t cmd_size,
+				struct ibv_post_send_resp *resp, size_t resp_size);
+	int	(*post_recv)(fid_t fid,
+				struct ibv_post_recv *cmd, size_t cmd_size,
+				struct ibv_post_recv_resp *resp, size_t resp_size);
+	int	(*post_srq_recv)(fid_t fid,
+				struct ibv_post_srq_recv *cmd, size_t cmd_size,
+				struct ibv_post_srq_recv_resp *resp, size_t resp_size);
+	int	(*attach_mcast)(fid_t fid,
+				struct ibv_attach_mcast *cmd, size_t cmd_size);
+	int	(*detach_mcast)(fid_t fid,
+				struct ibv_detach_mcast *cmd, size_t cmd_size);
+};
+
+struct fid_uverbs {
+	struct fid		fid;
+	int			fd;
+	struct fi_ops_uverbs	*ops;
+};
+
+#define FI_UVERBS_INTERFACE	"uverbs"
+
+static inline int
+uv_get_context(fid_t fid,
+	struct ibv_get_context *cmd, size_t cmd_size,
+	struct ibv_get_context_resp *resp, size_t resp_size)
+{
+	struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_uverbs, ops);
+	FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, get_context);
+	return uv->ops->get_context(fid, cmd, cmd_size, resp, resp_size);
+}
+
+static inline int
+uv_query_device(fid_t fid,
+	struct ibv_query_device *cmd, size_t cmd_size,
+	struct ibv_query_device_resp *resp, size_t resp_size)
+{
+	struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_uverbs, ops);
+	FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, query_device);
+	return uv->ops->query_device(fid, cmd, cmd_size, resp, resp_size);
+}
+
+static inline int
+uv_query_port(fid_t fid,
+	struct ibv_query_port *cmd, size_t cmd_size,
+	struct ibv_query_port_resp *resp, size_t resp_size)
+{
+	struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_uverbs, ops);
+	FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, query_port);
+	return uv->ops->query_port(fid, cmd, cmd_size, resp, resp_size);
+}
+
+static inline int
+uv_alloc_pd(fid_t fid,
+	struct ibv_alloc_pd *cmd, size_t cmd_size,
+	struct ibv_alloc_pd_resp *resp, size_t resp_size)
+{
+	struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_uverbs, ops);
+	FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, alloc_pd);
+	return uv->ops->alloc_pd(fid, cmd, cmd_size, resp, resp_size);
+}
+
+static inline int
+uv_dealloc_pd(fid_t fid,
+	struct ibv_dealloc_pd *cmd, size_t cmd_size)
+{
+	struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_uverbs, ops);
+	FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, dealloc_pd);
+	return uv->ops->dealloc_pd(fid, cmd, cmd_size);
+}
+
+static inline int
+uv_create_ah(fid_t fid,
+	struct ibv_create_ah *cmd, size_t cmd_size,
+	struct ibv_create_ah_resp *resp, size_t resp_size)
+{
+	struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_uverbs, ops);
+	FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, create_ah);
+	return uv->ops->create_ah(fid, cmd, cmd_size, resp, resp_size);
+}
+
+static inline int
+uv_destroy_ah(fid_t fid,
+	struct ibv_destroy_ah *cmd, size_t cmd_size)
+{
+	struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_uverbs, ops);
+	FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, destroy_ah);
+	return uv->ops->destroy_ah(fid, cmd, cmd_size);
+}
+
+static inline int
+uv_open_xrcd(fid_t fid,
+	struct ibv_open_xrcd *cmd, size_t cmd_size,
+	struct ibv_open_xrcd_resp *resp, size_t resp_size)
+{
+	struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_uverbs, ops);
+	FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, open_xrcd);
+	return uv->ops->open_xrcd(fid, cmd, cmd_size, resp, resp_size);
+}
+
+static inline int
+uv_close_xrcd(fid_t fid,
+	struct ibv_close_xrcd *cmd, size_t cmd_size)
+{
+	struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_uverbs, ops);
+	FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, close_xrcd);
+	return uv->ops->close_xrcd(fid, cmd, cmd_size);
+}
+
+static inline int
+uv_reg_mr(fid_t fid,
+	struct ibv_reg_mr *cmd, size_t cmd_size,
+	struct ibv_reg_mr_resp *resp, size_t resp_size)
+{
+	struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_uverbs, ops);
+	FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, reg_mr);
+	return uv->ops->reg_mr(fid, cmd, cmd_size, resp, resp_size);
+}
+
+static inline int
+uv_dereg_mr(fid_t fid,
+	struct ibv_dereg_mr *cmd, size_t cmd_size)
+{
+	struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_uverbs, ops);
+	FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, dereg_mr);
+	return uv->ops->dereg_mr(fid, cmd, cmd_size);
+}
+
+static inline int
+uv_create_comp_channel(fid_t fid,
+	struct ibv_create_comp_channel *cmd, size_t cmd_size,
+	struct ibv_create_comp_channel_resp *resp, size_t resp_size)
+{
+	struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_uverbs, ops);
+	FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, create_comp_channel);
+	return uv->ops->create_comp_channel(fid, cmd, cmd_size, resp, resp_size);
+}
+
+static inline int
+uv_create_cq(fid_t fid,
+	struct ibv_create_cq *cmd, size_t cmd_size,
+	struct ibv_create_cq_resp *resp, size_t resp_size)
+{
+	struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_uverbs, ops);
+	FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, create_cq);
+	return uv->ops->create_cq(fid, cmd, cmd_size, resp, resp_size);
+}
+
+static inline int
+uv_poll_cq(fid_t fid,
+	struct ibv_poll_cq *cmd, size_t cmd_size,
+	struct ibv_poll_cq_resp *resp, size_t resp_size)
+{
+	struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_uverbs, ops);
+	FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, poll_cq);
+	return uv->ops->poll_cq(fid, cmd, cmd_size, resp, resp_size);
+}
+
+static inline int
+uv_req_notify_cq(fid_t fid,
+	struct ibv_req_notify_cq *cmd, size_t cmd_size)
+{
+	struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_uverbs, ops);
+	FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, req_notify_cq);
+	return uv->ops->req_notify_cq(fid, cmd, cmd_size);
+}
+
+static inline int
+uv_resize_cq(fid_t fid,
+	struct ibv_resize_cq *cmd, size_t cmd_size,
+	struct ibv_resize_cq_resp *resp, size_t resp_size)
+{
+	struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_uverbs, ops);
+	FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, resize_cq);
+	return uv->ops->resize_cq(fid, cmd, cmd_size, resp, resp_size);
+}
+
+static inline int
+uv_destroy_cq(fid_t fid,
+	struct ibv_destroy_cq *cmd, size_t cmd_size,
+	struct ibv_destroy_cq_resp *resp, size_t resp_size)
+{
+	struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_uverbs, ops);
+	FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, destroy_cq);
+	return uv->ops->destroy_cq(fid, cmd, cmd_size, resp, resp_size);
+}
+
+static inline int
+uv_create_srq(fid_t fid,
+	struct ibv_create_srq *cmd, size_t cmd_size,
+	struct ibv_create_srq_resp *resp, size_t resp_size)
+{
+	struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_uverbs, ops);
+	FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, create_srq);
+	return uv->ops->create_srq(fid, cmd, cmd_size, resp, resp_size);
+}
+
+static inline int
+uv_modify_srq(fid_t fid,
+	struct ibv_modify_srq *cmd, size_t cmd_size)
+{
+	struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_uverbs, ops);
+	FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, modify_srq);
+	return uv->ops->modify_srq(fid, cmd, cmd_size);
+}
+
+static inline int
+uv_query_srq(fid_t fid,
+	struct ibv_query_srq *cmd, size_t cmd_size,
+	struct ibv_query_srq_resp *resp, size_t resp_size)
+{
+	struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_uverbs, ops);
+	FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, query_srq);
+	return uv->ops->query_srq(fid, cmd, cmd_size, resp, resp_size);
+}
+
+static inline int
+uv_destroy_srq(fid_t fid,
+	struct ibv_destroy_srq *cmd, size_t cmd_size,
+	struct ibv_destroy_srq_resp *resp, size_t resp_size)
+{
+	struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_uverbs, ops);
+	FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, destroy_srq);
+	return uv->ops->destroy_srq(fid, cmd, cmd_size, resp, resp_size);
+}
+
+static inline int
+uv_create_qp(fid_t fid,
+	struct ibv_create_qp *cmd, size_t cmd_size,
+	struct ibv_create_qp_resp *resp, size_t resp_size)
+{
+	struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_uverbs, ops);
+	FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, create_qp);
+	return uv->ops->create_qp(fid, cmd, cmd_size, resp, resp_size);
+}
+
+static inline int
+uv_open_qp(fid_t fid,
+	struct ibv_open_qp *cmd, size_t cmd_size,
+	struct ibv_create_qp_resp *resp, size_t resp_size)
+{
+	struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_uverbs, ops);
+	FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, open_qp);
+	return uv->ops->open_qp(fid, cmd, cmd_size, resp, resp_size);
+}
+
+static inline int
+uv_query_qp(fid_t fid,
+	struct ibv_query_qp *cmd, size_t cmd_size,
+	struct ibv_query_qp_resp *resp, size_t resp_size)
+{
+	struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_uverbs, ops);
+	FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, query_qp);
+	return uv->ops->query_qp(fid, cmd, cmd_size, resp, resp_size);
+}
+
+static inline int
+uv_modify_qp(fid_t fid,
+	struct ibv_modify_qp *cmd, size_t cmd_size)
+{
+	struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_uverbs, ops);
+	FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, modify_qp);
+	return uv->ops->modify_qp(fid, cmd, cmd_size);
+}
+
+static inline int
+uv_destroy_qp(fid_t fid,
+	struct ibv_destroy_qp *cmd, size_t cmd_size,
+	struct ibv_destroy_qp_resp *resp, size_t resp_size)
+{
+	struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_uverbs, ops);
+	FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, destroy_qp);
+	return uv->ops->destroy_qp(fid, cmd, cmd_size, resp, resp_size);
+}
+
+static inline int
+uv_post_send(fid_t fid,
+	struct ibv_post_send *cmd, size_t cmd_size,
+	struct ibv_post_send_resp *resp, size_t resp_size)
+{
+	struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_uverbs, ops);
+	FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, post_send);
+	return uv->ops->post_send(fid, cmd, cmd_size, resp, resp_size);
+}
+
+static inline int
+uv_post_recv(fid_t fid,
+	struct ibv_post_recv *cmd, size_t cmd_size,
+	struct ibv_post_recv_resp *resp, size_t resp_size)
+{
+	struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_uverbs, ops);
+	FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, post_recv);
+	return uv->ops->post_recv(fid, cmd, cmd_size, resp, resp_size);
+}
+
+static inline int
+uv_post_srq_recv(fid_t fid,
+	struct ibv_post_srq_recv *cmd, size_t cmd_size,
+	struct ibv_post_srq_recv_resp *resp, size_t resp_size)
+{
+	struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_uverbs, ops);
+	FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, post_srq_recv);
+	return uv->ops->post_srq_recv(fid, cmd, cmd_size, resp, resp_size);
+}
+
+static inline int
+uv_attach_mcast(fid_t fid,
+	struct ibv_attach_mcast *cmd, size_t cmd_size)
+{
+	struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_uverbs, ops);
+	FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, attach_mcast);
+	return uv->ops->attach_mcast(fid, cmd, cmd_size);
+}
+
+static inline int
+uv_detach_mcast(fid_t fid,
+		struct ibv_detach_mcast *cmd, size_t cmd_size)
+{
+	struct fid_uverbs *uv = container_of(fid, struct fid_uverbs, fid);
+	FI_ASSERT_CLASS(fid, FID_CLASS_INTERFACE);
+	FI_ASSERT_OPS(fid, struct fid_uverbs, ops);
+	FI_ASSERT_OP(uv->ops, struct fi_ops_uverbs, detach_mcast);
+	return uv->ops->detach_mcast(fid, cmd, cmd_size);
+}
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _FI_UVERBS_H_ */
diff --git a/libfabric.spec.in b/libfabric.spec.in
new file mode 100644
index 00000000000..c59388c94c8
--- /dev/null
+++ b/libfabric.spec.in
@@ -0,0 +1,71 @@
+%define ver @VERSION@
+
+Name: libfabric
+Version: 0.0.1
+Release: 1%{?dist}
+Summary: Userspace RDMA Fabric Interfaces
+
+Group: System Environment/Libraries
+License: GPLv2 or BSD
+Url: http://www.openfabrics.org/
+Source: http://www.openfabrics.org/downloads/fabrics/%{name}-%{version}.tar.gz
+BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n)
+
+%description
+libfabric provides a userspace API to access high-performance fabric
+services, such as RDMA.
+
+%package devel
+Summary: Development files for the libfabric library
+Group: System Environment/Libraries
+
+%description devel
+Development files for the libfabric library.
+
+%package utils
+Summary: Examples for the libfabric library
+Group: System Environment/Libraries
+Requires: %{name} = %{version}-%{release}
+
+%description utils
+Example test programs for the libfabric library.
+
+%prep
+%setup -q -n %{name}-%{ver}
+
+%build
+%configure
+make %{?_smp_mflags}
+
+%install
+rm -rf $RPM_BUILD_ROOT
+%makeinstall
+# remove unpackaged files from the buildroot
+rm -f $RPM_BUILD_ROOT%{_libdir}/*.la
+
+%clean
+rm -rf $RPM_BUILD_ROOT
+
+%post -p /sbin/ldconfig
+%postun -p /sbin/ldconfig
+
+%files
+%defattr(-,root,root,-)
+%{_libdir}/lib*.so.*
+%doc AUTHORS COPYING README
+
+%files devel
+%defattr(-,root,root)
+%{_libdir}/libfabric*.so
+%{_libdir}/*.a
+%{_includedir}/*
+%{_mandir}/man3/*
+%{_mandir}/man7/*
+
+%files utils
+%defattr(-,root,root,-)
+%{_bindir}/*
+%{_mandir}/man1/*
+
+%changelog
+
diff --git a/man/fi_getinfo.3 b/man/fi_getinfo.3
new file mode 100755
index 00000000000..4b4987f7915
--- /dev/null
+++ b/man/fi_getinfo.3
@@ -0,0 +1,53 @@
+.TH "FI_GETINFO" 3 "2013-11-01" "libfabric" "Libfabric Programmer's Manual" libfabric
+.SH NAME
+fi_getinfo / fi_freeinfo \- Obtain/free fabric interface information
+.SH SYNOPSIS
+.B "#include <rdma/fabric.h>"
+.P
+.B "int" fi_getinfo
+.BI "(char *" node ","
+.BI "char *" service ","
+.BI "struct fi_info *" hints ","
+.BI "struct fi_info **" info ");"
+.P
+.B "int" fi_freeinfo
+.BI "(struct fi_info *" info ");"
+.SH ARGUMENTS
+.IP "node" 12
+Optional, name or fabric address to resolve.
+.IP "service" 12
+Optional, service name or port number of address.
+.IP "hints" 12
+Reference to an fi_info structure containing hints about the type
+of service the caller supports.
+.IP "info" 12
+A pointer to a linked list of fi_info structures containing response
+information.
+.SH "DESCRIPTION"
+Resolves the destination node and service address and returns
+information needed to establish communication.  Provides the
+fabric interface functional equivalent to getaddrinfo.
+.SH "RETURN VALUE"
+Returns 0 on success, or errno on error.
+.SH "NOTES"
+Either node, service, or hints must be provided.  If hints are provided, the
+operation will be controlled by hints.ai_flags.  If FI_PASSIVE is
+specified, the call will resolve address information for use on the
+passive side of a connection.
+If node is provided, fi_getinfo will attempt to resolve the fabric address
+to the given node.  The hints parameter, if provided,
+may be used to control the resulting output as indicated below.
+If node is not given, fi_getinfo will attempt to resolve the fabric addressing
+information based on the provided hints.
+The caller must call fi_freeinfo to release fi_info structures returned
+by this call.
+.SH "fi_info"
+.IP "next" 12
+Pointer to the next fi_info structure in the list.  Will be NULL
+if no more structures exist.
+.IP "size" 12
+Size of the fi_info structure, used for compatibility.
+.IP "write more!!!" 12
+Details are left as an exercise for the reader.
+.SH "SEE ALSO"
+fi_open(3)
diff --git a/man/fi_open.3 b/man/fi_open.3
new file mode 100644
index 00000000000..e2c9896bf30
--- /dev/null
+++ b/man/fi_open.3
@@ -0,0 +1,27 @@
+.TH "FI_OPEN" 3 "2013-11-01" "libfabric" "Libfabric Programmer's Manual" libfabric
+.SH NAME
+fi_open / fi_close \- Open/close a fabric interface
+.SH SYNOPSIS
+.B "#include <rdma/fabric.h>"
+.P
+.B "int" fi_open
+.BI "(char *" name ","
+.BI WRITE ME!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+.BI "fid_t *" fid ");"
+.P
+.B "int" fi_close
+.BI "(fid_t " fid ");"
+.SH ARGUMENTS
+.IP "name" 12
+The name of the interface to open.
+.IP "fid" 12
+On success, points to the opened interface.
+.SH "DESCRIPTION"
+Opens a fabric interface.
+.SH "RETURN VALUE"
+Returns 0 on success, or errno on error.
+.SH "NOTES"
+Opens a fabric interface.  Users should call fi_close to release
+the interface.
+.SH "SEE ALSO"
+fi_getinfo(3)
diff --git a/man/fi_socket.3 b/man/fi_socket.3
new file mode 100644
index 00000000000..83ce8cf2e48
--- /dev/null
+++ b/man/fi_socket.3
@@ -0,0 +1,30 @@
+.TH "FI_SOCKET" 3 "2013-11-01" "libfabric" "Libfabric Programmer's Manual" libfabric
+.SH NAME
+fi_socket / fi_close \- Allocate/close a fabric interface socket
+.SH SYNOPSIS
+.B "#include <rdma/fabric.h>"
+.P
+.B "int" fi_socket
+.BI "(struct fi_info *" info ","
+.BI "fid_t *" fid ","
+.BI "void *" context ");"
+.P
+.B "int" fi_close
+.BI "(fid_t " fid ");"
+.SH ARGUMENTS
+.IP "info" 12
+Details about the fabric interface socket to be opened. 
+.IP "fid" 12
+On success, points to the opened fabric socket.
+.IP "context" 12
+User specified context associated with the socket.
+.SH "DESCRIPTION"
+Opens a fabric interface socket.
+.SH "RETURN VALUE"
+Returns 0 on success, or errno on error.
+.SH "NOTES"
+Opens a fabric socket corresponding to the requested fabric interface
+information.  Users should call fi_close to release all resources allocated
+fo the fabric socket.
+.SH "SEE ALSO"
+fi_getinfo(3), fi_open(3)
diff --git a/prov/ibverbs/AUTHORS b/prov/ibverbs/AUTHORS
new file mode 100644
index 00000000000..fcea3504a51
--- /dev/null
+++ b/prov/ibverbs/AUTHORS
@@ -0,0 +1,4 @@
+Roland Dreier		<roland@topspin.com>
+Dotan Barak		<dotanba@gmail.com>
+Sean Hefty		<sean.hefty@intel.com>
+Michael S. Tsirkin	<mst@mellanox.co.il>
diff --git a/prov/ibverbs/COPYING b/prov/ibverbs/COPYING
new file mode 100644
index 00000000000..ee1a79ffabf
--- /dev/null
+++ b/prov/ibverbs/COPYING
@@ -0,0 +1,378 @@
+This software is available to you under a choice of one of two
+licenses.  You may choose to be licensed under the terms of the the
+OpenIB.org BSD license or the GNU General Public License (GPL) Version
+2, both included below.
+
+Copyright (c) 2004 Topspin Communications.  All rights reserved.
+
+==================================================================
+
+		       OpenIB.org BSD license
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+
+  * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following
+    disclaimer in the documentation and/or other materials provided
+    with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+==================================================================
+
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+                       59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+                            NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Library General
+Public License instead of this License.
diff --git a/prov/ibverbs/include/infiniband/driver.h b/prov/ibverbs/include/infiniband/driver.h
new file mode 100644
index 00000000000..49353b632c6
--- /dev/null
+++ b/prov/ibverbs/include/infiniband/driver.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef INFINIBAND_DRIVER_H
+#define INFINIBAND_DRIVER_H
+
+#include <infiniband/verbs.h>
+#include <rdma/fi_uverbs.h>
+
+
+#ifdef __cplusplus
+#  define BEGIN_C_DECLS extern "C" {
+#  define END_C_DECLS   }
+#else /* !__cplusplus */
+#  define BEGIN_C_DECLS
+#  define END_C_DECLS
+#endif /* __cplusplus */
+
+/*
+ * Extension that low-level drivers should add to their .so filename
+ * (probably via libtool "-release" option).  For example a low-level
+ * driver named "libfoo" should build a plug-in named "libfoo-rdmav2.so".
+ */
+#define IBV_DEVICE_LIBRARY_EXTENSION rdmav2
+
+typedef struct ibv_device *(*ibv_driver_init_func)(const char *uverbs_sys_path,
+						   int abi_version);
+
+void ibv_register_driver(const char *name, ibv_driver_init_func init_func);
+int ibv_cmd_get_context(struct ibv_context *context, struct ibv_get_context *cmd,
+			size_t cmd_size, struct ibv_get_context_resp *resp,
+			size_t resp_size);
+int ibv_cmd_query_device(struct ibv_context *context,
+			 struct ibv_device_attr *device_attr,
+			 uint64_t *raw_fw_ver,
+			 struct ibv_query_device *cmd, size_t cmd_size);
+int ibv_cmd_query_port(struct ibv_context *context, uint8_t port_num,
+		       struct ibv_port_attr *port_attr,
+		       struct ibv_query_port *cmd, size_t cmd_size);
+int ibv_cmd_query_gid(struct ibv_context *context, uint8_t port_num,
+		      int index, union ibv_gid *gid);
+int ibv_cmd_query_pkey(struct ibv_context *context, uint8_t port_num,
+		       int index, uint16_t *pkey);
+int ibv_cmd_alloc_pd(struct ibv_context *context, struct ibv_pd *pd,
+		     struct ibv_alloc_pd *cmd, size_t cmd_size,
+		     struct ibv_alloc_pd_resp *resp, size_t resp_size);
+int ibv_cmd_dealloc_pd(struct ibv_pd *pd);
+#define IBV_CMD_REG_MR_HAS_RESP_PARAMS
+int ibv_cmd_reg_mr(struct ibv_pd *pd, void *addr, size_t length,
+		   uint64_t hca_va, int access,
+		   struct ibv_mr *mr, struct ibv_reg_mr *cmd,
+		   size_t cmd_size,
+		   struct ibv_reg_mr_resp *resp, size_t resp_size);
+int ibv_cmd_dereg_mr(struct ibv_mr *mr);
+int ibv_cmd_create_cq(struct ibv_context *context, int cqe,
+		      struct ibv_comp_channel *channel,
+		      int comp_vector, struct ibv_cq *cq,
+		      struct ibv_create_cq *cmd, size_t cmd_size,
+		      struct ibv_create_cq_resp *resp, size_t resp_size);
+int ibv_cmd_poll_cq(struct ibv_cq *cq, int ne, struct ibv_wc *wc);
+int ibv_cmd_req_notify_cq(struct ibv_cq *cq, int solicited_only);
+#define IBV_CMD_RESIZE_CQ_HAS_RESP_PARAMS
+int ibv_cmd_resize_cq(struct ibv_cq *cq, int cqe,
+		      struct ibv_resize_cq *cmd, size_t cmd_size,
+		      struct ibv_resize_cq_resp *resp, size_t resp_size);
+int ibv_cmd_destroy_cq(struct ibv_cq *cq);
+
+int ibv_cmd_create_srq(struct ibv_pd *pd,
+		       struct ibv_srq *srq, struct ibv_srq_init_attr *attr,
+		       struct ibv_create_srq *cmd, size_t cmd_size,
+		       struct ibv_create_srq_resp *resp, size_t resp_size);
+int ibv_cmd_modify_srq(struct ibv_srq *srq,
+		       struct ibv_srq_attr *srq_attr,
+		       int srq_attr_mask,
+		       struct ibv_modify_srq *cmd, size_t cmd_size);
+int ibv_cmd_query_srq(struct ibv_srq *srq,
+		      struct ibv_srq_attr *srq_attr,
+		      struct ibv_query_srq *cmd, size_t cmd_size);
+int ibv_cmd_destroy_srq(struct ibv_srq *srq);
+
+int ibv_cmd_create_qp(struct ibv_pd *pd,
+		      struct ibv_qp *qp, struct ibv_qp_init_attr *attr,
+		      struct ibv_create_qp *cmd, size_t cmd_size,
+		      struct ibv_create_qp_resp *resp, size_t resp_size);
+int ibv_cmd_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *qp_attr,
+		     int attr_mask,
+		     struct ibv_qp_init_attr *qp_init_attr,
+		     struct ibv_query_qp *cmd, size_t cmd_size);
+int ibv_cmd_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
+		      int attr_mask,
+		      struct ibv_modify_qp *cmd, size_t cmd_size);
+int ibv_cmd_destroy_qp(struct ibv_qp *qp);
+int ibv_cmd_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
+		      struct ibv_send_wr **bad_wr);
+int ibv_cmd_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
+		      struct ibv_recv_wr **bad_wr);
+int ibv_cmd_post_srq_recv(struct ibv_srq *srq, struct ibv_recv_wr *wr,
+			  struct ibv_recv_wr **bad_wr);
+int ibv_cmd_create_ah(struct ibv_pd *pd, struct ibv_ah *ah,
+		      struct ibv_ah_attr *attr);
+int ibv_cmd_destroy_ah(struct ibv_ah *ah);
+int ibv_cmd_attach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid);
+int ibv_cmd_detach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid);
+
+int ibv_dontfork_range(void *base, size_t size);
+int ibv_dofork_range(void *base, size_t size);
+
+/*
+ * sysfs helper functions
+ */
+#define ibv_get_sysfs_path fi_sysfs_path
+#define ibv_read_sysfs_file fi_read_file
+
+#endif /* INFINIBAND_DRIVER_H */
diff --git a/prov/ibverbs/include/infiniband/marshall.h b/prov/ibverbs/include/infiniband/marshall.h
new file mode 100644
index 00000000000..48493fa121d
--- /dev/null
+++ b/prov/ibverbs/include/infiniband/marshall.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2005 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef INFINIBAND_MARSHALL_H
+#define INFINIBAND_MARSHALL_H
+
+#include <infiniband/verbs.h>
+#include <rdma/fi_uverbs.h>
+#include <rdma/fi_ucma.h>
+
+#ifdef __cplusplus
+#  define BEGIN_C_DECLS extern "C" {
+#  define END_C_DECLS   }
+#else /* !__cplusplus */
+#  define BEGIN_C_DECLS
+#  define END_C_DECLS
+#endif /* __cplusplus */
+
+BEGIN_C_DECLS
+
+void ibv_copy_qp_attr_from_kern(struct ibv_qp_attr *dst,
+				struct ibv_kern_qp_attr *src);
+
+void ibv_copy_ah_attr_from_kern(struct ibv_ah_attr *dst,
+				struct ibv_kern_ah_attr *src);
+
+void ibv_copy_path_rec_from_kern(struct ibv_sa_path_rec *dst,
+				 struct ibv_kern_path_rec *src);
+
+void ibv_copy_path_rec_to_kern(struct ibv_kern_path_rec *dst,
+			       struct ibv_sa_path_rec *src);
+
+END_C_DECLS
+
+#endif /* INFINIBAND_MARSHALL_H */
diff --git a/prov/ibverbs/include/infiniband/opcode.h b/prov/ibverbs/include/infiniband/opcode.h
new file mode 100644
index 00000000000..fd4bc96a2c9
--- /dev/null
+++ b/prov/ibverbs/include/infiniband/opcode.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef INFINIBAND_OPCODE_H
+#define INFINIBAND_OPCODE_H
+
+/*
+ * This macro cleans up the definitions of constants for BTH opcodes.
+ * It is used to define constants such as IBV_OPCODE_UD_SEND_ONLY,
+ * which becomes IBV_OPCODE_UD + IBV_OPCODE_SEND_ONLY, and this gives
+ * the correct value.
+ *
+ * In short, user code should use the constants defined using the
+ * macro rather than worrying about adding together other constants.
+*/
+#define IBV_OPCODE(transport, op) \
+	IBV_OPCODE_ ## transport ## _ ## op = \
+		IBV_OPCODE_ ## transport + IBV_OPCODE_ ## op
+
+enum {
+	/* transport types -- just used to define real constants */
+	IBV_OPCODE_RC                                = 0x00,
+	IBV_OPCODE_UC                                = 0x20,
+	IBV_OPCODE_RD                                = 0x40,
+	IBV_OPCODE_UD                                = 0x60,
+
+	/* operations -- just used to define real constants */
+	IBV_OPCODE_SEND_FIRST                        = 0x00,
+	IBV_OPCODE_SEND_MIDDLE                       = 0x01,
+	IBV_OPCODE_SEND_LAST                         = 0x02,
+	IBV_OPCODE_SEND_LAST_WITH_IMMEDIATE          = 0x03,
+	IBV_OPCODE_SEND_ONLY                         = 0x04,
+	IBV_OPCODE_SEND_ONLY_WITH_IMMEDIATE          = 0x05,
+	IBV_OPCODE_RDMA_WRITE_FIRST                  = 0x06,
+	IBV_OPCODE_RDMA_WRITE_MIDDLE                 = 0x07,
+	IBV_OPCODE_RDMA_WRITE_LAST                   = 0x08,
+	IBV_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE    = 0x09,
+	IBV_OPCODE_RDMA_WRITE_ONLY                   = 0x0a,
+	IBV_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE    = 0x0b,
+	IBV_OPCODE_RDMA_READ_REQUEST                 = 0x0c,
+	IBV_OPCODE_RDMA_READ_RESPONSE_FIRST          = 0x0d,
+	IBV_OPCODE_RDMA_READ_RESPONSE_MIDDLE         = 0x0e,
+	IBV_OPCODE_RDMA_READ_RESPONSE_LAST           = 0x0f,
+	IBV_OPCODE_RDMA_READ_RESPONSE_ONLY           = 0x10,
+	IBV_OPCODE_ACKNOWLEDGE                       = 0x11,
+	IBV_OPCODE_ATOMIC_ACKNOWLEDGE                = 0x12,
+	IBV_OPCODE_COMPARE_SWAP                      = 0x13,
+	IBV_OPCODE_FETCH_ADD                         = 0x14,
+
+	/* real constants follow -- see comment about above IBV_OPCODE()
+	   macro for more details */
+
+	/* RC */
+	IBV_OPCODE(RC, SEND_FIRST),
+	IBV_OPCODE(RC, SEND_MIDDLE),
+	IBV_OPCODE(RC, SEND_LAST),
+	IBV_OPCODE(RC, SEND_LAST_WITH_IMMEDIATE),
+	IBV_OPCODE(RC, SEND_ONLY),
+	IBV_OPCODE(RC, SEND_ONLY_WITH_IMMEDIATE),
+	IBV_OPCODE(RC, RDMA_WRITE_FIRST),
+	IBV_OPCODE(RC, RDMA_WRITE_MIDDLE),
+	IBV_OPCODE(RC, RDMA_WRITE_LAST),
+	IBV_OPCODE(RC, RDMA_WRITE_LAST_WITH_IMMEDIATE),
+	IBV_OPCODE(RC, RDMA_WRITE_ONLY),
+	IBV_OPCODE(RC, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
+	IBV_OPCODE(RC, RDMA_READ_REQUEST),
+	IBV_OPCODE(RC, RDMA_READ_RESPONSE_FIRST),
+	IBV_OPCODE(RC, RDMA_READ_RESPONSE_MIDDLE),
+	IBV_OPCODE(RC, RDMA_READ_RESPONSE_LAST),
+	IBV_OPCODE(RC, RDMA_READ_RESPONSE_ONLY),
+	IBV_OPCODE(RC, ACKNOWLEDGE),
+	IBV_OPCODE(RC, ATOMIC_ACKNOWLEDGE),
+	IBV_OPCODE(RC, COMPARE_SWAP),
+	IBV_OPCODE(RC, FETCH_ADD),
+
+	/* UC */
+	IBV_OPCODE(UC, SEND_FIRST),
+	IBV_OPCODE(UC, SEND_MIDDLE),
+	IBV_OPCODE(UC, SEND_LAST),
+	IBV_OPCODE(UC, SEND_LAST_WITH_IMMEDIATE),
+	IBV_OPCODE(UC, SEND_ONLY),
+	IBV_OPCODE(UC, SEND_ONLY_WITH_IMMEDIATE),
+	IBV_OPCODE(UC, RDMA_WRITE_FIRST),
+	IBV_OPCODE(UC, RDMA_WRITE_MIDDLE),
+	IBV_OPCODE(UC, RDMA_WRITE_LAST),
+	IBV_OPCODE(UC, RDMA_WRITE_LAST_WITH_IMMEDIATE),
+	IBV_OPCODE(UC, RDMA_WRITE_ONLY),
+	IBV_OPCODE(UC, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
+
+	/* RD */
+	IBV_OPCODE(RD, SEND_FIRST),
+	IBV_OPCODE(RD, SEND_MIDDLE),
+	IBV_OPCODE(RD, SEND_LAST),
+	IBV_OPCODE(RD, SEND_LAST_WITH_IMMEDIATE),
+	IBV_OPCODE(RD, SEND_ONLY),
+	IBV_OPCODE(RD, SEND_ONLY_WITH_IMMEDIATE),
+	IBV_OPCODE(RD, RDMA_WRITE_FIRST),
+	IBV_OPCODE(RD, RDMA_WRITE_MIDDLE),
+	IBV_OPCODE(RD, RDMA_WRITE_LAST),
+	IBV_OPCODE(RD, RDMA_WRITE_LAST_WITH_IMMEDIATE),
+	IBV_OPCODE(RD, RDMA_WRITE_ONLY),
+	IBV_OPCODE(RD, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
+	IBV_OPCODE(RD, RDMA_READ_REQUEST),
+	IBV_OPCODE(RD, RDMA_READ_RESPONSE_FIRST),
+	IBV_OPCODE(RD, RDMA_READ_RESPONSE_MIDDLE),
+	IBV_OPCODE(RD, RDMA_READ_RESPONSE_LAST),
+	IBV_OPCODE(RD, RDMA_READ_RESPONSE_ONLY),
+	IBV_OPCODE(RD, ACKNOWLEDGE),
+	IBV_OPCODE(RD, ATOMIC_ACKNOWLEDGE),
+	IBV_OPCODE(RD, COMPARE_SWAP),
+	IBV_OPCODE(RD, FETCH_ADD),
+
+	/* UD */
+	IBV_OPCODE(UD, SEND_ONLY),
+	IBV_OPCODE(UD, SEND_ONLY_WITH_IMMEDIATE)
+};
+
+#endif /* INFINIBAND_OPCODE_H */
diff --git a/prov/ibverbs/include/infiniband/verbs.h b/prov/ibverbs/include/infiniband/verbs.h
new file mode 100644
index 00000000000..a0158ff1df8
--- /dev/null
+++ b/prov/ibverbs/include/infiniband/verbs.h
@@ -0,0 +1,1158 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2004 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef INFINIBAND_VERBS_H
+#define INFINIBAND_VERBS_H
+
+#include <stdint.h>
+#include <pthread.h>
+#include <rdma/fabric.h>
+
+#ifdef __cplusplus
+#  define BEGIN_C_DECLS extern "C" {
+#  define END_C_DECLS   }
+#else /* !__cplusplus */
+#  define BEGIN_C_DECLS
+#  define END_C_DECLS
+#endif /* __cplusplus */
+
+#if __GNUC__ >= 3
+#  define __attribute_const __attribute__((const))
+#else
+#  define __attribute_const
+#endif
+
+BEGIN_C_DECLS
+
+union ibv_gid {
+	uint8_t			raw[16];
+	struct {
+		uint64_t	subnet_prefix;
+		uint64_t	interface_id;
+	} global;
+};
+
+struct ibv_sa_path_rec {
+	union ibv_gid dgid;
+	union ibv_gid sgid;
+	uint16_t      dlid;
+	uint16_t      slid;
+	int           raw_traffic;
+	/* reserved */
+	uint32_t      flow_label;
+	uint8_t       hop_limit;
+	uint8_t       traffic_class;
+	int           reversible;
+	uint8_t       numb_path;
+	uint16_t      pkey;
+	/* reserved */
+	uint8_t       sl;
+	uint8_t       mtu_selector;
+	uint8_t	      mtu;
+	uint8_t       rate_selector;
+	uint8_t       rate;
+	uint8_t       packet_life_time_selector;
+	uint8_t       packet_life_time;
+	uint8_t       preference;
+};
+
+enum ibv_node_type {
+	IBV_NODE_UNKNOWN	= -1,
+	IBV_NODE_CA 		= 1,
+	IBV_NODE_SWITCH,
+	IBV_NODE_ROUTER,
+	IBV_NODE_RNIC
+};
+
+enum ibv_transport_type {
+	IBV_TRANSPORT_UNKNOWN	= -1,
+	IBV_TRANSPORT_IB	= 0,
+	IBV_TRANSPORT_IWARP
+};
+
+enum ibv_device_cap_flags {
+	IBV_DEVICE_RESIZE_MAX_WR	= 1,
+	IBV_DEVICE_BAD_PKEY_CNTR	= 1 <<  1,
+	IBV_DEVICE_BAD_QKEY_CNTR	= 1 <<  2,
+	IBV_DEVICE_RAW_MULTI		= 1 <<  3,
+	IBV_DEVICE_AUTO_PATH_MIG	= 1 <<  4,
+	IBV_DEVICE_CHANGE_PHY_PORT	= 1 <<  5,
+	IBV_DEVICE_UD_AV_PORT_ENFORCE	= 1 <<  6,
+	IBV_DEVICE_CURR_QP_STATE_MOD	= 1 <<  7,
+	IBV_DEVICE_SHUTDOWN_PORT	= 1 <<  8,
+	IBV_DEVICE_INIT_TYPE		= 1 <<  9,
+	IBV_DEVICE_PORT_ACTIVE_EVENT	= 1 << 10,
+	IBV_DEVICE_SYS_IMAGE_GUID	= 1 << 11,
+	IBV_DEVICE_RC_RNR_NAK_GEN	= 1 << 12,
+	IBV_DEVICE_SRQ_RESIZE		= 1 << 13,
+	IBV_DEVICE_N_NOTIFY_CQ		= 1 << 14
+};
+
+enum ibv_atomic_cap {
+	IBV_ATOMIC_NONE,
+	IBV_ATOMIC_HCA,
+	IBV_ATOMIC_GLOB
+};
+
+struct ibv_device_attr {
+	char			fw_ver[64];
+	uint64_t		node_guid;
+	uint64_t		sys_image_guid;
+	uint64_t		max_mr_size;
+	uint64_t		page_size_cap;
+	uint32_t		vendor_id;
+	uint32_t		vendor_part_id;
+	uint32_t		hw_ver;
+	int			max_qp;
+	int			max_qp_wr;
+	int			device_cap_flags;
+	int			max_sge;
+	int			max_sge_rd;
+	int			max_cq;
+	int			max_cqe;
+	int			max_mr;
+	int			max_pd;
+	int			max_qp_rd_atom;
+	int			max_ee_rd_atom;
+	int			max_res_rd_atom;
+	int			max_qp_init_rd_atom;
+	int			max_ee_init_rd_atom;
+	enum ibv_atomic_cap	atomic_cap;
+	int			max_ee;
+	int			max_rdd;
+	int			max_mw;
+	int			max_raw_ipv6_qp;
+	int			max_raw_ethy_qp;
+	int			max_mcast_grp;
+	int			max_mcast_qp_attach;
+	int			max_total_mcast_qp_attach;
+	int			max_ah;
+	int			max_fmr;
+	int			max_map_per_fmr;
+	int			max_srq;
+	int			max_srq_wr;
+	int			max_srq_sge;
+	uint16_t		max_pkeys;
+	uint8_t			local_ca_ack_delay;
+	uint8_t			phys_port_cnt;
+};
+
+enum ibv_mtu {
+	IBV_MTU_256  = 1,
+	IBV_MTU_512  = 2,
+	IBV_MTU_1024 = 3,
+	IBV_MTU_2048 = 4,
+	IBV_MTU_4096 = 5
+};
+
+enum ibv_port_state {
+	IBV_PORT_NOP		= 0,
+	IBV_PORT_DOWN		= 1,
+	IBV_PORT_INIT		= 2,
+	IBV_PORT_ARMED		= 3,
+	IBV_PORT_ACTIVE		= 4,
+	IBV_PORT_ACTIVE_DEFER	= 5
+};
+
+enum {
+	IBV_LINK_LAYER_UNSPECIFIED,
+	IBV_LINK_LAYER_INFINIBAND,
+	IBV_LINK_LAYER_ETHERNET,
+};
+
+struct ibv_port_attr {
+	enum ibv_port_state	state;
+	enum ibv_mtu		max_mtu;
+	enum ibv_mtu		active_mtu;
+	int			gid_tbl_len;
+	uint32_t		port_cap_flags;
+	uint32_t		max_msg_sz;
+	uint32_t		bad_pkey_cntr;
+	uint32_t		qkey_viol_cntr;
+	uint16_t		pkey_tbl_len;
+	uint16_t		lid;
+	uint16_t		sm_lid;
+	uint8_t			lmc;
+	uint8_t			max_vl_num;
+	uint8_t			sm_sl;
+	uint8_t			subnet_timeout;
+	uint8_t			init_type_reply;
+	uint8_t			active_width;
+	uint8_t			active_speed;
+	uint8_t			phys_state;
+	uint8_t			link_layer;
+	uint8_t			reserved;
+};
+
+enum ibv_event_type {
+	IBV_EVENT_CQ_ERR,
+	IBV_EVENT_QP_FATAL,
+	IBV_EVENT_QP_REQ_ERR,
+	IBV_EVENT_QP_ACCESS_ERR,
+	IBV_EVENT_COMM_EST,
+	IBV_EVENT_SQ_DRAINED,
+	IBV_EVENT_PATH_MIG,
+	IBV_EVENT_PATH_MIG_ERR,
+	IBV_EVENT_DEVICE_FATAL,
+	IBV_EVENT_PORT_ACTIVE,
+	IBV_EVENT_PORT_ERR,
+	IBV_EVENT_LID_CHANGE,
+	IBV_EVENT_PKEY_CHANGE,
+	IBV_EVENT_SM_CHANGE,
+	IBV_EVENT_SRQ_ERR,
+	IBV_EVENT_SRQ_LIMIT_REACHED,
+	IBV_EVENT_QP_LAST_WQE_REACHED,
+	IBV_EVENT_CLIENT_REREGISTER,
+	IBV_EVENT_GID_CHANGE,
+};
+
+struct ibv_async_event {
+	union {
+		struct ibv_cq  *cq;
+		struct ibv_qp  *qp;
+		struct ibv_srq *srq;
+		int		port_num;
+	} element;
+	enum ibv_event_type	event_type;
+};
+
+enum ibv_wc_status {
+	IBV_WC_SUCCESS,
+	IBV_WC_LOC_LEN_ERR,
+	IBV_WC_LOC_QP_OP_ERR,
+	IBV_WC_LOC_EEC_OP_ERR,
+	IBV_WC_LOC_PROT_ERR,
+	IBV_WC_WR_FLUSH_ERR,
+	IBV_WC_MW_BIND_ERR,
+	IBV_WC_BAD_RESP_ERR,
+	IBV_WC_LOC_ACCESS_ERR,
+	IBV_WC_REM_INV_REQ_ERR,
+	IBV_WC_REM_ACCESS_ERR,
+	IBV_WC_REM_OP_ERR,
+	IBV_WC_RETRY_EXC_ERR,
+	IBV_WC_RNR_RETRY_EXC_ERR,
+	IBV_WC_LOC_RDD_VIOL_ERR,
+	IBV_WC_REM_INV_RD_REQ_ERR,
+	IBV_WC_REM_ABORT_ERR,
+	IBV_WC_INV_EECN_ERR,
+	IBV_WC_INV_EEC_STATE_ERR,
+	IBV_WC_FATAL_ERR,
+	IBV_WC_RESP_TIMEOUT_ERR,
+	IBV_WC_GENERAL_ERR
+};
+const char *ibv_wc_status_str(enum ibv_wc_status status);
+
+enum ibv_wc_opcode {
+	IBV_WC_SEND,
+	IBV_WC_RDMA_WRITE,
+	IBV_WC_RDMA_READ,
+	IBV_WC_COMP_SWAP,
+	IBV_WC_FETCH_ADD,
+	IBV_WC_BIND_MW,
+/*
+ * Set value of IBV_WC_RECV so consumers can test if a completion is a
+ * receive by testing (opcode & IBV_WC_RECV).
+ */
+	IBV_WC_RECV			= 1 << 7,
+	IBV_WC_RECV_RDMA_WITH_IMM
+};
+
+enum ibv_wc_flags {
+	IBV_WC_GRH		= 1 << 0,
+	IBV_WC_WITH_IMM		= 1 << 1
+};
+
+struct ibv_wc {
+	uint64_t		wr_id;
+	enum ibv_wc_status	status;
+	enum ibv_wc_opcode	opcode;
+	uint32_t		vendor_err;
+	uint32_t		byte_len;
+	uint32_t		imm_data;	/* in network byte order */
+	uint32_t		qp_num;
+	uint32_t		src_qp;
+	int			wc_flags;
+	uint16_t		pkey_index;
+	uint16_t		slid;
+	uint8_t			sl;
+	uint8_t			dlid_path_bits;
+};
+
+enum ibv_access_flags {
+	IBV_ACCESS_LOCAL_WRITE		= 1,
+	IBV_ACCESS_REMOTE_WRITE		= (1<<1),
+	IBV_ACCESS_REMOTE_READ		= (1<<2),
+	IBV_ACCESS_REMOTE_ATOMIC	= (1<<3),
+	IBV_ACCESS_MW_BIND		= (1<<4)
+};
+
+struct ibv_pd {
+	struct ibv_context     *context;
+	uint32_t		handle;
+};
+
+enum ibv_rereg_mr_flags {
+	IBV_REREG_MR_CHANGE_TRANSLATION	= (1 << 0),
+	IBV_REREG_MR_CHANGE_PD		= (1 << 1),
+	IBV_REREG_MR_CHANGE_ACCESS	= (1 << 2),
+	IBV_REREG_MR_KEEP_VALID		= (1 << 3)
+};
+
+struct ibv_mr {
+	struct ibv_context     *context;
+	struct ibv_pd	       *pd;
+	void		       *addr;
+	size_t			length;
+	uint32_t		handle;
+	uint32_t		lkey;
+	uint32_t		rkey;
+};
+
+enum ibv_mw_type {
+	IBV_MW_TYPE_1			= 1,
+	IBV_MW_TYPE_2			= 2
+};
+
+struct ibv_mw {
+	struct ibv_context     *context;
+	struct ibv_pd	       *pd;
+	uint32_t		rkey;
+};
+
+struct ibv_global_route {
+	union ibv_gid		dgid;
+	uint32_t		flow_label;
+	uint8_t			sgid_index;
+	uint8_t			hop_limit;
+	uint8_t			traffic_class;
+};
+
+struct ibv_grh {
+	uint32_t		version_tclass_flow;
+	uint16_t		paylen;
+	uint8_t			next_hdr;
+	uint8_t			hop_limit;
+	union ibv_gid		sgid;
+	union ibv_gid		dgid;
+};
+
+enum ibv_rate {
+	IBV_RATE_MAX      = 0,
+	IBV_RATE_2_5_GBPS = 2,
+	IBV_RATE_5_GBPS   = 5,
+	IBV_RATE_10_GBPS  = 3,
+	IBV_RATE_20_GBPS  = 6,
+	IBV_RATE_30_GBPS  = 4,
+	IBV_RATE_40_GBPS  = 7,
+	IBV_RATE_60_GBPS  = 8,
+	IBV_RATE_80_GBPS  = 9,
+	IBV_RATE_120_GBPS = 10,
+	IBV_RATE_14_GBPS  = 11,
+	IBV_RATE_56_GBPS  = 12,
+	IBV_RATE_112_GBPS = 13,
+	IBV_RATE_168_GBPS = 14,
+	IBV_RATE_25_GBPS  = 15,
+	IBV_RATE_100_GBPS = 16,
+	IBV_RATE_200_GBPS = 17,
+	IBV_RATE_300_GBPS = 18
+};
+
+/**
+ * ibv_rate_to_mult - Convert the IB rate enum to a multiple of the
+ * base rate of 2.5 Gbit/sec.  For example, IBV_RATE_5_GBPS will be
+ * converted to 2, since 5 Gbit/sec is 2 * 2.5 Gbit/sec.
+ * @rate: rate to convert.
+ */
+int ibv_rate_to_mult(enum ibv_rate rate) __attribute_const;
+
+/**
+ * mult_to_ibv_rate - Convert a multiple of 2.5 Gbit/sec to an IB rate enum.
+ * @mult: multiple to convert.
+ */
+enum ibv_rate mult_to_ibv_rate(int mult) __attribute_const;
+
+/**
+ * ibv_rate_to_mbps - Convert the IB rate enum to Mbit/sec.
+ * For example, IBV_RATE_5_GBPS will return the value 5000.
+ * @rate: rate to convert.
+ */
+int ibv_rate_to_mbps(enum ibv_rate rate) __attribute_const;
+
+/**
+ * mbps_to_ibv_rate - Convert a Mbit/sec value to an IB rate enum.
+ * @mbps: value to convert.
+ */
+enum ibv_rate mbps_to_ibv_rate(int mbps) __attribute_const;
+
+struct ibv_ah_attr {
+	struct ibv_global_route	grh;
+	uint16_t		dlid;
+	uint8_t			sl;
+	uint8_t			src_path_bits;
+	uint8_t			static_rate;
+	uint8_t			is_global;
+	uint8_t			port_num;
+};
+
+enum ibv_srq_attr_mask {
+	IBV_SRQ_MAX_WR	= 1 << 0,
+	IBV_SRQ_LIMIT	= 1 << 1
+};
+
+struct ibv_srq_attr {
+	uint32_t		max_wr;
+	uint32_t		max_sge;
+	uint32_t		srq_limit;
+};
+
+struct ibv_srq_init_attr {
+	void		       *srq_context;
+	struct ibv_srq_attr	attr;
+};
+
+enum ibv_qp_type {
+	IBV_QPT_RC = 2,
+	IBV_QPT_UC,
+	IBV_QPT_UD,
+	IBV_QPT_RAW_PACKET = 8
+};
+
+struct ibv_qp_cap {
+	uint32_t		max_send_wr;
+	uint32_t		max_recv_wr;
+	uint32_t		max_send_sge;
+	uint32_t		max_recv_sge;
+	uint32_t		max_inline_data;
+};
+
+struct ibv_qp_init_attr {
+	void		       *qp_context;
+	struct ibv_cq	       *send_cq;
+	struct ibv_cq	       *recv_cq;
+	struct ibv_srq	       *srq;
+	struct ibv_qp_cap	cap;
+	enum ibv_qp_type	qp_type;
+	int			sq_sig_all;
+};
+
+enum ibv_qp_attr_mask {
+	IBV_QP_STATE			= 1 << 	0,
+	IBV_QP_CUR_STATE		= 1 << 	1,
+	IBV_QP_EN_SQD_ASYNC_NOTIFY	= 1 << 	2,
+	IBV_QP_ACCESS_FLAGS		= 1 << 	3,
+	IBV_QP_PKEY_INDEX		= 1 << 	4,
+	IBV_QP_PORT			= 1 << 	5,
+	IBV_QP_QKEY			= 1 << 	6,
+	IBV_QP_AV			= 1 << 	7,
+	IBV_QP_PATH_MTU			= 1 << 	8,
+	IBV_QP_TIMEOUT			= 1 << 	9,
+	IBV_QP_RETRY_CNT		= 1 << 10,
+	IBV_QP_RNR_RETRY		= 1 << 11,
+	IBV_QP_RQ_PSN			= 1 << 12,
+	IBV_QP_MAX_QP_RD_ATOMIC		= 1 << 13,
+	IBV_QP_ALT_PATH			= 1 << 14,
+	IBV_QP_MIN_RNR_TIMER		= 1 << 15,
+	IBV_QP_SQ_PSN			= 1 << 16,
+	IBV_QP_MAX_DEST_RD_ATOMIC	= 1 << 17,
+	IBV_QP_PATH_MIG_STATE		= 1 << 18,
+	IBV_QP_CAP			= 1 << 19,
+	IBV_QP_DEST_QPN			= 1 << 20
+};
+
+enum ibv_qp_state {
+	IBV_QPS_RESET,
+	IBV_QPS_INIT,
+	IBV_QPS_RTR,
+	IBV_QPS_RTS,
+	IBV_QPS_SQD,
+	IBV_QPS_SQE,
+	IBV_QPS_ERR
+};
+
+enum ibv_mig_state {
+	IBV_MIG_MIGRATED,
+	IBV_MIG_REARM,
+	IBV_MIG_ARMED
+};
+
+struct ibv_qp_attr {
+	enum ibv_qp_state	qp_state;
+	enum ibv_qp_state	cur_qp_state;
+	enum ibv_mtu		path_mtu;
+	enum ibv_mig_state	path_mig_state;
+	uint32_t		qkey;
+	uint32_t		rq_psn;
+	uint32_t		sq_psn;
+	uint32_t		dest_qp_num;
+	int			qp_access_flags;
+	struct ibv_qp_cap	cap;
+	struct ibv_ah_attr	ah_attr;
+	struct ibv_ah_attr	alt_ah_attr;
+	uint16_t		pkey_index;
+	uint16_t		alt_pkey_index;
+	uint8_t			en_sqd_async_notify;
+	uint8_t			sq_draining;
+	uint8_t			max_rd_atomic;
+	uint8_t			max_dest_rd_atomic;
+	uint8_t			min_rnr_timer;
+	uint8_t			port_num;
+	uint8_t			timeout;
+	uint8_t			retry_cnt;
+	uint8_t			rnr_retry;
+	uint8_t			alt_port_num;
+	uint8_t			alt_timeout;
+};
+
+enum ibv_wr_opcode {
+	IBV_WR_RDMA_WRITE,
+	IBV_WR_RDMA_WRITE_WITH_IMM,
+	IBV_WR_SEND,
+	IBV_WR_SEND_WITH_IMM,
+	IBV_WR_RDMA_READ,
+	IBV_WR_ATOMIC_CMP_AND_SWP,
+	IBV_WR_ATOMIC_FETCH_AND_ADD
+};
+
+enum ibv_send_flags {
+	IBV_SEND_FENCE		= 1 << 0,
+	IBV_SEND_SIGNALED	= 1 << 1,
+	IBV_SEND_SOLICITED	= 1 << 2,
+	IBV_SEND_INLINE		= 1 << 3
+};
+
+struct ibv_sge {
+	uint64_t		addr;
+	uint32_t		length;
+	uint32_t		lkey;
+};
+
+struct ibv_send_wr {
+	uint64_t		wr_id;
+	struct ibv_send_wr     *next;
+	struct ibv_sge	       *sg_list;
+	int			num_sge;
+	enum ibv_wr_opcode	opcode;
+	int			send_flags;
+	uint32_t		imm_data;	/* in network byte order */
+	union {
+		struct {
+			uint64_t	remote_addr;
+			uint32_t	rkey;
+		} rdma;
+		struct {
+			uint64_t	remote_addr;
+			uint64_t	compare_add;
+			uint64_t	swap;
+			uint32_t	rkey;
+		} atomic;
+		struct {
+			struct ibv_ah  *ah;
+			uint32_t	remote_qpn;
+			uint32_t	remote_qkey;
+		} ud;
+	} wr;
+};
+
+struct ibv_recv_wr {
+	uint64_t		wr_id;
+	struct ibv_recv_wr     *next;
+	struct ibv_sge	       *sg_list;
+	int			num_sge;
+};
+
+struct ibv_mw_bind {
+	uint64_t		wr_id;
+	struct ibv_mr	       *mr;
+	void		       *addr;
+	size_t			length;
+	int			send_flags;
+	int			mw_access_flags;
+};
+
+struct ibv_srq {
+	struct ibv_context     *context;
+	void		       *srq_context;
+	struct ibv_pd	       *pd;
+	uint32_t		handle;
+
+	pthread_mutex_t		mutex;
+	pthread_cond_t		cond;
+	uint32_t		events_completed;
+};
+
+struct ibv_qp {
+	struct ibv_context     *context;
+	void		       *qp_context;
+	struct ibv_pd	       *pd;
+	struct ibv_cq	       *send_cq;
+	struct ibv_cq	       *recv_cq;
+	struct ibv_srq	       *srq;
+	uint32_t		handle;
+	uint32_t		qp_num;
+	enum ibv_qp_state       state;
+	enum ibv_qp_type	qp_type;
+
+	pthread_mutex_t		mutex;
+	pthread_cond_t		cond;
+	uint32_t		events_completed;
+};
+
+struct ibv_comp_channel {
+	struct ibv_context     *context;
+	int			fd;
+	int			refcnt;
+};
+
+struct ibv_cq {
+	struct ibv_context     *context;
+	struct ibv_comp_channel *channel;
+	void		       *cq_context;
+	uint32_t		handle;
+	int			cqe;
+
+	pthread_mutex_t		mutex;
+	pthread_cond_t		cond;
+	uint32_t		comp_events_completed;
+	uint32_t		async_events_completed;
+};
+
+struct ibv_ah {
+	struct ibv_context     *context;
+	struct ibv_pd	       *pd;
+	uint32_t		handle;
+};
+
+struct ibv_device;
+struct ibv_context;
+
+struct ibv_device_ops {
+	struct ibv_context *	(*alloc_context)(struct ibv_device *device, fid_t fid);
+	void			(*free_context)(struct ibv_context *context);
+};
+
+enum {
+	IBV_SYSFS_NAME_MAX	= 64,
+	IBV_SYSFS_PATH_MAX	= 256
+};
+
+struct ibv_device {
+	struct ibv_device_ops	ops;
+	enum ibv_node_type	node_type;
+	enum ibv_transport_type	transport_type;
+	/* Name of underlying kernel IB device, eg "mthca0" */
+	char			name[IBV_SYSFS_NAME_MAX];
+	/* Name of uverbs device, eg "uverbs0" */
+	char			dev_name[IBV_SYSFS_NAME_MAX];
+	/* Path to infiniband_verbs class device in sysfs */
+	char			dev_path[IBV_SYSFS_PATH_MAX];
+	/* Path to infiniband class device in sysfs */
+	char			ibdev_path[IBV_SYSFS_PATH_MAX];
+};
+
+struct ibv_context_ops {
+	int			(*query_device)(struct ibv_context *context,
+					      struct ibv_device_attr *device_attr);
+	int			(*query_port)(struct ibv_context *context, uint8_t port_num,
+					      struct ibv_port_attr *port_attr);
+	struct ibv_pd *		(*alloc_pd)(struct ibv_context *context);
+	int			(*dealloc_pd)(struct ibv_pd *pd);
+	struct ibv_mr *		(*reg_mr)(struct ibv_pd *pd, void *addr, size_t length,
+					  int access);
+	struct ibv_mr *		(*rereg_mr)(struct ibv_mr *mr,
+					    int flags,
+					    struct ibv_pd *pd, void *addr,
+					    size_t length,
+					    int access);
+	int			(*dereg_mr)(struct ibv_mr *mr);
+	struct ibv_mw *		(*alloc_mw)(struct ibv_pd *pd, enum ibv_mw_type type);
+	int			(*bind_mw)(struct ibv_qp *qp, struct ibv_mw *mw,
+					   struct ibv_mw_bind *mw_bind);
+	int			(*dealloc_mw)(struct ibv_mw *mw);
+	struct ibv_cq *		(*create_cq)(struct ibv_context *context, int cqe,
+					     struct ibv_comp_channel *channel,
+					     int comp_vector);
+	int			(*poll_cq)(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc);
+	int			(*req_notify_cq)(struct ibv_cq *cq, int solicited_only);
+	void			(*cq_event)(struct ibv_cq *cq);
+	int			(*resize_cq)(struct ibv_cq *cq, int cqe);
+	int			(*destroy_cq)(struct ibv_cq *cq);
+	struct ibv_srq *	(*create_srq)(struct ibv_pd *pd,
+					      struct ibv_srq_init_attr *srq_init_attr);
+	int			(*modify_srq)(struct ibv_srq *srq,
+					      struct ibv_srq_attr *srq_attr,
+					      int srq_attr_mask);
+	int			(*query_srq)(struct ibv_srq *srq,
+					     struct ibv_srq_attr *srq_attr);
+	int			(*destroy_srq)(struct ibv_srq *srq);
+	int			(*post_srq_recv)(struct ibv_srq *srq,
+						 struct ibv_recv_wr *recv_wr,
+						 struct ibv_recv_wr **bad_recv_wr);
+	struct ibv_qp *		(*create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *attr);
+	int			(*query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr,
+					    int attr_mask,
+					    struct ibv_qp_init_attr *init_attr);
+	int			(*modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr,
+					     int attr_mask);
+	int			(*destroy_qp)(struct ibv_qp *qp);
+	int			(*post_send)(struct ibv_qp *qp, struct ibv_send_wr *wr,
+					     struct ibv_send_wr **bad_wr);
+	int			(*post_recv)(struct ibv_qp *qp, struct ibv_recv_wr *wr,
+					     struct ibv_recv_wr **bad_wr);
+	struct ibv_ah *		(*create_ah)(struct ibv_pd *pd, struct ibv_ah_attr *attr);
+	int			(*destroy_ah)(struct ibv_ah *ah);
+	int			(*attach_mcast)(struct ibv_qp *qp, const union ibv_gid *gid,
+						uint16_t lid);
+	int			(*detach_mcast)(struct ibv_qp *qp, const union ibv_gid *gid,
+						uint16_t lid);
+	void			(*async_event)(struct ibv_async_event *event);
+};
+
+struct ibv_context {
+	struct ibv_device      *device;
+	struct ibv_context_ops	ops;
+	int			cmd_fd;
+	int			async_fd;
+	int			num_comp_vectors;
+	pthread_mutex_t		mutex;
+	void		       *abi_compat;
+	fid_t			uv_fid;
+};
+
+/**
+ * ibv_get_device_list - Get list of IB devices currently available
+ * @num_devices: optional.  if non-NULL, set to the number of devices
+ * returned in the array.
+ *
+ * Return a NULL-terminated array of IB devices.  The array can be
+ * released with ibv_free_device_list().
+ */
+struct ibv_device **ibv_get_device_list(int *num_devices);
+
+/**
+ * ibv_free_device_list - Free list from ibv_get_device_list()
+ *
+ * Free an array of devices returned from ibv_get_device_list().  Once
+ * the array is freed, pointers to devices that were not opened with
+ * ibv_open_device() are no longer valid.  Client code must open all
+ * devices it intends to use before calling ibv_free_device_list().
+ */
+void ibv_free_device_list(struct ibv_device **list);
+
+/**
+ * ibv_get_device_name - Return kernel device name
+ */
+const char *ibv_get_device_name(struct ibv_device *device);
+
+/**
+ * ibv_get_device_guid - Return device's node GUID
+ */
+uint64_t ibv_get_device_guid(struct ibv_device *device);
+
+/**
+ * ibv_open_device - Initialize device for use
+ */
+struct ibv_context *ibv_open_device(struct ibv_device *device);
+
+/**
+ * ibv_close_device - Release device
+ */
+int ibv_close_device(struct ibv_context *context);
+
+/**
+ * ibv_get_async_event - Get next async event
+ * @event: Pointer to use to return async event
+ *
+ * All async events returned by ibv_get_async_event() must eventually
+ * be acknowledged with ibv_ack_async_event().
+ */
+int ibv_get_async_event(struct ibv_context *context,
+			struct ibv_async_event *event);
+
+/**
+ * ibv_ack_async_event - Acknowledge an async event
+ * @event: Event to be acknowledged.
+ *
+ * All async events which are returned by ibv_get_async_event() must
+ * be acknowledged.  To avoid races, destroying an object (CQ, SRQ or
+ * QP) will wait for all affiliated events to be acknowledged, so
+ * there should be a one-to-one correspondence between acks and
+ * successful gets.
+ */
+void ibv_ack_async_event(struct ibv_async_event *event);
+
+/**
+ * ibv_query_device - Get device properties
+ */
+int ibv_query_device(struct ibv_context *context,
+		     struct ibv_device_attr *device_attr);
+
+/**
+ * ibv_query_port - Get port properties
+ */
+int ibv_query_port(struct ibv_context *context, uint8_t port_num,
+		   struct ibv_port_attr *port_attr);
+
+/**
+ * ibv_query_gid - Get a GID table entry
+ */
+int ibv_query_gid(struct ibv_context *context, uint8_t port_num,
+		  int index, union ibv_gid *gid);
+
+/**
+ * ibv_query_pkey - Get a P_Key table entry
+ */
+int ibv_query_pkey(struct ibv_context *context, uint8_t port_num,
+		   int index, uint16_t *pkey);
+
+/**
+ * ibv_alloc_pd - Allocate a protection domain
+ */
+struct ibv_pd *ibv_alloc_pd(struct ibv_context *context);
+
+/**
+ * ibv_dealloc_pd - Free a protection domain
+ */
+int ibv_dealloc_pd(struct ibv_pd *pd);
+
+/**
+ * ibv_reg_mr - Register a memory region
+ */
+struct ibv_mr *ibv_reg_mr(struct ibv_pd *pd, void *addr,
+			  size_t length, int access);
+
+/**
+ * ibv_dereg_mr - Deregister a memory region
+ */
+int ibv_dereg_mr(struct ibv_mr *mr);
+
+/**
+ * ibv_create_comp_channel - Create a completion event channel
+ */
+struct ibv_comp_channel *ibv_create_comp_channel(struct ibv_context *context);
+
+/**
+ * ibv_destroy_comp_channel - Destroy a completion event channel
+ */
+int ibv_destroy_comp_channel(struct ibv_comp_channel *channel);
+
+/**
+ * ibv_create_cq - Create a completion queue
+ * @context - Context CQ will be attached to
+ * @cqe - Minimum number of entries required for CQ
+ * @cq_context - Consumer-supplied context returned for completion events
+ * @channel - Completion channel where completion events will be queued.
+ *     May be NULL if completion events will not be used.
+ * @comp_vector - Completion vector used to signal completion events.
+ *     Must be >= 0 and < context->num_comp_vectors.
+ */
+struct ibv_cq *ibv_create_cq(struct ibv_context *context, int cqe,
+			     void *cq_context,
+			     struct ibv_comp_channel *channel,
+			     int comp_vector);
+
+/**
+ * ibv_resize_cq - Modifies the capacity of the CQ.
+ * @cq: The CQ to resize.
+ * @cqe: The minimum size of the CQ.
+ *
+ * Users can examine the cq structure to determine the actual CQ size.
+ */
+int ibv_resize_cq(struct ibv_cq *cq, int cqe);
+
+/**
+ * ibv_destroy_cq - Destroy a completion queue
+ */
+int ibv_destroy_cq(struct ibv_cq *cq);
+
+/**
+ * ibv_get_cq_event - Read next CQ event
+ * @channel: Channel to get next event from.
+ * @cq: Used to return pointer to CQ.
+ * @cq_context: Used to return consumer-supplied CQ context.
+ *
+ * All completion events returned by ibv_get_cq_event() must
+ * eventually be acknowledged with ibv_ack_cq_events().
+ */
+int ibv_get_cq_event(struct ibv_comp_channel *channel,
+		     struct ibv_cq **cq, void **cq_context);
+
+/**
+ * ibv_ack_cq_events - Acknowledge CQ completion events
+ * @cq: CQ to acknowledge events for
+ * @nevents: Number of events to acknowledge.
+ *
+ * All completion events which are returned by ibv_get_cq_event() must
+ * be acknowledged.  To avoid races, ibv_destroy_cq() will wait for
+ * all completion events to be acknowledged, so there should be a
+ * one-to-one correspondence between acks and successful gets.  An
+ * application may accumulate multiple completion events and
+ * acknowledge them in a single call to ibv_ack_cq_events() by passing
+ * the number of events to ack in @nevents.
+ */
+void ibv_ack_cq_events(struct ibv_cq *cq, unsigned int nevents);
+
+/**
+ * ibv_poll_cq - Poll a CQ for work completions
+ * @cq:the CQ being polled
+ * @num_entries:maximum number of completions to return
+ * @wc:array of at least @num_entries of &struct ibv_wc where completions
+ *   will be returned
+ *
+ * Poll a CQ for (possibly multiple) completions.  If the return value
+ * is < 0, an error occurred.  If the return value is >= 0, it is the
+ * number of completions returned.  If the return value is
+ * non-negative and strictly less than num_entries, then the CQ was
+ * emptied.
+ */
+static inline int ibv_poll_cq(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc)
+{
+	return cq->context->ops.poll_cq(cq, num_entries, wc);
+}
+
+/**
+ * ibv_req_notify_cq - Request completion notification on a CQ.  An
+ *   event will be added to the completion channel associated with the
+ *   CQ when an entry is added to the CQ.
+ * @cq: The completion queue to request notification for.
+ * @solicited_only: If non-zero, an event will be generated only for
+ *   the next solicited CQ entry.  If zero, any CQ entry, solicited or
+ *   not, will generate an event.
+ */
+static inline int ibv_req_notify_cq(struct ibv_cq *cq, int solicited_only)
+{
+	return cq->context->ops.req_notify_cq(cq, solicited_only);
+}
+
+/**
+ * ibv_create_srq - Creates a SRQ associated with the specified protection
+ *   domain.
+ * @pd: The protection domain associated with the SRQ.
+ * @srq_init_attr: A list of initial attributes required to create the SRQ.
+ *
+ * srq_attr->max_wr and srq_attr->max_sge are read the determine the
+ * requested size of the SRQ, and set to the actual values allocated
+ * on return.  If ibv_create_srq() succeeds, then max_wr and max_sge
+ * will always be at least as large as the requested values.
+ */
+struct ibv_srq *ibv_create_srq(struct ibv_pd *pd,
+			       struct ibv_srq_init_attr *srq_init_attr);
+
+/**
+ * ibv_modify_srq - Modifies the attributes for the specified SRQ.
+ * @srq: The SRQ to modify.
+ * @srq_attr: On input, specifies the SRQ attributes to modify.  On output,
+ *   the current values of selected SRQ attributes are returned.
+ * @srq_attr_mask: A bit-mask used to specify which attributes of the SRQ
+ *   are being modified.
+ *
+ * The mask may contain IBV_SRQ_MAX_WR to resize the SRQ and/or
+ * IBV_SRQ_LIMIT to set the SRQ's limit and request notification when
+ * the number of receives queued drops below the limit.
+ */
+int ibv_modify_srq(struct ibv_srq *srq,
+		   struct ibv_srq_attr *srq_attr,
+		   int srq_attr_mask);
+
+/**
+ * ibv_query_srq - Returns the attribute list and current values for the
+ *   specified SRQ.
+ * @srq: The SRQ to query.
+ * @srq_attr: The attributes of the specified SRQ.
+ */
+int ibv_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr);
+
+/**
+ * ibv_destroy_srq - Destroys the specified SRQ.
+ * @srq: The SRQ to destroy.
+ */
+int ibv_destroy_srq(struct ibv_srq *srq);
+
+/**
+ * ibv_post_srq_recv - Posts a list of work requests to the specified SRQ.
+ * @srq: The SRQ to post the work request on.
+ * @recv_wr: A list of work requests to post on the receive queue.
+ * @bad_recv_wr: On an immediate failure, this parameter will reference
+ *   the work request that failed to be posted on the QP.
+ */
+static inline int ibv_post_srq_recv(struct ibv_srq *srq,
+				    struct ibv_recv_wr *recv_wr,
+				    struct ibv_recv_wr **bad_recv_wr)
+{
+	return srq->context->ops.post_srq_recv(srq, recv_wr, bad_recv_wr);
+}
+
+/**
+ * ibv_create_qp - Create a queue pair.
+ */
+struct ibv_qp *ibv_create_qp(struct ibv_pd *pd,
+			     struct ibv_qp_init_attr *qp_init_attr);
+
+/**
+ * ibv_modify_qp - Modify a queue pair.
+ */
+int ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
+		  int attr_mask);
+
+/**
+ * ibv_query_qp - Returns the attribute list and current values for the
+ *   specified QP.
+ * @qp: The QP to query.
+ * @attr: The attributes of the specified QP.
+ * @attr_mask: A bit-mask used to select specific attributes to query.
+ * @init_attr: Additional attributes of the selected QP.
+ *
+ * The qp_attr_mask may be used to limit the query to gathering only the
+ * selected attributes.
+ */
+int ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
+		 int attr_mask,
+		 struct ibv_qp_init_attr *init_attr);
+
+/**
+ * ibv_destroy_qp - Destroy a queue pair.
+ */
+int ibv_destroy_qp(struct ibv_qp *qp);
+
+/**
+ * ibv_post_send - Post a list of work requests to a send queue.
+ *
+ * If IBV_SEND_INLINE flag is set, the data buffers can be reused
+ * immediately after the call returns.
+ */
+static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr,
+				struct ibv_send_wr **bad_wr)
+{
+	return qp->context->ops.post_send(qp, wr, bad_wr);
+}
+
+/**
+ * ibv_post_recv - Post a list of work requests to a receive queue.
+ */
+static inline int ibv_post_recv(struct ibv_qp *qp, struct ibv_recv_wr *wr,
+				struct ibv_recv_wr **bad_wr)
+{
+	return qp->context->ops.post_recv(qp, wr, bad_wr);
+}
+
+/**
+ * ibv_create_ah - Create an address handle.
+ */
+struct ibv_ah *ibv_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr);
+
+/**
+ * ibv_init_ah_from_wc - Initializes address handle attributes from a
+ *   work completion.
+ * @context: Device context on which the received message arrived.
+ * @port_num: Port on which the received message arrived.
+ * @wc: Work completion associated with the received message.
+ * @grh: References the received global route header.  This parameter is
+ *   ignored unless the work completion indicates that the GRH is valid.
+ * @ah_attr: Returned attributes that can be used when creating an address
+ *   handle for replying to the message.
+ */
+int ibv_init_ah_from_wc(struct ibv_context *context, uint8_t port_num,
+			struct ibv_wc *wc, struct ibv_grh *grh,
+			struct ibv_ah_attr *ah_attr);
+
+/**
+ * ibv_create_ah_from_wc - Creates an address handle associated with the
+ *   sender of the specified work completion.
+ * @pd: The protection domain associated with the address handle.
+ * @wc: Work completion information associated with a received message.
+ * @grh: References the received global route header.  This parameter is
+ *   ignored unless the work completion indicates that the GRH is valid.
+ * @port_num: The outbound port number to associate with the address.
+ *
+ * The address handle is used to reference a local or global destination
+ * in all UD QP post sends.
+ */
+struct ibv_ah *ibv_create_ah_from_wc(struct ibv_pd *pd, struct ibv_wc *wc,
+				     struct ibv_grh *grh, uint8_t port_num);
+
+/**
+ * ibv_destroy_ah - Destroy an address handle.
+ */
+int ibv_destroy_ah(struct ibv_ah *ah);
+
+/**
+ * ibv_attach_mcast - Attaches the specified QP to a multicast group.
+ * @qp: QP to attach to the multicast group.  The QP must be a UD QP.
+ * @gid: Multicast group GID.
+ * @lid: Multicast group LID in host byte order.
+ *
+ * In order to route multicast packets correctly, subnet
+ * administration must have created the multicast group and configured
+ * the fabric appropriately.  The port associated with the specified
+ * QP must also be a member of the multicast group.
+ */
+int ibv_attach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid);
+
+/**
+ * ibv_detach_mcast - Detaches the specified QP from a multicast group.
+ * @qp: QP to detach from the multicast group.
+ * @gid: Multicast group GID.
+ * @lid: Multicast group LID in host byte order.
+ */
+int ibv_detach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid);
+
+/**
+ * ibv_fork_init - Prepare data structures so that fork() may be used
+ * safely.  If this function is not called or returns a non-zero
+ * status, then libibverbs data structures are not fork()-safe and the
+ * effect of an application calling fork() is undefined.
+ */
+int ibv_fork_init(void);
+
+/**
+ * ibv_node_type_str - Return string describing node_type enum value
+ */
+const char *ibv_node_type_str(enum ibv_node_type node_type);
+
+/**
+ * ibv_port_state_str - Return string describing port_state enum value
+ */
+const char *ibv_port_state_str(enum ibv_port_state port_state);
+
+/**
+ * ibv_event_type_str - Return string describing event_type enum value
+ */
+const char *ibv_event_type_str(enum ibv_event_type event);
+
+END_C_DECLS
+
+#  undef __attribute_const
+
+
+#endif /* INFINIBAND_VERBS_H */
diff --git a/prov/ibverbs/src/cmd.c b/prov/ibverbs/src/cmd.c
new file mode 100644
index 00000000000..0a240d47237
--- /dev/null
+++ b/prov/ibverbs/src/cmd.c
@@ -0,0 +1,879 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc.  All rights reserved.
+ * Copyright (c) 2006 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if HAVE_CONFIG_H
+#  include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <alloca.h>
+#include <string.h>
+
+#include <rdma/fi_uverbs.h>
+#include "ibverbs.h"
+
+int ibv_cmd_get_context(struct ibv_context *context, struct ibv_get_context *cmd,
+			size_t cmd_size, struct ibv_get_context_resp *resp,
+			size_t resp_size)
+{
+	int ret;
+
+	ret = uv_get_context(context->uv_fid, cmd, cmd_size, resp, resp_size);
+	if (ret)
+		return ret;
+
+	context->async_fd         = resp->async_fd;
+	context->num_comp_vectors = resp->num_comp_vectors;
+
+	return 0;
+}
+
+int ibv_cmd_query_device(struct ibv_context *context,
+			 struct ibv_device_attr *device_attr,
+			 uint64_t *raw_fw_ver,
+			 struct ibv_query_device *cmd, size_t cmd_size)
+{
+	struct ibv_query_device_resp resp;
+	int ret;
+
+	ret = uv_query_device(context->uv_fid, cmd, cmd_size, &resp, sizeof resp);
+	if (ret)
+		return ret;
+
+	memset(device_attr->fw_ver, 0, sizeof device_attr->fw_ver);
+	*raw_fw_ver			       = resp.fw_ver;
+	device_attr->node_guid 		       = resp.node_guid;
+	device_attr->sys_image_guid 	       = resp.sys_image_guid;
+	device_attr->max_mr_size 	       = resp.max_mr_size;
+	device_attr->page_size_cap 	       = resp.page_size_cap;
+	device_attr->vendor_id 		       = resp.vendor_id;
+	device_attr->vendor_part_id 	       = resp.vendor_part_id;
+	device_attr->hw_ver 		       = resp.hw_ver;
+	device_attr->max_qp 		       = resp.max_qp;
+	device_attr->max_qp_wr 		       = resp.max_qp_wr;
+	device_attr->device_cap_flags 	       = resp.device_cap_flags;
+	device_attr->max_sge 		       = resp.max_sge;
+	device_attr->max_sge_rd 	       = resp.max_sge_rd;
+	device_attr->max_cq 		       = resp.max_cq;
+	device_attr->max_cqe 		       = resp.max_cqe;
+	device_attr->max_mr 		       = resp.max_mr;
+	device_attr->max_pd 		       = resp.max_pd;
+	device_attr->max_qp_rd_atom 	       = resp.max_qp_rd_atom;
+	device_attr->max_ee_rd_atom 	       = resp.max_ee_rd_atom;
+	device_attr->max_res_rd_atom 	       = resp.max_res_rd_atom;
+	device_attr->max_qp_init_rd_atom       = resp.max_qp_init_rd_atom;
+	device_attr->max_ee_init_rd_atom       = resp.max_ee_init_rd_atom;
+	device_attr->atomic_cap 	       = resp.atomic_cap;
+	device_attr->max_ee 		       = resp.max_ee;
+	device_attr->max_rdd 		       = resp.max_rdd;
+	device_attr->max_mw 		       = resp.max_mw;
+	device_attr->max_raw_ipv6_qp 	       = resp.max_raw_ipv6_qp;
+	device_attr->max_raw_ethy_qp 	       = resp.max_raw_ethy_qp;
+	device_attr->max_mcast_grp 	       = resp.max_mcast_grp;
+	device_attr->max_mcast_qp_attach       = resp.max_mcast_qp_attach;
+	device_attr->max_total_mcast_qp_attach = resp.max_total_mcast_qp_attach;
+	device_attr->max_ah 		       = resp.max_ah;
+	device_attr->max_fmr 		       = resp.max_fmr;
+	device_attr->max_map_per_fmr 	       = resp.max_map_per_fmr;
+	device_attr->max_srq 		       = resp.max_srq;
+	device_attr->max_srq_wr 	       = resp.max_srq_wr;
+	device_attr->max_srq_sge 	       = resp.max_srq_sge;
+	device_attr->max_pkeys 		       = resp.max_pkeys;
+	device_attr->local_ca_ack_delay        = resp.local_ca_ack_delay;
+	device_attr->phys_port_cnt	       = resp.phys_port_cnt;
+
+	return 0;
+}
+
+int ibv_cmd_query_port(struct ibv_context *context, uint8_t port_num,
+		       struct ibv_port_attr *port_attr,
+		       struct ibv_query_port *cmd, size_t cmd_size)
+{
+	struct ibv_query_port_resp resp;
+	int ret;
+
+	cmd->port_num = port_num;
+	memset(cmd->reserved, 0, sizeof cmd->reserved);
+
+	ret = uv_query_port(context->uv_fid, cmd, cmd_size, &resp, sizeof resp);
+	if (ret)
+		return ret;
+
+	port_attr->state      	   = resp.state;
+	port_attr->max_mtu         = resp.max_mtu;
+	port_attr->active_mtu      = resp.active_mtu;
+	port_attr->gid_tbl_len     = resp.gid_tbl_len;
+	port_attr->port_cap_flags  = resp.port_cap_flags;
+	port_attr->max_msg_sz      = resp.max_msg_sz;
+	port_attr->bad_pkey_cntr   = resp.bad_pkey_cntr;
+	port_attr->qkey_viol_cntr  = resp.qkey_viol_cntr;
+	port_attr->pkey_tbl_len    = resp.pkey_tbl_len;
+	port_attr->lid 	      	   = resp.lid;
+	port_attr->sm_lid 	   = resp.sm_lid;
+	port_attr->lmc 	      	   = resp.lmc;
+	port_attr->max_vl_num      = resp.max_vl_num;
+	port_attr->sm_sl      	   = resp.sm_sl;
+	port_attr->subnet_timeout  = resp.subnet_timeout;
+	port_attr->init_type_reply = resp.init_type_reply;
+	port_attr->active_width    = resp.active_width;
+	port_attr->active_speed    = resp.active_speed;
+	port_attr->phys_state      = resp.phys_state;
+	port_attr->link_layer      = resp.link_layer;
+
+	return 0;
+}
+
+int ibv_cmd_alloc_pd(struct ibv_context *context, struct ibv_pd *pd,
+		     struct ibv_alloc_pd *cmd, size_t cmd_size,
+		     struct ibv_alloc_pd_resp *resp, size_t resp_size)
+{
+	int ret;
+
+	ret = uv_alloc_pd(context->uv_fid, cmd, cmd_size, resp, resp_size);
+	if (ret)
+		return ret;
+
+	pd->handle  = resp->pd_handle;
+	pd->context = context;
+
+	return 0;
+}
+
+int ibv_cmd_dealloc_pd(struct ibv_pd *pd)
+{
+	struct ibv_dealloc_pd cmd;
+
+	cmd.pd_handle = pd->handle;
+	return uv_dealloc_pd(pd->context->uv_fid, &cmd, sizeof cmd);
+}
+
+int ibv_cmd_reg_mr(struct ibv_pd *pd, void *addr, size_t length,
+		   uint64_t hca_va, int access,
+		   struct ibv_mr *mr, struct ibv_reg_mr *cmd,
+		   size_t cmd_size,
+		   struct ibv_reg_mr_resp *resp, size_t resp_size)
+{
+	int ret;
+
+	cmd->start 	  = (uintptr_t) addr;
+	cmd->length 	  = length;
+	cmd->hca_va 	  = hca_va;
+	cmd->pd_handle 	  = pd->handle;
+	cmd->access_flags = access;
+
+	ret = uv_reg_mr(pd->context->uv_fid, cmd, cmd_size, resp, resp_size);
+	if (ret)
+		return ret;
+
+	mr->handle  = resp->mr_handle;
+	mr->lkey    = resp->lkey;
+	mr->rkey    = resp->rkey;
+	mr->context = pd->context;
+
+	return 0;
+}
+
+int ibv_cmd_dereg_mr(struct ibv_mr *mr)
+{
+	struct ibv_dereg_mr cmd;
+
+	cmd.mr_handle = mr->handle;
+	return uv_dereg_mr(mr->context->uv_fid, &cmd, sizeof cmd);
+}
+
+int ibv_cmd_create_cq(struct ibv_context *context, int cqe,
+		      struct ibv_comp_channel *channel,
+		      int comp_vector, struct ibv_cq *cq,
+		      struct ibv_create_cq *cmd, size_t cmd_size,
+		      struct ibv_create_cq_resp *resp, size_t resp_size)
+{
+	int ret;
+
+	cmd->user_handle   = (uintptr_t) cq;
+	cmd->cqe           = cqe;
+	cmd->comp_vector   = comp_vector;
+	cmd->comp_channel  = channel ? channel->fd : -1;
+	cmd->reserved      = 0;
+
+	ret = uv_create_cq(context->uv_fid, cmd, cmd_size, resp, resp_size);
+	if (ret)
+		return ret;
+
+	cq->handle  = resp->cq_handle;
+	cq->cqe     = resp->cqe;
+	cq->context = context;
+
+	return 0;
+}
+
+int ibv_cmd_poll_cq(struct ibv_cq *ibcq, int ne, struct ibv_wc *wc)
+{
+	struct ibv_poll_cq       cmd;
+	struct ibv_poll_cq_resp *resp;
+	int                      i;
+	int                      rsize;
+	int                      ret;
+
+	rsize = sizeof *resp + ne * sizeof(struct ibv_kern_wc);
+	resp  = malloc(rsize);
+	if (!resp)
+		return -1;
+
+	cmd.cq_handle = ibcq->handle;
+	cmd.ne        = ne;
+
+	ret = uv_poll_cq(ibcq->context->uv_fid, &cmd, sizeof cmd, resp, rsize);
+	if (ret) {
+		errno = ret;
+		ret = -1;
+		goto out;
+	}
+
+	for (i = 0; i < resp->count; i++) {
+		wc[i].wr_id 	     = resp->wc[i].wr_id;
+		wc[i].status 	     = resp->wc[i].status;
+		wc[i].opcode 	     = resp->wc[i].opcode;
+		wc[i].vendor_err     = resp->wc[i].vendor_err;
+		wc[i].byte_len 	     = resp->wc[i].byte_len;
+		wc[i].imm_data 	     = resp->wc[i].imm_data;
+		wc[i].qp_num 	     = resp->wc[i].qp_num;
+		wc[i].src_qp 	     = resp->wc[i].src_qp;
+		wc[i].wc_flags 	     = resp->wc[i].wc_flags;
+		wc[i].pkey_index     = resp->wc[i].pkey_index;
+		wc[i].slid 	     = resp->wc[i].slid;
+		wc[i].sl 	     = resp->wc[i].sl;
+		wc[i].dlid_path_bits = resp->wc[i].dlid_path_bits;
+	}
+
+	ret = resp->count;
+
+out:
+	free(resp);
+	return ret;
+}
+
+int ibv_cmd_req_notify_cq(struct ibv_cq *ibcq, int solicited_only)
+{
+	struct ibv_req_notify_cq cmd;
+
+	cmd.cq_handle = ibcq->handle;
+	cmd.solicited = !!solicited_only;
+
+	return uv_req_notify_cq(ibcq->context->uv_fid, &cmd, sizeof cmd);
+}
+
+int ibv_cmd_resize_cq(struct ibv_cq *cq, int cqe,
+		      struct ibv_resize_cq *cmd, size_t cmd_size,
+		      struct ibv_resize_cq_resp *resp, size_t resp_size)
+{
+	int ret;
+
+	cmd->cq_handle = cq->handle;
+	cmd->cqe       = cqe;
+
+	ret = uv_resize_cq(cq->context->uv_fid, cmd, cmd_size, resp, resp_size);
+	if (ret)
+		return ret;
+
+	cq->cqe = resp->cqe;
+
+	return 0;
+}
+
+int ibv_cmd_destroy_cq(struct ibv_cq *cq)
+{
+	struct ibv_destroy_cq      cmd;
+	struct ibv_destroy_cq_resp resp;
+	int ret;
+
+	cmd.cq_handle = cq->handle;
+	cmd.reserved  = 0;
+
+	ret = uv_destroy_cq(cq->context->uv_fid, &cmd, sizeof cmd, &resp, sizeof resp);
+	if (ret)
+		return ret;
+
+	pthread_mutex_lock(&cq->mutex);
+	while (cq->comp_events_completed  != resp.comp_events_reported ||
+	       cq->async_events_completed != resp.async_events_reported)
+		pthread_cond_wait(&cq->cond, &cq->mutex);
+	pthread_mutex_unlock(&cq->mutex);
+
+	return 0;
+}
+
+int ibv_cmd_create_srq(struct ibv_pd *pd,
+		       struct ibv_srq *srq, struct ibv_srq_init_attr *attr,
+		       struct ibv_create_srq *cmd, size_t cmd_size,
+		       struct ibv_create_srq_resp *resp, size_t resp_size)
+{
+	int ret;
+
+	cmd->user_handle = (uintptr_t) srq;
+	cmd->pd_handle 	 = pd->handle;
+	cmd->max_wr      = attr->attr.max_wr;
+	cmd->max_sge     = attr->attr.max_sge;
+	cmd->srq_limit   = attr->attr.srq_limit;
+
+	ret = uv_create_srq(pd->context->uv_fid, cmd, cmd_size, resp, resp_size);
+	if (ret)
+		return ret;
+
+	srq->handle  = resp->srq_handle;
+	srq->context = pd->context;
+	attr->attr.max_wr = resp->max_wr;
+	attr->attr.max_sge = resp->max_sge;
+
+	return 0;
+}
+
+int ibv_cmd_modify_srq(struct ibv_srq *srq,
+		       struct ibv_srq_attr *srq_attr,
+		       int srq_attr_mask,
+		       struct ibv_modify_srq *cmd, size_t cmd_size)
+{
+	cmd->srq_handle	= srq->handle;
+	cmd->attr_mask	= srq_attr_mask;
+	cmd->max_wr	= srq_attr->max_wr;
+	cmd->srq_limit	= srq_attr->srq_limit;
+
+	return uv_modify_srq(srq->context->uv_fid, cmd, cmd_size);
+}
+
+int ibv_cmd_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr,
+		      struct ibv_query_srq *cmd, size_t cmd_size)
+{
+	struct ibv_query_srq_resp resp;
+	int ret;
+
+	cmd->srq_handle = srq->handle;
+	cmd->reserved   = 0;
+
+	ret = uv_query_srq(srq->context->uv_fid, cmd, cmd_size, &resp, sizeof resp);
+	if  (ret)
+		return ret;
+
+	srq_attr->max_wr    = resp.max_wr;
+	srq_attr->max_sge   = resp.max_sge;
+	srq_attr->srq_limit = resp.srq_limit;
+
+	return 0;
+}
+
+int ibv_cmd_destroy_srq(struct ibv_srq *srq)
+{
+	struct ibv_destroy_srq      cmd;
+	struct ibv_destroy_srq_resp resp;
+	int ret;
+
+	cmd.srq_handle = srq->handle;
+	cmd.reserved   = 0;
+
+	ret = uv_destroy_srq(srq->context->uv_fid, &cmd, sizeof cmd, &resp, sizeof resp);
+	if (ret)
+		return ret;
+
+	pthread_mutex_lock(&srq->mutex);
+	while (srq->events_completed != resp.events_reported)
+		pthread_cond_wait(&srq->cond, &srq->mutex);
+	pthread_mutex_unlock(&srq->mutex);
+
+	return 0;
+}
+
+int ibv_cmd_create_qp(struct ibv_pd *pd,
+		      struct ibv_qp *qp, struct ibv_qp_init_attr *attr,
+		      struct ibv_create_qp *cmd, size_t cmd_size,
+		      struct ibv_create_qp_resp *resp, size_t resp_size)
+{
+	int ret;
+
+	cmd->user_handle     = (uintptr_t) qp;
+	cmd->pd_handle 	     = pd->handle;
+	cmd->send_cq_handle  = attr->send_cq->handle;
+	cmd->recv_cq_handle  = attr->recv_cq->handle;
+	cmd->srq_handle      = attr->srq ? attr->srq->handle : 0;
+	cmd->max_send_wr     = attr->cap.max_send_wr;
+	cmd->max_recv_wr     = attr->cap.max_recv_wr;
+	cmd->max_send_sge    = attr->cap.max_send_sge;
+	cmd->max_recv_sge    = attr->cap.max_recv_sge;
+	cmd->max_inline_data = attr->cap.max_inline_data;
+	cmd->sq_sig_all	     = attr->sq_sig_all;
+	cmd->qp_type 	     = attr->qp_type;
+	cmd->is_srq 	     = !!attr->srq;
+	cmd->reserved	     = 0;
+
+	ret = uv_create_qp(pd->context->uv_fid, cmd, cmd_size, resp, resp_size);
+	if (ret)
+		return ret;
+
+	qp->handle 		  = resp->qp_handle;
+	qp->qp_num 		  = resp->qpn;
+	qp->context		  = pd->context;
+
+	attr->cap.max_recv_sge    = resp->max_recv_sge;
+	attr->cap.max_send_sge    = resp->max_send_sge;
+	attr->cap.max_recv_wr     = resp->max_recv_wr;
+	attr->cap.max_send_wr     = resp->max_send_wr;
+	attr->cap.max_inline_data = resp->max_inline_data;
+
+	return 0;
+}
+
+int ibv_cmd_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
+		     int attr_mask,
+		     struct ibv_qp_init_attr *init_attr,
+		     struct ibv_query_qp *cmd, size_t cmd_size)
+{
+	struct ibv_query_qp_resp resp;
+	int ret;
+
+	cmd->qp_handle = qp->handle;
+	cmd->attr_mask = attr_mask;
+
+	ret = uv_query_qp(qp->context->uv_fid, cmd, cmd_size, &resp, sizeof resp);
+	if (ret)
+		return ret;
+
+	attr->qkey                          = resp.qkey;
+	attr->rq_psn                        = resp.rq_psn;
+	attr->sq_psn                        = resp.sq_psn;
+	attr->dest_qp_num                   = resp.dest_qp_num;
+	attr->qp_access_flags               = resp.qp_access_flags;
+	attr->pkey_index                    = resp.pkey_index;
+	attr->alt_pkey_index                = resp.alt_pkey_index;
+	attr->qp_state                      = resp.qp_state;
+	attr->cur_qp_state                  = resp.cur_qp_state;
+	attr->path_mtu                      = resp.path_mtu;
+	attr->path_mig_state                = resp.path_mig_state;
+	attr->sq_draining                   = resp.sq_draining;
+	attr->max_rd_atomic                 = resp.max_rd_atomic;
+	attr->max_dest_rd_atomic            = resp.max_dest_rd_atomic;
+	attr->min_rnr_timer                 = resp.min_rnr_timer;
+	attr->port_num                      = resp.port_num;
+	attr->timeout                       = resp.timeout;
+	attr->retry_cnt                     = resp.retry_cnt;
+	attr->rnr_retry                     = resp.rnr_retry;
+	attr->alt_port_num                  = resp.alt_port_num;
+	attr->alt_timeout                   = resp.alt_timeout;
+	attr->cap.max_send_wr               = resp.max_send_wr;
+	attr->cap.max_recv_wr               = resp.max_recv_wr;
+	attr->cap.max_send_sge              = resp.max_send_sge;
+	attr->cap.max_recv_sge              = resp.max_recv_sge;
+	attr->cap.max_inline_data           = resp.max_inline_data;
+
+	memcpy(attr->ah_attr.grh.dgid.raw, resp.dest.dgid, 16);
+	attr->ah_attr.grh.flow_label        = resp.dest.flow_label;
+	attr->ah_attr.dlid                  = resp.dest.dlid;
+	attr->ah_attr.grh.sgid_index        = resp.dest.sgid_index;
+	attr->ah_attr.grh.hop_limit         = resp.dest.hop_limit;
+	attr->ah_attr.grh.traffic_class     = resp.dest.traffic_class;
+	attr->ah_attr.sl                    = resp.dest.sl;
+	attr->ah_attr.src_path_bits         = resp.dest.src_path_bits;
+	attr->ah_attr.static_rate           = resp.dest.static_rate;
+	attr->ah_attr.is_global             = resp.dest.is_global;
+	attr->ah_attr.port_num              = resp.dest.port_num;
+
+	memcpy(attr->alt_ah_attr.grh.dgid.raw, resp.alt_dest.dgid, 16);
+	attr->alt_ah_attr.grh.flow_label    = resp.alt_dest.flow_label;
+	attr->alt_ah_attr.dlid              = resp.alt_dest.dlid;
+	attr->alt_ah_attr.grh.sgid_index    = resp.alt_dest.sgid_index;
+	attr->alt_ah_attr.grh.hop_limit     = resp.alt_dest.hop_limit;
+	attr->alt_ah_attr.grh.traffic_class = resp.alt_dest.traffic_class;
+	attr->alt_ah_attr.sl                = resp.alt_dest.sl;
+	attr->alt_ah_attr.src_path_bits     = resp.alt_dest.src_path_bits;
+	attr->alt_ah_attr.static_rate       = resp.alt_dest.static_rate;
+	attr->alt_ah_attr.is_global         = resp.alt_dest.is_global;
+	attr->alt_ah_attr.port_num          = resp.alt_dest.port_num;
+
+	init_attr->qp_context               = qp->qp_context;
+	init_attr->send_cq                  = qp->send_cq;
+	init_attr->recv_cq                  = qp->recv_cq;
+	init_attr->srq                      = qp->srq;
+	init_attr->qp_type                  = qp->qp_type;
+	init_attr->cap.max_send_wr          = resp.max_send_wr;
+	init_attr->cap.max_recv_wr          = resp.max_recv_wr;
+	init_attr->cap.max_send_sge         = resp.max_send_sge;
+	init_attr->cap.max_recv_sge         = resp.max_recv_sge;
+	init_attr->cap.max_inline_data      = resp.max_inline_data;
+	init_attr->sq_sig_all               = resp.sq_sig_all;
+
+	return 0;
+}
+
+int ibv_cmd_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
+		      int attr_mask,
+		      struct ibv_modify_qp *cmd, size_t cmd_size)
+{
+	cmd->qp_handle 		 = qp->handle;
+	cmd->attr_mask 		 = attr_mask;
+	cmd->qkey 		 = attr->qkey;
+	cmd->rq_psn 		 = attr->rq_psn;
+	cmd->sq_psn 		 = attr->sq_psn;
+	cmd->dest_qp_num 	 = attr->dest_qp_num;
+	cmd->qp_access_flags 	 = attr->qp_access_flags;
+	cmd->pkey_index		 = attr->pkey_index;
+	cmd->alt_pkey_index 	 = attr->alt_pkey_index;
+	cmd->qp_state 		 = attr->qp_state;
+	cmd->cur_qp_state 	 = attr->cur_qp_state;
+	cmd->path_mtu 		 = attr->path_mtu;
+	cmd->path_mig_state 	 = attr->path_mig_state;
+	cmd->en_sqd_async_notify = attr->en_sqd_async_notify;
+	cmd->max_rd_atomic 	 = attr->max_rd_atomic;
+	cmd->max_dest_rd_atomic  = attr->max_dest_rd_atomic;
+	cmd->min_rnr_timer 	 = attr->min_rnr_timer;
+	cmd->port_num 		 = attr->port_num;
+	cmd->timeout 		 = attr->timeout;
+	cmd->retry_cnt 		 = attr->retry_cnt;
+	cmd->rnr_retry 		 = attr->rnr_retry;
+	cmd->alt_port_num 	 = attr->alt_port_num;
+	cmd->alt_timeout 	 = attr->alt_timeout;
+
+	memcpy(cmd->dest.dgid, attr->ah_attr.grh.dgid.raw, 16);
+	cmd->dest.flow_label 	    = attr->ah_attr.grh.flow_label;
+	cmd->dest.dlid 		    = attr->ah_attr.dlid;
+	cmd->dest.reserved	    = 0;
+	cmd->dest.sgid_index 	    = attr->ah_attr.grh.sgid_index;
+	cmd->dest.hop_limit 	    = attr->ah_attr.grh.hop_limit;
+	cmd->dest.traffic_class     = attr->ah_attr.grh.traffic_class;
+	cmd->dest.sl 		    = attr->ah_attr.sl;
+	cmd->dest.src_path_bits     = attr->ah_attr.src_path_bits;
+	cmd->dest.static_rate 	    = attr->ah_attr.static_rate;
+	cmd->dest.is_global 	    = attr->ah_attr.is_global;
+	cmd->dest.port_num 	    = attr->ah_attr.port_num;
+
+	memcpy(cmd->alt_dest.dgid, attr->alt_ah_attr.grh.dgid.raw, 16);
+	cmd->alt_dest.flow_label    = attr->alt_ah_attr.grh.flow_label;
+	cmd->alt_dest.dlid 	    = attr->alt_ah_attr.dlid;
+	cmd->alt_dest.reserved	    = 0;
+	cmd->alt_dest.sgid_index    = attr->alt_ah_attr.grh.sgid_index;
+	cmd->alt_dest.hop_limit     = attr->alt_ah_attr.grh.hop_limit;
+	cmd->alt_dest.traffic_class = attr->alt_ah_attr.grh.traffic_class;
+	cmd->alt_dest.sl 	    = attr->alt_ah_attr.sl;
+	cmd->alt_dest.src_path_bits = attr->alt_ah_attr.src_path_bits;
+	cmd->alt_dest.static_rate   = attr->alt_ah_attr.static_rate;
+	cmd->alt_dest.is_global     = attr->alt_ah_attr.is_global;
+	cmd->alt_dest.port_num 	    = attr->alt_ah_attr.port_num;
+
+	cmd->reserved[0] = cmd->reserved[1] = 0;
+
+	return uv_modify_qp(qp->context->uv_fid, cmd, cmd_size);
+}
+
+int ibv_cmd_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
+		      struct ibv_send_wr **bad_wr)
+{
+	struct ibv_post_send     *cmd;
+	struct ibv_post_send_resp resp;
+	struct ibv_send_wr       *i;
+	struct ibv_kern_send_wr  *n, *tmp;
+	struct ibv_sge           *s;
+	unsigned                  wr_count = 0;
+	unsigned                  sge_count = 0;
+	int                       cmd_size;
+	int                       ret;
+
+	for (i = wr; i; i = i->next) {
+		wr_count++;
+		sge_count += i->num_sge;
+	}
+
+	cmd_size = sizeof *cmd + wr_count * sizeof *n + sge_count * sizeof *s;
+	cmd  = alloca(cmd_size);
+
+	cmd->qp_handle = ibqp->handle;
+	cmd->wr_count  = wr_count;
+	cmd->sge_count = sge_count;
+	cmd->wqe_size  = sizeof *n;
+
+	n = (struct ibv_kern_send_wr *) ((void *) cmd + sizeof *cmd);
+	s = (struct ibv_sge *) (n + wr_count);
+
+	tmp = n;
+	for (i = wr; i; i = i->next) {
+		tmp->wr_id 	= i->wr_id;
+		tmp->num_sge 	= i->num_sge;
+		tmp->opcode 	= i->opcode;
+		tmp->send_flags = i->send_flags;
+		tmp->imm_data 	= i->imm_data;
+		if (ibqp->qp_type == IBV_QPT_UD) {
+			tmp->wr.ud.ah 	       = i->wr.ud.ah->handle;
+			tmp->wr.ud.remote_qpn  = i->wr.ud.remote_qpn;
+			tmp->wr.ud.remote_qkey = i->wr.ud.remote_qkey;
+		} else {
+			switch (i->opcode) {
+			case IBV_WR_RDMA_WRITE:
+			case IBV_WR_RDMA_WRITE_WITH_IMM:
+			case IBV_WR_RDMA_READ:
+				tmp->wr.rdma.remote_addr =
+					i->wr.rdma.remote_addr;
+				tmp->wr.rdma.rkey = i->wr.rdma.rkey;
+				break;
+			case IBV_WR_ATOMIC_CMP_AND_SWP:
+			case IBV_WR_ATOMIC_FETCH_AND_ADD:
+				tmp->wr.atomic.remote_addr =
+					i->wr.atomic.remote_addr;
+				tmp->wr.atomic.compare_add =
+					i->wr.atomic.compare_add;
+				tmp->wr.atomic.swap = i->wr.atomic.swap;
+				tmp->wr.atomic.rkey = i->wr.atomic.rkey;
+				break;
+			default:
+				break;
+			}
+		}
+
+		if (tmp->num_sge) {
+			memcpy(s, i->sg_list, tmp->num_sge * sizeof *s);
+			s += tmp->num_sge;
+		}
+
+		tmp++;
+	}
+
+	resp.bad_wr = 0;
+	ret = uv_post_send(ibqp->context->uv_fid, cmd, cmd_size, &resp, sizeof resp);
+
+	wr_count = resp.bad_wr;
+	if (wr_count) {
+		i = wr;
+		while (--wr_count)
+			i = i->next;
+		*bad_wr = i;
+	} else if (ret)
+		*bad_wr = wr;
+
+	return ret;
+}
+
+int ibv_cmd_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
+		      struct ibv_recv_wr **bad_wr)
+{
+	struct ibv_post_recv     *cmd;
+	struct ibv_post_recv_resp resp;
+	struct ibv_recv_wr       *i;
+	struct ibv_kern_recv_wr  *n, *tmp;
+	struct ibv_sge           *s;
+	unsigned                  wr_count = 0;
+	unsigned                  sge_count = 0;
+	int                       cmd_size;
+	int                       ret;
+
+	for (i = wr; i; i = i->next) {
+		wr_count++;
+		sge_count += i->num_sge;
+	}
+
+	cmd_size = sizeof *cmd + wr_count * sizeof *n + sge_count * sizeof *s;
+	cmd  = alloca(cmd_size);
+
+	cmd->qp_handle = ibqp->handle;
+	cmd->wr_count  = wr_count;
+	cmd->sge_count = sge_count;
+	cmd->wqe_size  = sizeof *n;
+
+	n = (struct ibv_kern_recv_wr *) ((void *) cmd + sizeof *cmd);
+	s = (struct ibv_sge *) (n + wr_count);
+
+	tmp = n;
+	for (i = wr; i; i = i->next) {
+		tmp->wr_id   = i->wr_id;
+		tmp->num_sge = i->num_sge;
+
+		if (tmp->num_sge) {
+			memcpy(s, i->sg_list, tmp->num_sge * sizeof *s);
+			s += tmp->num_sge;
+		}
+
+		tmp++;
+	}
+
+	resp.bad_wr = 0;
+	ret = uv_post_recv(ibqp->context->uv_fid, cmd, cmd_size, &resp, sizeof resp);
+
+	wr_count = resp.bad_wr;
+	if (wr_count) {
+		i = wr;
+		while (--wr_count)
+			i = i->next;
+		*bad_wr = i;
+	} else if (ret)
+		*bad_wr = wr;
+
+	return ret;
+}
+
+int ibv_cmd_post_srq_recv(struct ibv_srq *srq, struct ibv_recv_wr *wr,
+		      struct ibv_recv_wr **bad_wr)
+{
+	struct ibv_post_srq_recv *cmd;
+	struct ibv_post_srq_recv_resp resp;
+	struct ibv_recv_wr       *i;
+	struct ibv_kern_recv_wr  *n, *tmp;
+	struct ibv_sge           *s;
+	unsigned                  wr_count = 0;
+	unsigned                  sge_count = 0;
+	int                       cmd_size;
+	int                       ret;
+
+	for (i = wr; i; i = i->next) {
+		wr_count++;
+		sge_count += i->num_sge;
+	}
+
+	cmd_size = sizeof *cmd + wr_count * sizeof *n + sge_count * sizeof *s;
+	cmd  = alloca(cmd_size);
+
+	cmd->srq_handle = srq->handle;
+	cmd->wr_count  = wr_count;
+	cmd->sge_count = sge_count;
+	cmd->wqe_size  = sizeof *n;
+
+	n = (struct ibv_kern_recv_wr *) ((void *) cmd + sizeof *cmd);
+	s = (struct ibv_sge *) (n + wr_count);
+
+	tmp = n;
+	for (i = wr; i; i = i->next) {
+		tmp->wr_id = i->wr_id;
+		tmp->num_sge = i->num_sge;
+
+		if (tmp->num_sge) {
+			memcpy(s, i->sg_list, tmp->num_sge * sizeof *s);
+			s += tmp->num_sge;
+		}
+
+		tmp++;
+	}
+
+	resp.bad_wr = 0;
+	ret = uv_post_srq_recv(srq->context->uv_fid, cmd, cmd_size, &resp, sizeof resp);
+
+	wr_count = resp.bad_wr;
+	if (wr_count) {
+		i = wr;
+		while (--wr_count)
+			i = i->next;
+		*bad_wr = i;
+	} else if (ret)
+		*bad_wr = wr;
+
+	return ret;
+}
+
+int ibv_cmd_create_ah(struct ibv_pd *pd, struct ibv_ah *ah,
+		      struct ibv_ah_attr *attr)
+{
+	struct ibv_create_ah      cmd;
+	struct ibv_create_ah_resp resp;
+	int ret;
+
+	cmd.user_handle            = (uintptr_t) ah;
+	cmd.pd_handle              = pd->handle;
+	cmd.attr.dlid              = attr->dlid;
+	cmd.attr.sl                = attr->sl;
+	cmd.attr.src_path_bits     = attr->src_path_bits;
+	cmd.attr.static_rate       = attr->static_rate;
+	cmd.attr.is_global         = attr->is_global;
+	cmd.attr.port_num          = attr->port_num;
+	cmd.attr.grh.flow_label    = attr->grh.flow_label;
+	cmd.attr.grh.sgid_index    = attr->grh.sgid_index;
+	cmd.attr.grh.hop_limit     = attr->grh.hop_limit;
+	cmd.attr.grh.traffic_class = attr->grh.traffic_class;
+	memcpy(cmd.attr.grh.dgid, attr->grh.dgid.raw, 16);
+
+	ret = uv_create_ah(pd->context->uv_fid, &cmd, sizeof cmd, &resp, sizeof resp);
+	if (ret)
+		return ret;
+
+	ah->handle  = resp.handle;
+	ah->context = pd->context;
+
+	return 0;
+}
+
+int ibv_cmd_destroy_ah(struct ibv_ah *ah)
+{
+	struct ibv_destroy_ah cmd;
+
+	cmd.ah_handle = ah->handle;
+	return uv_destroy_ah(ah->context->uv_fid, &cmd, sizeof cmd);
+}
+
+int ibv_cmd_destroy_qp(struct ibv_qp *qp)
+{
+	struct ibv_destroy_qp      cmd;
+	struct ibv_destroy_qp_resp resp;
+	int ret;
+
+	cmd.qp_handle = qp->handle;
+	cmd.reserved  = 0;
+
+	ret = uv_destroy_qp(qp->context->uv_fid, &cmd, sizeof cmd, &resp, sizeof resp);
+	if (ret)
+		return ret;
+
+	pthread_mutex_lock(&qp->mutex);
+	while (qp->events_completed != resp.events_reported)
+		pthread_cond_wait(&qp->cond, &qp->mutex);
+	pthread_mutex_unlock(&qp->mutex);
+
+	return 0;
+}
+
+int ibv_cmd_attach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid)
+{
+	struct ibv_attach_mcast cmd;
+
+	memcpy(cmd.gid, gid->raw, sizeof cmd.gid);
+	cmd.qp_handle = qp->handle;
+	cmd.mlid      = lid;
+	cmd.reserved  = 0;
+
+	return uv_attach_mcast(qp->context->uv_fid, &cmd, sizeof cmd);
+}
+
+int ibv_cmd_detach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid)
+{
+	struct ibv_detach_mcast cmd;
+
+	memcpy(cmd.gid, gid->raw, sizeof cmd.gid);
+	cmd.qp_handle = qp->handle;
+	cmd.mlid      = lid;
+	cmd.reserved  = 0;
+
+	return uv_detach_mcast(qp->context->uv_fid, &cmd, sizeof cmd);
+}
diff --git a/prov/ibverbs/src/device.c b/prov/ibverbs/src/device.c
new file mode 100644
index 00000000000..429a25e08ff
--- /dev/null
+++ b/prov/ibverbs/src/device.c
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if HAVE_CONFIG_H
+#  include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#include <stdio.h>
+#include <netinet/in.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <alloca.h>
+#include <errno.h>
+
+#include <rdma/fi_uverbs.h>
+#include <fi.h>
+
+#include "ibverbs.h"
+
+static pthread_once_t device_list_once = PTHREAD_ONCE_INIT;
+static int num_devices;
+static struct ibv_device **device_list;
+
+static void count_devices(void)
+{
+	num_devices = ibverbs_init(&device_list);
+}
+
+struct ibv_device **ibv_get_device_list(int *num)
+{
+	struct ibv_device **l;
+	int i;
+
+	if (num)
+		*num = 0;
+
+	pthread_once(&device_list_once, count_devices);
+
+	if (num_devices < 0) {
+		errno = -num_devices;
+		return NULL;
+	}
+
+	l = calloc(num_devices + 1, sizeof (struct ibv_device *));
+	if (!l) {
+		errno = ENOMEM;
+		return NULL;
+	}
+
+	for (i = 0; i < num_devices; ++i)
+		l[i] = device_list[i];
+	if (num)
+		*num = num_devices;
+
+	return l;
+}
+
+void ibv_free_device_list(struct ibv_device **list)
+{
+	free(list);
+}
+
+const char *ibv_get_device_name(struct ibv_device *device)
+{
+	return device->name;
+}
+
+uint64_t ibv_get_device_guid(struct ibv_device *device)
+{
+	char attr[24];
+	uint64_t guid = 0;
+	uint16_t parts[4];
+	int i;
+
+	if (fi_read_file(device->ibdev_path, "node_guid", attr, sizeof attr) < 0)
+		return 0;
+
+	if (sscanf(attr, "%hx:%hx:%hx:%hx",
+		   parts, parts + 1, parts + 2, parts + 3) != 4)
+		return 0;
+
+	for (i = 0; i < 4; ++i)
+		guid = (guid << 16) | parts[i];
+
+	return htonll(guid);
+}
+
+struct ibv_context *ibv_open_device(struct ibv_device *device)
+{
+	struct ibv_context *context;
+	struct fid_uverbs *uv;
+	fid_t uv_fid;
+	char *uv_name;
+	int ret;
+
+	if (asprintf(&uv_name, "uverbs/%s", device->dev_name) < 0)
+		return NULL;
+
+	ret = fi_open(uv_name, NULL, 0, &uv_fid, NULL);
+	free(uv_name);
+	if (ret)
+		return NULL;
+
+	uv = (struct fid_uverbs *) uv_fid;
+	context = device->ops.alloc_context(device, uv_fid);
+	if (!context) {
+		fi_close(uv_fid);
+		return NULL;
+	}
+
+	context->device = device;
+	context->cmd_fd = uv->fd;
+	uv_fid->context = context;
+	pthread_mutex_init(&context->mutex, NULL);
+	return context;
+}
+
+int ibv_close_device(struct ibv_context *context)
+{
+	int async_fd = context->async_fd;
+	fid_t uv_fid;
+
+	context->device->ops.free_context(context);
+	uv_fid = context->uv_fid;
+	close(async_fd);
+	fi_close(uv_fid);
+	return 0;
+}
+
+int ibv_get_async_event(struct ibv_context *context,
+			  struct ibv_async_event *event)
+{
+	struct ibv_kern_async_event ev;
+
+	if (read(context->async_fd, &ev, sizeof ev) != sizeof ev)
+		return -1;
+
+	event->event_type = ev.event_type;
+
+	switch (event->event_type) {
+	case IBV_EVENT_CQ_ERR:
+		event->element.cq = (void *) (uintptr_t) ev.element;
+		break;
+
+	case IBV_EVENT_QP_FATAL:
+	case IBV_EVENT_QP_REQ_ERR:
+	case IBV_EVENT_QP_ACCESS_ERR:
+	case IBV_EVENT_COMM_EST:
+	case IBV_EVENT_SQ_DRAINED:
+	case IBV_EVENT_PATH_MIG:
+	case IBV_EVENT_PATH_MIG_ERR:
+	case IBV_EVENT_QP_LAST_WQE_REACHED:
+		event->element.qp = (void *) (uintptr_t) ev.element;
+		break;
+
+	case IBV_EVENT_SRQ_ERR:
+	case IBV_EVENT_SRQ_LIMIT_REACHED:
+		event->element.srq = (void *) (uintptr_t) ev.element;
+		break;
+
+	default:
+		event->element.port_num = ev.element;
+		break;
+	}
+
+	if (context->ops.async_event)
+		context->ops.async_event(event);
+
+	return 0;
+}
+
+void ibv_ack_async_event(struct ibv_async_event *event)
+{
+	switch (event->event_type) {
+	case IBV_EVENT_CQ_ERR:
+	{
+		struct ibv_cq *cq = event->element.cq;
+
+		pthread_mutex_lock(&cq->mutex);
+		++cq->async_events_completed;
+		pthread_cond_signal(&cq->cond);
+		pthread_mutex_unlock(&cq->mutex);
+
+		return;
+	}
+
+	case IBV_EVENT_QP_FATAL:
+	case IBV_EVENT_QP_REQ_ERR:
+	case IBV_EVENT_QP_ACCESS_ERR:
+	case IBV_EVENT_COMM_EST:
+	case IBV_EVENT_SQ_DRAINED:
+	case IBV_EVENT_PATH_MIG:
+	case IBV_EVENT_PATH_MIG_ERR:
+	case IBV_EVENT_QP_LAST_WQE_REACHED:
+	{
+		struct ibv_qp *qp = event->element.qp;
+
+		pthread_mutex_lock(&qp->mutex);
+		++qp->events_completed;
+		pthread_cond_signal(&qp->cond);
+		pthread_mutex_unlock(&qp->mutex);
+
+		return;
+	}
+
+	case IBV_EVENT_SRQ_ERR:
+	case IBV_EVENT_SRQ_LIMIT_REACHED:
+	{
+		struct ibv_srq *srq = event->element.srq;
+
+		pthread_mutex_lock(&srq->mutex);
+		++srq->events_completed;
+		pthread_cond_signal(&srq->cond);
+		pthread_mutex_unlock(&srq->mutex);
+
+		return;
+	}
+
+	default:
+		return;
+	}
+}
diff --git a/prov/ibverbs/src/enum_strs.c b/prov/ibverbs/src/enum_strs.c
new file mode 100644
index 00000000000..54d71a6e209
--- /dev/null
+++ b/prov/ibverbs/src/enum_strs.c
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2008 Lawrence Livermore National Laboratory
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <infiniband/verbs.h>
+
+const char *ibv_node_type_str(enum ibv_node_type node_type)
+{
+	static const char *const node_type_str[] = {
+		[IBV_NODE_CA]		= "InfiniBand channel adapter",
+		[IBV_NODE_SWITCH]	= "InfiniBand switch",
+		[IBV_NODE_ROUTER]	= "InfiniBand router",
+		[IBV_NODE_RNIC]		= "iWARP NIC"
+	};
+
+	if (node_type < IBV_NODE_CA || node_type > IBV_NODE_RNIC)
+		return "unknown";
+
+	return node_type_str[node_type];
+}
+
+const char *ibv_port_state_str(enum ibv_port_state port_state)
+{
+	static const char *const port_state_str[] = {
+		[IBV_PORT_NOP]		= "no state change (NOP)",
+		[IBV_PORT_DOWN]		= "down",
+		[IBV_PORT_INIT]		= "init",
+		[IBV_PORT_ARMED]	= "armed",
+		[IBV_PORT_ACTIVE]	= "active",
+		[IBV_PORT_ACTIVE_DEFER]	= "active defer"
+	};
+
+	if (port_state < IBV_PORT_NOP || port_state > IBV_PORT_ACTIVE_DEFER)
+		return "unknown";
+
+	return port_state_str[port_state];
+}
+
+const char *ibv_event_type_str(enum ibv_event_type event)
+{
+	static const char *const event_type_str[] = {
+		[IBV_EVENT_CQ_ERR]		= "CQ error",
+		[IBV_EVENT_QP_FATAL]		= "local work queue catastrophic error",
+		[IBV_EVENT_QP_REQ_ERR]		= "invalid request local work queue error",
+		[IBV_EVENT_QP_ACCESS_ERR]	= "local access violation work queue error",
+		[IBV_EVENT_COMM_EST]		= "communication established",
+		[IBV_EVENT_SQ_DRAINED]		= "send queue drained",
+		[IBV_EVENT_PATH_MIG]		= "path migrated",
+		[IBV_EVENT_PATH_MIG_ERR]	= "path migration request error",
+		[IBV_EVENT_DEVICE_FATAL]	= "local catastrophic error",
+		[IBV_EVENT_PORT_ACTIVE]		= "port active",
+		[IBV_EVENT_PORT_ERR]		= "port error",
+		[IBV_EVENT_LID_CHANGE]		= "LID change",
+		[IBV_EVENT_PKEY_CHANGE]		= "P_Key change",
+		[IBV_EVENT_SM_CHANGE]		= "SM change",
+		[IBV_EVENT_SRQ_ERR]		= "SRQ catastrophic error",
+		[IBV_EVENT_SRQ_LIMIT_REACHED]	= "SRQ limit reached",
+		[IBV_EVENT_QP_LAST_WQE_REACHED]	= "last WQE reached",
+		[IBV_EVENT_CLIENT_REREGISTER]	= "client reregistration",
+		[IBV_EVENT_GID_CHANGE]		= "GID table change"
+	};
+
+	if (event < IBV_EVENT_CQ_ERR || event > IBV_EVENT_GID_CHANGE)
+		return "unknown";
+
+	return event_type_str[event];
+}
+
+const char *ibv_wc_status_str(enum ibv_wc_status status)
+{
+	static const char *const wc_status_str[] = {
+		[IBV_WC_SUCCESS]		= "success",
+		[IBV_WC_LOC_LEN_ERR]		= "local length error",
+		[IBV_WC_LOC_QP_OP_ERR]		= "local QP operation error",
+		[IBV_WC_LOC_EEC_OP_ERR]		= "local EE context operation error",
+		[IBV_WC_LOC_PROT_ERR]		= "local protection error",
+		[IBV_WC_WR_FLUSH_ERR]		= "Work Request Flushed Error",
+		[IBV_WC_MW_BIND_ERR]		= "memory management operation error",
+		[IBV_WC_BAD_RESP_ERR]		= "bad response error",
+		[IBV_WC_LOC_ACCESS_ERR]		= "local access error",
+		[IBV_WC_REM_INV_REQ_ERR]	= "remote invalid request error",
+		[IBV_WC_REM_ACCESS_ERR]		= "remote access error",
+		[IBV_WC_REM_OP_ERR]		= "remote operation error",
+		[IBV_WC_RETRY_EXC_ERR]		= "transport retry counter exceeded",
+		[IBV_WC_RNR_RETRY_EXC_ERR]	= "RNR retry counter exceeded",
+		[IBV_WC_LOC_RDD_VIOL_ERR]	= "local RDD violation error",
+		[IBV_WC_REM_INV_RD_REQ_ERR]	= "remote invalid RD request",
+		[IBV_WC_REM_ABORT_ERR]		= "aborted error",
+		[IBV_WC_INV_EECN_ERR]		= "invalid EE context number",
+		[IBV_WC_INV_EEC_STATE_ERR]	= "invalid EE context state",
+		[IBV_WC_FATAL_ERR]		= "fatal error",
+		[IBV_WC_RESP_TIMEOUT_ERR]	= "response timeout error",
+		[IBV_WC_GENERAL_ERR]		= "general error"
+	};
+
+	if (status < IBV_WC_SUCCESS || status > IBV_WC_GENERAL_ERR)
+		return "unknown";
+
+	return wc_status_str[status];
+}
diff --git a/prov/ibverbs/src/fi_verbs.c b/prov/ibverbs/src/fi_verbs.c
new file mode 100644
index 00000000000..f9286f71ab2
--- /dev/null
+++ b/prov/ibverbs/src/fi_verbs.c
@@ -0,0 +1,1277 @@
+/*
+ * Copyright (c) 2013 Intel Corporation, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if HAVE_CONFIG_H
+#  include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+
+#include <infiniband/verbs.h>
+#include <rdma/rdma_cma.h>
+
+#include <rdma/fabric.h>
+#include <rdma/fi_cm.h>
+#include <rdma/fi_domain.h>
+#include <rdma/fi_prov.h>
+#include <rdma/fi_socket.h>
+#include <rdma/fi_rdma.h>
+
+#include "ibverbs.h"
+
+
+struct ibv_domain {
+	struct fid_domain	domain_fid;
+	struct ibv_context	*verbs;
+	struct ibv_pd		*pd;
+};
+
+struct ibv_ec {
+	struct fid_ec		fid;
+	enum fi_ec_domain	ec_domain;
+	struct ibv_domain	*domain;
+};
+
+struct ibv_ec_comp {
+	struct ibv_ec		ec;
+	struct ibv_comp_channel	*channel;
+	struct ibv_cq		*cq;
+	uint64_t		flags;
+	struct ibv_wc		wc;
+};
+
+struct ibv_ec_cm {
+	struct ibv_ec		ec;
+	struct rdma_event_channel *channel;
+	uint64_t		flags;
+	struct fi_ec_err_entry	err;
+};
+
+struct ibv_mem_desc {
+	struct fid_mr		mr_fid;
+	struct ibv_mr		*mr;
+	struct ibv_domain	*domain;
+};
+
+struct ibv_msg_socket {
+	struct fid_socket	socket_fid;
+	struct rdma_cm_id	*id;
+	struct ibv_ec_cm	*cm_ec;
+	struct ibv_ec_comp	*rec;
+	struct ibv_ec_comp	*sec;
+	uint32_t		inline_size;
+};
+
+static char def_send_wr[16] = "384";
+static char def_recv_wr[16] = "384";
+static char def_send_sge[16] = "4";
+static char def_recv_sge[16] = "4";
+static char def_inline_data[16] = "64";
+
+static int ibv_check_domain(const char *name)
+{
+	return (!name || !strncmp(name, IBV_PREFIX "/", sizeof(IBV_PREFIX))) ?
+		0 : -ENODATA;
+}
+
+/*
+ * TODO: this is not the full set of checks which are needed
+ */
+static int ibv_fi_to_rai(struct fi_info *fi, struct rdma_addrinfo *rai)
+{
+	memset(rai, 0, sizeof *rai);
+	if (fi->flags & FI_PASSIVE)
+		rai->ai_flags = RAI_PASSIVE;
+	if (fi->flags & FI_NUMERICHOST)
+		rai->ai_flags |= RAI_NUMERICHOST;
+//	if (fi->flags & FI_FAMILY)
+//		rai->ai_flags |= RAI_FAMILY;
+
+//	rai->ai_family = fi->sa_family;
+	if (fi->type == FID_MSG || fi->protocol & FI_PROTO_RDMA ||
+	    ((fi->protocol & FI_PROTO_MASK) == FI_PROTO_IB_RC) ||
+	    ((fi->protocol & FI_PROTO_MASK) == FI_PROTO_IWARP)) {
+		rai->ai_qp_type = IBV_QPT_RC;
+		rai->ai_port_space = RDMA_PS_TCP;
+	} else if (fi->type == FID_DGRAM ||
+		   ((fi->protocol & FI_PROTO_MASK) == FI_PROTO_IB_UD)) {
+		rai->ai_qp_type = IBV_QPT_UD;
+		rai->ai_port_space = RDMA_PS_UDP;
+	}
+
+	if (fi->src_addrlen) {
+		if (!(rai->ai_src_addr = malloc(fi->src_addrlen)))
+			return ENOMEM;
+		memcpy(rai->ai_src_addr, fi->src_addr, fi->src_addrlen);
+		rai->ai_src_len = fi->src_addrlen;
+	}
+	if (fi->dst_addrlen) {
+		if (!(rai->ai_dst_addr = malloc(fi->dst_addrlen)))
+			return ENOMEM;
+		memcpy(rai->ai_dst_addr, fi->dst_addr, fi->dst_addrlen);
+		rai->ai_dst_len = fi->dst_addrlen;
+	}
+//	if (fi->src_canonname)
+//		rai->ai_src_canonname = strdup(fi->src_canonname);
+//	if (fi->dst_canonname)
+//		rai->ai_dst_canonname = strdup(fi->dst_canonname);
+
+	return 0;
+}
+
+ static int ibv_rai_to_fi(struct rdma_addrinfo *rai, struct fi_info *fi)
+ {
+ 	memset(fi, 0, sizeof *fi);
+ 	if (rai->ai_flags & RAI_PASSIVE)
+ 		fi->flags = RAI_PASSIVE;
+
+ //	fi->sa_family = rai->ai_family;
+	if (rai->ai_qp_type == IBV_QPT_RC || rai->ai_port_space == RDMA_PS_TCP) {
+		fi->protocol = FI_PROTO_MSG | FI_PROTO_RDMA;
+		fi->type = FID_MSG;
+	} else if (rai->ai_qp_type == IBV_QPT_UD ||
+		   rai->ai_port_space == RDMA_PS_UDP) {
+		fi->protocol = FI_PROTO_IB_UD | FI_PROTO_MSG;
+		fi->type = FID_DGRAM;
+	}
+
+ 	if (rai->ai_src_len) {
+ 		if (!(fi->src_addr = malloc(rai->ai_src_len)))
+ 			return ENOMEM;
+ 		memcpy(fi->src_addr, rai->ai_src_addr, rai->ai_src_len);
+ 		fi->src_addrlen = rai->ai_src_len;
+ 	}
+ 	if (rai->ai_dst_len) {
+ 		if (!(fi->dst_addr = malloc(rai->ai_dst_len)))
+ 			return ENOMEM;
+ 		memcpy(fi->dst_addr, rai->ai_dst_addr, rai->ai_dst_len);
+ 		fi->dst_addrlen = rai->ai_dst_len;
+ 	}
+ //	if (rai->ai_src_canonname)
+ //		fi->src_canonname = strdup(rai->ai_src_canonname);
+ //	if (rai->ai_dst_canonname)
+ //		fi->dst_canonname = strdup(rai->ai_dst_canonname);
+
+ 	return 0;
+ }
+
+static int ibv_getinfo(char *node, char *service, struct fi_info *hints,
+		      struct fi_info **info)
+{
+	struct rdma_addrinfo rai_hints, *rai;
+	struct fi_info *fi;
+	struct rdma_cm_id *id;
+	int ret;
+
+	if (hints) {
+		ret = ibv_check_domain(hints->domain_name);
+		if (ret)
+			return ret;
+
+		ret = ibv_fi_to_rai(hints, &rai_hints);
+		if (ret)
+			return ret;
+
+		ret = rdma_getaddrinfo(node, service, &rai_hints, &rai);
+	} else {
+		ret = rdma_getaddrinfo(node, service, NULL, &rai);
+	}
+	if (ret)
+		return -errno;
+
+	if (!(fi = malloc(sizeof *fi))) {
+		ret = ENOMEM;
+		goto err1;
+	}
+
+	ret = ibv_rai_to_fi(rai, fi);
+	if (ret)
+		goto err2;
+
+	ret = rdma_create_ep(&id, rai, NULL, NULL);
+	if (ret) {
+		ret = -errno;
+		goto err2;
+	}
+	rdma_freeaddrinfo(rai);
+
+	if (!fi->src_addr) {
+		fi->src_addrlen = rdma_addrlen(rdma_get_local_addr(id));
+		if (!(fi->src_addr = malloc(fi->src_addrlen))) {
+			ret = -ENOMEM;
+			goto err3;
+		}
+		memcpy(fi->src_addr, rdma_get_local_addr(id), fi->src_addrlen);
+	}
+
+	if (id->verbs) {
+		if (!(fi->domain_name = malloc(FI_NAME_MAX))) {
+			ret = -ENOMEM;
+			goto err3;
+		}
+		strcpy(fi->domain_name, IBV_PREFIX "/");
+		strcpy(&fi->domain_name[sizeof(IBV_PREFIX)], id->verbs->device->name);
+	} else {
+		fi->domain_name = strdup(IBV_PREFIX "/" FI_UNBOUND_NAME);
+	}
+
+	fi->data = id;
+	fi->datalen = sizeof id;
+	*info = fi;
+	return 0;
+
+err3:
+	rdma_destroy_ep(id);
+err2:
+	__fi_freeinfo(fi);
+err1:
+	rdma_freeaddrinfo(rai);
+	return ret;
+}
+
+static int ibv_freeinfo(struct fi_info *info)
+{
+	int ret;
+
+	ret = ibv_check_domain(info->domain_name);
+	if (ret)
+		return ret;
+
+	if (info->data) {
+		rdma_destroy_ep(info->data);
+		info->data = NULL;
+	}
+	__fi_freeinfo(info);
+	return 0;
+}
+
+static int ibv_msg_socket_create_qp(struct ibv_msg_socket *sock)
+{
+	struct ibv_qp_init_attr attr;
+
+	/* TODO: serialize access to string buffers */
+	fi_read_file(FI_CONF_DIR, "def_send_wr",
+			def_send_wr, sizeof def_send_wr);
+	fi_read_file(FI_CONF_DIR, "def_recv_wr",
+			def_recv_wr, sizeof def_recv_wr);
+	fi_read_file(FI_CONF_DIR, "def_send_sge",
+			def_send_sge, sizeof def_send_sge);
+	fi_read_file(FI_CONF_DIR, "def_recv_sge",
+			def_recv_sge, sizeof def_recv_sge);
+	fi_read_file(FI_CONF_DIR, "def_inline_data",
+			def_inline_data, sizeof def_inline_data);
+
+	attr.cap.max_send_wr = atoi(def_send_wr);
+	attr.cap.max_recv_wr = atoi(def_recv_wr);
+	attr.cap.max_send_sge = atoi(def_send_sge);
+	attr.cap.max_recv_sge = atoi(def_recv_sge);
+	attr.cap.max_inline_data = atoi(def_inline_data);
+	sock->inline_size = attr.cap.max_inline_data;
+	attr.qp_context = sock;
+	attr.send_cq = sock->sec->cq;
+	attr.recv_cq = sock->rec->cq;
+	attr.srq = NULL;
+	attr.qp_type = IBV_QPT_RC;
+	attr.sq_sig_all = 1;
+
+	return rdma_create_qp(sock->id, sock->rec->ec.domain->pd, &attr) ? -errno : 0;
+}
+
+static int ibv_msg_socket_bind(fid_t fid, struct fi_resource *fids, int nfids)
+{
+	struct ibv_msg_socket *sock;
+	struct ibv_ec *ec;
+	int i, ret;
+
+	sock = container_of(fid, struct ibv_msg_socket, socket_fid.fid);
+	for (i = 0; i < nfids; i++) {
+		if (fids[i].fid->fclass != FID_CLASS_EC)
+			return -EINVAL;
+
+		ec = container_of(fids[i].fid, struct ibv_ec, fid.fid);
+		if (fids[i].flags & FI_RECV) {
+			if (sock->rec)
+				return -EINVAL;
+			sock->rec = container_of(ec, struct ibv_ec_comp, ec);
+		}
+		if (fids[i].flags & FI_SEND) {
+			if (sock->sec)
+				return -EINVAL;
+			sock->sec = container_of(ec, struct ibv_ec_comp, ec);
+		}
+		if (ec->ec_domain == FI_EC_DOMAIN_CM) {
+			sock->cm_ec = container_of(ec, struct ibv_ec_cm, ec);
+			ret = rdma_migrate_id(sock->id, sock->cm_ec->channel);
+			if (ret)
+				return -errno;
+		}
+	}
+
+	if (sock->sec && sock->rec && !sock->id->qp) {
+		ret = ibv_msg_socket_create_qp(sock);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static ssize_t ibv_msg_socket_recvmem(fid_t fid, void *buf, size_t len,
+				      uint64_t mem_desc, void *context)
+{
+	struct ibv_msg_socket *sock;
+	struct ibv_recv_wr wr, *bad;
+	struct ibv_sge sge;
+
+	sge.addr = (uintptr_t) buf;
+	sge.length = (uint32_t) len;
+	sge.lkey = (uint32_t) mem_desc;
+
+	wr.wr_id = (uintptr_t) context;
+	wr.next = NULL;
+	wr.sg_list = &sge;
+	wr.num_sge = 1;
+
+	sock = container_of(fid, struct ibv_msg_socket, socket_fid.fid);
+	return -ibv_post_recv(sock->id->qp, &wr, &bad);
+}
+
+static ssize_t ibv_msg_socket_sendmem(fid_t fid, const void *buf, size_t len,
+				      uint64_t mem_desc, void *context)
+{
+	struct ibv_msg_socket *sock;
+	struct ibv_send_wr wr, *bad;
+	struct ibv_sge sge;
+
+	sock = container_of(fid, struct ibv_msg_socket, socket_fid.fid);
+	sge.addr = (uintptr_t) buf;
+	sge.length = (uint32_t) len;
+	sge.lkey = (uint32_t) mem_desc;
+
+	wr.wr_id = (uintptr_t) context;
+	wr.next = NULL;
+	wr.sg_list = &sge;
+	wr.num_sge = 1;
+	wr.opcode = IBV_WR_SEND;
+	wr.send_flags = (len <= sock->inline_size) ? IBV_SEND_INLINE : 0;
+
+	return -ibv_post_send(sock->id->qp, &wr, &bad);
+}
+
+static struct fi_ops_msg ibv_msg_socket_msg_ops = {
+	.size = sizeof(struct fi_ops_msg),
+	.recvmem = ibv_msg_socket_recvmem,
+	.sendmem = ibv_msg_socket_sendmem,
+};
+
+static int ibv_msg_socket_rdma_writemem(fid_t fid, const void *buf, size_t len,
+	uint64_t mem_desc, uint64_t addr, be64_t tag, void *context)
+{
+	struct ibv_msg_socket *sock;
+	struct ibv_send_wr wr, *bad;
+	struct ibv_sge sge;
+
+	sock = container_of(fid, struct ibv_msg_socket, socket_fid.fid);
+	sge.addr = (uintptr_t) buf;
+	sge.length = (uint32_t) len;
+	sge.lkey = (uint32_t) mem_desc;
+
+	wr.wr_id = (uintptr_t) context;
+	wr.next = NULL;
+	wr.sg_list = &sge;
+	wr.num_sge = 1;
+	wr.opcode = IBV_WR_RDMA_WRITE;
+	wr.send_flags = (len <= sock->inline_size) ? IBV_SEND_INLINE : 0;
+	wr.wr.rdma.remote_addr = addr;
+	wr.wr.rdma.rkey = (uint32_t) tag;
+
+	return -ibv_post_send(sock->id->qp, &wr, &bad);
+}
+
+static int ibv_msg_socket_rdma_readmem(fid_t fid, void *buf, size_t len,
+	uint64_t mem_desc, uint64_t addr, be64_t tag, void *context)
+{
+	struct ibv_msg_socket *sock;
+	struct ibv_send_wr wr, *bad;
+	struct ibv_sge sge;
+
+	sge.addr = (uintptr_t) buf;
+	sge.length = (uint32_t) len;
+	sge.lkey = (uint32_t) mem_desc;
+
+	wr.wr_id = (uintptr_t) context;
+	wr.next = NULL;
+	wr.sg_list = &sge;
+	wr.num_sge = 1;
+	wr.opcode = IBV_WR_RDMA_READ;
+	wr.send_flags = 0;
+	wr.wr.rdma.remote_addr = addr;
+	wr.wr.rdma.rkey = (uint32_t) tag;
+
+	sock = container_of(fid, struct ibv_msg_socket, socket_fid.fid);
+	return -ibv_post_send(sock->id->qp, &wr, &bad);
+}
+
+static struct fi_ops_rdma ibv_msg_socket_rdma_ops = {
+	.size = sizeof(struct fi_ops_rdma),
+	.writemem = ibv_msg_socket_rdma_writemem,
+	.readmem = ibv_msg_socket_rdma_readmem
+};
+
+static int ibv_msg_socket_connect(fid_t fid, const void *param, size_t paramlen)
+{
+	struct ibv_msg_socket *sock;
+	struct rdma_conn_param conn_param;
+
+	sock = container_of(fid, struct ibv_msg_socket, socket_fid.fid);
+	memset(&conn_param, 0, sizeof conn_param);
+	conn_param.private_data = param;
+	conn_param.private_data_len = paramlen;
+	conn_param.responder_resources = RDMA_MAX_RESP_RES;
+	conn_param.initiator_depth = RDMA_MAX_INIT_DEPTH;
+	conn_param.flow_control = 1;
+	conn_param.retry_count = 15;
+	conn_param.rnr_retry_count = 7;
+
+	return rdma_connect(sock->id, &conn_param) ? -errno : 0;
+}
+
+static int ibv_msg_socket_listen(fid_t fid)
+{
+	struct ibv_msg_socket *sock;
+
+	sock = container_of(fid, struct ibv_msg_socket, socket_fid.fid);
+	return rdma_listen(sock->id, 0) ? -errno : 0;
+}
+
+static int ibv_msg_socket_accept(fid_t fid, const void *param, size_t paramlen)
+{
+	struct ibv_msg_socket *sock;
+	struct rdma_conn_param conn_param;
+
+	sock = container_of(fid, struct ibv_msg_socket, socket_fid.fid);
+	memset(&conn_param, 0, sizeof conn_param);
+	conn_param.private_data = param;
+	conn_param.private_data_len = paramlen;
+	conn_param.responder_resources = RDMA_MAX_RESP_RES;
+	conn_param.initiator_depth = RDMA_MAX_INIT_DEPTH;
+	conn_param.flow_control = 1;
+	conn_param.rnr_retry_count = 7;
+
+	return rdma_accept(sock->id, &conn_param) ? -errno : 0;
+}
+
+static int ibv_msg_socket_reject(fid_t fid, struct fi_info *info,
+				 const void *param, size_t paramlen)
+{
+	return rdma_reject(info->data, param, (uint8_t) paramlen) ? -errno : 0;
+}
+
+static int ibv_msg_socket_shutdown(fid_t fid, uint64_t flags)
+{
+	struct ibv_msg_socket *sock;
+	sock = container_of(fid, struct ibv_msg_socket, socket_fid.fid);
+	return rdma_disconnect(sock->id) ? -errno : 0;
+}
+
+struct fi_ops_cm ibv_msg_socket_cm_ops = {
+	.size = sizeof(struct fi_ops_cm),
+	.connect = ibv_msg_socket_connect,
+	.listen = ibv_msg_socket_listen,
+	.accept = ibv_msg_socket_accept,
+	.reject = ibv_msg_socket_reject,
+	.shutdown = ibv_msg_socket_shutdown,
+};
+
+static int ibv_msg_socket_close(fid_t fid)
+{
+	struct ibv_msg_socket *sock;
+
+	sock = container_of(fid, struct ibv_msg_socket, socket_fid.fid);
+	if (sock->id)
+		rdma_destroy_ep(sock->id);
+
+	free(sock);
+	return 0;
+}
+
+struct fi_ops ibv_msg_socket_ops = {
+	.size = sizeof(struct fi_ops),
+	.close = ibv_msg_socket_close,
+	.bind = ibv_msg_socket_bind
+};
+
+static int ibv_socket(struct fi_info *info, fid_t *fid, void *context)
+{
+	struct ibv_msg_socket *sock;
+	int ret;
+
+	ret = ibv_check_domain(info->domain_name);
+	if (ret)
+		return ret;
+
+	if (!info->data || info->datalen != sizeof(sock->id))
+		return -ENOSYS;
+
+	sock = calloc(1, sizeof *sock);
+	if (!sock)
+		return -ENOMEM;
+
+	sock->id = info->data;
+	sock->id->context = &sock->socket_fid.fid;
+	info->data = NULL;
+	info->datalen = 0;
+
+	sock->socket_fid.fid.fclass = FID_CLASS_SOCKET;
+	sock->socket_fid.fid.size = sizeof(struct fid_socket);
+	sock->socket_fid.fid.context = context;
+	sock->socket_fid.fid.ops = &ibv_msg_socket_ops;
+	sock->socket_fid.ops = NULL;
+	sock->socket_fid.msg = &ibv_msg_socket_msg_ops;
+	sock->socket_fid.cm = &ibv_msg_socket_cm_ops;
+	sock->socket_fid.rdma = &ibv_msg_socket_rdma_ops;
+
+	*fid = &sock->socket_fid.fid;
+	return 0;
+}
+
+static int ibv_poll_fd(int fd)
+{
+	struct pollfd fds;
+
+	fds.fd = fd;
+	fds.events = POLLIN;
+	return poll(&fds, 1, -1) < 0 ? -errno : 0;
+}
+
+static ssize_t ibv_ec_cm_readerr(fid_t fid, void *buf, size_t len, uint64_t flags)
+{
+	struct ibv_ec_cm *ec;
+	struct fi_ec_err_entry *entry;
+
+	ec = container_of(fid, struct ibv_ec_cm, ec.fid.fid);
+	if (!ec->err.err)
+		return 0;
+
+	if (len < sizeof(*entry))
+		return -EINVAL;
+
+	entry = (struct fi_ec_err_entry *) buf;
+	*entry = ec->err;
+	ec->err.err = 0;
+	ec->err.prov_errno = 0;
+	return sizeof(*entry);
+}
+
+static struct fi_info * ibv_ec_cm_getinfo(struct rdma_cm_event *event)
+{
+	struct fi_info *fi;
+
+	fi = calloc(1, sizeof *fi);
+	if (!fi)
+		return NULL;
+
+	fi->size = sizeof *fi;
+	fi->type = FID_MSG;
+	if (event->id->verbs->device->transport_type == IBV_TRANSPORT_IWARP)
+		fi->protocol = FI_PROTO_IWARP | FI_PROTO_RDMA;
+	else
+		fi->protocol = FI_PROTO_IB_RC | FI_PROTO_RDMA;
+//	fi->sa_family = rdma_get_local_addr(event->id)->sa_family;
+
+	fi->src_addrlen = rdma_addrlen(rdma_get_local_addr(event->id));
+	if (!(fi->src_addr = malloc(fi->src_addrlen)))
+		goto err;
+	memcpy(fi->src_addr, rdma_get_local_addr(event->id), fi->src_addrlen);
+
+	fi->dst_addrlen = rdma_addrlen(rdma_get_peer_addr(event->id));
+	if (!(fi->dst_addr = malloc(fi->dst_addrlen)))
+		goto err;
+	memcpy(fi->dst_addr, rdma_get_peer_addr(event->id), fi->dst_addrlen);
+
+	if (!(fi->domain_name = malloc(FI_NAME_MAX)))
+		goto err;
+	strcpy(fi->domain_name, IBV_PREFIX "/");
+	strcpy(&fi->domain_name[sizeof(IBV_PREFIX)], event->id->verbs->device->name);
+
+	fi->datalen = sizeof event->id;
+	fi->data = event->id;
+	return fi;
+err:
+	fi_freeinfo(fi);
+	return NULL;
+}
+
+static ssize_t ibv_ec_cm_process_event(struct ibv_ec_cm *ec,
+	struct rdma_cm_event *event, struct fi_ec_cm_entry *entry, size_t len)
+{
+	fid_t fid;
+	size_t datalen;
+
+	fid = event->id->context;
+	switch (event->event) {
+//	case RDMA_CM_EVENT_ADDR_RESOLVED:
+//		return 0;
+//	case RDMA_CM_EVENT_ROUTE_RESOLVED:
+//		return 0;
+	case RDMA_CM_EVENT_CONNECT_REQUEST:
+		rdma_migrate_id(event->id, NULL);
+		entry->event = FI_CONNREQ;
+		entry->info = ibv_ec_cm_getinfo(event);
+		if (!entry->info) {
+			rdma_destroy_id(event->id);
+			return 0;
+		}
+		break;
+	case RDMA_CM_EVENT_ESTABLISHED:
+		entry->event = FI_CONNECTED;
+		entry->info = NULL;
+		break;
+	case RDMA_CM_EVENT_DISCONNECTED:
+		entry->event = FI_SHUTDOWN;
+		entry->info = NULL;
+		break;
+	case RDMA_CM_EVENT_ADDR_ERROR:
+	case RDMA_CM_EVENT_ROUTE_ERROR:
+	case RDMA_CM_EVENT_CONNECT_ERROR:
+	case RDMA_CM_EVENT_UNREACHABLE:
+		ec->err.fid_context = fid->context;
+		ec->err.err = event->status;
+		return -EIO;
+	case RDMA_CM_EVENT_REJECTED:
+		ec->err.fid_context = fid->context;
+		ec->err.err = ECONNREFUSED;
+		ec->err.prov_errno = event->status;
+		return -EIO;
+	case RDMA_CM_EVENT_DEVICE_REMOVAL:
+		ec->err.fid_context = fid->context;
+		ec->err.err = ENODEV;
+		return -EIO;
+	case RDMA_CM_EVENT_ADDR_CHANGE:
+		ec->err.fid_context = fid->context;
+		ec->err.err = EADDRNOTAVAIL;
+		return -EIO;
+	default:
+		return 0;
+	}
+
+	entry->fid_context = fid->context;
+	entry->flags = 0;
+	datalen = min(len - sizeof(*entry), event->param.conn.private_data_len);
+	if (datalen)
+		memcpy(entry->data, event->param.conn.private_data, datalen);
+	return sizeof(*entry) + datalen;
+}
+
+static ssize_t ibv_ec_cm_read_data(fid_t fid, void *buf, size_t len)
+{
+	struct ibv_ec_cm *ec;
+	struct fi_ec_cm_entry *entry;
+	struct rdma_cm_event *event;
+	size_t left;
+	ssize_t ret = -EINVAL;
+
+	ec = container_of(fid, struct ibv_ec_cm, ec.fid.fid);
+	entry = (struct fi_ec_cm_entry *) buf;
+	if (ec->err.err)
+		return -EIO;
+
+	for (left = len; left >= sizeof(*entry); ) {
+		ret = rdma_get_cm_event(ec->channel, &event);
+		if (!ret) {
+			ret = ibv_ec_cm_process_event(ec, event, entry, left);
+			rdma_ack_cm_event(event);
+			if (ret < 0)
+				break;
+			else if (!ret)
+				continue;
+
+			left -= ret;
+			entry = ((void *) entry) + ret;
+		} else if (errno == EAGAIN) {
+			if (left < len)
+				return len - left;
+
+			if (ec->flags & FI_NONBLOCK)
+				return 0;
+
+			ibv_poll_fd(ec->channel->fd);
+		} else {
+			ret = -errno;
+			break;
+		}
+	}
+
+	return (left < len) ? len - left : ret;
+}
+
+static const char * ibv_ec_cm_strerror(fid_t fid, int prov_errno, void *prov_data,
+					 void *buf, size_t len)
+{
+	if (buf && len)
+		strncpy(buf, strerror(prov_errno), len);
+	return strerror(prov_errno);
+}
+
+struct fi_ops_ec ibv_ec_cm_data_ops = {
+	.size = sizeof(struct fi_ops_ec),
+	.read = ibv_ec_cm_read_data,
+	.readfrom = NULL,
+	.readerr = ibv_ec_cm_readerr,
+	.write = NULL,
+	.reset = NULL,
+	.strerror = ibv_ec_cm_strerror
+};
+
+static int ibv_ec_cm_close(fid_t fid)
+{
+	struct ibv_ec_cm *ec;
+
+	ec = container_of(fid, struct ibv_ec_cm, ec.fid.fid);
+	if (ec->channel)
+		rdma_destroy_event_channel(ec->channel);
+
+	free(ec);
+	return 0;
+}
+
+struct fi_ops ibv_ec_cm_ops = {
+	.size = sizeof(struct fi_ops),
+	.close = ibv_ec_cm_close,
+};
+
+static int ibv_ec_cm_open(fid_t fid, struct fi_ec_attr *attr, fid_t *ec, void *context)
+{
+	struct ibv_ec_cm *vec;
+	long flags = 0;
+	int ret;
+
+	if (attr->type != FI_EC_QUEUE || attr->format != FI_EC_FORMAT_CM)
+		return -ENOSYS;
+
+	vec = calloc(1, sizeof *vec);
+	if (!vec)
+		return -ENOMEM;
+
+	vec->ec.domain = container_of(fid, struct ibv_domain, domain_fid.fid);
+
+	switch (attr->wait_obj) {
+	case FI_EC_WAIT_FD:
+		vec->channel = rdma_create_event_channel();
+		if (!vec->channel) {
+			ret = -errno;
+			goto err1;
+		}
+		fcntl(vec->channel->fd, F_GETFL, &flags);
+		ret = fcntl(vec->channel->fd, F_SETFL, flags | O_NONBLOCK);
+		if (ret) {
+			ret = -errno;
+			goto err2;
+		}
+		break;
+	case FI_EC_WAIT_NONE:
+		vec->flags = O_NONBLOCK;
+		break;
+	default:
+		return -ENOSYS;
+	}
+
+	vec->flags = attr->flags;
+	vec->ec.fid.fid.fclass = FID_CLASS_EC;
+	vec->ec.fid.fid.size = sizeof(struct fid_ec);
+	vec->ec.fid.fid.context = context;
+	vec->ec.fid.fid.ops = &ibv_ec_cm_ops;
+	vec->ec.fid.ops = &ibv_ec_cm_data_ops;
+
+	*ec = &vec->ec.fid.fid;
+	return 0;
+err2:
+	if (vec->channel)
+		rdma_destroy_event_channel(vec->channel);
+err1:
+	free(vec);
+	return ret;
+}
+
+static int ibv_ec_comp_reset(fid_t fid, void *cond)
+{
+	struct ibv_ec_comp *ec;
+	struct ibv_cq *cq;
+	void *context;
+	int ret;
+
+	ec = container_of(fid, struct ibv_ec_comp, ec.fid.fid);
+	ret = ibv_get_cq_event(ec->channel, &cq	, &context);
+	if (!ret)
+		ibv_ack_cq_events(cq, 1);
+
+	return -ibv_req_notify_cq(ec->cq, (ec->flags & FI_SIGNAL) ? 1 : 0);
+}
+
+static ssize_t ibv_ec_comp_readerr(fid_t fid, void *buf, size_t len, uint64_t flags)
+{
+	struct ibv_ec_comp *ec;
+	struct fi_ec_err_entry *entry;
+
+	ec = container_of(fid, struct ibv_ec_comp, ec.fid.fid);
+	if (!ec->wc.status)
+		return 0;
+
+	if (len < sizeof(*entry))
+		return -EINVAL;
+
+	entry = (struct fi_ec_err_entry *) buf;
+	entry->fid_context = NULL;	/* TODO: return qp context from wc */
+	entry->op_context = (void *) (uintptr_t) ec->wc.wr_id;
+	entry->flags = 0;
+	entry->err = EIO;
+	entry->prov_errno = ec->wc.status;
+	entry->data = ec->wc.vendor_err;
+	entry->prov_data = NULL;
+
+	ec->wc.status = 0;
+	return sizeof(*entry);
+}
+
+static ssize_t ibv_ec_comp_read(fid_t fid, void *buf, size_t len)
+{
+	struct ibv_ec_comp *ec;
+	struct fi_ec_entry *entry;
+	size_t left;
+	int reset = 1, ret = -EINVAL;
+
+	ec = container_of(fid, struct ibv_ec_comp, ec.fid.fid);
+	entry = (struct fi_ec_entry *) buf;
+	if (ec->wc.status)
+		return -EIO;
+
+	for (left = len; left >= sizeof(*entry); ) {
+		ret = ibv_poll_cq(ec->cq, 1, &ec->wc);
+		if (ret > 0) {
+			if (ec->wc.status) {
+				ret = -EIO;
+				break;
+			}
+
+			entry->op_context = (void *) (uintptr_t) ec->wc.wr_id;
+			left -= sizeof(*entry);
+			entry = entry + 1;
+		} else if (ret == 0) {
+			if (left < len)
+				return len - left;
+
+			if (reset && (ec->flags & FI_AUTO_RESET)) {
+				ibv_ec_comp_reset(fid, NULL);
+				reset = 0;
+				continue;
+			}
+
+			if (ec->flags & FI_NONBLOCK)
+				return 0;
+
+			ibv_poll_fd(ec->channel->fd);
+		} else {
+			break;
+		}
+	}
+
+	return (left < len) ? len - left : ret;
+}
+
+static ssize_t ibv_ec_comp_read_data(fid_t fid, void *buf, size_t len)
+{
+	struct ibv_ec_comp *ec;
+	struct fi_ec_data_entry *entry;
+	size_t left;
+	int reset = 1, ret = -EINVAL;
+
+	ec = container_of(fid, struct ibv_ec_comp, ec.fid.fid);
+	entry = (struct fi_ec_data_entry *) buf;
+	if (ec->wc.status)
+		return -EIO;
+
+	for (left = len; left >= sizeof(*entry); ) {
+		ret = ibv_poll_cq(ec->cq, 1, &ec->wc);
+		if (ret > 0) {
+			if (ec->wc.status) {
+				ret = -EIO;
+				break;
+			}
+
+			entry->op_context = (void *) (uintptr_t) ec->wc.wr_id;
+			if (ec->wc.wc_flags & IBV_WC_WITH_IMM) {
+				entry->flags = FI_IMM;
+				entry->data = ec->wc.imm_data;
+			}
+			if (ec->wc.opcode & IBV_WC_RECV)
+				entry->len = ec->wc.byte_len;
+			left -= sizeof(*entry);
+			entry = entry + 1;
+		} else if (ret == 0) {
+			if (left < len)
+				return len - left;
+
+			if (reset && (ec->flags & FI_AUTO_RESET)) {
+				ibv_ec_comp_reset(fid, NULL);
+				reset = 0;
+				continue;
+			}
+
+			if (ec->flags & FI_NONBLOCK)
+				return 0;
+
+			ibv_poll_fd(ec->channel->fd);
+		} else {
+			break;
+		}
+	}
+
+	return (left < len) ? len - left : ret;
+}
+
+static const char * ibv_ec_comp_strerror(fid_t fid, int prov_errno, void *prov_data,
+					 void *buf, size_t len)
+{
+	if (buf && len)
+		strncpy(buf, ibv_wc_status_str(prov_errno), len);
+	return ibv_wc_status_str(prov_errno);
+}
+
+struct fi_ops_ec ibv_ec_comp_context_ops = {
+	.size = sizeof(struct fi_ops_ec),
+	.read = ibv_ec_comp_read,
+	.readerr = ibv_ec_comp_readerr,
+	.reset = ibv_ec_comp_reset,
+	.strerror = ibv_ec_comp_strerror
+};
+
+struct fi_ops_ec ibv_ec_comp_data_ops = {
+	.size = sizeof(struct fi_ops_ec),
+	.read = ibv_ec_comp_read_data,
+	.readerr = ibv_ec_comp_readerr,
+	.reset = ibv_ec_comp_reset,
+	.strerror = ibv_ec_comp_strerror
+};
+
+static int ibv_ec_comp_close(fid_t fid)
+{
+	struct ibv_ec_comp *ec;
+	int ret;
+
+	ec = container_of(fid, struct ibv_ec_comp, ec.fid.fid);
+	if (ec->cq) {
+		ret = ibv_destroy_cq(ec->cq);
+		if (ret)
+			return -ret;
+		ec->cq = NULL;
+	}
+	if (ec->channel)
+		ibv_destroy_comp_channel(ec->channel);
+
+	free(ec);
+	return 0;
+}
+
+struct fi_ops ibv_ec_comp_ops = {
+	.size = sizeof(struct fi_ops),
+	.close = ibv_ec_comp_close,
+};
+
+static int ibv_ec_comp_open(fid_t fid, struct fi_ec_attr *attr, fid_t *ec, void *context)
+{
+	struct ibv_ec_comp *vec;
+	long flags = 0;
+	int ret;
+
+	if (attr->type != FI_EC_QUEUE || attr->wait_cond != FI_EC_COND_NONE)
+		return -ENOSYS;
+
+	vec = calloc(1, sizeof *vec);
+	if (!vec)
+		return -ENOMEM;
+
+	vec->ec.domain = container_of(fid, struct ibv_domain, domain_fid.fid);
+
+	switch (attr->wait_obj) {
+	case FI_EC_WAIT_FD:
+		vec->channel = ibv_create_comp_channel(vec->ec.domain->verbs);
+		if (!vec->channel) {
+			ret = -errno;
+			goto err1;
+		}
+		fcntl(vec->channel->fd, F_GETFL, &flags);
+		ret = fcntl(vec->channel->fd, F_SETFL, flags | O_NONBLOCK);
+		if (ret) {
+			ret = -errno;
+			goto err1;
+		}
+		break;
+	case FI_EC_WAIT_NONE:
+		vec->flags = FI_NONBLOCK;
+		break;
+	default:
+		return -ENOSYS;
+	}
+
+	vec->cq = ibv_create_cq(vec->ec.domain->verbs, attr->size, vec,
+				vec->channel, attr->signaling_vector);
+	if (!vec->cq) {
+		ret = -errno;
+		goto err2;
+	}
+
+	vec->flags |= attr->flags;
+	vec->ec.fid.fid.fclass = FID_CLASS_EC;
+	vec->ec.fid.fid.size = sizeof(struct fid_ec);
+	vec->ec.fid.fid.context = context;
+	vec->ec.fid.fid.ops = &ibv_ec_comp_ops;
+
+	switch (attr->format) {
+	case FI_EC_FORMAT_CONTEXT:
+		vec->ec.fid.ops = &ibv_ec_comp_context_ops;
+		break;
+	case FI_EC_FORMAT_DATA:
+		vec->ec.fid.ops = &ibv_ec_comp_data_ops;
+		break;
+	default:
+		ret = -ENOSYS;
+		goto err3;
+	}
+
+	*ec = &vec->ec.fid.fid;
+	return 0;
+
+err3:
+	ibv_destroy_cq(vec->cq);
+err2:
+	if (vec->channel)
+		ibv_destroy_comp_channel(vec->channel);
+err1:
+	free(vec);
+	return ret;
+}
+
+static int ibv_ec_open(fid_t fid, struct fi_ec_attr *attr, fid_t *ec, void *context)
+{
+	struct ibv_ec *vec;
+	int ret;
+
+	switch (attr->domain) {
+	case FI_EC_DOMAIN_GENERAL:
+		return -ENOSYS;
+	case FI_EC_DOMAIN_COMP:
+		ret = ibv_ec_comp_open(fid, attr, ec, context);
+		break;
+	case FI_EC_DOMAIN_CM:
+		ret  = ibv_ec_cm_open(fid, attr, ec, context);
+		break;
+	case FI_EC_DOMAIN_AV:
+		return -ENOSYS;
+	default:
+		return -ENOSYS;
+	}
+	if (ret)
+		return ret;
+
+	vec = container_of(*ec, struct ibv_ec, fid);
+	vec->ec_domain = attr->domain;
+
+	if (attr->flags & FI_AUTO_RESET && vec->fid.ops->reset)
+		fi_ec_reset(*ec, attr->cond);
+
+	return 0;
+}
+
+static int ibv_mr_close(fid_t fid)
+{
+	struct ibv_mem_desc *mr;
+	int ret;
+
+	mr = container_of(fid, struct ibv_mem_desc, mr_fid.fid);
+	ret = -ibv_dereg_mr(mr->mr);
+	if (!ret)
+		free(mr);
+	return ret;
+}
+
+struct fi_ops ibv_mr_ops = {
+	.size = sizeof(struct fi_ops),
+	.close = ibv_mr_close
+};
+
+static int ibv_mr_reg(fid_t fid, const void *buf, size_t len, fid_t *mr,
+		      uint64_t flags, void *context)
+{
+	struct ibv_mem_desc *md;
+	int access;
+
+	md = calloc(1, sizeof *md);
+	if (!md)
+		return -ENOMEM;
+
+	md->domain = container_of(fid, struct ibv_domain, domain_fid.fid);
+	md->mr_fid.fid.fclass = FID_CLASS_MR;
+	md->mr_fid.fid.size = sizeof(struct fid_mr);
+	md->mr_fid.fid.context = context;
+	md->mr_fid.fid.ops = &ibv_mr_ops;
+
+	access = IBV_ACCESS_LOCAL_WRITE;
+	if (flags & FI_READ)
+		access |= IBV_ACCESS_REMOTE_READ;
+	if (flags & FI_WRITE)
+		access |= IBV_ACCESS_REMOTE_WRITE;
+	md->mr = ibv_reg_mr(md->domain->pd, (void *) buf, len, access);
+	if (!md->mr)
+		goto err;
+
+	md->mr_fid.mem_desc = md->mr->lkey;
+	md->mr_fid.key = md->mr->rkey;
+	*mr = &md->mr_fid.fid;
+	return 0;
+
+err:
+	free(md);
+	return -errno;
+}
+
+static int ibv_close(fid_t fid)
+{
+	struct ibv_domain *domain;
+	int ret;
+
+	domain = container_of(fid, struct ibv_domain, domain_fid.fid);
+	if (domain->pd) {
+		ret = ibv_dealloc_pd(domain->pd);
+		if (ret)
+			return -ret;
+		domain->pd = NULL;
+	}
+
+	free(domain);
+	return 0;
+}
+
+static int ibv_open_device_by_name(struct ibv_domain *domain, const char *name)
+{
+	struct ibv_context **dev_list;
+	int i, ret = -ENODEV;
+
+	name = name + sizeof(IBV_PREFIX);
+	dev_list = rdma_get_devices(NULL);
+	if (!dev_list)
+		return -errno;
+
+	for (i = 0; dev_list[i]; i++) {
+		if (!strcmp(name, ibv_get_device_name(dev_list[i]->device))) {
+			domain->verbs = dev_list[i];
+			ret = 0;
+			break;
+		}
+	}
+	rdma_free_devices(dev_list);
+	return ret;
+}
+
+struct fi_ops ibv_fid_ops = {
+	.size = sizeof(struct fi_ops),
+	.close = ibv_close,
+};
+
+struct fi_ops_domain ibv_domain_ops = {
+	.size = sizeof(struct fi_ops_domain),
+	.mr_reg = ibv_mr_reg,
+	.ec_open = ibv_ec_open
+};
+
+static int ibv_open(const char *name, struct fi_info *info, uint64_t flags,
+		    fid_t *fid, void *context)
+{
+	struct ibv_domain *domain;
+	const char *domain_name;
+	int ret;
+
+	domain_name = name ? name : info->domain_name;
+	ret = ibv_check_domain(domain_name);
+	if (ret)
+		return ret;
+
+	domain = calloc(1, sizeof *domain);
+	if (!domain)
+		return -ENOMEM;
+
+	if (strcmp(domain_name + sizeof(IBV_PREFIX), "local")) {
+		ret = ibv_open_device_by_name(domain, domain_name);
+		if (ret)
+			goto err;
+
+		domain->pd = ibv_alloc_pd(domain->verbs);
+		if (!domain->pd) {
+			ret = -errno;
+			goto err;
+		}
+	}
+
+	domain->domain_fid.fid.fclass = FID_CLASS_RESOURCE_DOMAIN;
+	domain->domain_fid.fid.size = sizeof(struct fid_domain);
+	domain->domain_fid.fid.context = context;
+	domain->domain_fid.fid.ops = &ibv_fid_ops;
+	domain->domain_fid.ops = &ibv_domain_ops;
+
+	*fid = &domain->domain_fid.fid;
+	return 0;
+err:
+	free(domain);
+	return ret;
+}
+
+struct fi_ops_prov ibv_ops = {
+	.size = sizeof(struct fi_ops_prov),
+	.getinfo = ibv_getinfo,
+	.freeinfo = ibv_freeinfo,
+	.socket = ibv_socket,
+	.open = ibv_open
+};
+
+
+void ibv_ini(void)
+{
+	fi_register(&ibv_ops);
+}
+
+void ibv_fini(void)
+{
+}
diff --git a/prov/ibverbs/src/ibverbs.h b/prov/ibverbs/src/ibverbs.h
new file mode 100644
index 00000000000..e3db32d04fe
--- /dev/null
+++ b/prov/ibverbs/src/ibverbs.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IB_VERBS_H
+#define IB_VERBS_H
+
+#include <pthread.h>
+
+#include <infiniband/driver.h>
+#include <rdma/fi_uverbs.h>
+#include "fi.h"
+
+#define HIDDEN		__attribute__((visibility ("hidden")))
+
+#define DEFAULT_ABI	"IBVERBS_1.1"
+
+#ifdef HAVE_SYMVER_SUPPORT
+#  define symver(name, api, ver) \
+	asm(".symver " #name "," #api "@" #ver)
+#  define default_symver(name, api) \
+	asm(".symver " #name "," #api "@@" DEFAULT_ABI)
+#else
+#  define symver(name, api, ver)
+#  define default_symver(name, api) \
+	extern __typeof(name) api __attribute__((alias(#name)))
+#endif /* HAVE_SYMVER_SUPPORT */
+
+extern HIDDEN int abi_ver;
+
+HIDDEN int ibverbs_init(struct ibv_device ***list);
+
+#endif /* IB_VERBS_H */
diff --git a/prov/ibverbs/src/init.c b/prov/ibverbs/src/init.c
new file mode 100644
index 00000000000..4eefb9a796a
--- /dev/null
+++ b/prov/ibverbs/src/init.c
@@ -0,0 +1,473 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2006 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if HAVE_CONFIG_H
+#  include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#include <stdlib.h>
+#include <string.h>
+#include <glob.h>
+#include <stdio.h>
+#include <dlfcn.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <dirent.h>
+#include <errno.h>
+
+#include "ibverbs.h"
+#include "fi.h"
+
+#define IBV_CONFIG_DIR	SYSCONFDIR "/libibverbs.d"
+
+HIDDEN int abi_ver;
+
+struct ibv_sysfs_dev {
+	char		        sysfs_name[IBV_SYSFS_NAME_MAX];
+	char		        ibdev_name[IBV_SYSFS_NAME_MAX];
+	char		        sysfs_path[IBV_SYSFS_PATH_MAX];
+	char		        ibdev_path[IBV_SYSFS_PATH_MAX];
+	struct ibv_sysfs_dev   *next;
+	int			abi_ver;
+	int			have_driver;
+};
+
+struct ibv_driver_name {
+	char		       *name;
+	struct ibv_driver_name *next;
+};
+
+struct ibv_driver {
+	const char	       *name;
+	ibv_driver_init_func	init_func;
+	struct ibv_driver      *next;
+};
+
+static struct ibv_sysfs_dev *sysfs_dev_list;
+static struct ibv_driver_name *driver_name_list;
+static struct ibv_driver *head_driver, *tail_driver;
+
+static int find_sysfs_devs(void)
+{
+	struct uv_dev *udev;
+	struct ibv_sysfs_dev *sysfs_dev;
+	char value[8];
+	int ret= 0;
+
+	for (udev = udev_head; udev; udev = udev->next) {
+		sysfs_dev = calloc(1, sizeof *sysfs_dev);
+		if (!sysfs_dev) {
+			ret = ENOMEM;
+			break;
+		}
+
+		strcpy(sysfs_dev->sysfs_name, udev->sysfs_name);
+		strcpy(sysfs_dev->sysfs_path, udev->sysfs_path);
+		strcpy(sysfs_dev->ibdev_name, udev->dev_name);
+		strcpy(sysfs_dev->ibdev_path, udev->dev_path);
+		if (fi_read_file(sysfs_dev->sysfs_path, "abi_version",
+				 value, sizeof value) > 0)
+			sysfs_dev->abi_ver = strtol(value, NULL, 10);
+
+		sysfs_dev->next = sysfs_dev_list;
+		sysfs_dev_list = sysfs_dev;
+	}
+
+	return ret;
+}
+
+void ibv_register_driver(const char *name, ibv_driver_init_func init_func)
+{
+	struct ibv_driver *driver;
+
+	driver = malloc(sizeof *driver);
+	if (!driver) {
+		fprintf(stderr, "ibverbs: warning: couldn't allocate driver for %s\n", name);
+		return;
+	}
+
+	driver->name      = name;
+	driver->init_func = init_func;
+	driver->next      = NULL;
+
+	if (tail_driver)
+		tail_driver->next = driver;
+	else
+		head_driver = driver;
+	tail_driver = driver;
+}
+
+static void load_driver(const char *name)
+{
+	char *so_name;
+	void *dlhandle;
+
+#define __IBV_QUOTE(x)	#x
+#define IBV_QUOTE(x)	__IBV_QUOTE(x)
+
+	if (asprintf(&so_name,
+		     name[0] == '/' ?
+		     "%s-" IBV_QUOTE(IBV_DEVICE_LIBRARY_EXTENSION) ".so" :
+		     "lib%s-" IBV_QUOTE(IBV_DEVICE_LIBRARY_EXTENSION) ".so",
+		     name) < 0) {
+		fprintf(stderr, "ibverbs: warning: couldn't load driver '%s'.\n",
+			name);
+		return;
+	}
+
+	dlhandle = dlopen(so_name, RTLD_NOW);
+	if (!dlhandle) {
+		fprintf(stderr, "ibverbs: warning: couldn't load driver '%s': %s\n",
+			name, dlerror());
+		goto out;
+	}
+
+out:
+	free(so_name);
+}
+
+static void load_drivers(void)
+{
+	struct ibv_driver_name *name, *next_name;
+	const char *env;
+	char *list, *env_name;
+
+	/*
+	 * Only use drivers passed in through the calling user's
+	 * environment if we're not running setuid.
+	 */
+	if (getuid() == geteuid()) {
+		if ((env = getenv("RDMAV_DRIVERS"))) {
+			list = strdupa(env);
+			while ((env_name = strsep(&list, ":;")))
+				load_driver(env_name);
+		} else if ((env = getenv("IBV_DRIVERS"))) {
+			list = strdupa(env);
+			while ((env_name = strsep(&list, ":;")))
+				load_driver(env_name);
+		}
+	}
+
+	for (name = driver_name_list, next_name = name ? name->next : NULL;
+	     name;
+	     name = next_name, next_name = name ? name->next : NULL) {
+		load_driver(name->name);
+		free(name->name);
+		free(name);
+	}
+}
+
+static void read_config_file(const char *path)
+{
+	FILE *conf;
+	char *line = NULL;
+	char *config;
+	char *field;
+	size_t buflen = 0;
+	ssize_t len;
+
+	conf = fopen(path, "r");
+	if (!conf) {
+		fprintf(stderr, "ibverbs: warning: couldn't read config file %s.\n",
+			path);
+		return;
+	}
+
+	while ((len = getline(&line, &buflen, conf)) != -1) {
+		config = line + strspn(line, "\t ");
+		if (config[0] == '\n' || config[0] == '#')
+			continue;
+
+		field = strsep(&config, "\n\t ");
+
+		if (strcmp(field, "driver") == 0 && config != NULL) {
+			struct ibv_driver_name *driver_name;
+
+			config += strspn(config, "\t ");
+			field = strsep(&config, "\n\t ");
+
+			driver_name = malloc(sizeof *driver_name);
+			if (!driver_name) {
+				fprintf(stderr, "ibverbs: warning: couldn't allocate "
+					"driver name '%s'.\n", field);
+				continue;
+			}
+
+			driver_name->name = strdup(field);
+			if (!driver_name->name) {
+				fprintf(stderr, "ibverbs: warning: couldn't allocate "
+					"driver name '%s'.\n", field);
+				free(driver_name);
+				continue;
+			}
+
+			driver_name->next = driver_name_list;
+			driver_name_list  = driver_name;
+		} else
+			fprintf(stderr, "ibverbs: warning: ignoring bad config directive "
+				"'%s' in file '%s'.\n", field, path);
+	}
+
+	if (line)
+		free(line);
+	fclose(conf);
+}
+
+static void read_config(void)
+{
+	DIR *conf_dir;
+	struct dirent *dent;
+	char *path;
+
+	conf_dir = opendir(IBV_CONFIG_DIR);
+	if (!conf_dir) {
+		fprintf(stderr, "ibverbs: warning: couldn't open config directory '%s'.\n",
+			IBV_CONFIG_DIR);
+		return;
+	}
+
+	while ((dent = readdir(conf_dir))) {
+		struct stat buf;
+
+		if (asprintf(&path, "%s/%s", IBV_CONFIG_DIR, dent->d_name) < 0) {
+			fprintf(stderr, "ibverbs: warning: couldn't read config file %s/%s.\n",
+				IBV_CONFIG_DIR, dent->d_name);
+			goto out;
+		}
+
+		if (stat(path, &buf)) {
+			fprintf(stderr, "ibverbs: warning: couldn't stat config file '%s'.\n",
+				path);
+			goto next;
+		}
+
+		if (!S_ISREG(buf.st_mode))
+			goto next;
+
+		read_config_file(path);
+next:
+		free(path);
+	}
+
+out:
+	closedir(conf_dir);
+}
+
+static struct ibv_device *try_driver(struct ibv_driver *driver,
+				     struct ibv_sysfs_dev *sysfs_dev)
+{
+	struct ibv_device *dev;
+	char value[8];
+
+	dev = driver->init_func(sysfs_dev->sysfs_path, sysfs_dev->abi_ver);
+	if (!dev)
+		return NULL;
+
+	if (fi_read_file(sysfs_dev->ibdev_path, "node_type", value, sizeof value) < 0) {
+		fprintf(stderr, "ibverbs: warning: no node_type attr under %s.\n",
+			sysfs_dev->ibdev_path);
+			dev->node_type = IBV_NODE_UNKNOWN;
+	} else {
+		dev->node_type = strtol(value, NULL, 10);
+		if (dev->node_type < IBV_NODE_CA || dev->node_type > IBV_NODE_RNIC)
+			dev->node_type = IBV_NODE_UNKNOWN;
+	}
+
+	switch (dev->node_type) {
+	case IBV_NODE_CA:
+	case IBV_NODE_SWITCH:
+	case IBV_NODE_ROUTER:
+		dev->transport_type = IBV_TRANSPORT_IB;
+		break;
+	case IBV_NODE_RNIC:
+		dev->transport_type = IBV_TRANSPORT_IWARP;
+		break;
+	default:
+		dev->transport_type = IBV_TRANSPORT_UNKNOWN;
+		break;
+	}
+
+	strcpy(dev->dev_name,   sysfs_dev->sysfs_name);
+	strcpy(dev->dev_path,   sysfs_dev->sysfs_path);
+	strcpy(dev->name,       sysfs_dev->ibdev_name);
+	strcpy(dev->ibdev_path, sysfs_dev->ibdev_path);
+
+	return dev;
+}
+
+static struct ibv_device *try_drivers(struct ibv_sysfs_dev *sysfs_dev)
+{
+	struct ibv_driver *driver;
+	struct ibv_device *dev;
+
+	for (driver = head_driver; driver; driver = driver->next) {
+		dev = try_driver(driver, sysfs_dev);
+		if (dev)
+			return dev;
+	}
+
+	return NULL;
+}
+
+static void check_memlock_limit(void)
+{
+	struct rlimit rlim;
+
+	if (!geteuid())
+		return;
+
+	if (getrlimit(RLIMIT_MEMLOCK, &rlim)) {
+		fprintf(stderr, "ibverbs: warning: getrlimit(RLIMIT_MEMLOCK) failed.");
+		return;
+	}
+
+	if (rlim.rlim_cur <= 32768)
+		fprintf(stderr, "ibverbs: warning: RLIMIT_MEMLOCK is %lu bytes.\n"
+			"    This will severely limit memory registrations.\n",
+			rlim.rlim_cur);
+}
+
+static void add_device(struct ibv_device *dev,
+		       struct ibv_device ***dev_list,
+		       int *num_devices,
+		       int *list_size)
+{
+	struct ibv_device **new_list;
+
+	if (*list_size <= *num_devices) {
+		*list_size = *list_size ? *list_size * 2 : 1;
+		new_list = realloc(*dev_list, *list_size * sizeof (struct ibv_device *));
+		if (!new_list)
+			return;
+		*dev_list = new_list;
+	}
+
+	(*dev_list)[(*num_devices)++] = dev;
+}
+
+HIDDEN int ibverbs_init(struct ibv_device ***list)
+{
+	const char *sysfs_path;
+	struct ibv_sysfs_dev *sysfs_dev, *next_dev;
+	struct ibv_device *device;
+	int num_devices = 0;
+	int list_size = 0;
+	int statically_linked = 0;
+	int no_driver = 0;
+	int ret;
+
+	fi_init(); /* temporary until we have a real provider */
+	*list = NULL;
+
+	if (getenv("RDMAV_FORK_SAFE") || getenv("IBV_FORK_SAFE"))
+		if (ibv_fork_init())
+			fprintf(stderr, "ibverbs: warning: fork()-safety requested "
+				"but init failed\n");
+
+	sysfs_path = fi_sysfs_path();
+	if (!sysfs_path)
+		return -ENOSYS;
+
+	check_memlock_limit();
+
+	read_config();
+
+	ret = find_sysfs_devs();
+	if (ret)
+		return -ret;
+
+	for (sysfs_dev = sysfs_dev_list; sysfs_dev; sysfs_dev = sysfs_dev->next) {
+		device = try_drivers(sysfs_dev);
+		if (device) {
+			add_device(device, list, &num_devices, &list_size);
+			sysfs_dev->have_driver = 1;
+		} else
+			no_driver = 1;
+	}
+
+	if (!no_driver)
+		goto out;
+
+	/*
+	 * Check if we can dlopen() ourselves.  If this fails,
+	 * libibverbs is probably statically linked into the
+	 * executable, and we should just give up, since trying to
+	 * dlopen() a driver module will fail spectacularly (loading a
+	 * driver .so will bring in dynamic copies of libibverbs and
+	 * libdl to go along with the static copies the executable
+	 * has, which quickly leads to a crash.
+	 */
+	{
+		void *hand = dlopen(NULL, RTLD_NOW);
+		if (!hand) {
+			fprintf(stderr, "ibverbs: warning: dlopen(NULL) failed, "
+				"assuming static linking.\n");
+			statically_linked = 1;
+			goto out;
+		}
+		dlclose(hand);
+	}
+
+	load_drivers();
+
+	for (sysfs_dev = sysfs_dev_list; sysfs_dev; sysfs_dev = sysfs_dev->next) {
+		if (sysfs_dev->have_driver)
+			continue;
+
+		device = try_drivers(sysfs_dev);
+		if (device) {
+			add_device(device, list, &num_devices, &list_size);
+			sysfs_dev->have_driver = 1;
+		}
+	}
+
+out:
+	for (sysfs_dev = sysfs_dev_list,
+		     next_dev = sysfs_dev ? sysfs_dev->next : NULL;
+	     sysfs_dev;
+	     sysfs_dev = next_dev, next_dev = sysfs_dev ? sysfs_dev->next : NULL) {
+		if (!sysfs_dev->have_driver) {
+			fprintf(stderr, "ibverbs: warning: no userspace device-specific "
+				"driver found for %s\n", sysfs_dev->sysfs_path);
+			if (statically_linked)
+				fprintf(stderr, "	When linking libibverbs statically, "
+					"driver must be statically linked too.\n");
+		}
+		free(sysfs_dev);
+	}
+
+	return num_devices;
+}
diff --git a/prov/ibverbs/src/marshall.c b/prov/ibverbs/src/marshall.c
new file mode 100644
index 00000000000..dc576059404
--- /dev/null
+++ b/prov/ibverbs/src/marshall.c
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2005 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if HAVE_CONFIG_H
+#  include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#include <string.h>
+
+#include <infiniband/marshall.h>
+#include <rdma/fi_ucma.h>
+
+
+void ibv_copy_ah_attr_from_kern(struct ibv_ah_attr *dst,
+				struct ibv_kern_ah_attr *src)
+{
+	memcpy(&dst->grh.dgid, src->grh.dgid, sizeof dst->grh.dgid);
+	dst->grh.flow_label = src->grh.flow_label;
+	dst->grh.sgid_index = src->grh.sgid_index;
+	dst->grh.hop_limit = src->grh.hop_limit;
+	dst->grh.traffic_class = src->grh.traffic_class;
+
+	dst->dlid = src->dlid;
+	dst->sl = src->sl;
+	dst->src_path_bits = src->src_path_bits;
+	dst->static_rate = src->static_rate;
+	dst->is_global = src->is_global;
+	dst->port_num = src->port_num;
+}
+
+void ibv_copy_qp_attr_from_kern(struct ibv_qp_attr *dst,
+				struct ibv_kern_qp_attr *src)
+{
+	dst->cur_qp_state = src->cur_qp_state;
+	dst->path_mtu = src->path_mtu;
+	dst->path_mig_state = src->path_mig_state;
+	dst->qkey = src->qkey;
+	dst->rq_psn = src->rq_psn;
+	dst->sq_psn = src->sq_psn;
+	dst->dest_qp_num = src->dest_qp_num;
+	dst->qp_access_flags = src->qp_access_flags;
+
+	dst->cap.max_send_wr = src->max_send_wr;
+	dst->cap.max_recv_wr = src->max_recv_wr;
+	dst->cap.max_send_sge = src->max_send_sge;
+	dst->cap.max_recv_sge = src->max_recv_sge;
+	dst->cap.max_inline_data = src->max_inline_data;
+
+	ibv_copy_ah_attr_from_kern(&dst->ah_attr, &src->ah_attr);
+	ibv_copy_ah_attr_from_kern(&dst->alt_ah_attr, &src->alt_ah_attr);
+
+	dst->pkey_index = src->pkey_index;
+	dst->alt_pkey_index = src->alt_pkey_index;
+	dst->en_sqd_async_notify = src->en_sqd_async_notify;
+	dst->sq_draining = src->sq_draining;
+	dst->max_rd_atomic = src->max_rd_atomic;
+	dst->max_dest_rd_atomic = src->max_dest_rd_atomic;
+	dst->min_rnr_timer = src->min_rnr_timer;
+	dst->port_num = src->port_num;
+	dst->timeout = src->timeout;
+	dst->retry_cnt = src->retry_cnt;
+	dst->rnr_retry = src->rnr_retry;
+	dst->alt_port_num = src->alt_port_num;
+	dst->alt_timeout = src->alt_timeout;
+}
+
+void ibv_copy_path_rec_from_kern(struct ibv_sa_path_rec *dst,
+				 struct ibv_kern_path_rec *src)
+{
+	memcpy(&dst->dgid, src->dgid, sizeof dst->dgid);
+	memcpy(&dst->sgid, src->sgid, sizeof dst->sgid);
+
+	dst->dlid		= src->dlid;
+	dst->slid		= src->slid;
+	dst->raw_traffic	= src->raw_traffic;
+	dst->flow_label		= src->flow_label;
+	dst->hop_limit		= src->hop_limit;
+	dst->traffic_class	= src->traffic_class;
+	dst->reversible		= src->reversible;
+	dst->numb_path		= src->numb_path;
+	dst->pkey		= src->pkey;
+	dst->sl			= src->sl;
+	dst->mtu_selector	= src->mtu_selector;
+	dst->mtu		= src->mtu;
+	dst->rate_selector	= src->rate_selector;
+	dst->rate		= src->rate;
+	dst->packet_life_time	= src->packet_life_time;
+	dst->preference		= src->preference;
+	dst->packet_life_time_selector = src->packet_life_time_selector;
+}
+
+void ibv_copy_path_rec_to_kern(struct ibv_kern_path_rec *dst,
+			       struct ibv_sa_path_rec *src)
+{
+	memcpy(dst->dgid, &src->dgid, sizeof src->dgid);
+	memcpy(dst->sgid, &src->sgid, sizeof src->sgid);
+
+	dst->dlid		= src->dlid;
+	dst->slid		= src->slid;
+	dst->raw_traffic	= src->raw_traffic;
+	dst->flow_label		= src->flow_label;
+	dst->hop_limit		= src->hop_limit;
+	dst->traffic_class	= src->traffic_class;
+	dst->reversible		= src->reversible;
+	dst->numb_path		= src->numb_path;
+	dst->pkey		= src->pkey;
+	dst->sl			= src->sl;
+	dst->mtu_selector	= src->mtu_selector;
+	dst->mtu		= src->mtu;
+	dst->rate_selector	= src->rate_selector;
+	dst->rate		= src->rate;
+	dst->packet_life_time	= src->packet_life_time;
+	dst->preference		= src->preference;
+	dst->packet_life_time_selector = src->packet_life_time_selector;
+}
diff --git a/prov/ibverbs/src/memory.c b/prov/ibverbs/src/memory.c
new file mode 100644
index 00000000000..7d97e5541b2
--- /dev/null
+++ b/prov/ibverbs/src/memory.c
@@ -0,0 +1,719 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2006 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if HAVE_CONFIG_H
+#  include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#include <errno.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <dirent.h>
+#include <limits.h>
+#include <inttypes.h>
+
+#include "ibverbs.h"
+
+/*
+ * Most distro's headers don't have these yet.
+ */
+#ifndef MADV_DONTFORK
+#define MADV_DONTFORK	10
+#endif
+
+#ifndef MADV_DOFORK
+#define MADV_DOFORK	11
+#endif
+
+struct ibv_mem_node {
+	enum {
+		IBV_RED,
+		IBV_BLACK
+	}			color;
+	struct ibv_mem_node    *parent;
+	struct ibv_mem_node    *left, *right;
+	uintptr_t		start, end;
+	int			refcnt;
+};
+
+static struct ibv_mem_node *mm_root;
+static pthread_mutex_t mm_mutex = PTHREAD_MUTEX_INITIALIZER;
+static int page_size;
+static int huge_page_enabled;
+static int too_late;
+
+static unsigned long smaps_page_size(FILE *file)
+{
+	int n;
+	unsigned long size = page_size;
+	char buf[1024];
+
+	while (fgets(buf, sizeof(buf), file) != NULL) {
+		if (!strstr(buf, "KernelPageSize:"))
+			continue;
+
+		n = sscanf(buf, "%*s %lu", &size);
+		if (n < 1)
+			continue;
+
+		/* page size is printed in Kb */
+		size = size * 1024;
+
+		break;
+	}
+
+	return size;
+}
+
+static unsigned long get_page_size(void *base)
+{
+	unsigned long ret = page_size;
+	pid_t pid;
+	FILE *file;
+	char buf[1024];
+
+	pid = getpid();
+	snprintf(buf, sizeof(buf), "/proc/%d/smaps", pid);
+
+	file = fopen(buf, "r");
+	if (!file)
+		goto out;
+
+	while (fgets(buf, sizeof(buf), file) != NULL) {
+		int n;
+		uintptr_t range_start, range_end;
+
+		n = sscanf(buf, "%" SCNxPTR "-%" SCNxPTR, &range_start, &range_end);
+
+		if (n < 2)
+			continue;
+
+		if ((uintptr_t) base >= range_start && (uintptr_t) base < range_end) {
+			ret = smaps_page_size(file);
+			break;
+		}
+	}
+
+	fclose(file);
+
+out:
+	return ret;
+}
+
+int ibv_fork_init(void)
+{
+	void *tmp, *tmp_aligned;
+	int ret;
+	unsigned long size;
+
+	if (mm_root)
+		return 0;
+
+	if (too_late)
+		return EINVAL;
+
+	page_size = sysconf(_SC_PAGESIZE);
+	if (page_size < 0)
+		return errno;
+
+	if (posix_memalign(&tmp, page_size, page_size))
+		return ENOMEM;
+
+	if (getenv("RDMAV_HUGEPAGES_SAFE"))
+		huge_page_enabled = 1;
+	else
+		huge_page_enabled = 0;
+
+	if (huge_page_enabled) {
+		size = get_page_size(tmp);
+		tmp_aligned = (void *) ((uintptr_t) tmp & ~(size - 1));
+	} else {
+		size = page_size;
+		tmp_aligned = tmp;
+	}
+
+	ret = madvise(tmp_aligned, size, MADV_DONTFORK) ||
+	      madvise(tmp_aligned, size, MADV_DOFORK);
+
+	free(tmp);
+
+	if (ret)
+		return ENOSYS;
+
+	mm_root = malloc(sizeof *mm_root);
+	if (!mm_root)
+		return ENOMEM;
+
+	mm_root->parent = NULL;
+	mm_root->left   = NULL;
+	mm_root->right  = NULL;
+	mm_root->color  = IBV_BLACK;
+	mm_root->start  = 0;
+	mm_root->end    = UINTPTR_MAX;
+	mm_root->refcnt = 0;
+
+	return 0;
+}
+
+static struct ibv_mem_node *__mm_prev(struct ibv_mem_node *node)
+{
+	if (node->left) {
+		node = node->left;
+		while (node->right)
+			node = node->right;
+	} else {
+		while (node->parent && node == node->parent->left)
+			node = node->parent;
+
+		node = node->parent;
+	}
+
+	return node;
+}
+
+static struct ibv_mem_node *__mm_next(struct ibv_mem_node *node)
+{
+	if (node->right) {
+		node = node->right;
+		while (node->left)
+			node = node->left;
+	} else {
+		while (node->parent && node == node->parent->right)
+			node = node->parent;
+
+		node = node->parent;
+	}
+
+	return node;
+}
+
+static void __mm_rotate_right(struct ibv_mem_node *node)
+{
+	struct ibv_mem_node *tmp;
+
+	tmp = node->left;
+
+	node->left = tmp->right;
+	if (node->left)
+		node->left->parent = node;
+
+	if (node->parent) {
+		if (node->parent->right == node)
+			node->parent->right = tmp;
+		else
+			node->parent->left = tmp;
+	} else
+		mm_root = tmp;
+
+	tmp->parent = node->parent;
+
+	tmp->right = node;
+	node->parent = tmp;
+}
+
+static void __mm_rotate_left(struct ibv_mem_node *node)
+{
+	struct ibv_mem_node *tmp;
+
+	tmp = node->right;
+
+	node->right = tmp->left;
+	if (node->right)
+		node->right->parent = node;
+
+	if (node->parent) {
+		if (node->parent->right == node)
+			node->parent->right = tmp;
+		else
+			node->parent->left = tmp;
+	} else
+		mm_root = tmp;
+
+	tmp->parent = node->parent;
+
+	tmp->left = node;
+	node->parent = tmp;
+}
+
+#if 0
+static int verify(struct ibv_mem_node *node)
+{
+	int hl, hr;
+
+	if (!node)
+		return 1;
+
+	hl = verify(node->left);
+	hr = verify(node->left);
+
+	if (!hl || !hr)
+		return 0;
+	if (hl != hr)
+		return 0;
+
+	if (node->color == IBV_RED) {
+		if (node->left && node->left->color != IBV_BLACK)
+			return 0;
+		if (node->right && node->right->color != IBV_BLACK)
+			return 0;
+		return hl;
+	}
+
+	return hl + 1;
+}
+#endif
+
+static void __mm_add_rebalance(struct ibv_mem_node *node)
+{
+	struct ibv_mem_node *parent, *gp, *uncle;
+
+	while (node->parent && node->parent->color == IBV_RED) {
+		parent = node->parent;
+		gp     = node->parent->parent;
+
+		if (parent == gp->left) {
+			uncle = gp->right;
+
+			if (uncle && uncle->color == IBV_RED) {
+				parent->color = IBV_BLACK;
+				uncle->color  = IBV_BLACK;
+				gp->color     = IBV_RED;
+
+				node = gp;
+			} else {
+				if (node == parent->right) {
+					__mm_rotate_left(parent);
+					node   = parent;
+					parent = node->parent;
+				}
+
+				parent->color = IBV_BLACK;
+				gp->color     = IBV_RED;
+
+				__mm_rotate_right(gp);
+			}
+		} else {
+			uncle = gp->left;
+
+			if (uncle && uncle->color == IBV_RED) {
+				parent->color = IBV_BLACK;
+				uncle->color  = IBV_BLACK;
+				gp->color     = IBV_RED;
+
+				node = gp;
+			} else {
+				if (node == parent->left) {
+					__mm_rotate_right(parent);
+					node   = parent;
+					parent = node->parent;
+				}
+
+				parent->color = IBV_BLACK;
+				gp->color     = IBV_RED;
+
+				__mm_rotate_left(gp);
+			}
+		}
+	}
+
+	mm_root->color = IBV_BLACK;
+}
+
+static void __mm_add(struct ibv_mem_node *new)
+{
+	struct ibv_mem_node *node, *parent = NULL;
+
+	node = mm_root;
+	while (node) {
+		parent = node;
+		if (node->start < new->start)
+			node = node->right;
+		else
+			node = node->left;
+	}
+
+	if (parent->start < new->start)
+		parent->right = new;
+	else
+		parent->left = new;
+
+	new->parent = parent;
+	new->left   = NULL;
+	new->right  = NULL;
+
+	new->color = IBV_RED;
+	__mm_add_rebalance(new);
+}
+
+static void __mm_remove(struct ibv_mem_node *node)
+{
+	struct ibv_mem_node *child, *parent, *sib, *tmp;
+	int nodecol;
+
+	if (node->left && node->right) {
+		tmp = node->left;
+		while (tmp->right)
+			tmp = tmp->right;
+
+		nodecol    = tmp->color;
+		child      = tmp->left;
+		tmp->color = node->color;
+
+		if (tmp->parent != node) {
+			parent        = tmp->parent;
+			parent->right = tmp->left;
+			if (tmp->left)
+				tmp->left->parent = parent;
+
+			tmp->left   	   = node->left;
+			node->left->parent = tmp;
+		} else
+			parent = tmp;
+
+		tmp->right          = node->right;
+		node->right->parent = tmp;
+
+		tmp->parent = node->parent;
+		if (node->parent) {
+			if (node->parent->left == node)
+				node->parent->left = tmp;
+			else
+				node->parent->right = tmp;
+		} else
+			mm_root = tmp;
+	} else {
+		nodecol = node->color;
+
+		child  = node->left ? node->left : node->right;
+		parent = node->parent;
+
+		if (child)
+			child->parent = parent;
+		if (parent) {
+			if (parent->left == node)
+				parent->left = child;
+			else
+				parent->right = child;
+		} else
+			mm_root = child;
+	}
+
+	free(node);
+
+	if (nodecol == IBV_RED)
+		return;
+
+	while ((!child || child->color == IBV_BLACK) && child != mm_root) {
+		if (parent->left == child) {
+			sib = parent->right;
+
+			if (sib->color == IBV_RED) {
+				parent->color = IBV_RED;
+				sib->color    = IBV_BLACK;
+				__mm_rotate_left(parent);
+				sib = parent->right;
+			}
+
+			if ((!sib->left  || sib->left->color  == IBV_BLACK) &&
+			    (!sib->right || sib->right->color == IBV_BLACK)) {
+				sib->color = IBV_RED;
+				child  = parent;
+				parent = child->parent;
+			} else {
+				if (!sib->right || sib->right->color == IBV_BLACK) {
+					if (sib->left)
+						sib->left->color = IBV_BLACK;
+					sib->color = IBV_RED;
+					__mm_rotate_right(sib);
+					sib = parent->right;
+				}
+
+				sib->color    = parent->color;
+				parent->color = IBV_BLACK;
+				if (sib->right)
+					sib->right->color = IBV_BLACK;
+				__mm_rotate_left(parent);
+				child = mm_root;
+				break;
+			}
+		} else {
+			sib = parent->left;
+
+			if (sib->color == IBV_RED) {
+				parent->color = IBV_RED;
+				sib->color    = IBV_BLACK;
+				__mm_rotate_right(parent);
+				sib = parent->left;
+			}
+
+			if ((!sib->left  || sib->left->color  == IBV_BLACK) &&
+			    (!sib->right || sib->right->color == IBV_BLACK)) {
+				sib->color = IBV_RED;
+				child  = parent;
+				parent = child->parent;
+			} else {
+				if (!sib->left || sib->left->color == IBV_BLACK) {
+					if (sib->right)
+						sib->right->color = IBV_BLACK;
+					sib->color = IBV_RED;
+					__mm_rotate_left(sib);
+					sib = parent->left;
+				}
+
+				sib->color    = parent->color;
+				parent->color = IBV_BLACK;
+				if (sib->left)
+					sib->left->color = IBV_BLACK;
+				__mm_rotate_right(parent);
+				child = mm_root;
+				break;
+			}
+		}
+	}
+
+	if (child)
+		child->color = IBV_BLACK;
+}
+
+static struct ibv_mem_node *__mm_find_start(uintptr_t start, uintptr_t end)
+{
+	struct ibv_mem_node *node = mm_root;
+
+	while (node) {
+		if (node->start <= start && node->end >= start)
+			break;
+
+		if (node->start < start)
+			node = node->right;
+		else
+			node = node->left;
+	}
+
+	return node;
+}
+
+static struct ibv_mem_node *merge_ranges(struct ibv_mem_node *node,
+					 struct ibv_mem_node *prev)
+{
+	prev->end = node->end;
+	prev->refcnt = node->refcnt;
+	__mm_remove(node);
+
+	return prev;
+}
+
+static struct ibv_mem_node *split_range(struct ibv_mem_node *node,
+					uintptr_t cut_line)
+{
+	struct ibv_mem_node *new_node = NULL;
+
+	new_node = malloc(sizeof *new_node);
+	if (!new_node)
+		return NULL;
+	new_node->start  = cut_line;
+	new_node->end    = node->end;
+	new_node->refcnt = node->refcnt;
+	node->end  = cut_line - 1;
+	__mm_add(new_node);
+
+	return new_node;
+}
+
+static struct ibv_mem_node *get_start_node(uintptr_t start, uintptr_t end,
+					   int inc)
+{
+	struct ibv_mem_node *node, *tmp = NULL;
+
+	node = __mm_find_start(start, end);
+	if (node->start < start)
+		node = split_range(node, start);
+	else {
+		tmp = __mm_prev(node);
+		if (tmp && tmp->refcnt == node->refcnt + inc)
+			node = merge_ranges(node, tmp);
+	}
+	return node;
+}
+
+/*
+ * This function is called if madvise() fails to undo merging/splitting
+ * operations performed on the node.
+ */
+static struct ibv_mem_node *undo_node(struct ibv_mem_node *node,
+				      uintptr_t start, int inc)
+{
+	struct ibv_mem_node *tmp = NULL;
+
+	/*
+	 * This condition can be true only if we merged this
+	 * node with the previous one, so we need to split them.
+	*/
+	if (start > node->start) {
+		tmp = split_range(node, start);
+		if (tmp) {
+			node->refcnt += inc;
+			node = tmp;
+		} else
+			return NULL;
+	}
+
+	tmp  =  __mm_prev(node);
+	if (tmp && tmp->refcnt == node->refcnt)
+		node = merge_ranges(node, tmp);
+
+	tmp  =  __mm_next(node);
+	if (tmp && tmp->refcnt == node->refcnt)
+		node = merge_ranges(tmp, node);
+
+	return node;
+}
+
+static int ibv_madvise_range(void *base, size_t size, int advice)
+{
+	uintptr_t start, end;
+	struct ibv_mem_node *node, *tmp;
+	int inc;
+	int rolling_back = 0;
+	int ret = 0;
+	unsigned long range_page_size;
+
+	if (!size)
+		return 0;
+
+	if (huge_page_enabled)
+		range_page_size = get_page_size(base);
+	else
+		range_page_size = page_size;
+
+	start = (uintptr_t) base & ~(range_page_size - 1);
+	end   = ((uintptr_t) (base + size + range_page_size - 1) &
+		 ~(range_page_size - 1)) - 1;
+
+	pthread_mutex_lock(&mm_mutex);
+again:
+	inc = advice == MADV_DONTFORK ? 1 : -1;
+
+	node = get_start_node(start, end, inc);
+	if (!node) {
+		ret = -1;
+		goto out;
+	}
+
+	while (node && node->start <= end) {
+		if (node->end > end) {
+			if (!split_range(node, end + 1)) {
+				ret = -1;
+				goto out;
+			}
+		}
+
+		if ((inc == -1 && node->refcnt == 1) ||
+		    (inc ==  1 && node->refcnt == 0)) {
+			/*
+			 * If this is the first time through the loop,
+			 * and we merged this node with the previous
+			 * one, then we only want to do the madvise()
+			 * on start ... node->end (rather than
+			 * starting at node->start).
+			 *
+			 * Otherwise we end up doing madvise() on
+			 * bigger region than we're being asked to,
+			 * and that may lead to a spurious failure.
+			 */
+			if (start > node->start)
+				ret = madvise((void *) start, node->end - start + 1,
+					      advice);
+			else
+				ret = madvise((void *) node->start,
+					      node->end - node->start + 1,
+					      advice);
+			if (ret) {
+				node = undo_node(node, start, inc);
+
+				if (rolling_back || !node)
+					goto out;
+
+				/* madvise failed, roll back previous changes */
+				rolling_back = 1;
+				advice = advice == MADV_DONTFORK ?
+					MADV_DOFORK : MADV_DONTFORK;
+				tmp = __mm_prev(node);
+				if (!tmp || start > tmp->end)
+					goto out;
+				end = tmp->end;
+				goto again;
+			}
+		}
+
+		node->refcnt += inc;
+		node = __mm_next(node);
+	}
+
+	if (node) {
+		tmp = __mm_prev(node);
+		if (tmp && node->refcnt == tmp->refcnt)
+			node = merge_ranges(node, tmp);
+	}
+
+out:
+	if (rolling_back)
+		ret = -1;
+
+	pthread_mutex_unlock(&mm_mutex);
+
+	return ret;
+}
+
+int ibv_dontfork_range(void *base, size_t size)
+{
+	if (mm_root)
+		return ibv_madvise_range(base, size, MADV_DONTFORK);
+	else {
+		too_late = 1;
+		return 0;
+	}
+}
+
+int ibv_dofork_range(void *base, size_t size)
+{
+	if (mm_root)
+		return ibv_madvise_range(base, size, MADV_DOFORK);
+	else {
+		too_late = 1;
+		return 0;
+	}
+}
diff --git a/prov/ibverbs/src/verbs.c b/prov/ibverbs/src/verbs.c
new file mode 100644
index 00000000000..c58108087b3
--- /dev/null
+++ b/prov/ibverbs/src/verbs.c
@@ -0,0 +1,534 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if HAVE_CONFIG_H
+#  include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#include <stdio.h>
+#include <netinet/in.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <string.h>
+
+#include "ibverbs.h"
+
+int ibv_rate_to_mult(enum ibv_rate rate)
+{
+	switch (rate) {
+	case IBV_RATE_2_5_GBPS: return  1;
+	case IBV_RATE_5_GBPS:   return  2;
+	case IBV_RATE_10_GBPS:  return  4;
+	case IBV_RATE_20_GBPS:  return  8;
+	case IBV_RATE_30_GBPS:  return 12;
+	case IBV_RATE_40_GBPS:  return 16;
+	case IBV_RATE_60_GBPS:  return 24;
+	case IBV_RATE_80_GBPS:  return 32;
+	case IBV_RATE_120_GBPS: return 48;
+	default:           return -1;
+	}
+}
+
+enum ibv_rate mult_to_ibv_rate(int mult)
+{
+	switch (mult) {
+	case 1:  return IBV_RATE_2_5_GBPS;
+	case 2:  return IBV_RATE_5_GBPS;
+	case 4:  return IBV_RATE_10_GBPS;
+	case 8:  return IBV_RATE_20_GBPS;
+	case 12: return IBV_RATE_30_GBPS;
+	case 16: return IBV_RATE_40_GBPS;
+	case 24: return IBV_RATE_60_GBPS;
+	case 32: return IBV_RATE_80_GBPS;
+	case 48: return IBV_RATE_120_GBPS;
+	default: return IBV_RATE_MAX;
+	}
+}
+
+int ibv_rate_to_mbps(enum ibv_rate rate)
+{
+	switch (rate) {
+	case IBV_RATE_2_5_GBPS: return 2500;
+	case IBV_RATE_5_GBPS:   return 5000;
+	case IBV_RATE_10_GBPS:  return 10000;
+	case IBV_RATE_20_GBPS:  return 20000;
+	case IBV_RATE_30_GBPS:  return 30000;
+	case IBV_RATE_40_GBPS:  return 40000;
+	case IBV_RATE_60_GBPS:  return 60000;
+	case IBV_RATE_80_GBPS:  return 80000;
+	case IBV_RATE_120_GBPS: return 120000;
+	case IBV_RATE_14_GBPS:  return 14062;
+	case IBV_RATE_56_GBPS:  return 56250;
+	case IBV_RATE_112_GBPS: return 112500;
+	case IBV_RATE_168_GBPS: return 168750;
+	case IBV_RATE_25_GBPS:  return 25781;
+	case IBV_RATE_100_GBPS: return 103125;
+	case IBV_RATE_200_GBPS: return 206250;
+	case IBV_RATE_300_GBPS: return 309375;
+	default:               return -1;
+	}
+}
+
+enum ibv_rate mbps_to_ibv_rate(int mbps)
+{
+	switch (mbps) {
+	case 2500:   return IBV_RATE_2_5_GBPS;
+	case 5000:   return IBV_RATE_5_GBPS;
+	case 10000:  return IBV_RATE_10_GBPS;
+	case 20000:  return IBV_RATE_20_GBPS;
+	case 30000:  return IBV_RATE_30_GBPS;
+	case 40000:  return IBV_RATE_40_GBPS;
+	case 60000:  return IBV_RATE_60_GBPS;
+	case 80000:  return IBV_RATE_80_GBPS;
+	case 120000: return IBV_RATE_120_GBPS;
+	case 14062:  return IBV_RATE_14_GBPS;
+	case 56250:  return IBV_RATE_56_GBPS;
+	case 112500: return IBV_RATE_112_GBPS;
+	case 168750: return IBV_RATE_168_GBPS;
+	case 25781:  return IBV_RATE_25_GBPS;
+	case 103125: return IBV_RATE_100_GBPS;
+	case 206250: return IBV_RATE_200_GBPS;
+	case 309375: return IBV_RATE_300_GBPS;
+	default:     return IBV_RATE_MAX;
+	}
+}
+
+int ibv_query_device(struct ibv_context *context,
+		       struct ibv_device_attr *device_attr)
+{
+	return context->ops.query_device(context, device_attr);
+}
+
+int ibv_query_port(struct ibv_context *context, uint8_t port_num,
+		     struct ibv_port_attr *port_attr)
+{
+	return context->ops.query_port(context, port_num, port_attr);
+}
+
+int ibv_query_gid(struct ibv_context *context, uint8_t port_num,
+		    int index, union ibv_gid *gid)
+{
+	char name[24];
+	char attr[41];
+	uint16_t val;
+	int i;
+
+	snprintf(name, sizeof name, "ports/%d/gids/%d", port_num, index);
+
+	if (fi_read_file(context->device->ibdev_path, name,
+				attr, sizeof attr) < 0)
+		return -1;
+
+	for (i = 0; i < 8; ++i) {
+		if (sscanf(attr + i * 5, "%hx", &val) != 1)
+			return -1;
+		gid->raw[i * 2    ] = val >> 8;
+		gid->raw[i * 2 + 1] = val & 0xff;
+	}
+
+	return 0;
+}
+
+int ibv_query_pkey(struct ibv_context *context, uint8_t port_num,
+		     int index, uint16_t *pkey)
+{
+	char name[24];
+	char attr[8];
+	uint16_t val;
+
+	snprintf(name, sizeof name, "ports/%d/pkeys/%d", port_num, index);
+
+	if (fi_read_file(context->device->ibdev_path, name,
+				attr, sizeof attr) < 0)
+		return -1;
+
+	if (sscanf(attr, "%hx", &val) != 1)
+		return -1;
+
+	*pkey = htons(val);
+	return 0;
+}
+
+struct ibv_pd *ibv_alloc_pd(struct ibv_context *context)
+{
+	struct ibv_pd *pd;
+
+	pd = context->ops.alloc_pd(context);
+	if (pd)
+		pd->context = context;
+
+	return pd;
+}
+
+int ibv_dealloc_pd(struct ibv_pd *pd)
+{
+	return pd->context->ops.dealloc_pd(pd);
+}
+
+struct ibv_mr *ibv_reg_mr(struct ibv_pd *pd, void *addr,
+			    size_t length, int access)
+{
+	struct ibv_mr *mr;
+
+	if (ibv_dontfork_range(addr, length))
+		return NULL;
+
+	mr = pd->context->ops.reg_mr(pd, addr, length, access);
+	if (mr) {
+		mr->context = pd->context;
+		mr->pd      = pd;
+		mr->addr    = addr;
+		mr->length  = length;
+	} else
+		ibv_dofork_range(addr, length);
+
+	return mr;
+}
+
+int ibv_dereg_mr(struct ibv_mr *mr)
+{
+	int ret;
+	void *addr	= mr->addr;
+	size_t length	= mr->length;
+
+	ret = mr->context->ops.dereg_mr(mr);
+	if (!ret)
+		ibv_dofork_range(addr, length);
+
+	return ret;
+}
+
+struct ibv_comp_channel *ibv_create_comp_channel(struct ibv_context *context)
+{
+	struct ibv_comp_channel            *channel;
+	struct ibv_create_comp_channel      cmd;
+	struct ibv_create_comp_channel_resp resp;
+	int ret;
+
+	channel = malloc(sizeof *channel);
+	if (!channel)
+		return NULL;
+
+	ret = uv_create_comp_channel(context->uv_fid, &cmd, sizeof cmd, &resp, sizeof resp);
+	if (ret) {
+		free(channel);
+		return NULL;
+	}
+
+	channel->context = context;
+	channel->fd      = resp.fd;
+	channel->refcnt  = 0;
+
+	return channel;
+}
+
+int ibv_destroy_comp_channel(struct ibv_comp_channel *channel)
+{
+	struct ibv_context *context;
+	int ret;
+
+	context = channel->context;
+	pthread_mutex_lock(&context->mutex);
+
+	if (channel->refcnt) {
+		ret = EBUSY;
+		goto out;
+	}
+
+	close(channel->fd);
+	free(channel);
+	ret = 0;
+
+out:
+	pthread_mutex_unlock(&context->mutex);
+
+	return ret;
+}
+
+struct ibv_cq *ibv_create_cq(struct ibv_context *context, int cqe, void *cq_context,
+			       struct ibv_comp_channel *channel, int comp_vector)
+{
+	struct ibv_cq *cq;
+
+	pthread_mutex_lock(&context->mutex);
+
+	cq = context->ops.create_cq(context, cqe, channel, comp_vector);
+
+	if (cq) {
+		cq->context    	     	   = context;
+		cq->channel		   = channel;
+		if (channel)
+			++channel->refcnt;
+		cq->cq_context 	     	   = cq_context;
+		cq->comp_events_completed  = 0;
+		cq->async_events_completed = 0;
+		pthread_mutex_init(&cq->mutex, NULL);
+		pthread_cond_init(&cq->cond, NULL);
+	}
+
+	pthread_mutex_unlock(&context->mutex);
+
+	return cq;
+}
+
+int ibv_resize_cq(struct ibv_cq *cq, int cqe)
+{
+	if (!cq->context->ops.resize_cq)
+		return ENOSYS;
+
+	return cq->context->ops.resize_cq(cq, cqe);
+}
+
+int ibv_destroy_cq(struct ibv_cq *cq)
+{
+	struct ibv_comp_channel *channel = cq->channel;
+	int ret;
+
+	if (channel)
+		pthread_mutex_lock(&channel->context->mutex);
+
+	ret = cq->context->ops.destroy_cq(cq);
+
+	if (channel) {
+		if (!ret)
+			--channel->refcnt;
+		pthread_mutex_unlock(&channel->context->mutex);
+	}
+
+	return ret;
+}
+
+int ibv_get_cq_event(struct ibv_comp_channel *channel,
+		       struct ibv_cq **cq, void **cq_context)
+{
+	struct ibv_comp_event ev;
+
+	if (read(channel->fd, &ev, sizeof ev) != sizeof ev)
+		return -1;
+
+	*cq         = (struct ibv_cq *) (uintptr_t) ev.cq_handle;
+	*cq_context = (*cq)->cq_context;
+
+	if ((*cq)->context->ops.cq_event)
+		(*cq)->context->ops.cq_event(*cq);
+
+	return 0;
+}
+
+void ibv_ack_cq_events(struct ibv_cq *cq, unsigned int nevents)
+{
+	pthread_mutex_lock(&cq->mutex);
+	cq->comp_events_completed += nevents;
+	pthread_cond_signal(&cq->cond);
+	pthread_mutex_unlock(&cq->mutex);
+}
+
+struct ibv_srq *ibv_create_srq(struct ibv_pd *pd,
+				 struct ibv_srq_init_attr *srq_init_attr)
+{
+	struct ibv_srq *srq;
+
+	if (!pd->context->ops.create_srq)
+		return NULL;
+
+	srq = pd->context->ops.create_srq(pd, srq_init_attr);
+	if (srq) {
+		srq->context          = pd->context;
+		srq->srq_context      = srq_init_attr->srq_context;
+		srq->pd               = pd;
+		srq->events_completed = 0;
+		pthread_mutex_init(&srq->mutex, NULL);
+		pthread_cond_init(&srq->cond, NULL);
+	}
+
+	return srq;
+}
+
+int ibv_modify_srq(struct ibv_srq *srq,
+		     struct ibv_srq_attr *srq_attr,
+		     int srq_attr_mask)
+{
+	return srq->context->ops.modify_srq(srq, srq_attr, srq_attr_mask);
+}
+
+int ibv_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr)
+{
+	return srq->context->ops.query_srq(srq, srq_attr);
+}
+
+int ibv_destroy_srq(struct ibv_srq *srq)
+{
+	return srq->context->ops.destroy_srq(srq);
+}
+
+struct ibv_qp *ibv_create_qp(struct ibv_pd *pd,
+			       struct ibv_qp_init_attr *qp_init_attr)
+{
+	struct ibv_qp *qp = pd->context->ops.create_qp(pd, qp_init_attr);
+
+	if (qp) {
+		qp->context    	     = pd->context;
+		qp->qp_context 	     = qp_init_attr->qp_context;
+		qp->pd         	     = pd;
+		qp->send_cq    	     = qp_init_attr->send_cq;
+		qp->recv_cq    	     = qp_init_attr->recv_cq;
+		qp->srq        	     = qp_init_attr->srq;
+		qp->qp_type          = qp_init_attr->qp_type;
+		qp->state	     = IBV_QPS_RESET;
+		qp->events_completed = 0;
+		pthread_mutex_init(&qp->mutex, NULL);
+		pthread_cond_init(&qp->cond, NULL);
+	}
+
+	return qp;
+}
+
+int ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
+		   int attr_mask,
+		   struct ibv_qp_init_attr *init_attr)
+{
+	int ret;
+
+	ret = qp->context->ops.query_qp(qp, attr, attr_mask, init_attr);
+	if (ret)
+		return ret;
+
+	if (attr_mask & IBV_QP_STATE)
+		qp->state = attr->qp_state;
+
+	return 0;
+}
+
+int ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
+		    int attr_mask)
+{
+	int ret;
+
+	ret = qp->context->ops.modify_qp(qp, attr, attr_mask);
+	if (ret)
+		return ret;
+
+	if (attr_mask & IBV_QP_STATE)
+		qp->state = attr->qp_state;
+
+	return 0;
+}
+
+int ibv_destroy_qp(struct ibv_qp *qp)
+{
+	return qp->context->ops.destroy_qp(qp);
+}
+
+struct ibv_ah *ibv_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr)
+{
+	struct ibv_ah *ah = pd->context->ops.create_ah(pd, attr);
+
+	if (ah) {
+		ah->context = pd->context;
+		ah->pd      = pd;
+	}
+
+	return ah;
+}
+
+static int ibv_find_gid_index(struct ibv_context *context, uint8_t port_num,
+			      union ibv_gid *gid)
+{
+	union ibv_gid sgid;
+	int i = 0, ret;
+
+	do {
+		ret = ibv_query_gid(context, port_num, i++, &sgid);
+	} while (!ret && memcmp(&sgid, gid, sizeof *gid));
+
+	return ret ? ret : i - 1;
+}
+
+int ibv_init_ah_from_wc(struct ibv_context *context, uint8_t port_num,
+			struct ibv_wc *wc, struct ibv_grh *grh,
+			struct ibv_ah_attr *ah_attr)
+{
+	uint32_t flow_class;
+	int ret;
+
+	memset(ah_attr, 0, sizeof *ah_attr);
+	ah_attr->dlid = wc->slid;
+	ah_attr->sl = wc->sl;
+	ah_attr->src_path_bits = wc->dlid_path_bits;
+	ah_attr->port_num = port_num;
+
+	if (wc->wc_flags & IBV_WC_GRH) {
+		ah_attr->is_global = 1;
+		ah_attr->grh.dgid = grh->sgid;
+
+		ret = ibv_find_gid_index(context, port_num, &grh->dgid);
+		if (ret < 0)
+			return ret;
+
+		ah_attr->grh.sgid_index = (uint8_t) ret;
+		flow_class = ntohl(grh->version_tclass_flow);
+		ah_attr->grh.flow_label = flow_class & 0xFFFFF;
+		ah_attr->grh.hop_limit = grh->hop_limit;
+		ah_attr->grh.traffic_class = (flow_class >> 20) & 0xFF;
+	}
+	return 0;
+}
+
+struct ibv_ah *ibv_create_ah_from_wc(struct ibv_pd *pd, struct ibv_wc *wc,
+				     struct ibv_grh *grh, uint8_t port_num)
+{
+	struct ibv_ah_attr ah_attr;
+	int ret;
+
+	ret = ibv_init_ah_from_wc(pd->context, port_num, wc, grh, &ah_attr);
+	if (ret)
+		return NULL;
+
+	return ibv_create_ah(pd, &ah_attr);
+}
+
+int ibv_destroy_ah(struct ibv_ah *ah)
+{
+	return ah->context->ops.destroy_ah(ah);
+}
+
+int ibv_attach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid)
+{
+	return qp->context->ops.attach_mcast(qp, gid, lid);
+}
+
+int ibv_detach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid)
+{
+	return qp->context->ops.detach_mcast(qp, gid, lid);
+}
diff --git a/prov/mlx4/AUTHORS b/prov/mlx4/AUTHORS
new file mode 100644
index 00000000000..ffe1800452f
--- /dev/null
+++ b/prov/mlx4/AUTHORS
@@ -0,0 +1 @@
+Roland Dreier		<rolandd@cisco.com>
diff --git a/prov/mlx4/COPYING b/prov/mlx4/COPYING
new file mode 100644
index 00000000000..add3d1990bc
--- /dev/null
+++ b/prov/mlx4/COPYING
@@ -0,0 +1,378 @@
+This software is available to you under a choice of one of two
+licenses.  You may choose to be licensed under the terms of the the
+OpenIB.org BSD license or the GNU General Public License (GPL) Version
+2, both included below.
+
+Copyright (c) 2007 Cisco, Inc.  All rights reserved.
+
+==================================================================
+
+		       OpenIB.org BSD license
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+
+  * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following
+    disclaimer in the documentation and/or other materials provided
+    with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+==================================================================
+
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+                       59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+                            NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Library General
+Public License instead of this License.
diff --git a/prov/mlx4/src/buf.c b/prov/mlx4/src/buf.c
new file mode 100644
index 00000000000..3e8ec9a17e5
--- /dev/null
+++ b/prov/mlx4/src/buf.c
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if HAVE_CONFIG_H
+#  include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#include <stdlib.h>
+#include <errno.h>
+#include <sys/mman.h>
+
+#include "mlx4.h"
+
+
+int mlx4_alloc_buf(struct mlx4_buf *buf, size_t size, int page_size)
+{
+	int ret;
+
+	buf->length = align(size, page_size);
+	buf->buf = mmap(NULL, buf->length, PROT_READ | PROT_WRITE,
+			MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	if (buf->buf == MAP_FAILED)
+		return errno;
+
+	ret = ibv_dontfork_range(buf->buf, size);
+	if (ret)
+		munmap(buf->buf, buf->length);
+
+	return ret;
+}
+
+void mlx4_free_buf(struct mlx4_buf *buf)
+{
+	ibv_dofork_range(buf->buf, buf->length);
+	munmap(buf->buf, buf->length);
+}
diff --git a/prov/mlx4/src/cq.c b/prov/mlx4/src/cq.c
new file mode 100644
index 00000000000..18447c48fbc
--- /dev/null
+++ b/prov/mlx4/src/cq.c
@@ -0,0 +1,480 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if HAVE_CONFIG_H
+#  include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <pthread.h>
+#include <netinet/in.h>
+#include <string.h>
+
+#include <infiniband/opcode.h>
+
+#include "mlx4.h"
+#include "doorbell.h"
+
+enum {
+	MLX4_CQ_DOORBELL			= 0x20
+};
+
+enum {
+	CQ_OK					=  0,
+	CQ_EMPTY				= -1,
+	CQ_POLL_ERR				= -2
+};
+
+#define MLX4_CQ_DB_REQ_NOT_SOL			(1 << 24)
+#define MLX4_CQ_DB_REQ_NOT			(2 << 24)
+
+enum {
+	MLX4_CQE_VLAN_PRESENT_MASK		= 1 << 29,
+	MLX4_CQE_QPN_MASK			= 0xffffff,
+};
+
+enum {
+	MLX4_CQE_OWNER_MASK			= 0x80,
+	MLX4_CQE_IS_SEND_MASK			= 0x40,
+	MLX4_CQE_OPCODE_MASK			= 0x1f
+};
+
+enum {
+	MLX4_CQE_SYNDROME_LOCAL_LENGTH_ERR		= 0x01,
+	MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR		= 0x02,
+	MLX4_CQE_SYNDROME_LOCAL_PROT_ERR		= 0x04,
+	MLX4_CQE_SYNDROME_WR_FLUSH_ERR			= 0x05,
+	MLX4_CQE_SYNDROME_MW_BIND_ERR			= 0x06,
+	MLX4_CQE_SYNDROME_BAD_RESP_ERR			= 0x10,
+	MLX4_CQE_SYNDROME_LOCAL_ACCESS_ERR		= 0x11,
+	MLX4_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR		= 0x12,
+	MLX4_CQE_SYNDROME_REMOTE_ACCESS_ERR		= 0x13,
+	MLX4_CQE_SYNDROME_REMOTE_OP_ERR			= 0x14,
+	MLX4_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR	= 0x15,
+	MLX4_CQE_SYNDROME_RNR_RETRY_EXC_ERR		= 0x16,
+	MLX4_CQE_SYNDROME_REMOTE_ABORTED_ERR		= 0x22,
+};
+
+struct mlx4_err_cqe {
+	uint32_t	vlan_my_qpn;
+	uint32_t	reserved1[5];
+	uint16_t	wqe_index;
+	uint8_t		vendor_err;
+	uint8_t		syndrome;
+	uint8_t		reserved2[3];
+	uint8_t		owner_sr_opcode;
+};
+
+static struct mlx4_cqe *get_cqe(struct mlx4_cq *cq, int entry)
+{
+	return cq->buf.buf + entry * cq->cqe_size;
+}
+
+static void *get_sw_cqe(struct mlx4_cq *cq, int n)
+{
+	struct mlx4_cqe *cqe = get_cqe(cq, n & cq->ibv_cq.cqe);
+	struct mlx4_cqe *tcqe = cq->cqe_size == 64 ? cqe + 1 : cqe;
+
+	return (!!(tcqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
+		!!(n & (cq->ibv_cq.cqe + 1))) ? NULL : cqe;
+}
+
+static struct mlx4_cqe *next_cqe_sw(struct mlx4_cq *cq)
+{
+	return get_sw_cqe(cq, cq->cons_index);
+}
+
+static void update_cons_index(struct mlx4_cq *cq)
+{
+	*cq->set_ci_db = htonl(cq->cons_index & 0xffffff);
+}
+
+static void mlx4_handle_error_cqe(struct mlx4_err_cqe *cqe, struct ibv_wc *wc)
+{
+	if (cqe->syndrome == MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR)
+		printf(PFX "local QP operation err "
+		       "(QPN %06x, WQE index %x, vendor syndrome %02x, "
+		       "opcode = %02x)\n",
+		       htonl(cqe->vlan_my_qpn), htonl(cqe->wqe_index),
+		       cqe->vendor_err,
+		       cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK);
+
+	switch (cqe->syndrome) {
+	case MLX4_CQE_SYNDROME_LOCAL_LENGTH_ERR:
+		wc->status = IBV_WC_LOC_LEN_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR:
+		wc->status = IBV_WC_LOC_QP_OP_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_LOCAL_PROT_ERR:
+		wc->status = IBV_WC_LOC_PROT_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_WR_FLUSH_ERR:
+		wc->status = IBV_WC_WR_FLUSH_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_MW_BIND_ERR:
+		wc->status = IBV_WC_MW_BIND_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_BAD_RESP_ERR:
+		wc->status = IBV_WC_BAD_RESP_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_LOCAL_ACCESS_ERR:
+		wc->status = IBV_WC_LOC_ACCESS_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR:
+		wc->status = IBV_WC_REM_INV_REQ_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_REMOTE_ACCESS_ERR:
+		wc->status = IBV_WC_REM_ACCESS_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_REMOTE_OP_ERR:
+		wc->status = IBV_WC_REM_OP_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR:
+		wc->status = IBV_WC_RETRY_EXC_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_RNR_RETRY_EXC_ERR:
+		wc->status = IBV_WC_RNR_RETRY_EXC_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_REMOTE_ABORTED_ERR:
+		wc->status = IBV_WC_REM_ABORT_ERR;
+		break;
+	default:
+		wc->status = IBV_WC_GENERAL_ERR;
+		break;
+	}
+
+	wc->vendor_err = cqe->vendor_err;
+}
+
+static int mlx4_poll_one(struct mlx4_cq *cq,
+			 struct mlx4_qp **cur_qp,
+			 struct ibv_wc *wc)
+{
+	struct mlx4_wq *wq;
+	struct mlx4_cqe *cqe;
+	struct mlx4_srq *srq;
+	uint32_t qpn;
+	uint32_t g_mlpath_rqpn;
+	uint16_t wqe_index;
+	int is_error;
+	int is_send;
+
+	cqe = next_cqe_sw(cq);
+	if (!cqe)
+		return CQ_EMPTY;
+
+	if (cq->cqe_size == 64)
+		++cqe;
+
+	++cq->cons_index;
+
+	VALGRIND_MAKE_MEM_DEFINED(cqe, sizeof *cqe);
+
+	/*
+	 * Make sure we read CQ entry contents after we've checked the
+	 * ownership bit.
+	 */
+	rmb();
+
+	qpn = ntohl(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK;
+
+	is_send  = cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK;
+	is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
+		MLX4_CQE_OPCODE_ERROR;
+
+	if (!*cur_qp ||
+	    (qpn != (*cur_qp)->ibv_qp.qp_num)) {
+		/*
+		 * We do not have to take the QP table lock here,
+		 * because CQs will be locked while QPs are removed
+		 * from the table.
+		 */
+		*cur_qp = mlx4_find_qp(to_mctx(cq->ibv_cq.context), qpn);
+		if (!*cur_qp)
+			return CQ_POLL_ERR;
+	}
+
+	wc->qp_num = (*cur_qp)->ibv_qp.qp_num;
+
+	if (is_send) {
+		wq = &(*cur_qp)->sq;
+		wqe_index = ntohs(cqe->wqe_index);
+		wq->tail += (uint16_t) (wqe_index - (uint16_t) wq->tail);
+		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+		++wq->tail;
+	} else if ((*cur_qp)->ibv_qp.srq) {
+		srq = to_msrq((*cur_qp)->ibv_qp.srq);
+		wqe_index = htons(cqe->wqe_index);
+		wc->wr_id = srq->wrid[wqe_index];
+		mlx4_free_srq_wqe(srq, wqe_index);
+	} else {
+		wq = &(*cur_qp)->rq;
+		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+		++wq->tail;
+	}
+
+	if (is_error) {
+		mlx4_handle_error_cqe((struct mlx4_err_cqe *) cqe, wc);
+		return CQ_OK;
+	}
+
+	wc->status = IBV_WC_SUCCESS;
+
+	if (is_send) {
+		wc->wc_flags = 0;
+		switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
+		case MLX4_OPCODE_RDMA_WRITE_IMM:
+			wc->wc_flags |= IBV_WC_WITH_IMM;
+		case MLX4_OPCODE_RDMA_WRITE:
+			wc->opcode    = IBV_WC_RDMA_WRITE;
+			break;
+		case MLX4_OPCODE_SEND_IMM:
+			wc->wc_flags |= IBV_WC_WITH_IMM;
+		case MLX4_OPCODE_SEND:
+			wc->opcode    = IBV_WC_SEND;
+			break;
+		case MLX4_OPCODE_RDMA_READ:
+			wc->opcode    = IBV_WC_RDMA_READ;
+			wc->byte_len  = ntohl(cqe->byte_cnt);
+			break;
+		case MLX4_OPCODE_ATOMIC_CS:
+			wc->opcode    = IBV_WC_COMP_SWAP;
+			wc->byte_len  = 8;
+			break;
+		case MLX4_OPCODE_ATOMIC_FA:
+			wc->opcode    = IBV_WC_FETCH_ADD;
+			wc->byte_len  = 8;
+			break;
+		case MLX4_OPCODE_BIND_MW:
+			wc->opcode    = IBV_WC_BIND_MW;
+			break;
+		default:
+			/* assume it's a send completion */
+			wc->opcode    = IBV_WC_SEND;
+			break;
+		}
+	} else {
+		wc->byte_len = ntohl(cqe->byte_cnt);
+
+		switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
+		case MLX4_RECV_OPCODE_RDMA_WRITE_IMM:
+			wc->opcode   = IBV_WC_RECV_RDMA_WITH_IMM;
+			wc->wc_flags = IBV_WC_WITH_IMM;
+			wc->imm_data = cqe->immed_rss_invalid;
+			break;
+		case MLX4_RECV_OPCODE_SEND:
+			wc->opcode   = IBV_WC_RECV;
+			wc->wc_flags = 0;
+			break;
+		case MLX4_RECV_OPCODE_SEND_IMM:
+			wc->opcode   = IBV_WC_RECV;
+			wc->wc_flags = IBV_WC_WITH_IMM;
+			wc->imm_data = cqe->immed_rss_invalid;
+			break;
+		}
+
+		wc->slid	   = ntohs(cqe->rlid);
+		g_mlpath_rqpn	   = ntohl(cqe->g_mlpath_rqpn);
+		wc->src_qp	   = g_mlpath_rqpn & 0xffffff;
+		wc->dlid_path_bits = (g_mlpath_rqpn >> 24) & 0x7f;
+		wc->wc_flags	  |= g_mlpath_rqpn & 0x80000000 ? IBV_WC_GRH : 0;
+		wc->pkey_index     = ntohl(cqe->immed_rss_invalid) & 0x7f;
+		if ((*cur_qp)->link_layer == IBV_LINK_LAYER_ETHERNET)
+			wc->sl	   = ntohs(cqe->sl_vid) >> 13;
+		else
+			wc->sl	   = ntohs(cqe->sl_vid) >> 12;
+	}
+
+	return CQ_OK;
+}
+
+int mlx4_poll_cq(struct ibv_cq *ibcq, int ne, struct ibv_wc *wc)
+{
+	struct mlx4_cq *cq = to_mcq(ibcq);
+	struct mlx4_qp *qp = NULL;
+	int npolled;
+	int err = CQ_OK;
+
+	pthread_spin_lock(&cq->lock);
+
+	for (npolled = 0; npolled < ne; ++npolled) {
+		err = mlx4_poll_one(cq, &qp, wc + npolled);
+		if (err != CQ_OK)
+			break;
+	}
+
+	if (npolled || err == CQ_POLL_ERR)
+		update_cons_index(cq);
+
+	pthread_spin_unlock(&cq->lock);
+
+	return err == CQ_POLL_ERR ? err : npolled;
+}
+
+int mlx4_arm_cq(struct ibv_cq *ibvcq, int solicited)
+{
+	struct mlx4_cq *cq = to_mcq(ibvcq);
+	uint32_t doorbell[2];
+	uint32_t sn;
+	uint32_t ci;
+	uint32_t cmd;
+
+	sn  = cq->arm_sn & 3;
+	ci  = cq->cons_index & 0xffffff;
+	cmd = solicited ? MLX4_CQ_DB_REQ_NOT_SOL : MLX4_CQ_DB_REQ_NOT;
+
+	*cq->arm_db = htonl(sn << 28 | cmd | ci);
+
+	/*
+	 * Make sure that the doorbell record in host memory is
+	 * written before ringing the doorbell via PCI MMIO.
+	 */
+	wmb();
+
+	doorbell[0] = htonl(sn << 28 | cmd | cq->cqn);
+	doorbell[1] = htonl(ci);
+
+	mlx4_write64(doorbell, to_mctx(ibvcq->context), MLX4_CQ_DOORBELL);
+
+	return 0;
+}
+
+void mlx4_cq_event(struct ibv_cq *cq)
+{
+	to_mcq(cq)->arm_sn++;
+}
+
+void __mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq)
+{
+	struct mlx4_cqe *cqe, *dest;
+	uint32_t prod_index;
+	uint8_t owner_bit;
+	int nfreed = 0;
+	int cqe_inc = cq->cqe_size == 64 ? 1 : 0;
+
+	/*
+	 * First we need to find the current producer index, so we
+	 * know where to start cleaning from.  It doesn't matter if HW
+	 * adds new entries after this loop -- the QP we're worried
+	 * about is already in RESET, so the new entries won't come
+	 * from our QP and therefore don't need to be checked.
+	 */
+	for (prod_index = cq->cons_index; get_sw_cqe(cq, prod_index); ++prod_index)
+		if (prod_index == cq->cons_index + cq->ibv_cq.cqe)
+			break;
+
+	/*
+	 * Now sweep backwards through the CQ, removing CQ entries
+	 * that match our QP by copying older entries on top of them.
+	 */
+	while ((int) --prod_index - (int) cq->cons_index >= 0) {
+		cqe = get_cqe(cq, prod_index & cq->ibv_cq.cqe);
+		cqe += cqe_inc;
+		if ((ntohl(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK) == qpn) {
+			if (srq && !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK))
+				mlx4_free_srq_wqe(srq, ntohs(cqe->wqe_index));
+			++nfreed;
+		} else if (nfreed) {
+			dest = get_cqe(cq, (prod_index + nfreed) & cq->ibv_cq.cqe);
+			dest += cqe_inc;
+			owner_bit = dest->owner_sr_opcode & MLX4_CQE_OWNER_MASK;
+			memcpy(dest, cqe, sizeof *cqe);
+			dest->owner_sr_opcode = owner_bit |
+				(dest->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK);
+		}
+	}
+
+	if (nfreed) {
+		cq->cons_index += nfreed;
+		/*
+		 * Make sure update of buffer contents is done before
+		 * updating consumer index.
+		 */
+		wmb();
+		update_cons_index(cq);
+	}
+}
+
+void mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq)
+{
+	pthread_spin_lock(&cq->lock);
+	__mlx4_cq_clean(cq, qpn, srq);
+	pthread_spin_unlock(&cq->lock);
+}
+
+int mlx4_get_outstanding_cqes(struct mlx4_cq *cq)
+{
+	uint32_t i;
+
+	for (i = cq->cons_index; get_sw_cqe(cq, (i & cq->ibv_cq.cqe)); ++i)
+		;
+
+	return i - cq->cons_index;
+}
+
+void mlx4_cq_resize_copy_cqes(struct mlx4_cq *cq, void *buf, int old_cqe)
+{
+	struct mlx4_cqe *cqe;
+	int i;
+	int cqe_inc = cq->cqe_size == 64 ? 1 : 0;
+
+	i = cq->cons_index;
+	cqe = get_cqe(cq, (i & old_cqe));
+	cqe += cqe_inc;
+
+	while ((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) != MLX4_CQE_OPCODE_RESIZE) {
+		cqe->owner_sr_opcode = (cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK) |
+			(((i + 1) & (cq->ibv_cq.cqe + 1)) ? MLX4_CQE_OWNER_MASK : 0);
+		memcpy(buf + ((i + 1) & cq->ibv_cq.cqe) * cq->cqe_size,
+		       cqe - cqe_inc, cq->cqe_size);
+		++i;
+		cqe = get_cqe(cq, (i & old_cqe));
+		cqe += cqe_inc;
+	}
+
+	++cq->cons_index;
+}
+
+int mlx4_alloc_cq_buf(struct mlx4_device *dev, struct mlx4_buf *buf, int nent,
+		      int entry_size)
+{
+	if (mlx4_alloc_buf(buf, align(nent * entry_size, dev->page_size),
+			   dev->page_size))
+		return -1;
+	memset(buf->buf, 0, nent * entry_size);
+
+	return 0;
+}
diff --git a/prov/mlx4/src/dbrec.c b/prov/mlx4/src/dbrec.c
new file mode 100644
index 00000000000..02ef237b392
--- /dev/null
+++ b/prov/mlx4/src/dbrec.c
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if HAVE_CONFIG_H
+#  include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#include <stdlib.h>
+#include <netinet/in.h>
+#include <pthread.h>
+#include <string.h>
+
+#include "mlx4.h"
+
+struct mlx4_db_page {
+	struct mlx4_db_page	       *prev, *next;
+	struct mlx4_buf			buf;
+	int				num_db;
+	int				use_cnt;
+	unsigned long			free[0];
+};
+
+static const int db_size[] = {
+	[MLX4_DB_TYPE_CQ] = 8,
+	[MLX4_DB_TYPE_RQ] = 4,
+};
+
+static struct mlx4_db_page *__add_page(struct mlx4_context *context,
+				       enum mlx4_db_type type)
+{
+	struct mlx4_db_page *page;
+	int ps = to_mdev(context->ibv_ctx.device)->page_size;
+	int pp;
+	int i;
+
+	pp = ps / db_size[type];
+
+	page = malloc(sizeof *page + pp / 8);
+	if (!page)
+		return NULL;
+
+	if (mlx4_alloc_buf(&page->buf, ps, ps)) {
+		free(page);
+		return NULL;
+	}
+
+	page->num_db  = pp;
+	page->use_cnt = 0;
+	for (i = 0; i < pp / (sizeof (long) * 8); ++i)
+		page->free[i] = ~0;
+
+	page->prev = NULL;
+	page->next = context->db_list[type];
+	context->db_list[type] = page;
+	if (page->next)
+		page->next->prev = page;
+
+	return page;
+}
+
+uint32_t *mlx4_alloc_db(struct mlx4_context *context, enum mlx4_db_type type)
+{
+	struct mlx4_db_page *page;
+	uint32_t *db = NULL;
+	int i, j;
+
+	pthread_mutex_lock(&context->db_list_mutex);
+
+	for (page = context->db_list[type]; page; page = page->next)
+		if (page->use_cnt < page->num_db)
+			goto found;
+
+	page = __add_page(context, type);
+	if (!page)
+		goto out;
+
+found:
+	++page->use_cnt;
+
+	for (i = 0; !page->free[i]; ++i)
+		/* nothing */;
+
+	j = ffsl(page->free[i]);
+	page->free[i] &= ~(1UL << (j - 1));
+	db = page->buf.buf + (i * 8 * sizeof (long) + (j - 1)) * db_size[type];
+
+out:
+	pthread_mutex_unlock(&context->db_list_mutex);
+
+	return db;
+}
+
+void mlx4_free_db(struct mlx4_context *context, enum mlx4_db_type type, uint32_t *db)
+{
+	struct mlx4_db_page *page;
+	uintptr_t ps = to_mdev(context->ibv_ctx.device)->page_size;
+	int i;
+
+	pthread_mutex_lock(&context->db_list_mutex);
+
+	for (page = context->db_list[type]; page; page = page->next)
+		if (((uintptr_t) db & ~(ps - 1)) == (uintptr_t) page->buf.buf)
+			break;
+
+	if (!page)
+		goto out;
+
+	i = ((void *) db - page->buf.buf) / db_size[type];
+	page->free[i / (8 * sizeof (long))] |= 1UL << (i % (8 * sizeof (long)));
+
+	if (!--page->use_cnt) {
+		if (page->prev)
+			page->prev->next = page->next;
+		else
+			context->db_list[type] = page->next;
+		if (page->next)
+			page->next->prev = page->prev;
+
+		mlx4_free_buf(&page->buf);
+		free(page);
+	}
+
+out:
+	pthread_mutex_unlock(&context->db_list_mutex);
+}
diff --git a/prov/mlx4/src/doorbell.h b/prov/mlx4/src/doorbell.h
new file mode 100644
index 00000000000..3171e76976a
--- /dev/null
+++ b/prov/mlx4/src/doorbell.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2007 Cisco, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef DOORBELL_H
+#define DOORBELL_H
+
+#if SIZEOF_LONG == 8
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#  define MLX4_PAIR_TO_64(val) ((uint64_t) val[1] << 32 | val[0])
+#elif __BYTE_ORDER == __BIG_ENDIAN
+#  define MLX4_PAIR_TO_64(val) ((uint64_t) val[0] << 32 | val[1])
+#else
+#  error __BYTE_ORDER not defined
+#endif
+
+static inline void mlx4_write64(uint32_t val[2], struct mlx4_context *ctx, int offset)
+{
+	*(volatile uint64_t *) (ctx->uar + offset) = MLX4_PAIR_TO_64(val);
+}
+
+#else
+
+static inline void mlx4_write64(uint32_t val[2], struct mlx4_context *ctx, int offset)
+{
+	pthread_spin_lock(&ctx->uar_lock);
+	*(volatile uint32_t *) (ctx->uar + offset)     = val[0];
+	*(volatile uint32_t *) (ctx->uar + offset + 4) = val[1];
+	pthread_spin_unlock(&ctx->uar_lock);
+}
+
+#endif
+
+#endif /* DOORBELL_H */
diff --git a/prov/mlx4/src/mlx4-abi.h b/prov/mlx4/src/mlx4-abi.h
new file mode 100644
index 00000000000..3bb3e6f2e65
--- /dev/null
+++ b/prov/mlx4/src/mlx4-abi.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2007 Cisco, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_ABI_H
+#define MLX4_ABI_H
+
+#include <rdma/fi_uverbs.h>
+
+#define MLX4_UVERBS_MIN_ABI_VERSION	2
+#define MLX4_UVERBS_MAX_ABI_VERSION	4
+
+#define MLX4_UVERBS_NO_DEV_CAPS_ABI_VERSION	3
+
+enum {
+	MLX4_USER_DEV_CAP_64B_CQE	= 1L << 0
+};
+
+struct mlx4_alloc_ucontext_resp_v3 {
+	struct ibv_get_context_resp	ibv_resp;
+	__u32				qp_tab_size;
+	__u16				bf_reg_size;
+	__u16				bf_regs_per_page;
+};
+
+struct mlx4_alloc_ucontext_resp {
+	struct ibv_get_context_resp	ibv_resp;
+	__u32				dev_caps;
+	__u32				qp_tab_size;
+	__u16				bf_reg_size;
+	__u16				bf_regs_per_page;
+	__u32				cqe_size;
+};
+
+struct mlx4_alloc_pd_resp {
+	struct ibv_alloc_pd_resp	ibv_resp;
+	__u32				pdn;
+	__u32				reserved;
+};
+
+struct mlx4_create_cq {
+	struct ibv_create_cq		ibv_cmd;
+	__u64				buf_addr;
+	__u64				db_addr;
+};
+
+struct mlx4_create_cq_resp {
+	struct ibv_create_cq_resp	ibv_resp;
+	__u32				cqn;
+	__u32				reserved;
+};
+
+struct mlx4_resize_cq {
+	struct ibv_resize_cq		ibv_cmd;
+	__u64				buf_addr;
+};
+
+struct mlx4_create_srq {
+	struct ibv_create_srq		ibv_cmd;
+	__u64				buf_addr;
+	__u64				db_addr;
+};
+
+struct mlx4_create_srq_resp {
+	struct ibv_create_srq_resp	ibv_resp;
+	__u32				srqn;
+	__u32				reserved;
+};
+
+struct mlx4_create_qp {
+	struct ibv_create_qp		ibv_cmd;
+	__u64				buf_addr;
+	__u64				db_addr;
+	__u8				log_sq_bb_count;
+	__u8				log_sq_stride;
+	__u8				sq_no_prefetch;	/* was reserved in ABI 2 */
+	__u8				reserved[5];
+};
+
+#endif /* MLX4_ABI_H */
diff --git a/prov/mlx4/src/mlx4.c b/prov/mlx4/src/mlx4.c
new file mode 100644
index 00000000000..5e68070f1f5
--- /dev/null
+++ b/prov/mlx4/src/mlx4.c
@@ -0,0 +1,276 @@
+/*
+ * Copyright (c) 2007 Cisco, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if HAVE_CONFIG_H
+#  include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/mman.h>
+#include <pthread.h>
+#include <string.h>
+
+#include "mlx4.h"
+#include "mlx4-abi.h"
+
+#ifndef PCI_VENDOR_ID_MELLANOX
+#define PCI_VENDOR_ID_MELLANOX			0x15b3
+#endif
+
+#define HCA(v, d) \
+	{ .vendor = PCI_VENDOR_ID_##v,			\
+	  .device = d }
+
+struct {
+	unsigned		vendor;
+	unsigned		device;
+} hca_table[] = {
+	HCA(MELLANOX, 0x6340),	/* MT25408 "Hermon" SDR */
+	HCA(MELLANOX, 0x634a),	/* MT25408 "Hermon" DDR */
+	HCA(MELLANOX, 0x6354),	/* MT25408 "Hermon" QDR */
+	HCA(MELLANOX, 0x6732),	/* MT25408 "Hermon" DDR PCIe gen2 */
+	HCA(MELLANOX, 0x673c),	/* MT25408 "Hermon" QDR PCIe gen2 */
+	HCA(MELLANOX, 0x6368),	/* MT25408 "Hermon" EN 10GigE */
+	HCA(MELLANOX, 0x6750),	/* MT25408 "Hermon" EN 10GigE PCIe gen2 */
+	HCA(MELLANOX, 0x6372),	/* MT25458 ConnectX EN 10GBASE-T 10GigE */
+	HCA(MELLANOX, 0x675a),	/* MT25458 ConnectX EN 10GBASE-T+Gen2 10GigE */
+	HCA(MELLANOX, 0x6764),	/* MT26468 ConnectX EN 10GigE PCIe gen2*/
+	HCA(MELLANOX, 0x6746),	/* MT26438 ConnectX EN 40GigE PCIe gen2 5GT/s */
+	HCA(MELLANOX, 0x676e),	/* MT26478 ConnectX2 40GigE PCIe gen2 */
+	HCA(MELLANOX, 0x1002),	/* MT25400 Family [ConnectX-2 Virtual Function] */
+	HCA(MELLANOX, 0x1003),	/* MT27500 Family [ConnectX-3] */
+	HCA(MELLANOX, 0x1004),	/* MT27500 Family [ConnectX-3 Virtual Function] */
+	HCA(MELLANOX, 0x1005),	/* MT27510 Family */
+	HCA(MELLANOX, 0x1006),	/* MT27511 Family */
+	HCA(MELLANOX, 0x1007),	/* MT27520 Family */
+	HCA(MELLANOX, 0x1008),	/* MT27521 Family */
+	HCA(MELLANOX, 0x1009),	/* MT27530 Family */
+	HCA(MELLANOX, 0x100a),	/* MT27531 Family */
+	HCA(MELLANOX, 0x100b),	/* MT27540 Family */
+	HCA(MELLANOX, 0x100c),	/* MT27541 Family */
+	HCA(MELLANOX, 0x100d),	/* MT27550 Family */
+	HCA(MELLANOX, 0x100e),	/* MT27551 Family */
+	HCA(MELLANOX, 0x100f),	/* MT27560 Family */
+	HCA(MELLANOX, 0x1010),	/* MT27561 Family */
+};
+
+static struct ibv_context_ops mlx4_ctx_ops = {
+	.query_device  = mlx4_query_device,
+	.query_port    = mlx4_query_port,
+	.alloc_pd      = mlx4_alloc_pd,
+	.dealloc_pd    = mlx4_free_pd,
+	.reg_mr	       = mlx4_reg_mr,
+	.dereg_mr      = mlx4_dereg_mr,
+	.create_cq     = mlx4_create_cq,
+	.poll_cq       = mlx4_poll_cq,
+	.req_notify_cq = mlx4_arm_cq,
+	.cq_event      = mlx4_cq_event,
+	.resize_cq     = mlx4_resize_cq,
+	.destroy_cq    = mlx4_destroy_cq,
+	.create_srq    = mlx4_create_srq,
+	.modify_srq    = mlx4_modify_srq,
+	.query_srq     = mlx4_query_srq,
+	.destroy_srq   = mlx4_destroy_srq,
+	.post_srq_recv = mlx4_post_srq_recv,
+	.create_qp     = mlx4_create_qp,
+	.query_qp      = mlx4_query_qp,
+	.modify_qp     = mlx4_modify_qp,
+	.destroy_qp    = mlx4_destroy_qp,
+	.post_send     = mlx4_post_send,
+	.post_recv     = mlx4_post_recv,
+	.create_ah     = mlx4_create_ah,
+	.destroy_ah    = mlx4_destroy_ah,
+	.attach_mcast  = ibv_cmd_attach_mcast,
+	.detach_mcast  = ibv_cmd_detach_mcast
+};
+
+static struct ibv_context *mlx4_alloc_context(struct ibv_device *ibdev, fid_t fid)
+{
+	struct mlx4_context	       *context;
+	struct ibv_get_context		cmd;
+	struct mlx4_alloc_ucontext_resp resp;
+	int				i;
+	struct mlx4_alloc_ucontext_resp_v3 resp_v3;
+	__u16				bf_reg_size;
+	struct mlx4_device		*dev = to_mdev(ibdev);
+	struct fid_uverbs		*uv;
+
+	context = calloc(1, sizeof *context);
+	if (!context)
+		return NULL;
+
+	context->ibv_ctx.uv_fid = fid;
+	uv = container_of(fid, struct fid_uverbs, fid);
+
+	if (dev->abi_version <= MLX4_UVERBS_NO_DEV_CAPS_ABI_VERSION) {
+		if (ibv_cmd_get_context(&context->ibv_ctx, &cmd, sizeof cmd,
+					&resp_v3.ibv_resp, sizeof resp_v3))
+			goto err_free;
+
+		context->num_qps  = resp_v3.qp_tab_size;
+		bf_reg_size	  = resp_v3.bf_reg_size;
+		context->cqe_size = sizeof (struct mlx4_cqe);
+	} else  {
+		if (ibv_cmd_get_context(&context->ibv_ctx, &cmd, sizeof cmd,
+					&resp.ibv_resp, sizeof resp))
+			goto err_free;
+
+		context->num_qps  = resp.qp_tab_size;
+		bf_reg_size	  = resp.bf_reg_size;
+		if (resp.dev_caps & MLX4_USER_DEV_CAP_64B_CQE)
+			context->cqe_size = resp.cqe_size;
+		else
+			context->cqe_size = sizeof (struct mlx4_cqe);
+	}
+
+	context->qp_table_shift = ffs(context->num_qps) - 1 - MLX4_QP_TABLE_BITS;
+	context->qp_table_mask	= (1 << context->qp_table_shift) - 1;
+
+	pthread_mutex_init(&context->qp_table_mutex, NULL);
+	for (i = 0; i < MLX4_QP_TABLE_SIZE; ++i)
+		context->qp_table[i].refcnt = 0;
+
+	for (i = 0; i < MLX4_NUM_DB_TYPE; ++i)
+		context->db_list[i] = NULL;
+
+	pthread_mutex_init(&context->db_list_mutex, NULL);
+
+	context->uar = mmap(NULL, to_mdev(ibdev)->page_size, PROT_WRITE,
+			    MAP_SHARED, uv->fd, 0);
+	if (context->uar == MAP_FAILED)
+		goto err_free;
+
+	if (bf_reg_size) {
+		context->bf_page = mmap(NULL, to_mdev(ibdev)->page_size,
+					PROT_WRITE, MAP_SHARED, uv->fd,
+					to_mdev(ibdev)->page_size);
+		if (context->bf_page == MAP_FAILED) {
+			fprintf(stderr, PFX "Warning: BlueFlame available, "
+				"but failed to mmap() BlueFlame page.\n");
+				context->bf_page     = NULL;
+				context->bf_buf_size = 0;
+		} else {
+			context->bf_buf_size = bf_reg_size / 2;
+			context->bf_offset   = 0;
+			pthread_spin_init(&context->bf_lock, PTHREAD_PROCESS_PRIVATE);
+		}
+	} else {
+		context->bf_page     = NULL;
+		context->bf_buf_size = 0;
+	}
+
+	pthread_spin_init(&context->uar_lock, PTHREAD_PROCESS_PRIVATE);
+
+	context->ibv_ctx.ops = mlx4_ctx_ops;
+
+	return &context->ibv_ctx;
+
+err_free:
+	free(context);
+	return NULL;
+}
+
+static void mlx4_free_context(struct ibv_context *ibctx)
+{
+	struct mlx4_context *context = to_mctx(ibctx);
+
+	munmap(context->uar, to_mdev(ibctx->device)->page_size);
+	if (context->bf_page)
+		munmap(context->bf_page, to_mdev(ibctx->device)->page_size);
+	free(context);
+}
+
+static struct ibv_device_ops mlx4_dev_ops = {
+	.alloc_context = mlx4_alloc_context,
+	.free_context  = mlx4_free_context
+};
+
+static struct ibv_device *mlx4_driver_init(const char *uverbs_sys_path, int abi_version)
+{
+	char			value[8];
+	struct mlx4_device    *dev;
+	unsigned		vendor, device;
+	int			i;
+
+	if (ibv_read_sysfs_file(uverbs_sys_path, "device/vendor",
+				value, sizeof value) < 0)
+		return NULL;
+	vendor = strtol(value, NULL, 16);
+
+	if (ibv_read_sysfs_file(uverbs_sys_path, "device/device",
+				value, sizeof value) < 0)
+		return NULL;
+	device = strtol(value, NULL, 16);
+
+	for (i = 0; i < sizeof hca_table / sizeof hca_table[0]; ++i)
+		if (vendor == hca_table[i].vendor &&
+		    device == hca_table[i].device)
+			goto found;
+
+	return NULL;
+
+found:
+	if (abi_version < MLX4_UVERBS_MIN_ABI_VERSION ||
+	    abi_version > MLX4_UVERBS_MAX_ABI_VERSION) {
+		fprintf(stderr, PFX "Fatal: ABI version %d of %s is not supported "
+			"(min supported %d, max supported %d)\n",
+			abi_version, uverbs_sys_path,
+			MLX4_UVERBS_MIN_ABI_VERSION,
+			MLX4_UVERBS_MAX_ABI_VERSION);
+		return NULL;
+	}
+
+	dev = malloc(sizeof *dev);
+	if (!dev) {
+		fprintf(stderr, PFX "Fatal: couldn't allocate device for %s\n",
+			uverbs_sys_path);
+		return NULL;
+	}
+
+	dev->ibv_dev.ops = mlx4_dev_ops;
+	dev->page_size   = sysconf(_SC_PAGESIZE);
+	dev->abi_version = abi_version;
+
+	return &dev->ibv_dev;
+}
+
+void mlx4_ini(void)
+{
+	ibv_register_driver("mlx4", mlx4_driver_init);
+}
+
+void mlx4_fini(void)
+{
+}
diff --git a/prov/mlx4/src/mlx4.h b/prov/mlx4/src/mlx4.h
new file mode 100644
index 00000000000..61ba7a11e47
--- /dev/null
+++ b/prov/mlx4/src/mlx4.h
@@ -0,0 +1,350 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_H
+#define MLX4_H
+
+#include <stddef.h>
+
+#include <fi.h>
+#include <infiniband/driver.h>
+#include <rdma/fabric.h>
+#include <rdma/fi_arch.h>
+
+
+#define HIDDEN		__attribute__((visibility ("hidden")))
+
+
+#ifndef HAVE_IBV_QPT_RAW_PACKET
+#define IBV_QPT_RAW_PACKET		8
+#endif
+
+enum {
+	MLX4_STAT_RATE_OFFSET		= 5
+};
+
+enum {
+	MLX4_QP_TABLE_BITS		= 8,
+	MLX4_QP_TABLE_SIZE		= 1 << MLX4_QP_TABLE_BITS,
+	MLX4_QP_TABLE_MASK		= MLX4_QP_TABLE_SIZE - 1
+};
+
+enum mlx4_db_type {
+	MLX4_DB_TYPE_CQ,
+	MLX4_DB_TYPE_RQ,
+	MLX4_NUM_DB_TYPE
+};
+
+enum {
+	MLX4_OPCODE_NOP			= 0x00,
+	MLX4_OPCODE_SEND_INVAL		= 0x01,
+	MLX4_OPCODE_RDMA_WRITE		= 0x08,
+	MLX4_OPCODE_RDMA_WRITE_IMM	= 0x09,
+	MLX4_OPCODE_SEND		= 0x0a,
+	MLX4_OPCODE_SEND_IMM		= 0x0b,
+	MLX4_OPCODE_LSO			= 0x0e,
+	MLX4_OPCODE_RDMA_READ		= 0x10,
+	MLX4_OPCODE_ATOMIC_CS		= 0x11,
+	MLX4_OPCODE_ATOMIC_FA		= 0x12,
+	MLX4_OPCODE_MASKED_ATOMIC_CS	= 0x14,
+	MLX4_OPCODE_MASKED_ATOMIC_FA	= 0x15,
+	MLX4_OPCODE_BIND_MW		= 0x18,
+	MLX4_OPCODE_FMR			= 0x19,
+	MLX4_OPCODE_LOCAL_INVAL		= 0x1b,
+	MLX4_OPCODE_CONFIG_CMD		= 0x1f,
+
+	MLX4_RECV_OPCODE_RDMA_WRITE_IMM	= 0x00,
+	MLX4_RECV_OPCODE_SEND		= 0x01,
+	MLX4_RECV_OPCODE_SEND_IMM	= 0x02,
+	MLX4_RECV_OPCODE_SEND_INVAL	= 0x03,
+
+	MLX4_CQE_OPCODE_ERROR		= 0x1e,
+	MLX4_CQE_OPCODE_RESIZE		= 0x16,
+};
+
+struct mlx4_device {
+	struct ibv_device		ibv_dev;
+	int				page_size;
+	int				abi_version;
+};
+
+struct mlx4_db_page;
+
+struct mlx4_context {
+	struct ibv_context		ibv_ctx;
+
+	void			       *uar;
+	pthread_spinlock_t		uar_lock;
+
+	void			       *bf_page;
+	int				bf_buf_size;
+	int				bf_offset;
+	pthread_spinlock_t		bf_lock;
+
+	struct {
+		struct mlx4_qp	      **table;
+		int			refcnt;
+	}				qp_table[MLX4_QP_TABLE_SIZE];
+	pthread_mutex_t			qp_table_mutex;
+	int				num_qps;
+	int				qp_table_shift;
+	int				qp_table_mask;
+
+	struct mlx4_db_page	       *db_list[MLX4_NUM_DB_TYPE];
+	pthread_mutex_t			db_list_mutex;
+	int				cqe_size;
+};
+
+struct mlx4_buf {
+	void			       *buf;
+	size_t				length;
+};
+
+struct mlx4_pd {
+	struct ibv_pd			ibv_pd;
+	uint32_t			pdn;
+};
+
+struct mlx4_cq {
+	struct ibv_cq			ibv_cq;
+	struct mlx4_buf			buf;
+	struct mlx4_buf			resize_buf;
+	pthread_spinlock_t		lock;
+	uint32_t			cqn;
+	uint32_t			cons_index;
+	uint32_t		       *set_ci_db;
+	uint32_t		       *arm_db;
+	int				arm_sn;
+	int				cqe_size;
+};
+
+struct mlx4_srq {
+	struct ibv_srq			ibv_srq;
+	struct mlx4_buf			buf;
+	pthread_spinlock_t		lock;
+	uint64_t		       *wrid;
+	uint32_t			srqn;
+	int				max;
+	int				max_gs;
+	int				wqe_shift;
+	int				head;
+	int				tail;
+	uint32_t		       *db;
+	uint16_t			counter;
+};
+
+struct mlx4_wq {
+	uint64_t		       *wrid;
+	pthread_spinlock_t		lock;
+	int				wqe_cnt;
+	int				max_post;
+	unsigned			head;
+	unsigned			tail;
+	int				max_gs;
+	int				wqe_shift;
+	int				offset;
+};
+
+struct mlx4_qp {
+	struct ibv_qp			ibv_qp;
+	struct mlx4_buf			buf;
+	int				max_inline_data;
+	int				buf_size;
+
+	uint32_t			doorbell_qpn;
+	uint32_t			sq_signal_bits;
+	int				sq_spare_wqes;
+	struct mlx4_wq			sq;
+
+	uint32_t		       *db;
+	struct mlx4_wq			rq;
+
+	uint8_t				link_layer;
+};
+
+struct mlx4_av {
+	uint32_t			port_pd;
+	uint8_t				reserved1;
+	uint8_t				g_slid;
+	uint16_t			dlid;
+	uint8_t				reserved2;
+	uint8_t				gid_index;
+	uint8_t				stat_rate;
+	uint8_t				hop_limit;
+	uint32_t			sl_tclass_flowlabel;
+	uint8_t				dgid[16];
+};
+
+struct mlx4_ah {
+	struct ibv_ah			ibv_ah;
+	struct mlx4_av			av;
+	uint16_t			vlan;
+	uint8_t				mac[6];
+};
+
+struct mlx4_cqe {
+	uint32_t	vlan_my_qpn;
+	uint32_t	immed_rss_invalid;
+	uint32_t	g_mlpath_rqpn;
+	uint8_t		sl_vid;
+	uint8_t		reserved1;
+	uint16_t	rlid;
+	uint32_t	reserved2;
+	uint32_t	byte_cnt;
+	uint16_t	wqe_index;
+	uint16_t	checksum;
+	uint8_t		reserved3[3];
+	uint8_t		owner_sr_opcode;
+};
+
+static inline unsigned long align(unsigned long val, unsigned long align)
+{
+	return (val + align - 1) & ~(align - 1);
+}
+
+#define to_mxxx(xxx, type)						\
+	((struct mlx4_##type *)					\
+	 ((void *) ib##xxx - offsetof(struct mlx4_##type, ibv_##xxx)))
+
+static inline struct mlx4_device *to_mdev(struct ibv_device *ibdev)
+{
+	return to_mxxx(dev, device);
+}
+
+static inline struct mlx4_context *to_mctx(struct ibv_context *ibctx)
+{
+	return to_mxxx(ctx, context);
+}
+
+static inline struct mlx4_pd *to_mpd(struct ibv_pd *ibpd)
+{
+	return to_mxxx(pd, pd);
+}
+
+static inline struct mlx4_cq *to_mcq(struct ibv_cq *ibcq)
+{
+	return to_mxxx(cq, cq);
+}
+
+static inline struct mlx4_srq *to_msrq(struct ibv_srq *ibsrq)
+{
+	return to_mxxx(srq, srq);
+}
+
+static inline struct mlx4_qp *to_mqp(struct ibv_qp *ibqp)
+{
+	return to_mxxx(qp, qp);
+}
+
+static inline struct mlx4_ah *to_mah(struct ibv_ah *ibah)
+{
+	return to_mxxx(ah, ah);
+}
+
+int mlx4_alloc_buf(struct mlx4_buf *buf, size_t size, int page_size);
+void mlx4_free_buf(struct mlx4_buf *buf);
+
+uint32_t *mlx4_alloc_db(struct mlx4_context *context, enum mlx4_db_type type);
+void mlx4_free_db(struct mlx4_context *context, enum mlx4_db_type type, uint32_t *db);
+
+int mlx4_query_device(struct ibv_context *context,
+		       struct ibv_device_attr *attr);
+int mlx4_query_port(struct ibv_context *context, uint8_t port,
+		     struct ibv_port_attr *attr);
+
+struct ibv_pd *mlx4_alloc_pd(struct ibv_context *context);
+int mlx4_free_pd(struct ibv_pd *pd);
+
+struct ibv_mr *mlx4_reg_mr(struct ibv_pd *pd, void *addr,
+			    size_t length, int access);
+int mlx4_dereg_mr(struct ibv_mr *mr);
+
+struct ibv_cq *mlx4_create_cq(struct ibv_context *context, int cqe,
+			       struct ibv_comp_channel *channel,
+			       int comp_vector);
+int mlx4_alloc_cq_buf(struct mlx4_device *dev, struct mlx4_buf *buf, int nent,
+		      int entry_size);
+int mlx4_resize_cq(struct ibv_cq *cq, int cqe);
+int mlx4_destroy_cq(struct ibv_cq *cq);
+int mlx4_poll_cq(struct ibv_cq *cq, int ne, struct ibv_wc *wc);
+int mlx4_arm_cq(struct ibv_cq *cq, int solicited);
+void mlx4_cq_event(struct ibv_cq *cq);
+void __mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq);
+void mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq);
+int mlx4_get_outstanding_cqes(struct mlx4_cq *cq);
+void mlx4_cq_resize_copy_cqes(struct mlx4_cq *cq, void *buf, int new_cqe);
+
+struct ibv_srq *mlx4_create_srq(struct ibv_pd *pd,
+				 struct ibv_srq_init_attr *attr);
+int mlx4_modify_srq(struct ibv_srq *srq,
+		     struct ibv_srq_attr *attr,
+		     int mask);
+int mlx4_query_srq(struct ibv_srq *srq,
+			   struct ibv_srq_attr *attr);
+int mlx4_destroy_srq(struct ibv_srq *srq);
+int mlx4_alloc_srq_buf(struct ibv_pd *pd, struct ibv_srq_attr *attr,
+			struct mlx4_srq *srq);
+void mlx4_free_srq_wqe(struct mlx4_srq *srq, int ind);
+int mlx4_post_srq_recv(struct ibv_srq *ibsrq,
+		       struct ibv_recv_wr *wr,
+		       struct ibv_recv_wr **bad_wr);
+
+struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr);
+int mlx4_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
+		   int attr_mask,
+		   struct ibv_qp_init_attr *init_attr);
+int mlx4_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
+		    int attr_mask);
+int mlx4_destroy_qp(struct ibv_qp *qp);
+void mlx4_init_qp_indices(struct mlx4_qp *qp);
+void mlx4_qp_init_sq_ownership(struct mlx4_qp *qp);
+int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
+			  struct ibv_send_wr **bad_wr);
+int mlx4_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
+			  struct ibv_recv_wr **bad_wr);
+void mlx4_calc_sq_wqe_size(struct ibv_qp_cap *cap, enum ibv_qp_type type,
+			   struct mlx4_qp *qp);
+int mlx4_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap *cap,
+		       enum ibv_qp_type type, struct mlx4_qp *qp);
+void mlx4_set_sq_sizes(struct mlx4_qp *qp, struct ibv_qp_cap *cap,
+		       enum ibv_qp_type type);
+struct mlx4_qp *mlx4_find_qp(struct mlx4_context *ctx, uint32_t qpn);
+int mlx4_store_qp(struct mlx4_context *ctx, uint32_t qpn, struct mlx4_qp *qp);
+void mlx4_clear_qp(struct mlx4_context *ctx, uint32_t qpn);
+struct ibv_ah *mlx4_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr);
+int mlx4_destroy_ah(struct ibv_ah *ah);
+int mlx4_alloc_av(struct mlx4_pd *pd, struct ibv_ah_attr *attr,
+		   struct mlx4_ah *ah);
+void mlx4_free_av(struct mlx4_ah *ah);
+
+#endif /* MLX4_H */
diff --git a/prov/mlx4/src/mlx4_verbs.c b/prov/mlx4/src/mlx4_verbs.c
new file mode 100644
index 00000000000..7c5ee531498
--- /dev/null
+++ b/prov/mlx4/src/mlx4_verbs.c
@@ -0,0 +1,741 @@
+/*
+ * Copyright (c) 2007 Cisco, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if HAVE_CONFIG_H
+#  include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <pthread.h>
+#include <errno.h>
+#include <netinet/in.h>
+
+#include "mlx4.h"
+#include "mlx4-abi.h"
+#include "wqe.h"
+
+int mlx4_query_device(struct ibv_context *context, struct ibv_device_attr *attr)
+{
+	struct ibv_query_device cmd;
+	uint64_t raw_fw_ver;
+	unsigned major, minor, sub_minor;
+	int ret;
+
+	ret = ibv_cmd_query_device(context, attr, &raw_fw_ver, &cmd, sizeof cmd);
+	if (ret)
+		return ret;
+
+	major     = (raw_fw_ver >> 32) & 0xffff;
+	minor     = (raw_fw_ver >> 16) & 0xffff;
+	sub_minor = raw_fw_ver & 0xffff;
+
+	snprintf(attr->fw_ver, sizeof attr->fw_ver,
+		 "%d.%d.%03d", major, minor, sub_minor);
+
+	return 0;
+}
+
+int mlx4_query_port(struct ibv_context *context, uint8_t port,
+		     struct ibv_port_attr *attr)
+{
+	struct ibv_query_port cmd;
+
+	return ibv_cmd_query_port(context, port, attr, &cmd, sizeof cmd);
+}
+
+struct ibv_pd *mlx4_alloc_pd(struct ibv_context *context)
+{
+	struct ibv_alloc_pd       cmd;
+	struct mlx4_alloc_pd_resp resp;
+	struct mlx4_pd		 *pd;
+
+	pd = malloc(sizeof *pd);
+	if (!pd)
+		return NULL;
+
+	if (ibv_cmd_alloc_pd(context, &pd->ibv_pd, &cmd, sizeof cmd,
+			     &resp.ibv_resp, sizeof resp)) {
+		free(pd);
+		return NULL;
+	}
+
+	pd->pdn = resp.pdn;
+
+	return &pd->ibv_pd;
+}
+
+int mlx4_free_pd(struct ibv_pd *pd)
+{
+	int ret;
+
+	ret = ibv_cmd_dealloc_pd(pd);
+	if (ret)
+		return ret;
+
+	free(to_mpd(pd));
+	return 0;
+}
+
+struct ibv_mr *mlx4_reg_mr(struct ibv_pd *pd, void *addr, size_t length,
+			   int access)
+{
+	struct ibv_mr *mr;
+	struct ibv_reg_mr cmd;
+	int ret;
+
+	mr = malloc(sizeof *mr);
+	if (!mr)
+		return NULL;
+
+#ifdef IBV_CMD_REG_MR_HAS_RESP_PARAMS
+	{
+		struct ibv_reg_mr_resp resp;
+
+		ret = ibv_cmd_reg_mr(pd, addr, length, (uintptr_t) addr,
+				     access, mr, &cmd, sizeof cmd,
+				     &resp, sizeof resp);
+	}
+#else
+	ret = ibv_cmd_reg_mr(pd, addr, length, (uintptr_t) addr, access, mr,
+			     &cmd, sizeof cmd);
+#endif
+	if (ret) {
+		free(mr);
+		return NULL;
+	}
+
+	return mr;
+}
+
+int mlx4_dereg_mr(struct ibv_mr *mr)
+{
+	int ret;
+
+	ret = ibv_cmd_dereg_mr(mr);
+	if (ret)
+		return ret;
+
+	free(mr);
+	return 0;
+}
+
+static int align_queue_size(int req)
+{
+	int nent;
+
+	for (nent = 1; nent < req; nent <<= 1)
+		; /* nothing */
+
+	return nent;
+}
+
+struct ibv_cq *mlx4_create_cq(struct ibv_context *context, int cqe,
+			       struct ibv_comp_channel *channel,
+			       int comp_vector)
+{
+	struct mlx4_create_cq      cmd;
+	struct mlx4_create_cq_resp resp;
+	struct mlx4_cq		  *cq;
+	int			   ret;
+	struct mlx4_context       *mctx = to_mctx(context);
+
+	/* Sanity check CQ size before proceeding */
+	if (cqe > 0x3fffff)
+		return NULL;
+
+	cq = malloc(sizeof *cq);
+	if (!cq)
+		return NULL;
+
+	cq->cons_index = 0;
+
+	if (pthread_spin_init(&cq->lock, PTHREAD_PROCESS_PRIVATE))
+		goto err;
+
+	cqe = align_queue_size(cqe + 1);
+
+	if (mlx4_alloc_cq_buf(to_mdev(context->device), &cq->buf, cqe, mctx->cqe_size))
+		goto err;
+
+	cq->cqe_size = mctx->cqe_size;
+	cq->set_ci_db  = mlx4_alloc_db(to_mctx(context), MLX4_DB_TYPE_CQ);
+	if (!cq->set_ci_db)
+		goto err_buf;
+
+	cq->arm_db     = cq->set_ci_db + 1;
+	*cq->arm_db    = 0;
+	cq->arm_sn     = 1;
+	*cq->set_ci_db = 0;
+
+	cmd.buf_addr = (uintptr_t) cq->buf.buf;
+	cmd.db_addr  = (uintptr_t) cq->set_ci_db;
+
+	ret = ibv_cmd_create_cq(context, cqe - 1, channel, comp_vector,
+				&cq->ibv_cq, &cmd.ibv_cmd, sizeof cmd,
+				&resp.ibv_resp, sizeof resp);
+	if (ret)
+		goto err_db;
+
+	cq->cqn = resp.cqn;
+
+	return &cq->ibv_cq;
+
+err_db:
+	mlx4_free_db(to_mctx(context), MLX4_DB_TYPE_CQ, cq->set_ci_db);
+
+err_buf:
+	mlx4_free_buf(&cq->buf);
+
+err:
+	free(cq);
+
+	return NULL;
+}
+
+int mlx4_resize_cq(struct ibv_cq *ibcq, int cqe)
+{
+	struct mlx4_cq *cq = to_mcq(ibcq);
+	struct mlx4_resize_cq cmd;
+	struct mlx4_buf buf;
+	int old_cqe, outst_cqe, ret;
+
+	/* Sanity check CQ size before proceeding */
+	if (cqe > 0x3fffff)
+		return EINVAL;
+
+	pthread_spin_lock(&cq->lock);
+
+	cqe = align_queue_size(cqe + 1);
+	if (cqe == ibcq->cqe + 1) {
+		ret = 0;
+		goto out;
+	}
+
+	/* Can't be smaller then the number of outstanding CQEs */
+	outst_cqe = mlx4_get_outstanding_cqes(cq);
+	if (cqe < outst_cqe + 1) {
+		ret = 0;
+		goto out;
+	}
+
+	ret = mlx4_alloc_cq_buf(to_mdev(ibcq->context->device), &buf, cqe, cq->cqe_size);
+	if (ret)
+		goto out;
+
+	old_cqe = ibcq->cqe;
+	cmd.buf_addr = (uintptr_t) buf.buf;
+
+#ifdef IBV_CMD_RESIZE_CQ_HAS_RESP_PARAMS
+	{
+		struct ibv_resize_cq_resp resp;
+		ret = ibv_cmd_resize_cq(ibcq, cqe - 1, &cmd.ibv_cmd, sizeof cmd,
+					&resp, sizeof resp);
+	}
+#else
+	ret = ibv_cmd_resize_cq(ibcq, cqe - 1, &cmd.ibv_cmd, sizeof cmd);
+#endif
+	if (ret) {
+		mlx4_free_buf(&buf);
+		goto out;
+	}
+
+	mlx4_cq_resize_copy_cqes(cq, buf.buf, old_cqe);
+
+	mlx4_free_buf(&cq->buf);
+	cq->buf = buf;
+
+out:
+	pthread_spin_unlock(&cq->lock);
+	return ret;
+}
+
+int mlx4_destroy_cq(struct ibv_cq *cq)
+{
+	int ret;
+
+	ret = ibv_cmd_destroy_cq(cq);
+	if (ret)
+		return ret;
+
+	mlx4_free_db(to_mctx(cq->context), MLX4_DB_TYPE_CQ, to_mcq(cq)->set_ci_db);
+	mlx4_free_buf(&to_mcq(cq)->buf);
+	free(to_mcq(cq));
+
+	return 0;
+}
+
+struct ibv_srq *mlx4_create_srq(struct ibv_pd *pd,
+				 struct ibv_srq_init_attr *attr)
+{
+	struct mlx4_create_srq      cmd;
+	struct mlx4_create_srq_resp resp;
+	struct mlx4_srq		   *srq;
+	int			    ret;
+
+	/* Sanity check SRQ size before proceeding */
+	if (attr->attr.max_wr > 1 << 16 || attr->attr.max_sge > 64)
+		return NULL;
+
+	srq = malloc(sizeof *srq);
+	if (!srq)
+		return NULL;
+
+	if (pthread_spin_init(&srq->lock, PTHREAD_PROCESS_PRIVATE))
+		goto err;
+
+	srq->max     = align_queue_size(attr->attr.max_wr + 1);
+	srq->max_gs  = attr->attr.max_sge;
+	srq->counter = 0;
+
+	if (mlx4_alloc_srq_buf(pd, &attr->attr, srq))
+		goto err;
+
+	srq->db = mlx4_alloc_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ);
+	if (!srq->db)
+		goto err_free;
+
+	*srq->db = 0;
+
+	cmd.buf_addr = (uintptr_t) srq->buf.buf;
+	cmd.db_addr  = (uintptr_t) srq->db;
+
+	ret = ibv_cmd_create_srq(pd, &srq->ibv_srq, attr,
+				 &cmd.ibv_cmd, sizeof cmd,
+				 &resp.ibv_resp, sizeof resp);
+	if (ret)
+		goto err_db;
+
+	srq->srqn = resp.srqn;
+
+	return &srq->ibv_srq;
+
+err_db:
+	mlx4_free_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ, srq->db);
+
+err_free:
+	free(srq->wrid);
+	mlx4_free_buf(&srq->buf);
+
+err:
+	free(srq);
+
+	return NULL;
+}
+
+int mlx4_modify_srq(struct ibv_srq *srq,
+		     struct ibv_srq_attr *attr,
+		     int attr_mask)
+{
+	struct ibv_modify_srq cmd;
+
+	return ibv_cmd_modify_srq(srq, attr, attr_mask, &cmd, sizeof cmd);
+}
+
+int mlx4_query_srq(struct ibv_srq *srq,
+		    struct ibv_srq_attr *attr)
+{
+	struct ibv_query_srq cmd;
+
+	return ibv_cmd_query_srq(srq, attr, &cmd, sizeof cmd);
+}
+
+int mlx4_destroy_srq(struct ibv_srq *srq)
+{
+	int ret;
+
+	ret = ibv_cmd_destroy_srq(srq);
+	if (ret)
+		return ret;
+
+	mlx4_free_db(to_mctx(srq->context), MLX4_DB_TYPE_RQ, to_msrq(srq)->db);
+	mlx4_free_buf(&to_msrq(srq)->buf);
+	free(to_msrq(srq)->wrid);
+	free(to_msrq(srq));
+
+	return 0;
+}
+
+struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr)
+{
+	struct mlx4_create_qp     cmd;
+	struct ibv_create_qp_resp resp;
+	struct mlx4_qp		 *qp;
+	int			  ret;
+
+	/* Sanity check QP size before proceeding */
+	if (attr->cap.max_send_wr     > 65536 ||
+	    attr->cap.max_recv_wr     > 65536 ||
+	    attr->cap.max_send_sge    > 64    ||
+	    attr->cap.max_recv_sge    > 64    ||
+	    attr->cap.max_inline_data > 1024)
+		return NULL;
+
+	qp = malloc(sizeof *qp);
+	if (!qp)
+		return NULL;
+
+	mlx4_calc_sq_wqe_size(&attr->cap, attr->qp_type, qp);
+
+	/*
+	 * We need to leave 2 KB + 1 WQE of headroom in the SQ to
+	 * allow HW to prefetch.
+	 */
+	qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + 1;
+	qp->sq.wqe_cnt = align_queue_size(attr->cap.max_send_wr + qp->sq_spare_wqes);
+	qp->rq.wqe_cnt = align_queue_size(attr->cap.max_recv_wr);
+
+	if (attr->srq)
+		attr->cap.max_recv_wr = qp->rq.wqe_cnt = 0;
+	else {
+		if (attr->cap.max_recv_sge < 1)
+			attr->cap.max_recv_sge = 1;
+		if (attr->cap.max_recv_wr < 1)
+			attr->cap.max_recv_wr = 1;
+	}
+
+	if (mlx4_alloc_qp_buf(pd, &attr->cap, attr->qp_type, qp))
+		goto err;
+
+	mlx4_init_qp_indices(qp);
+
+	if (pthread_spin_init(&qp->sq.lock, PTHREAD_PROCESS_PRIVATE) ||
+	    pthread_spin_init(&qp->rq.lock, PTHREAD_PROCESS_PRIVATE))
+		goto err_free;
+
+	if (!attr->srq) {
+		qp->db = mlx4_alloc_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ);
+		if (!qp->db)
+			goto err_free;
+
+		*qp->db = 0;
+	}
+
+	cmd.buf_addr	    = (uintptr_t) qp->buf.buf;
+	if (attr->srq)
+		cmd.db_addr = 0;
+	else
+		cmd.db_addr = (uintptr_t) qp->db;
+	cmd.log_sq_stride   = qp->sq.wqe_shift;
+	for (cmd.log_sq_bb_count = 0;
+	     qp->sq.wqe_cnt > 1 << cmd.log_sq_bb_count;
+	     ++cmd.log_sq_bb_count)
+		; /* nothing */
+	cmd.sq_no_prefetch = 0;	/* OK for ABI 2: just a reserved field */
+	memset(cmd.reserved, 0, sizeof cmd.reserved);
+
+	pthread_mutex_lock(&to_mctx(pd->context)->qp_table_mutex);
+
+	ret = ibv_cmd_create_qp(pd, &qp->ibv_qp, attr, &cmd.ibv_cmd, sizeof cmd,
+				&resp, sizeof resp);
+	if (ret)
+		goto err_rq_db;
+
+	ret = mlx4_store_qp(to_mctx(pd->context), qp->ibv_qp.qp_num, qp);
+	if (ret)
+		goto err_destroy;
+	pthread_mutex_unlock(&to_mctx(pd->context)->qp_table_mutex);
+
+	qp->rq.wqe_cnt = qp->rq.max_post = attr->cap.max_recv_wr;
+	qp->rq.max_gs  = attr->cap.max_recv_sge;
+	mlx4_set_sq_sizes(qp, &attr->cap, attr->qp_type);
+
+	qp->doorbell_qpn    = htonl(qp->ibv_qp.qp_num << 8);
+	if (attr->sq_sig_all)
+		qp->sq_signal_bits = htonl(MLX4_WQE_CTRL_CQ_UPDATE);
+	else
+		qp->sq_signal_bits = 0;
+
+	return &qp->ibv_qp;
+
+err_destroy:
+	ibv_cmd_destroy_qp(&qp->ibv_qp);
+
+err_rq_db:
+	pthread_mutex_unlock(&to_mctx(pd->context)->qp_table_mutex);
+	if (!attr->srq)
+		mlx4_free_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ, qp->db);
+
+err_free:
+	free(qp->sq.wrid);
+	if (qp->rq.wqe_cnt)
+		free(qp->rq.wrid);
+	mlx4_free_buf(&qp->buf);
+
+err:
+	free(qp);
+
+	return NULL;
+}
+
+int mlx4_query_qp(struct ibv_qp *ibqp, struct ibv_qp_attr *attr,
+		   int attr_mask,
+		   struct ibv_qp_init_attr *init_attr)
+{
+	struct ibv_query_qp cmd;
+	struct mlx4_qp *qp = to_mqp(ibqp);
+	int ret;
+
+	ret = ibv_cmd_query_qp(ibqp, attr, attr_mask, init_attr, &cmd, sizeof cmd);
+	if (ret)
+		return ret;
+
+	init_attr->cap.max_send_wr     = qp->sq.max_post;
+	init_attr->cap.max_send_sge    = qp->sq.max_gs;
+	init_attr->cap.max_inline_data = qp->max_inline_data;
+
+	attr->cap = init_attr->cap;
+
+	return 0;
+}
+
+int mlx4_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
+		    int attr_mask)
+{
+	struct ibv_modify_qp cmd;
+	struct ibv_port_attr port_attr;
+	struct mlx4_qp *mqp = to_mqp(qp);
+	int ret;
+
+	if (attr_mask & IBV_QP_PORT) {
+		ret = ibv_query_port(qp->pd->context, attr->port_num,
+				     &port_attr);
+		if (ret)
+			return ret;
+		mqp->link_layer = port_attr.link_layer;
+	}
+
+	if (qp->state == IBV_QPS_RESET &&
+	    attr_mask & IBV_QP_STATE   &&
+	    attr->qp_state == IBV_QPS_INIT) {
+		mlx4_qp_init_sq_ownership(to_mqp(qp));
+	}
+
+	ret = ibv_cmd_modify_qp(qp, attr, attr_mask, &cmd, sizeof cmd);
+
+	if (!ret		       &&
+	    (attr_mask & IBV_QP_STATE) &&
+	    attr->qp_state == IBV_QPS_RESET) {
+		mlx4_cq_clean(to_mcq(qp->recv_cq), qp->qp_num,
+			       qp->srq ? to_msrq(qp->srq) : NULL);
+		if (qp->send_cq != qp->recv_cq)
+			mlx4_cq_clean(to_mcq(qp->send_cq), qp->qp_num, NULL);
+
+		mlx4_init_qp_indices(to_mqp(qp));
+		if (!qp->srq)
+			*to_mqp(qp)->db = 0;
+	}
+
+	return ret;
+}
+
+static void mlx4_lock_cqs(struct ibv_qp *qp)
+{
+	struct mlx4_cq *send_cq = to_mcq(qp->send_cq);
+	struct mlx4_cq *recv_cq = to_mcq(qp->recv_cq);
+
+	if (send_cq == recv_cq)
+		pthread_spin_lock(&send_cq->lock);
+	else if (send_cq->cqn < recv_cq->cqn) {
+		pthread_spin_lock(&send_cq->lock);
+		pthread_spin_lock(&recv_cq->lock);
+	} else {
+		pthread_spin_lock(&recv_cq->lock);
+		pthread_spin_lock(&send_cq->lock);
+	}
+}
+
+static void mlx4_unlock_cqs(struct ibv_qp *qp)
+{
+	struct mlx4_cq *send_cq = to_mcq(qp->send_cq);
+	struct mlx4_cq *recv_cq = to_mcq(qp->recv_cq);
+
+	if (send_cq == recv_cq)
+		pthread_spin_unlock(&send_cq->lock);
+	else if (send_cq->cqn < recv_cq->cqn) {
+		pthread_spin_unlock(&recv_cq->lock);
+		pthread_spin_unlock(&send_cq->lock);
+	} else {
+		pthread_spin_unlock(&send_cq->lock);
+		pthread_spin_unlock(&recv_cq->lock);
+	}
+}
+
+int mlx4_destroy_qp(struct ibv_qp *ibqp)
+{
+	struct mlx4_qp *qp = to_mqp(ibqp);
+	int ret;
+
+	pthread_mutex_lock(&to_mctx(ibqp->context)->qp_table_mutex);
+	ret = ibv_cmd_destroy_qp(ibqp);
+	if (ret) {
+		pthread_mutex_unlock(&to_mctx(ibqp->context)->qp_table_mutex);
+		return ret;
+	}
+
+	mlx4_lock_cqs(ibqp);
+
+	__mlx4_cq_clean(to_mcq(ibqp->recv_cq), ibqp->qp_num,
+			ibqp->srq ? to_msrq(ibqp->srq) : NULL);
+	if (ibqp->send_cq != ibqp->recv_cq)
+		__mlx4_cq_clean(to_mcq(ibqp->send_cq), ibqp->qp_num, NULL);
+
+	mlx4_clear_qp(to_mctx(ibqp->context), ibqp->qp_num);
+
+	mlx4_unlock_cqs(ibqp);
+	pthread_mutex_unlock(&to_mctx(ibqp->context)->qp_table_mutex);
+
+	if (!ibqp->srq)
+		mlx4_free_db(to_mctx(ibqp->context), MLX4_DB_TYPE_RQ, qp->db);
+	free(qp->sq.wrid);
+	if (qp->rq.wqe_cnt)
+		free(qp->rq.wrid);
+	mlx4_free_buf(&qp->buf);
+	free(qp);
+
+	return 0;
+}
+
+static int link_local_gid(const union ibv_gid *gid)
+{
+	uint32_t hi = *(uint32_t *)(gid->raw);
+	uint32_t lo = *(uint32_t *)(gid->raw + 4);
+	if (hi == htonl(0xfe800000) && lo == 0)
+		return 1;
+
+	return 0;
+}
+
+static int is_multicast_gid(const union ibv_gid *gid)
+{
+	return gid->raw[0] == 0xff;
+}
+
+static uint16_t get_vlan_id(union ibv_gid *gid)
+{
+	uint16_t vid;
+	vid = gid->raw[11] << 8 | gid->raw[12];
+	return vid < 0x1000 ? vid : 0xffff;
+}
+
+static int mlx4_resolve_grh_to_l2(struct ibv_pd *pd, struct mlx4_ah *ah,
+				  struct ibv_ah_attr *attr)
+{
+	int err, i;
+	uint16_t vid;
+	union ibv_gid sgid;
+
+	if (link_local_gid(&attr->grh.dgid)) {
+		memcpy(ah->mac, &attr->grh.dgid.raw[8], 3);
+		memcpy(ah->mac + 3, &attr->grh.dgid.raw[13], 3);
+		ah->mac[0] ^= 2;
+
+		vid = get_vlan_id(&attr->grh.dgid);
+	} else if (is_multicast_gid(&attr->grh.dgid)) {
+		ah->mac[0] = 0x33;
+		ah->mac[1] = 0x33;
+		for (i = 2; i < 6; ++i)
+			ah->mac[i] = attr->grh.dgid.raw[i + 10];
+
+		err = ibv_query_gid(pd->context, attr->port_num,
+				    attr->grh.sgid_index, &sgid);
+		if (err)
+			return err;
+
+		ah->av.dlid = htons(0xc000);
+		ah->av.port_pd |= htonl(1 << 31);
+
+		vid = get_vlan_id(&sgid);
+	} else
+		return 1;
+
+	if (vid != 0xffff) {
+		ah->av.port_pd |= htonl(1 << 29);
+		ah->vlan = vid | ((attr->sl & 7) << 13);
+	}
+
+	return 0;
+}
+
+struct ibv_ah *mlx4_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr)
+{
+	struct mlx4_ah *ah;
+	struct ibv_port_attr port_attr;
+
+	if (ibv_query_port(pd->context, attr->port_num, &port_attr))
+		return NULL;
+
+	ah = malloc(sizeof *ah);
+	if (!ah)
+		return NULL;
+
+	memset(&ah->av, 0, sizeof ah->av);
+
+	ah->av.port_pd   = htonl(to_mpd(pd)->pdn | (attr->port_num << 24));
+
+	if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
+		ah->av.g_slid = attr->src_path_bits;
+		ah->av.dlid   = htons(attr->dlid);
+		ah->av.sl_tclass_flowlabel = htonl(attr->sl << 28);
+	} else
+		ah->av.sl_tclass_flowlabel = htonl(attr->sl << 29);
+
+	if (attr->static_rate) {
+		ah->av.stat_rate = attr->static_rate + MLX4_STAT_RATE_OFFSET;
+		/* XXX check rate cap? */
+	}
+	if (attr->is_global) {
+		ah->av.g_slid   |= 0x80;
+		ah->av.gid_index = attr->grh.sgid_index;
+		ah->av.hop_limit = attr->grh.hop_limit;
+		ah->av.sl_tclass_flowlabel |=
+			htonl((attr->grh.traffic_class << 20) |
+				    attr->grh.flow_label);
+		memcpy(ah->av.dgid, attr->grh.dgid.raw, 16);
+	}
+
+	if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET)
+		if (mlx4_resolve_grh_to_l2(pd, ah, attr)) {
+			free(ah);
+			return NULL;
+		}
+
+	return &ah->ibv_ah;
+}
+
+int mlx4_destroy_ah(struct ibv_ah *ah)
+{
+	free(to_mah(ah));
+
+	return 0;
+}
diff --git a/prov/mlx4/src/qp.c b/prov/mlx4/src/qp.c
new file mode 100644
index 00000000000..11c750b4c9a
--- /dev/null
+++ b/prov/mlx4/src/qp.c
@@ -0,0 +1,702 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2007 Cisco, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if HAVE_CONFIG_H
+#  include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#include <stdlib.h>
+#include <netinet/in.h>
+#include <pthread.h>
+#include <string.h>
+#include <errno.h>
+
+#include "mlx4.h"
+#include "doorbell.h"
+#include "wqe.h"
+
+static const uint32_t mlx4_ib_opcode[] = {
+	[IBV_WR_SEND]			= MLX4_OPCODE_SEND,
+	[IBV_WR_SEND_WITH_IMM]		= MLX4_OPCODE_SEND_IMM,
+	[IBV_WR_RDMA_WRITE]		= MLX4_OPCODE_RDMA_WRITE,
+	[IBV_WR_RDMA_WRITE_WITH_IMM]	= MLX4_OPCODE_RDMA_WRITE_IMM,
+	[IBV_WR_RDMA_READ]		= MLX4_OPCODE_RDMA_READ,
+	[IBV_WR_ATOMIC_CMP_AND_SWP]	= MLX4_OPCODE_ATOMIC_CS,
+	[IBV_WR_ATOMIC_FETCH_AND_ADD]	= MLX4_OPCODE_ATOMIC_FA,
+};
+
+static void *get_recv_wqe(struct mlx4_qp *qp, int n)
+{
+	return qp->buf.buf + qp->rq.offset + (n << qp->rq.wqe_shift);
+}
+
+static void *get_send_wqe(struct mlx4_qp *qp, int n)
+{
+	return qp->buf.buf + qp->sq.offset + (n << qp->sq.wqe_shift);
+}
+
+/*
+ * Stamp a SQ WQE so that it is invalid if prefetched by marking the
+ * first four bytes of every 64 byte chunk with 0xffffffff, except for
+ * the very first chunk of the WQE.
+ */
+static void stamp_send_wqe(struct mlx4_qp *qp, int n)
+{
+	uint32_t *wqe = get_send_wqe(qp, n);
+	int i;
+	int ds = (((struct mlx4_wqe_ctrl_seg *)wqe)->fence_size & 0x3f) << 2;
+
+	for (i = 16; i < ds; i += 16)
+		wqe[i] = 0xffffffff;
+}
+
+void mlx4_init_qp_indices(struct mlx4_qp *qp)
+{
+	qp->sq.head	 = 0;
+	qp->sq.tail	 = 0;
+	qp->rq.head	 = 0;
+	qp->rq.tail	 = 0;
+}
+
+void mlx4_qp_init_sq_ownership(struct mlx4_qp *qp)
+{
+	struct mlx4_wqe_ctrl_seg *ctrl;
+	int i;
+
+	for (i = 0; i < qp->sq.wqe_cnt; ++i) {
+		ctrl = get_send_wqe(qp, i);
+		ctrl->owner_opcode = htonl(1 << 31);
+		ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4);
+
+		stamp_send_wqe(qp, i);
+	}
+}
+
+static int wq_overflow(struct mlx4_wq *wq, int nreq, struct mlx4_cq *cq)
+{
+	unsigned cur;
+
+	cur = wq->head - wq->tail;
+	if (cur + nreq < wq->max_post)
+		return 0;
+
+	pthread_spin_lock(&cq->lock);
+	cur = wq->head - wq->tail;
+	pthread_spin_unlock(&cq->lock);
+
+	return cur + nreq >= wq->max_post;
+}
+
+static inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg,
+				 uint64_t remote_addr, uint32_t rkey)
+{
+	rseg->raddr    = htonll(remote_addr);
+	rseg->rkey     = htonl(rkey);
+	rseg->reserved = 0;
+}
+
+static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, struct ibv_send_wr *wr)
+{
+	if (wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP) {
+		aseg->swap_add = htonll(wr->wr.atomic.swap);
+		aseg->compare  = htonll(wr->wr.atomic.compare_add);
+	} else {
+		aseg->swap_add = htonll(wr->wr.atomic.compare_add);
+		aseg->compare  = 0;
+	}
+
+}
+
+static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg,
+			     struct ibv_send_wr *wr)
+{
+	memcpy(dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av));
+	dseg->dqpn = htonl(wr->wr.ud.remote_qpn);
+	dseg->qkey = htonl(wr->wr.ud.remote_qkey);
+	dseg->vlan = htons(to_mah(wr->wr.ud.ah)->vlan);
+	memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->mac, 6);
+}
+
+static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg)
+{
+	dseg->byte_count = htonl(sg->length);
+	dseg->lkey       = htonl(sg->lkey);
+	dseg->addr       = htonll(sg->addr);
+}
+
+static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg)
+{
+	dseg->lkey       = htonl(sg->lkey);
+	dseg->addr       = htonll(sg->addr);
+
+	/*
+	 * Need a barrier here before writing the byte_count field to
+	 * make sure that all the data is visible before the
+	 * byte_count field is set.  Otherwise, if the segment begins
+	 * a new cacheline, the HCA prefetcher could grab the 64-byte
+	 * chunk and get a valid (!= * 0xffffffff) byte count but
+	 * stale data, and end up sending the wrong data.
+	 */
+	wmb();
+
+	dseg->byte_count = htonl(sg->length);
+}
+
+/*
+ * Avoid using memcpy() to copy to BlueFlame page, since memcpy()
+ * implementations may use move-string-buffer assembler instructions,
+ * which do not guarantee order of copying.
+ */
+static void mlx4_bf_copy(unsigned long *dst, unsigned long *src, unsigned bytecnt)
+{
+	while (bytecnt > 0) {
+		*dst++ = *src++;
+		*dst++ = *src++;
+		bytecnt -= 2 * sizeof (long);
+	}
+}
+
+int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
+			  struct ibv_send_wr **bad_wr)
+{
+	struct mlx4_context *ctx;
+	struct mlx4_qp *qp = to_mqp(ibqp);
+	void *wqe;
+	struct mlx4_wqe_ctrl_seg *ctrl;
+	int ind;
+	int nreq;
+	int inl = 0;
+	int ret = 0;
+	int size;
+	int i;
+
+	pthread_spin_lock(&qp->sq.lock);
+
+	/* XXX check that state is OK to post send */
+
+	ind = qp->sq.head;
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (wq_overflow(&qp->sq, nreq, to_mcq(qp->ibv_qp.send_cq))) {
+			ret = ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		if (wr->num_sge > qp->sq.max_gs) {
+			ret = ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		if (wr->opcode >= sizeof mlx4_ib_opcode / sizeof mlx4_ib_opcode[0]) {
+			ret = EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
+		qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
+
+		ctrl->srcrb_flags =
+			(wr->send_flags & IBV_SEND_SIGNALED ?
+			 htonl(MLX4_WQE_CTRL_CQ_UPDATE) : 0) |
+			(wr->send_flags & IBV_SEND_SOLICITED ?
+			 htonl(MLX4_WQE_CTRL_SOLICIT) : 0)   |
+			qp->sq_signal_bits;
+
+		if (wr->opcode == IBV_WR_SEND_WITH_IMM ||
+		    wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)
+			ctrl->imm = wr->imm_data;
+		else
+			ctrl->imm = 0;
+
+		wqe += sizeof *ctrl;
+		size = sizeof *ctrl / 16;
+
+		switch (ibqp->qp_type) {
+		case IBV_QPT_RC:
+		case IBV_QPT_UC:
+			switch (wr->opcode) {
+			case IBV_WR_ATOMIC_CMP_AND_SWP:
+			case IBV_WR_ATOMIC_FETCH_AND_ADD:
+				set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
+					      wr->wr.atomic.rkey);
+				wqe  += sizeof (struct mlx4_wqe_raddr_seg);
+
+				set_atomic_seg(wqe, wr);
+				wqe  += sizeof (struct mlx4_wqe_atomic_seg);
+				size += (sizeof (struct mlx4_wqe_raddr_seg) +
+					 sizeof (struct mlx4_wqe_atomic_seg)) / 16;
+
+				break;
+
+			case IBV_WR_RDMA_READ:
+				inl = 1;
+				/* fall through */
+			case IBV_WR_RDMA_WRITE:
+			case IBV_WR_RDMA_WRITE_WITH_IMM:
+				if (!wr->num_sge)
+					inl = 1;
+				set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
+					      wr->wr.rdma.rkey);
+				wqe  += sizeof (struct mlx4_wqe_raddr_seg);
+				size += sizeof (struct mlx4_wqe_raddr_seg) / 16;
+
+				break;
+
+			default:
+				/* No extra segments required for sends */
+				break;
+			}
+			break;
+
+		case IBV_QPT_UD:
+			set_datagram_seg(wqe, wr);
+			wqe  += sizeof (struct mlx4_wqe_datagram_seg);
+			size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
+			break;
+
+		case IBV_QPT_RAW_PACKET:
+			/* For raw eth, the MLX4_WQE_CTRL_SOLICIT flag is used
+			 * to indicate that no icrc should be calculated */
+			ctrl->srcrb_flags |= htonl(MLX4_WQE_CTRL_SOLICIT);
+			break;
+
+		default:
+			break;
+		}
+
+		if (wr->send_flags & IBV_SEND_INLINE && wr->num_sge) {
+			struct mlx4_wqe_inline_seg *seg;
+			void *addr;
+			int len, seg_len;
+			int num_seg;
+			int off, to_copy;
+
+			inl = 0;
+
+			seg = wqe;
+			wqe += sizeof *seg;
+			off = ((uintptr_t) wqe) & (MLX4_INLINE_ALIGN - 1);
+			num_seg = 0;
+			seg_len = 0;
+
+			for (i = 0; i < wr->num_sge; ++i) {
+				addr = (void *) (uintptr_t) wr->sg_list[i].addr;
+				len  = wr->sg_list[i].length;
+				inl += len;
+
+				if (inl > qp->max_inline_data) {
+					inl = 0;
+					ret = ENOMEM;
+					*bad_wr = wr;
+					goto out;
+				}
+
+				while (len >= MLX4_INLINE_ALIGN - off) {
+					to_copy = MLX4_INLINE_ALIGN - off;
+					memcpy(wqe, addr, to_copy);
+					len -= to_copy;
+					wqe += to_copy;
+					addr += to_copy;
+					seg_len += to_copy;
+					wmb(); /* see comment below */
+					seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len);
+					seg_len = 0;
+					seg = wqe;
+					wqe += sizeof *seg;
+					off = sizeof *seg;
+					++num_seg;
+				}
+
+				memcpy(wqe, addr, len);
+				wqe += len;
+				seg_len += len;
+				off += len;
+			}
+
+			if (seg_len) {
+				++num_seg;
+				/*
+				 * Need a barrier here to make sure
+				 * all the data is visible before the
+				 * byte_count field is set.  Otherwise
+				 * the HCA prefetcher could grab the
+				 * 64-byte chunk with this inline
+				 * segment and get a valid (!=
+				 * 0xffffffff) byte count but stale
+				 * data, and end up sending the wrong
+				 * data.
+				 */
+				wmb();
+				seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len);
+			}
+
+			size += (inl + num_seg * sizeof * seg + 15) / 16;
+		} else {
+			struct mlx4_wqe_data_seg *seg = wqe;
+
+			for (i = wr->num_sge - 1; i >= 0 ; --i)
+				set_data_seg(seg + i, wr->sg_list + i);
+
+			size += wr->num_sge * (sizeof *seg / 16);
+		}
+
+		ctrl->fence_size = (wr->send_flags & IBV_SEND_FENCE ?
+				    MLX4_WQE_CTRL_FENCE : 0) | size;
+
+		/*
+		 * Make sure descriptor is fully written before
+		 * setting ownership bit (because HW can start
+		 * executing as soon as we do).
+		 */
+		wmb();
+
+		ctrl->owner_opcode = htonl(mlx4_ib_opcode[wr->opcode]) |
+			(ind & qp->sq.wqe_cnt ? htonl(1 << 31) : 0);
+
+		/*
+		 * We can improve latency by not stamping the last
+		 * send queue WQE until after ringing the doorbell, so
+		 * only stamp here if there are still more WQEs to post.
+		 */
+		if (wr->next)
+			stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) &
+				       (qp->sq.wqe_cnt - 1));
+
+		++ind;
+	}
+
+out:
+	ctx = to_mctx(ibqp->context);
+
+	if (nreq == 1 && inl && size > 1 && size <= ctx->bf_buf_size / 16) {
+		ctrl->owner_opcode |= htonl((qp->sq.head & 0xffff) << 8);
+		*(uint32_t *) ctrl->reserved |= qp->doorbell_qpn;
+		/*
+		 * Make sure that descriptor is written to memory
+		 * before writing to BlueFlame page.
+		 */
+		wmb();
+
+		++qp->sq.head;
+
+		pthread_spin_lock(&ctx->bf_lock);
+
+		mlx4_bf_copy(ctx->bf_page + ctx->bf_offset, (unsigned long *) ctrl,
+			     align(size * 16, 64));
+		wc_wmb();
+
+		ctx->bf_offset ^= ctx->bf_buf_size;
+
+		pthread_spin_unlock(&ctx->bf_lock);
+	} else if (nreq) {
+		qp->sq.head += nreq;
+
+		/*
+		 * Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		wmb();
+
+		*(uint32_t *) (ctx->uar + MLX4_SEND_DOORBELL) = qp->doorbell_qpn;
+	}
+
+	if (nreq)
+		stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) &
+			       (qp->sq.wqe_cnt - 1));
+
+	pthread_spin_unlock(&qp->sq.lock);
+
+	return ret;
+}
+
+int mlx4_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
+		   struct ibv_recv_wr **bad_wr)
+{
+	struct mlx4_qp *qp = to_mqp(ibqp);
+	struct mlx4_wqe_data_seg *scat;
+	int ret = 0;
+	int nreq;
+	int ind;
+	int i;
+
+	pthread_spin_lock(&qp->rq.lock);
+
+	/* XXX check that state is OK to post receive */
+
+	ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (wq_overflow(&qp->rq, nreq, to_mcq(qp->ibv_qp.recv_cq))) {
+			ret = ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		if (wr->num_sge > qp->rq.max_gs) {
+			ret = ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		scat = get_recv_wqe(qp, ind);
+
+		for (i = 0; i < wr->num_sge; ++i)
+			__set_data_seg(scat + i, wr->sg_list + i);
+
+		if (i < qp->rq.max_gs) {
+			scat[i].byte_count = 0;
+			scat[i].lkey       = htonl(MLX4_INVALID_LKEY);
+			scat[i].addr       = 0;
+		}
+
+		qp->rq.wrid[ind] = wr->wr_id;
+
+		ind = (ind + 1) & (qp->rq.wqe_cnt - 1);
+	}
+
+out:
+	if (nreq) {
+		qp->rq.head += nreq;
+
+		/*
+		 * Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		wmb();
+
+		*qp->db = htonl(qp->rq.head & 0xffff);
+	}
+
+	pthread_spin_unlock(&qp->rq.lock);
+
+	return ret;
+}
+
+static int num_inline_segs(int data, enum ibv_qp_type type)
+{
+	/*
+	 * Inline data segments are not allowed to cross 64 byte
+	 * boundaries.  For UD QPs, the data segments always start
+	 * aligned to 64 bytes (16 byte control segment + 48 byte
+	 * datagram segment); for other QPs, there will be a 16 byte
+	 * control segment and possibly a 16 byte remote address
+	 * segment, so in the worst case there will be only 32 bytes
+	 * available for the first data segment.
+	 */
+	if (type == IBV_QPT_UD)
+		data += (sizeof (struct mlx4_wqe_ctrl_seg) +
+			 sizeof (struct mlx4_wqe_datagram_seg)) %
+			MLX4_INLINE_ALIGN;
+	else
+		data += (sizeof (struct mlx4_wqe_ctrl_seg) +
+			 sizeof (struct mlx4_wqe_raddr_seg)) %
+			MLX4_INLINE_ALIGN;
+
+	return (data + MLX4_INLINE_ALIGN - sizeof (struct mlx4_wqe_inline_seg) - 1) /
+		(MLX4_INLINE_ALIGN - sizeof (struct mlx4_wqe_inline_seg));
+}
+
+void mlx4_calc_sq_wqe_size(struct ibv_qp_cap *cap, enum ibv_qp_type type,
+			   struct mlx4_qp *qp)
+{
+	int size;
+	int max_sq_sge;
+
+	max_sq_sge	 = align(cap->max_inline_data +
+				 num_inline_segs(cap->max_inline_data, type) *
+				 sizeof (struct mlx4_wqe_inline_seg),
+				 sizeof (struct mlx4_wqe_data_seg)) /
+		sizeof (struct mlx4_wqe_data_seg);
+	if (max_sq_sge < cap->max_send_sge)
+		max_sq_sge = cap->max_send_sge;
+
+	size = max_sq_sge * sizeof (struct mlx4_wqe_data_seg);
+	switch (type) {
+	case IBV_QPT_UD:
+		size += sizeof (struct mlx4_wqe_datagram_seg);
+		break;
+
+	case IBV_QPT_UC:
+		size += sizeof (struct mlx4_wqe_raddr_seg);
+		break;
+
+	case IBV_QPT_RC:
+		size += sizeof (struct mlx4_wqe_raddr_seg);
+		/*
+		 * An atomic op will require an atomic segment, a
+		 * remote address segment and one scatter entry.
+		 */
+		if (size < (sizeof (struct mlx4_wqe_atomic_seg) +
+			    sizeof (struct mlx4_wqe_raddr_seg) +
+			    sizeof (struct mlx4_wqe_data_seg)))
+			size = (sizeof (struct mlx4_wqe_atomic_seg) +
+				sizeof (struct mlx4_wqe_raddr_seg) +
+				sizeof (struct mlx4_wqe_data_seg));
+		break;
+
+	default:
+		break;
+	}
+
+	/* Make sure that we have enough space for a bind request */
+	if (size < sizeof (struct mlx4_wqe_bind_seg))
+		size = sizeof (struct mlx4_wqe_bind_seg);
+
+	size += sizeof (struct mlx4_wqe_ctrl_seg);
+
+	for (qp->sq.wqe_shift = 6; 1 << qp->sq.wqe_shift < size;
+	     qp->sq.wqe_shift++)
+		; /* nothing */
+}
+
+int mlx4_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap *cap,
+		       enum ibv_qp_type type, struct mlx4_qp *qp)
+{
+	qp->rq.max_gs	 = cap->max_recv_sge;
+
+	qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof (uint64_t));
+	if (!qp->sq.wrid)
+		return -1;
+
+	if (qp->rq.wqe_cnt) {
+		qp->rq.wrid = malloc(qp->rq.wqe_cnt * sizeof (uint64_t));
+		if (!qp->rq.wrid) {
+			free(qp->sq.wrid);
+			return -1;
+		}
+	}
+
+	for (qp->rq.wqe_shift = 4;
+	     1 << qp->rq.wqe_shift < qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg);
+	     qp->rq.wqe_shift++)
+		; /* nothing */
+
+	qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
+		(qp->sq.wqe_cnt << qp->sq.wqe_shift);
+	if (qp->rq.wqe_shift > qp->sq.wqe_shift) {
+		qp->rq.offset = 0;
+		qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
+	} else {
+		qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift;
+		qp->sq.offset = 0;
+	}
+
+	if (mlx4_alloc_buf(&qp->buf,
+			    align(qp->buf_size, to_mdev(pd->context->device)->page_size),
+			    to_mdev(pd->context->device)->page_size)) {
+		free(qp->sq.wrid);
+		free(qp->rq.wrid);
+		return -1;
+	}
+
+	memset(qp->buf.buf, 0, qp->buf_size);
+
+	return 0;
+}
+
+void mlx4_set_sq_sizes(struct mlx4_qp *qp, struct ibv_qp_cap *cap,
+		       enum ibv_qp_type type)
+{
+	int wqe_size;
+
+	wqe_size = (1 << qp->sq.wqe_shift) - sizeof (struct mlx4_wqe_ctrl_seg);
+	switch (type) {
+	case IBV_QPT_UD:
+		wqe_size -= sizeof (struct mlx4_wqe_datagram_seg);
+		break;
+
+	case IBV_QPT_UC:
+	case IBV_QPT_RC:
+		wqe_size -= sizeof (struct mlx4_wqe_raddr_seg);
+		break;
+
+	default:
+		break;
+	}
+
+	qp->sq.max_gs	     = wqe_size / sizeof (struct mlx4_wqe_data_seg);
+	cap->max_send_sge    = qp->sq.max_gs;
+	qp->sq.max_post	     = qp->sq.wqe_cnt - qp->sq_spare_wqes;
+	cap->max_send_wr     = qp->sq.max_post;
+
+	/*
+	 * Inline data segments can't cross a 64 byte boundary.  So
+	 * subtract off one segment header for each 64-byte chunk,
+	 * taking into account the fact that wqe_size will be 32 mod
+	 * 64 for non-UD QPs.
+	 */
+	qp->max_inline_data  = wqe_size -
+		sizeof (struct mlx4_wqe_inline_seg) *
+		(align(wqe_size, MLX4_INLINE_ALIGN) / MLX4_INLINE_ALIGN);
+	cap->max_inline_data = qp->max_inline_data;
+}
+
+struct mlx4_qp *mlx4_find_qp(struct mlx4_context *ctx, uint32_t qpn)
+{
+	int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
+
+	if (ctx->qp_table[tind].refcnt)
+		return ctx->qp_table[tind].table[qpn & ctx->qp_table_mask];
+	else
+		return NULL;
+}
+
+int mlx4_store_qp(struct mlx4_context *ctx, uint32_t qpn, struct mlx4_qp *qp)
+{
+	int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
+
+	if (!ctx->qp_table[tind].refcnt) {
+		ctx->qp_table[tind].table = calloc(ctx->qp_table_mask + 1,
+						   sizeof (struct mlx4_qp *));
+		if (!ctx->qp_table[tind].table)
+			return -1;
+	}
+
+	++ctx->qp_table[tind].refcnt;
+	ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = qp;
+	return 0;
+}
+
+void mlx4_clear_qp(struct mlx4_context *ctx, uint32_t qpn)
+{
+	int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
+
+	if (!--ctx->qp_table[tind].refcnt)
+		free(ctx->qp_table[tind].table);
+	else
+		ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = NULL;
+}
diff --git a/prov/mlx4/src/srq.c b/prov/mlx4/src/srq.c
new file mode 100644
index 00000000000..f1d12402701
--- /dev/null
+++ b/prov/mlx4/src/srq.c
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2007 Cisco, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if HAVE_CONFIG_H
+#  include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#include <stdlib.h>
+#include <netinet/in.h>
+#include <pthread.h>
+#include <string.h>
+
+#include "mlx4.h"
+#include "doorbell.h"
+#include "wqe.h"
+
+static void *get_wqe(struct mlx4_srq *srq, int n)
+{
+	return srq->buf.buf + (n << srq->wqe_shift);
+}
+
+void mlx4_free_srq_wqe(struct mlx4_srq *srq, int ind)
+{
+	struct mlx4_wqe_srq_next_seg *next;
+
+	pthread_spin_lock(&srq->lock);
+
+	next = get_wqe(srq, srq->tail);
+	next->next_wqe_index = htons(ind);
+	srq->tail = ind;
+
+	pthread_spin_unlock(&srq->lock);
+}
+
+int mlx4_post_srq_recv(struct ibv_srq *ibsrq,
+		       struct ibv_recv_wr *wr,
+		       struct ibv_recv_wr **bad_wr)
+{
+	struct mlx4_srq *srq = to_msrq(ibsrq);
+	struct mlx4_wqe_srq_next_seg *next;
+	struct mlx4_wqe_data_seg *scat;
+	int err = 0;
+	int nreq;
+	int i;
+
+	pthread_spin_lock(&srq->lock);
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (wr->num_sge > srq->max_gs) {
+			err = -1;
+			*bad_wr = wr;
+			break;
+		}
+
+		if (srq->head == srq->tail) {
+			/* SRQ is full*/
+			err = -1;
+			*bad_wr = wr;
+			break;
+		}
+
+		srq->wrid[srq->head] = wr->wr_id;
+
+		next      = get_wqe(srq, srq->head);
+		srq->head = ntohs(next->next_wqe_index);
+		scat      = (struct mlx4_wqe_data_seg *) (next + 1);
+
+		for (i = 0; i < wr->num_sge; ++i) {
+			scat[i].byte_count = htonl(wr->sg_list[i].length);
+			scat[i].lkey       = htonl(wr->sg_list[i].lkey);
+			scat[i].addr       = htonll(wr->sg_list[i].addr);
+		}
+
+		if (i < srq->max_gs) {
+			scat[i].byte_count = 0;
+			scat[i].lkey       = htonl(MLX4_INVALID_LKEY);
+			scat[i].addr       = 0;
+		}
+	}
+
+	if (nreq) {
+		srq->counter += nreq;
+
+		/*
+		 * Make sure that descriptors are written before
+		 * we write doorbell record.
+		 */
+		wmb();
+
+		*srq->db = htonl(srq->counter);
+	}
+
+	pthread_spin_unlock(&srq->lock);
+
+	return err;
+}
+
+int mlx4_alloc_srq_buf(struct ibv_pd *pd, struct ibv_srq_attr *attr,
+		       struct mlx4_srq *srq)
+{
+	struct mlx4_wqe_srq_next_seg *next;
+	struct mlx4_wqe_data_seg *scatter;
+	int size;
+	int buf_size;
+	int i;
+
+	srq->wrid = malloc(srq->max * sizeof (uint64_t));
+	if (!srq->wrid)
+		return -1;
+
+	size = sizeof (struct mlx4_wqe_srq_next_seg) +
+		srq->max_gs * sizeof (struct mlx4_wqe_data_seg);
+
+	for (srq->wqe_shift = 5; 1 << srq->wqe_shift < size; ++srq->wqe_shift)
+		; /* nothing */
+
+	buf_size = srq->max << srq->wqe_shift;
+
+	if (mlx4_alloc_buf(&srq->buf, buf_size,
+			   to_mdev(pd->context->device)->page_size)) {
+		free(srq->wrid);
+		return -1;
+	}
+
+	memset(srq->buf.buf, 0, buf_size);
+
+	/*
+	 * Now initialize the SRQ buffer so that all of the WQEs are
+	 * linked into the list of free WQEs.
+	 */
+
+	for (i = 0; i < srq->max; ++i) {
+		next = get_wqe(srq, i);
+		next->next_wqe_index = htons((i + 1) & (srq->max - 1));
+
+		for (scatter = (void *) (next + 1);
+		     (void *) scatter < (void *) next + (1 << srq->wqe_shift);
+		     ++scatter)
+			scatter->lkey = htonl(MLX4_INVALID_LKEY);
+	}
+
+	srq->head = 0;
+	srq->tail = srq->max - 1;
+
+	return 0;
+}
diff --git a/prov/mlx4/src/wqe.h b/prov/mlx4/src/wqe.h
new file mode 100644
index 00000000000..bbd22bad225
--- /dev/null
+++ b/prov/mlx4/src/wqe.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2007 Cisco, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef WQE_H
+#define WQE_H
+
+enum {
+	MLX4_SEND_DOORBELL	= 0x14,
+};
+
+enum {
+	MLX4_WQE_CTRL_FENCE	= 1 << 6,
+	MLX4_WQE_CTRL_CQ_UPDATE	= 3 << 2,
+	MLX4_WQE_CTRL_SOLICIT	= 1 << 1,
+};
+
+enum {
+	MLX4_INLINE_SEG		= 1 << 31,
+	MLX4_INLINE_ALIGN	= 64,
+};
+
+enum {
+	MLX4_INVALID_LKEY	= 0x100,
+};
+
+struct mlx4_wqe_ctrl_seg {
+	uint32_t		owner_opcode;
+	uint8_t			reserved[3];
+	uint8_t			fence_size;
+	/*
+	 * High 24 bits are SRC remote buffer; low 8 bits are flags:
+	 * [7]   SO (strong ordering)
+	 * [5]   TCP/UDP checksum
+	 * [4]   IP checksum
+	 * [3:2] C (generate completion queue entry)
+	 * [1]   SE (solicited event)
+	 * [0]   FL (force loopback)
+	 */
+	uint32_t		srcrb_flags;
+	/*
+	 * imm is immediate data for send/RDMA write w/ immediate;
+	 * also invalidation key for send with invalidate; input
+	 * modifier for WQEs on CCQs.
+	 */
+	uint32_t		imm;
+};
+
+struct mlx4_wqe_datagram_seg {
+	uint32_t		av[8];
+	uint32_t		dqpn;
+	uint32_t		qkey;
+	uint16_t		vlan;
+	uint8_t			mac[6];
+};
+
+struct mlx4_wqe_data_seg {
+	uint32_t		byte_count;
+	uint32_t		lkey;
+	uint64_t		addr;
+};
+
+struct mlx4_wqe_inline_seg {
+	uint32_t		byte_count;
+};
+
+struct mlx4_wqe_srq_next_seg {
+	uint16_t		reserved1;
+	uint16_t		next_wqe_index;
+	uint32_t		reserved2[3];
+};
+
+struct mlx4_wqe_raddr_seg {
+	uint64_t		raddr;
+	uint32_t		rkey;
+	uint32_t		reserved;
+};
+
+struct mlx4_wqe_atomic_seg {
+	uint64_t		swap_add;
+	uint64_t		compare;
+};
+
+struct mlx4_wqe_bind_seg {
+	uint32_t		flags1;
+	uint32_t		flags2;
+	uint32_t		new_rkey;
+	uint32_t		lkey;
+	uint64_t		addr;
+	uint64_t		length;
+};
+
+#endif /* WQE_H */
diff --git a/prov/psm/AUTHORS b/prov/psm/AUTHORS
new file mode 100644
index 00000000000..e104c6e5b95
--- /dev/null
+++ b/prov/psm/AUTHORS
@@ -0,0 +1 @@
+Jianxin Xiong	<jianxin.xiong@intel.com>
diff --git a/prov/psm/COPYING b/prov/psm/COPYING
new file mode 100644
index 00000000000..ee1a79ffabf
--- /dev/null
+++ b/prov/psm/COPYING
@@ -0,0 +1,378 @@
+This software is available to you under a choice of one of two
+licenses.  You may choose to be licensed under the terms of the the
+OpenIB.org BSD license or the GNU General Public License (GPL) Version
+2, both included below.
+
+Copyright (c) 2004 Topspin Communications.  All rights reserved.
+
+==================================================================
+
+		       OpenIB.org BSD license
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+
+  * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following
+    disclaimer in the documentation and/or other materials provided
+    with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+==================================================================
+
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+                       59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+                            NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Library General
+Public License instead of this License.
diff --git a/prov/psm/src/psmx.h b/prov/psm/src/psmx.h
new file mode 100644
index 00000000000..4fadc90b558
--- /dev/null
+++ b/prov/psm/src/psmx.h
@@ -0,0 +1,91 @@
+#ifndef _FI_PSM_H
+#define _FI_PSM_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netdb.h>
+#include <rdma/fabric.h>
+#include <rdma/fi_prov.h>
+#include <rdma/fi_domain.h>
+#include <rdma/fi_socket.h>
+#include <rdma/fi_tagged.h>
+#include <rdma/fi_cm.h>
+#include <rdma/fi_errno.h>
+#include <psm.h>
+#include <psm_mq.h>
+
+#define PFX "libfabric:psm"
+
+#define PSMX_TIME_OUT	120
+
+struct psmx_fid_domain {
+	struct fid_domain	domain;
+	psm_ep_t		psm_ep;
+	psm_epid_t		psm_epid;
+	psm_mq_t		psm_mq;
+	pthread_t		ns_thread;
+	int			ns_port;
+};
+
+struct psmx_fid_ec {
+	struct fid_ec		ec;
+	struct psmx_fid_domain	*domain;
+	int			type;
+	int 			format;
+};
+
+struct psmx_fid_av {
+	struct fid_av		av;
+	struct psmx_fid_domain	*domain;
+	int			type;
+	int			format;
+	size_t			addrlen;
+};
+
+struct psmx_fid_socket {
+	struct fid_socket	socket;
+	struct psmx_fid_domain	*domain;
+	struct psmx_fid_ec	*ec;
+	struct psmx_fid_av	*av;
+	uint64_t		flags;
+};
+
+extern struct fi_ops_cm		psmx_cm_ops;
+extern struct fi_ops_tagged	psmx_tagged_ops;
+
+void	psmx_ini(void);
+void	psmx_fini(void);
+
+int	psmx_domain_open(const char *name, struct fi_info *info, uint64_t flags,
+			 fid_t *fid, void *context);
+int	psmx_sock_open(struct fi_info *info, fid_t *fid, void *context);
+int	psmx_ec_open(fid_t fid, struct fi_ec_attr *attr, fid_t *ec, void *context);
+int	psmx_av_open(fid_t fid, struct fi_av_attr *attr, fid_t *av, void *context);
+
+void 	*psmx_name_server(void *args);
+void	*psmx_resolve_name(char *servername, psm_uuid_t uuid);
+void	psmx_string_to_uuid(char *s, psm_uuid_t uuid);
+int	psmx_uuid_to_port(psm_uuid_t uuid);
+int	psmx_errno(int err);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/prov/psm/src/psmx_av.c b/prov/psm/src/psmx_av.c
new file mode 100644
index 00000000000..d14f22fe038
--- /dev/null
+++ b/prov/psm/src/psmx_av.c
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2013 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenFabrics.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "psmx.h"
+
+static int psmx_av_insert(fid_t fid, const void *addr, size_t count,
+			  void **fi_addr, uint64_t flags)
+{
+	struct psmx_fid_av *fid_av;
+	psm_error_t *errors;
+	int err;
+
+	fid_av = container_of(fid, struct psmx_fid_av, av.fid);
+
+	errors = (psm_error_t *) calloc(count, sizeof *errors);
+	if (!errors)
+		return -ENOMEM;
+
+	err = psm_ep_connect(fid_av->domain->psm_ep, count, 
+			(psm_epid_t *) addr, NULL, errors,
+			(psm_epaddr_t *) fi_addr, 30*1e9);
+
+	free(errors);
+
+	return psmx_errno(err);
+}
+
+static int psmx_av_remove(fid_t fid, void *fi_addr, size_t count,
+			  uint64_t flags)
+{
+	struct psmx_fid_av *fid_av;
+	int err = PSM_OK;
+	fid_av = container_of(fid, struct psmx_fid_av, av.fid);
+
+	return psmx_errno(err);
+}
+
+static int psmx_av_close(fid_t fid)
+{
+	struct psmx_fid_av *fid_av;
+	fid_av = container_of(fid, struct psmx_fid_av, av.fid);
+	free(fid_av);
+	return 0;
+}
+
+static int psmx_av_bind(fid_t fid, struct fi_resource *fids, int nfids)
+{
+	/* no need to bind an EQ since insert/remove is synchronous */
+	return 0;
+}
+
+static int psmx_av_sync(fid_t fid, uint64_t flags, void *context)
+{
+	/* no-op since insert/remove is synchronous */
+	return 0;
+}
+
+static int psmx_av_control(fid_t fid, int command, void *arg)
+{
+	return -ENOSYS;
+}
+
+static struct fi_ops psmx_fi_ops = {
+	.size = sizeof(struct fi_ops),
+	.close = psmx_av_close,
+	.bind = psmx_av_bind,
+	.sync = psmx_av_sync,
+	.control = psmx_av_control,
+};
+
+static struct fi_ops_av psmx_av_ops = {
+	.size = sizeof(struct fi_ops_av),
+	.insert = psmx_av_insert,
+	.remove = psmx_av_remove,
+};
+
+int psmx_av_open(fid_t fid, struct fi_av_attr *attr, fid_t *av, void *context)
+{
+	struct psmx_fid_domain *fid_domain;
+	struct psmx_fid_av *fid_av;
+
+	fid_domain = container_of(fid, struct psmx_fid_domain, domain.fid);
+
+	if (attr) {
+		if ((attr->av_mask & FI_AV_ATTR_TYPE) &&
+			attr->type != FI_AV_MAP)
+			return -ENOSYS;
+
+		if ((attr->av_mask & FI_AV_ATTR_ADDR_FORMAT) &&
+			attr->addr_format != FI_ADDR)
+			return -ENOSYS;
+
+		if ((attr->av_mask & FI_AV_ATTR_ADDRLEN) &&
+			attr->addrlen != sizeof(psm_epaddr_t))
+			return -ENOSYS;
+	}
+
+	fid_av = (struct psmx_fid_av *) calloc(1, sizeof *fid_av);
+	if (!fid_av)
+		return -ENOMEM;
+
+	fid_av->domain = fid_domain;
+	fid_av->type = FI_AV_MAP;
+	fid_av->format = FI_ADDR;
+	fid_av->addrlen = sizeof(psm_epaddr_t);
+
+	fid_av->av.fid.size = sizeof(struct fid_av);
+	fid_av->av.fid.fclass = FID_CLASS_AV;
+	fid_av->av.fid.context = context;
+	fid_av->av.fid.ops = &psmx_fi_ops;
+	fid_av->av.ops = &psmx_av_ops;
+
+	*av = &fid_av->av.fid;
+	return 0;
+}
+
diff --git a/prov/psm/src/psmx_cm.c b/prov/psm/src/psmx_cm.c
new file mode 100644
index 00000000000..c43ddb7faff
--- /dev/null
+++ b/prov/psm/src/psmx_cm.c
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2013 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenFabrics.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "psmx.h"
+
+static int psmx_cm_getname(fid_t fid, void *addr, size_t *addrlen)
+{
+	struct psmx_fid_socket *fid_socket;
+
+	fid_socket = container_of(fid, struct psmx_fid_socket, socket.fid);
+	if (!fid_socket->domain)
+		return -EBADF;
+
+	if (*addrlen < sizeof(psm_epid_t))
+		return -FI_ETOOSMALL;
+
+	*(psm_epid_t *)addr = fid_socket->domain->psm_epid;
+	*addrlen = sizeof(psm_epid_t);
+
+	return 0;
+}
+
+static int psmx_cm_getpeer(fid_t fid, void *addr, size_t *addrlen)
+{
+	return -ENOSYS;
+}
+
+static int psmx_cm_connect(fid_t fid, const void *param, size_t paramlen)
+{
+	return -ENOSYS;
+}
+
+static int psmx_cm_listen(fid_t fid)
+{
+	return -ENOSYS;
+}
+
+static int psmx_cm_accept(fid_t fid, const void *param, size_t paramlen)
+{
+	return -ENOSYS;
+}
+
+static int psmx_cm_reject(fid_t fid, struct fi_info *info, const void *param,
+			size_t paramlen)
+{
+	return -ENOSYS;
+}
+
+static int psmx_cm_shutdown(fid_t fid, uint64_t flags)
+{
+	return -ENOSYS;
+}
+
+static int psmx_cm_join(fid_t fid, void *addr, void **fi_addr, uint64_t flags)
+{
+	return -ENOSYS;
+}
+
+static int psmx_cm_leave(fid_t fid, void *addr, void *fi_addr, uint64_t flags)
+{
+	return -ENOSYS;
+}
+
+struct fi_ops_cm psmx_cm_ops = {
+	.size = sizeof(struct fi_ops_cm),
+	.getname = psmx_cm_getname,
+	.getpeer = psmx_cm_getpeer,
+	.connect = psmx_cm_connect,
+	.listen = psmx_cm_listen,
+	.accept = psmx_cm_accept,
+	.reject = psmx_cm_reject,
+	.shutdown = psmx_cm_shutdown,
+	.join = psmx_cm_join,
+	.leave = psmx_cm_leave,
+};
+
diff --git a/prov/psm/src/psmx_domain.c b/prov/psm/src/psmx_domain.c
new file mode 100644
index 00000000000..7d5c95c3e85
--- /dev/null
+++ b/prov/psm/src/psmx_domain.c
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2013 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenFabrics.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "psmx.h"
+
+static int psmx_domain_close(fid_t fid)
+{
+	struct psmx_fid_domain *fid_domain;
+	int err;
+
+	fid_domain = container_of(fid, struct psmx_fid_domain, domain.fid);
+
+	if (fid_domain->ns_thread) {
+		pthread_cancel(fid_domain->ns_thread);
+		pthread_join(fid_domain->ns_thread, NULL);
+	}
+
+	psm_mq_finalize(fid_domain->psm_mq);
+
+	err = psm_ep_close(fid_domain->psm_ep, PSM_EP_CLOSE_GRACEFUL,
+			   (int64_t) PSMX_TIME_OUT * 1000000000LL);
+	if (err != PSM_OK)
+		psm_ep_close(fid_domain->psm_ep, PSM_EP_CLOSE_FORCE, 0);
+
+	free(fid_domain);
+
+	return 0;
+}
+
+static int psmx_domain_bind(fid_t fid, struct fi_resource *fids, int nfids)
+{
+	return -ENOSYS;
+}
+
+static int psmx_domain_sync(fid_t fid, uint64_t flags, void *context)
+{
+	return -ENOSYS;
+}
+
+static int psmx_domain_control(fid_t fid, int command, void *arg)
+{
+	return -ENOSYS;
+}
+
+static int psmx_domain_query(fid_t fid, struct fi_domain_attr *attr, size_t *attrlen)
+{
+	return -ENOSYS;
+}
+
+static int psmx_progress(fid_t fid)
+{
+	return -ENOSYS;
+}
+
+static int psmx_mr_reg(fid_t fid, const void *buf, size_t len, fid_t *mr,
+		       uint64_t flags, void *context)
+{
+	return -ENOSYS;
+}
+
+static int psmx_mr_regv(fid_t fid, const struct iovec *iov, size_t count,
+			fid_t *mr, uint64_t flags, void *context)
+{
+	return -ENOSYS;
+}
+
+static struct fi_ops psmx_fi_ops = {
+	.size = sizeof(struct fi_ops),
+	.close = psmx_domain_close,
+	.bind = psmx_domain_bind,
+	.sync = psmx_domain_sync,
+	.control = psmx_domain_control,
+};
+
+static struct fi_ops_domain psmx_domain_ops = {
+	.size = sizeof(struct fi_ops_domain),
+	.progress = psmx_progress,
+	.query = psmx_domain_query,
+	.av_open = psmx_av_open,
+	.ec_open = psmx_ec_open,
+	.mr_reg = psmx_mr_reg,
+	.mr_regv = psmx_mr_regv,
+};
+
+int psmx_domain_open(const char *name, struct fi_info *info, uint64_t flags,
+		     fid_t *fid, void *context)
+{
+	struct psmx_fid_domain *fid_domain;
+	int err = -ENOMEM;
+	char *s;
+
+	if (name && strncmp(name, "psm", 3))
+		return -EINVAL;
+
+	fid_domain = (struct psmx_fid_domain *) calloc(1, sizeof *fid_domain);
+	if (!fid_domain)
+		goto err_out;
+
+	fid_domain->domain.fid.size = sizeof(struct fid_domain);
+	fid_domain->domain.fid.fclass = FID_CLASS_RESOURCE_DOMAIN;
+	fid_domain->domain.fid.context = context;
+	fid_domain->domain.fid.ops = &psmx_fi_ops;
+	fid_domain->domain.ops = &psmx_domain_ops;
+
+	err = psm_ep_open(info->auth_key, NULL,
+			  &fid_domain->psm_ep, &fid_domain->psm_epid);
+	if (err != PSM_OK) {
+		fprintf(stderr, "%s: psm_ep_open returns %d, errno=%d\n",
+			__func__, err, errno);
+		err = psmx_errno(err);
+		goto err_out_free_domain;
+	}
+
+	err = psm_mq_init(fid_domain->psm_ep, PSM_MQ_ORDERMASK_ALL,
+			  NULL, 0, &fid_domain->psm_mq);
+	if (err != PSM_OK) {
+		fprintf(stderr, "%s: psm_mq_init returns %d, errno=%d\n",
+			__func__, err, errno);
+		err = psmx_errno(err);
+		goto err_out_close_ep;
+	}
+
+	fid_domain->ns_port = psmx_uuid_to_port(info->auth_key);
+
+	s = getenv("SFI_PSM_NAME_SERVER");
+	if (s && (!strcasecmp(s, "yes") || !strcasecmp(s, "on") || !strcmp(s, "1")))
+		err = pthread_create(&fid_domain->ns_thread, NULL, psmx_name_server, (void *)fid_domain);
+	else
+		err = -1;
+
+	if (err)
+		fid_domain->ns_thread = 0;
+
+	*fid = &fid_domain->domain.fid;
+	return 0;
+
+err_out_close_ep:
+	if (psm_ep_close(fid_domain->psm_ep, PSM_EP_CLOSE_GRACEFUL,
+			 (int64_t) PSMX_TIME_OUT * 1000000000LL) != PSM_OK)
+		psm_ep_close(fid_domain->psm_ep, PSM_EP_CLOSE_FORCE, 0);
+
+err_out_free_domain:
+	free(fid_domain);
+
+err_out:
+	return err;
+}
+
diff --git a/prov/psm/src/psmx_ec.c b/prov/psm/src/psmx_ec.c
new file mode 100644
index 00000000000..dfd53fb1692
--- /dev/null
+++ b/prov/psm/src/psmx_ec.c
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2013 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenFabrics.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "psmx.h"
+
+static struct fi_ec_err_entry error_ece;
+static int error_state = 0;
+
+static ssize_t psmx_ec_readfrom(fid_t fid, void *buf, size_t len,
+				void *src_addr, size_t *addrlen)
+{
+	struct psmx_fid_ec *fid_ec;
+	psm_mq_req_t psm_req;
+	psm_mq_status_t psm_status;
+	struct fi_ec_tagged_entry *ece;
+	int err;
+
+	fid_ec = container_of(fid, struct psmx_fid_ec, ec.fid);
+	assert(fid_ec->domain);
+	assert(fid_ec->format == FI_EC_FORMAT_TAGGED);
+
+	if (len < sizeof *ece)
+		return -FI_ETOOSMALL;
+
+	err = psm_mq_ipeek(fid_ec->domain->psm_mq, &psm_req, NULL);
+	if (err == PSM_OK) {
+		err = psm_mq_test(&psm_req, &psm_status);
+
+		if (psm_status.error_code) {
+			error_ece.fid_context = fid_ec->ec.fid.context;
+			error_ece.op_context = psm_status.context;
+			error_ece.flags = 0;
+			error_ece.err = psmx_errno(psm_status.error_code);
+			error_ece.prov_errno = psm_status.error_code;
+			error_ece.data = 0;
+			error_ece.prov_data = NULL;
+			error_state = 1;
+			return error_ece.err;
+		}
+
+		ece = (struct fi_ec_tagged_entry *) buf;
+		ece->op_context = psm_status.context;
+		ece->flags = 0;
+		ece->len = psm_status.nbytes;
+		ece->data = 0;
+		ece->tag = psm_status.msg_tag;
+		ece->olen = psm_status.msg_length;
+
+		return 1;
+	} else if (err == PSM_MQ_NO_COMPLETIONS) {
+		return 0;
+	} else {
+		return -1;
+	}
+}
+
+static ssize_t psmx_ec_read(fid_t fid, void *buf, size_t len)
+{
+	return psmx_ec_readfrom(fid, buf, len, NULL, NULL);
+}
+
+static ssize_t psmx_ec_readerr(fid_t fid, void *buf, size_t len, uint64_t flags)
+{
+	if (len < sizeof(error_ece))
+		return -FI_ETOOSMALL;
+
+	*(struct fi_ec_err_entry *)buf = error_ece;
+	error_state = 0;
+
+	return 0;
+}
+
+static ssize_t psmx_ec_write(fid_t fid, void *buf, size_t len)
+{
+	return -ENOSYS;
+}
+
+static int psmx_ec_reset(fid_t fid, void *cond)
+{
+	return -ENOSYS;
+}
+
+static ssize_t psmx_ec_condread(fid_t fid, void *buf, size_t len, void *cond)
+{
+	return -ENOSYS;
+}
+
+static ssize_t psmx_ec_condreadfrom(fid_t fid, void *buf, size_t len,
+				    void *src_addr, size_t *addrlen, void *cond)
+{
+	return -ENOSYS;
+}
+
+static const char *psmx_ec_strerror(fid_t fid, int prov_errno, void *prov_data,
+				    void *buf, size_t len)
+{
+	return psm_error_get_string(prov_errno);
+}
+
+static int psmx_ec_close(fid_t fid)
+{
+	struct psmx_fid_ec *fid_ec;
+
+	fid_ec = container_of(fid, struct psmx_fid_ec, ec.fid);
+	free(fid_ec);
+
+	return 0;
+}
+
+static int psmx_ec_bind(fid_t fid, struct fi_resource *fids, int nfids)
+{
+	return -ENOSYS;
+}
+
+static int psmx_ec_sync(fid_t fid, uint64_t flags, void *context)
+{
+	return -ENOSYS;
+}
+
+static int psmx_ec_control(fid_t fid, int command, void *arg)
+{
+	return -ENOSYS;
+}
+
+static struct fi_ops psmx_fi_ops = {
+	.size = sizeof(struct fi_ops),
+	.close = psmx_ec_close,
+	.bind = psmx_ec_bind,
+	.sync = psmx_ec_sync,
+	.control = psmx_ec_control,
+};
+
+static struct fi_ops_ec psmx_ec_ops = {
+	.size = sizeof(struct fi_ops_ec),
+	.read = psmx_ec_read,
+	.readfrom = psmx_ec_readfrom,
+	.readerr = psmx_ec_readerr,
+	.write = psmx_ec_write,
+	.reset = psmx_ec_reset,
+	.condread = psmx_ec_condread,
+	.condreadfrom = psmx_ec_condreadfrom,
+	.strerror = psmx_ec_strerror,
+};
+
+int psmx_ec_open(fid_t fid, struct fi_ec_attr *attr, fid_t *ec, void *context)
+{
+	struct psmx_fid_domain *fid_domain;
+	struct psmx_fid_ec *fid_ec;
+
+	if (attr->domain != FI_EC_DOMAIN_GENERAL && attr->domain != FI_EC_DOMAIN_COMP)
+		return -ENOSYS;
+
+	if (attr->type != FI_EC_QUEUE)
+		return -ENOSYS;
+
+	if (attr->format != FI_EC_FORMAT_TAGGED && attr->format != FI_EC_FORMAT_UNSPEC)
+		return -ENOSYS;
+
+	fid_domain = container_of(fid, struct psmx_fid_domain, domain.fid);
+	fid_ec = (struct psmx_fid_ec *) calloc(1, sizeof *fid_ec);
+	if (!fid_ec)
+		return -ENOMEM;
+
+	fid_ec->domain = fid_domain;
+	fid_ec->type = FI_EC_QUEUE;
+	fid_ec->format = FI_EC_FORMAT_TAGGED;
+	fid_ec->ec.fid.size = sizeof(struct fid_ec);
+	fid_ec->ec.fid.fclass = FID_CLASS_EC;
+	fid_ec->ec.fid.context = context;
+	fid_ec->ec.fid.ops = &psmx_fi_ops;
+	fid_ec->ec.ops = &psmx_ec_ops;
+
+	*ec = &fid_ec->ec.fid;
+	return 0;
+}
+
diff --git a/prov/psm/src/psmx_init.c b/prov/psm/src/psmx_init.c
new file mode 100644
index 00000000000..ad817a8b069
--- /dev/null
+++ b/prov/psm/src/psmx_init.c
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2013 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenFabrics.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "psmx.h"
+
+static int psmx_getinfo(char *node, char *service, struct fi_info *hints,
+			struct fi_info **info)
+{
+	struct fi_info *psmx_info;
+	uint64_t supported_flags = FI_NONBLOCK|FI_ACK|FI_EXCL|FI_BUFFERED_RECV|FI_CANCEL;
+	uint64_t default_flags = FI_NONBLOCK;
+	uint64_t flags = 0;
+	void *dst_addr = NULL;
+	void *uuid;
+	char *s;
+
+	uuid = calloc(1, sizeof(psm_uuid_t));
+	if (!uuid) 
+		return -ENOMEM;
+
+	s = getenv("SFI_PSM_UUID");
+	if (s)
+		psmx_string_to_uuid(s, uuid);
+
+	if (node)
+		dst_addr = psmx_resolve_name(node, uuid);
+
+	if (service) {
+		/* FIXME: check service */
+	}
+
+	if (hints) {
+		switch (hints->type & FID_TYPE_MASK) {
+		case FID_UNSPEC:
+		case FID_RDM:
+			break;
+		default:
+			*info = NULL;
+			return -ENODATA;
+		}
+
+		switch (hints->protocol & FI_PROTO_MASK) {
+		case FI_PROTO_UNSPEC:
+			  if (hints->protocol & FI_PROTO_TAGGED)
+				  break;
+		/* fall through */
+		default:
+			*info = NULL;
+			return -ENODATA;
+		}
+
+		flags = hints->flags;
+		if ((flags & supported_flags) != flags) {
+			*info = NULL;
+			return -ENODATA;
+		}
+
+		if (hints->domain_name && strncmp(hints->domain_name, "psm", 3)) {
+			*info = NULL;
+			return -ENODATA;
+		}
+
+		/* FIXME: check other fields of hints */
+	}
+
+	psmx_info = calloc(1, sizeof *psmx_info);
+	if (!psmx_info) {
+		free(uuid);
+		return -ENOMEM;
+	}
+
+	psmx_info->next = NULL;
+	psmx_info->size = sizeof(*psmx_info);
+	psmx_info->flags = flags | default_flags;
+	psmx_info->type = FID_RDM;
+	psmx_info->protocol = FI_PROTO_TAGGED;
+	psmx_info->iov_format = FI_IOTAGGED; /* FIXME: or FI_IOTAGGEDV? */
+	psmx_info->addr_format = FI_ADDR; 
+	psmx_info->info_addr_format = FI_ADDR;
+	psmx_info->src_addrlen = 0;
+	psmx_info->dst_addrlen = sizeof(psm_epid_t);
+	psmx_info->src_addr = NULL;
+	psmx_info->dst_addr = dst_addr;
+	psmx_info->auth_keylen = sizeof(psm_uuid_t);
+	psmx_info->auth_key = uuid;
+	psmx_info->shared_fd = -1;
+	psmx_info->domain_name = strdup("psm");
+	psmx_info->datalen = 0;
+	psmx_info->data = NULL;
+
+	*info = psmx_info;
+
+	return 0;
+}
+
+static struct fi_ops_prov psmx_ops = {
+	.size = sizeof(struct fi_ops_prov),
+	.getinfo = psmx_getinfo,
+	.freeinfo = NULL,
+	.socket = psmx_sock_open,
+	.open = psmx_domain_open
+};
+
+void psmx_ini(void)
+{
+	int major, minor;
+	int err;
+
+        psm_error_register_handler(NULL, PSM_ERRHANDLER_NO_HANDLER);
+
+	major = PSM_VERNO_MAJOR;
+	minor = PSM_VERNO_MINOR;
+
+        err = psm_init(&major, &minor);
+	if (err != PSM_OK) {
+		fprintf(stderr, "%s: psm_init failed: %s\n", __func__,
+			psm_error_get_string(err));
+		return;
+	}
+
+	if (major > PSM_VERNO_MAJOR) {
+		fprintf(stderr, "%s: PSM loaded an unexpected/unsupported version %d.%d\n",
+			__func__, major, minor);
+		return;
+	}
+
+	fi_register(&psmx_ops);
+}
+
+void psmx_fini(void)
+{
+	psm_finalize();
+}
+
diff --git a/prov/psm/src/psmx_sock.c b/prov/psm/src/psmx_sock.c
new file mode 100644
index 00000000000..5202b1d7ddd
--- /dev/null
+++ b/prov/psm/src/psmx_sock.c
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2013 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenFabrics.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "psmx.h"
+
+static ssize_t psmx_sock_cancel(fid_t fid, struct fi_context *context)
+{
+	struct psmx_fid_socket *fid_socket;
+	int err;
+
+	fid_socket = container_of(fid, struct psmx_fid_socket, socket.fid);
+	if (!fid_socket->domain)
+		return -EBADF;
+
+	if (!context)
+		return -EINVAL;
+
+	if (context->internal[0] == NULL)
+		return 0;
+
+	err = psm_mq_cancel((psm_mq_req_t *)&context->internal[0]);
+	return psmx_errno(err);
+}
+
+static int psmx_sock_getopt(fid_t fid, int level, int optname,
+			void *optval, size_t *optlen)
+{
+	return -ENOSYS;
+}
+
+static int psmx_sock_setopt(fid_t fid, int level, int optname,
+			const void *optval, size_t optlen)
+{
+	return -ENOSYS;
+}
+
+static int psmx_sock_close(fid_t fid)
+{
+	struct psmx_fid_socket *fid_socket;
+
+	fid_socket = container_of(fid, struct psmx_fid_socket, socket.fid);
+	free(fid_socket);
+
+	return 0;
+}
+
+static int psmx_sock_bind(fid_t fid, struct fi_resource *ress, int nress)
+{
+	int i;
+	struct psmx_fid_socket *fid_socket;
+	struct psmx_fid_domain *domain;
+	struct psmx_fid_av *av;
+	struct psmx_fid_ec *ec;
+
+	fid_socket = container_of(fid, struct psmx_fid_socket, socket.fid);
+
+	for (i=0; i<nress; i++) {
+		if (!ress[i].fid)
+			return -EINVAL;
+		switch (ress[i].fid->fclass) {
+		case FID_CLASS_RESOURCE_DOMAIN:
+			domain = container_of(ress[i].fid,
+					struct psmx_fid_domain, domain.fid);
+			if (fid_socket->domain && fid_socket->domain != domain)
+				return -EEXIST;
+			fid_socket->domain = domain;
+			break;
+
+		case FID_CLASS_EC:
+			/* TODO: check ress flags for send/recv EQs */
+			ec = container_of(ress[i].fid,
+					struct psmx_fid_ec, ec.fid);
+			if (fid_socket->ec && fid_socket->ec != ec)
+				return -EEXIST;
+			if (fid_socket->domain && fid_socket->domain != ec->domain)
+				return -EINVAL;
+			fid_socket->ec = ec;
+			fid_socket->domain = ec->domain;
+			break;
+
+		case FID_CLASS_AV:
+			av = container_of(ress[i].fid,
+					struct psmx_fid_av, av.fid);
+			if (fid_socket->av && fid_socket->av != av)
+				return -EEXIST;
+			if (fid_socket->domain && fid_socket->domain != av->domain)
+				return -EINVAL;
+			fid_socket->av = av;
+			fid_socket->domain = av->domain;
+			break;
+
+		default:
+			return -ENOSYS;
+		}
+	}
+
+	return 0;
+}
+
+static int psmx_sock_sync(fid_t fid, uint64_t flags, void *context)
+{
+	return -ENOSYS;
+}
+
+static int psmx_sock_control(fid_t fid, int command, void *arg)
+{
+	return -ENOSYS;
+}
+
+static struct fi_ops psmx_fi_ops = {
+	.size = sizeof(struct fi_ops),
+	.close = psmx_sock_close,
+	.bind = psmx_sock_bind,
+	.sync = psmx_sock_sync,
+	.control = psmx_sock_control,
+};
+
+static struct fi_ops_sock psmx_sock_ops = {
+	.size = sizeof(struct fi_ops_sock),
+	.cancel = psmx_sock_cancel,
+	.getopt = psmx_sock_getopt,
+	.setopt = psmx_sock_setopt,
+};
+
+int psmx_sock_open(struct fi_info *info, fid_t *fid, void *context)
+{
+	struct psmx_fid_socket *fid_socket;
+
+	fid_socket = (struct psmx_fid_socket *) calloc(1, sizeof *fid_socket);
+	if (!fid_socket)
+		return -ENOMEM;
+
+	fid_socket->socket.fid.size = sizeof(struct fid_socket);
+	fid_socket->socket.fid.fclass = FID_CLASS_SOCKET;
+	fid_socket->socket.fid.context = context;
+	fid_socket->socket.fid.ops = &psmx_fi_ops;
+	fid_socket->socket.ops = &psmx_sock_ops;
+	fid_socket->socket.cm = &psmx_cm_ops;
+	fid_socket->socket.tagged = &psmx_tagged_ops;
+
+	if (info)
+		fid_socket->flags = info->flags;
+
+	*fid = &fid_socket->socket.fid;
+
+	return 0;
+}
+
diff --git a/prov/psm/src/psmx_tagged.c b/prov/psm/src/psmx_tagged.c
new file mode 100644
index 00000000000..a94b672673b
--- /dev/null
+++ b/prov/psm/src/psmx_tagged.c
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2013 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenFabrics.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "psmx.h"
+
+static ssize_t psmx_tagged_recv(fid_t fid, void *buf, size_t len,
+				be64_t tag, be64_t mask, void *context)
+{
+	return -ENOSYS;
+}
+
+static ssize_t psmx_tagged_recvv(fid_t fid, const void *iov, size_t len,
+				 be64_t tag, be64_t mask, void *context)
+{
+	return -ENOSYS;
+}
+
+static ssize_t psmx_tagged_recvfrom(fid_t fid, void *buf, size_t len,
+				    const void *src_addr,
+				    be64_t tag, be64_t mask, void *context)
+{
+	struct psmx_fid_socket *fid_socket;
+	psm_mq_req_t psm_req;
+	int err;
+
+	fid_socket = container_of(fid, struct psmx_fid_socket, socket.fid);
+	assert(fid_socket->domain);
+
+	err = psm_mq_irecv(fid_socket->domain->psm_mq, tag, ~mask, 0, /* flags */
+			   buf, len, context, &psm_req);
+	if (err != PSM_OK)
+		return psmx_errno(err);
+
+	if (fid_socket->flags & (FI_BUFFERED_RECV | FI_CANCEL))
+		((struct fi_context *)context)->internal[0] = psm_req;
+
+	return 0;
+}
+
+static ssize_t psmx_tagged_recvmsg(fid_t fid, const struct fi_msg_tagged *msg,
+				   uint64_t flags)
+{
+	return -ENOSYS;
+}
+
+static ssize_t psmx_tagged_send(fid_t fid, const void *buf, size_t len,
+				be64_t tag, void *context)
+{
+	return -ENOSYS;
+}
+
+static ssize_t psmx_tagged_sendv(fid_t fid, const void *iov, size_t len,
+				be64_t tag, void *context)
+{
+	return -ENOSYS;
+}
+
+static ssize_t psmx_tagged_sendto(fid_t fid, const void *buf, size_t len,
+				  const void *dest_addr,
+				  be64_t tag, void *context)
+{
+	struct psmx_fid_socket *fid_socket;
+	int nonblocking;
+	int send_flag;
+	psm_epaddr_t psm_epaddr;
+	psm_mq_req_t psm_req;
+	int err;
+	int flags;
+
+	fid_socket = container_of(fid, struct psmx_fid_socket, socket.fid);
+	assert(fid_socket->domain);
+
+	psm_epaddr = (psm_epaddr_t) dest_addr;
+
+	flags = fid_socket->flags;
+
+	nonblocking = !!(flags & FI_NONBLOCK);
+	send_flag = (flags & FI_ACK) ? PSM_MQ_FLAG_SENDSYNC : 0;
+
+	if (nonblocking) {
+		err = psm_mq_isend(fid_socket->domain->psm_mq, psm_epaddr,
+				   send_flag, tag, buf, len, context, &psm_req);
+
+		if (flags & (FI_BUFFERED_RECV | FI_CANCEL))
+			((struct fi_context *)context)->internal[0] = NULL;
+			 /* send cannot be canceled */
+		return 0;
+	} else {
+		err = psm_mq_send(fid_socket->domain->psm_mq, psm_epaddr,
+				  send_flag, tag, buf, len);
+		if (err == PSM_OK)
+			return len;
+		else
+			return psmx_errno(err);
+	}
+}
+
+static ssize_t psmx_tagged_sendmsg(fid_t fid, const struct fi_msg_tagged *msg,
+				   uint64_t flags)
+{
+	return -ENOSYS;
+}
+
+static ssize_t psmx_tagged_search(fid_t fid, be64_t *tag, be64_t mask,
+				  uint64_t flags, void *src_addr, 
+				  size_t *src_addrlen, size_t *len,
+				  void *context)
+{
+	struct psmx_fid_socket *fid_socket;
+	psm_mq_status_t psm_status;
+	int err;
+
+	fid_socket = container_of(fid, struct psmx_fid_socket, socket.fid);
+	assert(fid_socket->domain);
+
+	err = psm_mq_iprobe(fid_socket->domain->psm_mq, *tag, ~mask, &psm_status);
+	switch (err) {
+	case PSM_OK:
+		*tag = psm_status.msg_tag;
+		*len = psm_status.msg_length;
+		/* FIXME: fill in src_addr and src_addrlen */
+		return 1;
+
+	case PSM_MQ_NO_COMPLETIONS:
+		return -FI_ENOMSG;
+
+	default:
+		return psmx_errno(err);
+	}
+}
+
+struct fi_ops_tagged psmx_tagged_ops = {
+	.size = sizeof(struct fi_ops_tagged),
+	.recv = psmx_tagged_recv,
+	.recvv = psmx_tagged_recvv,
+	.recvfrom = psmx_tagged_recvfrom,
+	.recvmsg = psmx_tagged_recvmsg,
+	.send = psmx_tagged_send,
+	.sendv = psmx_tagged_sendv,
+	.sendto = psmx_tagged_sendto,
+	.sendmsg = psmx_tagged_sendmsg,
+	.search = psmx_tagged_search,
+};
+
diff --git a/prov/psm/src/psmx_util.c b/prov/psm/src/psmx_util.c
new file mode 100644
index 00000000000..41d71f26885
--- /dev/null
+++ b/prov/psm/src/psmx_util.c
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2013 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenFabrics.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "psmx.h"
+
+void psmx_string_to_uuid(char *s, psm_uuid_t uuid)
+{
+	int n;
+
+	n = sscanf(s,
+		"%2hhx%2hhx%2hhx%2hhx-"
+		"%2hhx%2hhx-%2hhx%2hhx-%2hhx%2hhx-"
+		"%2hhx%2hhx%2hhx%2hhx%2hhx%2hhx",
+		&uuid[0], &uuid[1], &uuid[2], &uuid[3],
+		&uuid[4], &uuid[5], &uuid[6], &uuid[7], &uuid[8], &uuid[9],
+		&uuid[10], &uuid[11], &uuid[12], &uuid[13], &uuid[14], &uuid[15]);
+
+	if (n != 16) {
+		fprintf(stderr, "%s: wrong uuid format: %s\n", __func__, s);
+		fprintf(stderr, "%s: correct uuid format is: "
+			"xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx\n",
+			__func__);
+	}
+}
+
+int psmx_uuid_to_port(psm_uuid_t uuid)
+{
+	uint16_t port;
+	uint16_t *u = (uint16_t *)uuid;
+
+	port = u[0] + u[1] + u[2] + u[3] + u[4] + u[5] + u[6] + u[7];
+	if (port < 4096)
+		port += 4096;
+
+	return (int)port;
+}
+
+static void psmx_name_server_cleanup(void *args)
+{
+	close((int)(uintptr_t)args);
+}
+
+/*************************************************************
+ * A simple name resolution mechanism for client-server style
+ * applications. The server side has to run first. The client
+ * side then passes the server name as the first parameter
+ * of fi_getinfo call and the resulting provider info should
+ * have the transport address of the server in the dst_addr
+ * field. Both side has to use the same UUID.
+ *************************************************************/
+void *psmx_name_server(void *args)
+{
+	struct psmx_fid_domain *fid_domain;
+	struct addrinfo hints = {
+		.ai_flags = AI_PASSIVE,
+		.ai_family = AF_UNSPEC,
+		.ai_socktype = SOCK_STREAM
+	};
+	struct addrinfo *res, *p;
+	char *service;
+	int listenfd = -1, connfd;
+	int port;
+	int n;
+
+	fid_domain = args;
+	port = fid_domain->ns_port;
+
+	pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL);
+
+	if (asprintf(&service, "%d", port) < 0)
+		return NULL;
+
+	n = getaddrinfo(NULL, service, &hints, &res);
+	if (n < 0) {
+		fprintf(stderr, "%s: port %d: %s\n", __func__, port, gai_strerror(n));
+		free(service);
+		return NULL;
+	}
+
+	for (p=res; p; p=p->ai_next) {
+		listenfd = socket(p->ai_family, p->ai_socktype, p->ai_protocol);
+		if (listenfd >= 0) {
+			n = 1;
+			setsockopt(listenfd, SOL_SOCKET, SO_REUSEADDR, &n, sizeof(n));
+			if (!bind(listenfd, p->ai_addr, p->ai_addrlen))
+				break;
+			close(listenfd);
+			listenfd = -1;
+		}
+	}
+
+	freeaddrinfo(res);
+	free(service);
+
+	if (listenfd < 0) {
+		fprintf(stderr, "%s: couldn't listen to port %d\n", __func__, port);
+		return NULL;
+	}
+
+	listen(listenfd, 256);
+
+	pthread_cleanup_push(psmx_name_server_cleanup, (void *)(uintptr_t)listenfd);
+	{
+		pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
+		pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);
+
+		while (1) {
+			connfd = accept(listenfd, NULL, 0);
+			if (connfd >= 0) {
+				pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL);
+				write(connfd, &fid_domain->psm_epid, sizeof(psm_epid_t));
+				close(connfd);
+				pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
+			}
+		}
+	}
+	pthread_cleanup_pop(1);
+
+	return NULL;
+}
+
+void *psmx_resolve_name(char *servername, psm_uuid_t uuid)
+{
+	struct addrinfo hints = {
+		.ai_family   = AF_UNSPEC,
+		.ai_socktype = SOCK_STREAM
+	};
+	struct addrinfo *res, *p;
+	char *service;
+	void *dst_addr;
+	int sockfd = -1;
+	int port;
+	int n;
+
+	port = psmx_uuid_to_port(uuid);
+
+	if (asprintf(&service, "%d", port) < 0)
+		return NULL;
+
+	n = getaddrinfo(servername, service, &hints, &res);
+	if (n < 0) {
+		fprintf(stderr, "%s:(%s:%d):%s\n", __func__, servername, port, gai_strerror(n));
+		free(service);
+		return NULL;
+	}
+
+	for (p = res; p; p = p->ai_next) {
+		sockfd = socket(p->ai_family, p->ai_socktype, p->ai_protocol);
+		if (sockfd >= 0) {
+			if (!connect(sockfd, p->ai_addr, p->ai_addrlen))
+				break;
+			close(sockfd);
+			sockfd = -1;
+		}
+	}
+
+	freeaddrinfo(res);
+	free(service);
+
+	if (sockfd < 0) {
+		fprintf(stderr, "%s: couldn't connect to %s:%d\n", __func__, servername, port);
+		return NULL;
+	}
+
+	dst_addr = calloc(1,sizeof(*dst_addr));
+	if (!dst_addr) {
+		close(sockfd);
+		return NULL;
+	}
+
+	if (read(sockfd, dst_addr, sizeof(psm_epid_t)) != sizeof(psm_epid_t)) {
+		perror(__func__);
+		free(dst_addr);
+		close(sockfd);
+		return NULL;
+	}
+
+	close(sockfd);
+
+	return dst_addr;
+}
+
+static int psmx_errno_table[PSM_ERROR_LAST] = {
+	0,		/* PSM_OK = 0 */
+	0,		/* PSM_OK_NO_PROGRESS = 1 */
+	-FI_EOTHER,
+	-FI_EINVAL,	/* PSM_PARAM_ERR = 3 */
+	-FI_ENOMEM, 	/* PSM_NO_MEMORY = 4 */
+	-FI_EBADF,	/* PSM_INIT_NOT_INIT = 5 */
+	-FI_EINVAL,	/* PSM_INIT_BAD_API_VERSION = 6 */
+	-FI_ENOSYS,	/* PSM_NO_AFFINITY = 7 */
+	-FI_EIO,	/* PSM_INTERNAL_ERR = 8 */
+	-FI_EINVAL,	/* PSM_SHMEM_SEGMENT_ERR = 9 */
+	-FI_EACCES,	/* PSM_OPT_READONLY = 10 */
+	-FI_ETIMEDOUT,	/* PSM_TIMEOUT = 11 */
+	-FI_EMFILE,	/* PSM_TOO_MANY_ENDPOINTS = 12 */
+	-FI_ESHUTDOWN,	/* PSM_IS_FINALIZED = 13 */
+	-FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER,
+	-FI_ESHUTDOWN,	/* PSM_EP_WAS_CLOSED = 20 */
+	-FI_ENODEV,	/* PSM_EP_NO_DEVICE = 21 */
+	-FI_ENOENT,	/* PSM_EP_UNIT_NOT_FOUND = 22 */
+	-FI_EIO,	/* PSM_EP_DEVICE_FAILURE = 23 */
+	-FI_ETIMEDOUT, 	/* PSM_EP_CLOSE_TIMEOUT = 24 */
+	-FI_ENOENT,	/* PSM_EP_NO_PORTS_AVAIL = 25 */
+	-FI_ENETDOWN,	/* PSM_EP_NO_NETWORK = 26 */
+	-FI_EINVAL,	/* PSM_EP_INVALID_UUID_KEY = 27 */
+	-FI_ENOSPC,	/* PSM_EP_NO_RESOURCES = 28 */
+	-FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER,
+	-FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER,
+	-FI_EBADF,	/* PSM_EPID_UNKNOWN = 40 */
+	-FI_ENETUNREACH,/* PSM_EPID_UNREACHABLE = 41 */
+	-FI_EOTHER,
+	-FI_EINVAL,	/* PSM_EPID_INVALID_NODE = 43 */
+	-FI_EINVAL,	/* PSM_EPID_INVALID_MTU =  44 */
+	-FI_EINVAL,	/* PSM_EPID_INVALID_UUID_KEY = 45 */
+	-FI_EINVAL,	/* PSM_EPID_INVALID_VERSION = 46 */
+	-FI_EINVAL,	/* PSM_EPID_INVALID_CONNECT = 47 */
+	-FI_EISCONN,	/* PSM_EPID_ALREADY_CONNECTED = 48 */
+	-FI_EIO,	/* PSM_EPID_NETWORK_ERROR = 49 */
+	-FI_EINVAL,	/* PSM_EPID_INVALID_PKEY = 50 */
+	-FI_ENETUNREACH,/* PSM_EPID_PATH_RESOLUTION = 51 */
+	-FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER,
+	-FI_EOTHER, -FI_EOTHER,
+	-FI_EAGAIN,	/* PSM_MQ_NO_COMPLETIONS = 60 */
+	-FI_EMSGSIZE,	/* PSM_MQ_TRUNCATION = 61 */
+	-FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER,
+	-FI_EOTHER, -FI_EOTHER,
+	-FI_EINVAL,	/* PSM_AM_INVALID_REPLY = 70 */
+	-FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER,
+	-FI_EOTHER, -FI_EOTHER, -FI_EOTHER
+			/* PSM_ERROR_LAST = 80 */
+};
+
+int psmx_errno(int err)
+{
+	if (err >= 0 && err < PSM_ERROR_LAST)
+		return psmx_errno_table[err];
+	else
+		return -FI_EOTHER;
+}
+
diff --git a/prov/rdmacm/AUTHORS b/prov/rdmacm/AUTHORS
new file mode 100644
index 00000000000..f76b870b4db
--- /dev/null
+++ b/prov/rdmacm/AUTHORS
@@ -0,0 +1 @@
+Sean Hefty		<sean.hefty@intel.com>
diff --git a/prov/rdmacm/COPYING b/prov/rdmacm/COPYING
new file mode 100644
index 00000000000..39f3831585f
--- /dev/null
+++ b/prov/rdmacm/COPYING
@@ -0,0 +1,378 @@
+This software is available to you under a choice of one of two
+licenses.  You may choose to be licensed under the terms of the the
+OpenIB.org BSD license or the GNU General Public License (GPL) Version
+2, both included below.
+
+Copyright (c) 2005 Intel Corporation.  All rights reserved.
+
+==================================================================
+
+		       OpenIB.org BSD license
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+
+  * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following
+    disclaimer in the documentation and/or other materials provided
+    with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+==================================================================
+
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+                       59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+                            NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Library General
+Public License instead of this License.
diff --git a/prov/rdmacm/examples/common.c b/prov/rdmacm/examples/common.c
new file mode 100644
index 00000000000..2d10ea1262d
--- /dev/null
+++ b/prov/rdmacm/examples/common.c
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2005-2006,2012 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id$
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <netdb.h>
+#include <byteswap.h>
+
+#include <rdma/rdma_cma.h>
+#include "common.h"
+
+int use_rs = 1;
+
+/*
+ * rdma_getaddrinfo is not exported by libfabric at this time
+ */
+//int get_rdma_addr(char *src, char *dst, char *port,
+//		  struct rdma_addrinfo *hints, struct rdma_addrinfo **rai)
+//{
+//	struct rdma_addrinfo rai_hints, *res;
+//	int ret;
+//
+//	if (hints->ai_flags & RAI_PASSIVE)
+//		return rdma_getaddrinfo(src, port, hints, rai);
+//
+//	rai_hints = *hints;
+//	if (src) {
+//		rai_hints.ai_flags |= RAI_PASSIVE;
+//		ret = rdma_getaddrinfo(src, NULL, &rai_hints, &res);
+//		if (ret)
+//			return ret;
+//
+//		rai_hints.ai_src_addr = res->ai_src_addr;
+//		rai_hints.ai_src_len = res->ai_src_len;
+//		rai_hints.ai_flags &= ~RAI_PASSIVE;
+//	}
+//
+//	ret = rdma_getaddrinfo(dst, port, &rai_hints, rai);
+//	if (src)
+//		rdma_freeaddrinfo(res);
+//
+//	return ret;
+//}
+
+void size_str(char *str, size_t ssize, long long size)
+{
+	long long base, fraction = 0;
+	char mag;
+
+	if (size >= (1 << 30)) {
+		base = 1 << 30;
+		mag = 'g';
+	} else if (size >= (1 << 20)) {
+		base = 1 << 20;
+		mag = 'm';
+	} else if (size >= (1 << 10)) {
+		base = 1 << 10;
+		mag = 'k';
+	} else {
+		base = 1;
+		mag = '\0';
+	}
+
+	if (size / base < 10)
+		fraction = (size % base) * 10 / base;
+	if (fraction) {
+		snprintf(str, ssize, "%lld.%lld%c", size / base, fraction, mag);
+	} else {
+		snprintf(str, ssize, "%lld%c", size / base, mag);
+	}
+}
+
+void cnt_str(char *str, size_t ssize, long long cnt)
+{
+	if (cnt >= 1000000000)
+		snprintf(str, ssize, "%lldb", cnt / 1000000000);
+	else if (cnt >= 1000000)
+		snprintf(str, ssize, "%lldm", cnt / 1000000);
+	else if (cnt >= 1000)
+		snprintf(str, ssize, "%lldk", cnt / 1000);
+	else
+		snprintf(str, ssize, "%lld", cnt);
+}
+
+int size_to_count(int size)
+{
+	if (size >= (1 << 20))
+		return 100;
+	else if (size >= (1 << 16))
+		return 1000;
+	else if (size >= (1 << 10))
+		return 10000;
+	else
+		return 100000;
+}
+
+void format_buf(void *buf, int size)
+{
+	uint8_t *array = buf;
+	static uint8_t data;
+	int i;
+
+	for (i = 0; i < size; i++)
+		array[i] = data++;
+}
+
+int verify_buf(void *buf, int size)
+{
+	static long long total_bytes;
+	uint8_t *array = buf;
+	static uint8_t data;
+	int i;
+
+	for (i = 0; i < size; i++, total_bytes++) {
+		if (array[i] != data++) {
+			printf("data verification failed byte %lld\n", total_bytes);
+			return -1;
+		}
+	}
+	return 0;
+}
+
+int do_poll(struct pollfd *fds, int timeout)
+{
+	int ret;
+
+	do {
+		ret = rs_poll(fds, 1, timeout);
+	} while (!ret);
+
+	return ret == 1 ? 0 : ret;
+}
diff --git a/prov/rdmacm/examples/common.h b/prov/rdmacm/examples/common.h
new file mode 100644
index 00000000000..f7511f03969
--- /dev/null
+++ b/prov/rdmacm/examples/common.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2005-2012 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id$
+ */
+
+#include <stdlib.h>
+#include <sys/types.h>
+#include <byteswap.h>
+#include <poll.h>
+
+#include <rdma/rdma_cma.h>
+#include <rdma/rsocket.h>
+#include <infiniband/ib.h>
+
+#if __BYTE_ORDER == __BIG_ENDIAN
+static inline uint64_t cpu_to_be64(uint64_t x) { return x; }
+static inline uint32_t cpu_to_be32(uint32_t x) { return x; }
+#else
+static inline uint64_t cpu_to_be64(uint64_t x) { return bswap_64(x); }
+static inline uint32_t cpu_to_be32(uint32_t x) { return bswap_32(x); }
+#endif
+
+extern int use_rs;
+
+#define rs_socket(f,t,p)  use_rs ? rsocket(f,t,p)  : socket(f,t,p)
+#define rs_bind(s,a,l)    use_rs ? rbind(s,a,l)    : bind(s,a,l)
+#define rs_listen(s,b)    use_rs ? rlisten(s,b)    : listen(s,b)
+#define rs_connect(s,a,l) use_rs ? rconnect(s,a,l) : connect(s,a,l)
+#define rs_accept(s,a,l)  use_rs ? raccept(s,a,l)  : accept(s,a,l)
+#define rs_shutdown(s,h)  use_rs ? rshutdown(s,h)  : shutdown(s,h)
+#define rs_close(s)       use_rs ? rclose(s)       : close(s)
+#define rs_recv(s,b,l,f)  use_rs ? rrecv(s,b,l,f)  : recv(s,b,l,f)
+#define rs_send(s,b,l,f)  use_rs ? rsend(s,b,l,f)  : send(s,b,l,f)
+#define rs_recvfrom(s,b,l,f,a,al) \
+	use_rs ? rrecvfrom(s,b,l,f,a,al) : recvfrom(s,b,l,f,a,al)
+#define rs_sendto(s,b,l,f,a,al) \
+	use_rs ? rsendto(s,b,l,f,a,al)   : sendto(s,b,l,f,a,al)
+#define rs_poll(f,n,t)	  use_rs ? rpoll(f,n,t)	   : poll(f,n,t)
+#define rs_fcntl(s,c,p)   use_rs ? rfcntl(s,c,p)   : fcntl(s,c,p)
+#define rs_setsockopt(s,l,n,v,ol) \
+	use_rs ? rsetsockopt(s,l,n,v,ol) : setsockopt(s,l,n,v,ol)
+#define rs_getsockopt(s,l,n,v,ol) \
+	use_rs ? rgetsockopt(s,l,n,v,ol) : getsockopt(s,l,n,v,ol)
+
+union socket_addr {
+	struct sockaddr		sa;
+	struct sockaddr_in	sin;
+	struct sockaddr_in6	sin6;
+};
+
+enum rs_optimization {
+	opt_mixed,
+	opt_latency,
+	opt_bandwidth
+};
+
+int get_rdma_addr(char *src, char *dst, char *port,
+		  struct rdma_addrinfo *hints, struct rdma_addrinfo **rai);
+
+void size_str(char *str, size_t ssize, long long size);
+void cnt_str(char *str, size_t ssize, long long cnt);
+int size_to_count(int size);
+void format_buf(void *buf, int size);
+int verify_buf(void *buf, int size);
+int do_poll(struct pollfd *fds, int timeout);
diff --git a/prov/rdmacm/examples/rcopy.c b/prov/rdmacm/examples/rcopy.c
new file mode 100644
index 00000000000..152acef2359
--- /dev/null
+++ b/prov/rdmacm/examples/rcopy.c
@@ -0,0 +1,628 @@
+/*
+ * Copyright (c) 2011 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under the OpenIB.org BSD license
+ * below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AWV
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#include <errno.h>
+#include <getopt.h>
+#include <arpa/inet.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <netdb.h>
+#include <unistd.h>
+
+#include <rdma/rsocket.h>
+
+union rsocket_address {
+	struct sockaddr		sa;
+	struct sockaddr_in	sin;
+	struct sockaddr_in6	sin6;
+	struct sockaddr_storage storage;
+};
+
+static char *port = "7427";
+static char *dst_addr;
+static char *dst_file;
+static char *src_file;
+static struct timeval start, end;
+//static void buf[1024 * 1024];
+static uint64_t bytes;
+int fd;
+void *file_addr;
+
+enum {
+	CMD_NOOP,
+	CMD_OPEN,
+	CMD_CLOSE,
+	CMD_WRITE,
+	CMD_RESP = 0x80,
+};
+
+/* TODO: handle byte swapping */
+struct msg_hdr {
+	uint8_t  version;
+	uint8_t  command;
+	uint16_t len;
+	uint32_t data;
+	uint64_t id;
+};
+
+struct msg_open {
+	struct msg_hdr hdr;
+	char path[0];
+};
+
+struct msg_write {
+	struct msg_hdr hdr;
+	uint64_t size;
+};
+
+static void show_perf(void)
+{
+	float usec;
+
+	usec = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec);
+
+	printf("%lld bytes in %.2f seconds = %.2f Gb/sec\n",
+	       (long long) bytes, usec / 1000000., (bytes * 8) / (1000. * usec));
+}
+
+static char *_ntop(union rsocket_address *rsa)
+{
+	static char addr[32];
+
+	switch (rsa->sa.sa_family) {
+	case AF_INET:
+		inet_ntop(AF_INET, &rsa->sin.sin_addr, addr, sizeof addr);
+		break;
+	case AF_INET6:
+		inet_ntop(AF_INET6, &rsa->sin6.sin6_addr, addr, sizeof addr);
+		break;
+	default:
+		addr[0] = '\0';
+		break;
+	}
+
+	return addr;
+}
+
+static size_t _recv(int rs, char *msg, size_t len)
+{
+	size_t ret, offset;
+
+	for (offset = 0; offset < len; offset += ret) {
+		ret = rrecv(rs, msg + offset, len - offset, 0);
+		if (ret <= 0)
+			return ret;
+	}
+
+	return len;
+}
+
+static int msg_recv_hdr(int rs, struct msg_hdr *hdr)
+{
+	int ret;
+
+	ret = _recv(rs, (char *) hdr, sizeof *hdr);
+	if (ret != sizeof *hdr)
+		return -1;
+
+	if (hdr->version || hdr->len < sizeof *hdr) {
+		printf("invalid version %d or length %d\n",
+		       hdr->version, hdr->len);
+		return -1;
+	}
+
+	return sizeof *hdr;
+}
+
+static int msg_get_resp(int rs, struct msg_hdr *msg, uint8_t cmd)
+{
+	int ret;
+
+	ret = msg_recv_hdr(rs, msg);
+	if (ret != sizeof *msg)
+		return ret;
+
+	if ((msg->len != sizeof *msg) || (msg->command != (cmd | CMD_RESP))) {
+		printf("invalid length %d or bad command response %x:%x\n",
+		       msg->len, msg->command, cmd | CMD_RESP);
+		return -1;
+	}
+
+	return msg->data;
+}
+
+static void msg_send_resp(int rs, struct msg_hdr *msg, uint32_t status)
+{
+	struct msg_hdr resp;
+
+	resp.version = 0;
+	resp.command = msg->command | CMD_RESP;
+	resp.len = sizeof resp;
+	resp.data = status;
+	resp.id = msg->id;
+	rsend(rs, (char *) &resp, sizeof resp, 0);
+}
+
+static int server_listen(void)
+{
+	struct addrinfo hints, *res;
+	int ret, rs;
+
+	memset(&hints, 0, sizeof hints);
+	hints.ai_flags = RAI_PASSIVE;
+ 	ret = getaddrinfo(NULL, port, &hints, &res);
+	if (ret) {
+		perror("getaddrinfo failed\n");
+		return ret;
+	}
+
+	rs = rsocket(res->ai_family, res->ai_socktype, res->ai_protocol);
+	if (rs < 0) {
+		perror("rsocket failed\n");
+		ret = rs;
+		goto free;
+	}
+
+	ret = 1;
+	ret = rsetsockopt(rs, SOL_SOCKET, SO_REUSEADDR, &ret, sizeof ret);
+	if (ret) {
+		perror("rsetsockopt failed");
+		goto close;
+	}
+
+	ret = rbind(rs, res->ai_addr, res->ai_addrlen);
+	if (ret) {
+		perror("rbind failed");
+		goto close;
+	}
+
+	ret = rlisten(rs, 1);
+	if (ret) {
+		perror("rlisten failed");
+		goto close;
+	}
+
+	ret = rs;
+	goto free;
+
+close:
+	rclose(rs);
+free:
+	freeaddrinfo(res);
+	return ret;
+}
+
+static int server_open(int rs, struct msg_hdr *msg)
+{
+	char *path = NULL;
+	int ret, len;
+
+	printf("opening: ");
+	fflush(NULL);
+	if (file_addr || fd > 0) {
+		printf("cannot open another file\n");
+		ret = EBUSY;
+		goto out;
+	}
+
+	len = msg->len - sizeof *msg;
+	path = malloc(len);
+	if (!path) {
+		printf("cannot allocate path name\n");
+		ret = ENOMEM;
+		goto out;
+	}
+
+	ret = _recv(rs, path, len);
+	if (ret != len) {
+		printf("error receiving path\n");
+		goto out;
+	}
+
+	printf("%s, ", path);
+	fflush(NULL);
+	fd = open(path, O_RDWR | O_CREAT | O_TRUNC, msg->data);
+	if (fd < 0) {
+		printf("unable to open destination file\n");
+		ret = errno;
+	}
+
+	ret = 0;
+out:
+	if (path)
+		free(path);
+
+	msg_send_resp(rs, msg, ret);
+	return ret;
+}
+
+static void server_close(int rs, struct msg_hdr *msg)
+{
+	printf("closing...");
+	fflush(NULL);
+	msg_send_resp(rs, msg, 0);
+
+	if (file_addr) {
+		munmap(file_addr, bytes);
+		file_addr = 0;
+	}
+
+	if (fd > 0) {
+		close(fd);
+		fd = 0;
+	}
+	printf("done\n");
+}
+
+static int server_write(int rs, struct msg_hdr *msg)
+{
+	size_t len;
+	int ret;
+
+	printf("transferring");
+	fflush(NULL);
+	if (fd <= 0) {
+		printf("...file not opened\n");
+		ret = EINVAL;
+		goto out;
+	}
+
+	if (msg->len != sizeof(struct msg_write)) {
+		printf("...invalid message length %d\n", msg->len);
+		ret = EINVAL;
+		goto out;
+	}
+
+	ret = _recv(rs, (char *) &bytes, sizeof bytes);
+	if (ret != sizeof bytes)
+		goto out;
+
+	ret = ftruncate(fd, bytes);
+	if (ret)
+		goto out;
+
+	file_addr = mmap(NULL, bytes, PROT_WRITE, MAP_SHARED, fd, 0);
+	if (file_addr == (void *) -1) {
+		printf("...error mapping file\n");
+		ret = errno;
+		goto out;
+	}
+
+	printf("...%lld bytes...", (long long) bytes);
+	fflush(NULL);
+	len = _recv(rs, file_addr, bytes);
+	if (len != bytes) {
+		printf("...error receiving data\n");
+		ret = (int) len;
+	}
+out:
+	msg_send_resp(rs, msg, ret);
+	return ret;
+}
+
+static void server_process(int rs)
+{
+	struct msg_hdr msg;
+	int ret;
+
+	do {
+		ret = msg_recv_hdr(rs, &msg);
+		if (ret != sizeof msg)
+			break;
+
+		switch (msg.command) {
+		case CMD_OPEN:
+			ret = server_open(rs, &msg);
+			break;
+		case CMD_CLOSE:
+			server_close(rs, &msg);
+			ret = 0;
+			break;
+		case CMD_WRITE:
+			ret = server_write(rs, &msg);
+			break;
+		default:
+			msg_send_resp(rs, &msg, EINVAL);
+			ret = -1;
+			break;
+		}
+
+	} while (!ret);
+}
+
+static int server_run(void)
+{
+	int lrs, rs;
+	union rsocket_address rsa;
+	socklen_t len;
+
+	lrs = server_listen();
+	if (lrs < 0)
+		return lrs;
+
+	while (1) {
+		len = sizeof rsa;
+		printf("waiting for connection...");
+		fflush(NULL);
+		rs = raccept(lrs, &rsa.sa, &len);
+
+		printf("client: %s\n", _ntop(&rsa));
+		server_process(rs);
+
+		rshutdown(rs, SHUT_RDWR);
+		rclose(rs);
+	}
+	return 0;
+}
+
+static int client_connect(void)
+{
+	struct addrinfo *res;
+	int ret, rs;
+
+ 	ret = getaddrinfo(dst_addr, port, NULL, &res);
+	if (ret) {
+		perror("getaddrinfo failed\n");
+		return ret;
+	}
+
+	rs = rsocket(res->ai_family, res->ai_socktype, res->ai_protocol);
+	if (rs < 0) {
+		perror("rsocket failed\n");
+		goto free;
+	}
+
+	ret = rconnect(rs, res->ai_addr, res->ai_addrlen);
+	if (ret) {
+		perror("rconnect failed\n");
+		rclose(rs);
+		rs = ret;
+	}
+
+free:
+	freeaddrinfo(res);
+	return rs;
+}
+
+static int client_open(int rs)
+{
+	struct msg_open *msg;
+	struct stat stats;
+	uint32_t len;
+	int ret;
+
+	printf("opening...");
+	fflush(NULL);
+	fd = open(src_file, O_RDONLY);
+	if (fd < 0)
+		return fd;
+
+	ret = fstat(fd, &stats);
+	if (ret < 0)
+		goto err1;
+
+	bytes = (uint64_t) stats.st_size;
+	file_addr = mmap(NULL, bytes, PROT_READ, MAP_SHARED, fd, 0);
+	if (file_addr == (void *) -1) {
+		ret = errno;
+		goto err1;
+	}
+
+	len = (((uint32_t) strlen(dst_file)) + 8) & 0xFFFFFFF8;
+	msg = calloc(1, sizeof(*msg) + len);
+	if (!msg) {
+		ret = -1;
+		goto err2;
+	}
+
+	msg->hdr.command = CMD_OPEN;
+	msg->hdr.len = sizeof(*msg) + len;
+	msg->hdr.data = (uint32_t) stats.st_mode;
+	strcpy(msg->path, dst_file);
+	ret = rsend(rs, msg, msg->hdr.len, 0);
+	if (ret != msg->hdr.len)
+		goto err3;
+
+	ret = msg_get_resp(rs, &msg->hdr, CMD_OPEN);
+	if (ret)
+		goto err3;
+
+	return 0;
+
+err3:
+	free(msg);
+err2:
+	munmap(file_addr, bytes);
+err1:
+	close(fd);
+	return ret;
+}
+
+static int client_start_write(int rs)
+{
+	struct msg_write msg;
+	int ret;
+
+	printf("transferring");
+	fflush(NULL);
+	memset(&msg, 0, sizeof msg);
+	msg.hdr.command = CMD_WRITE;
+	msg.hdr.len = sizeof(msg);
+	msg.size = bytes;
+
+	ret = rsend(rs, &msg, sizeof msg, 0);
+	if (ret != msg.hdr.len)
+		return ret;
+
+	return 0;
+}
+
+static int client_close(int rs)
+{
+	struct msg_hdr msg;
+	int ret;
+
+	printf("closing...");
+	fflush(NULL);
+	memset(&msg, 0, sizeof msg);
+	msg.command = CMD_CLOSE;
+	msg.len = sizeof msg;
+	ret = rsend(rs, (char *) &msg, msg.len, 0);
+	if (ret != msg.len)
+		goto out;
+
+	ret = msg_get_resp(rs, &msg, CMD_CLOSE);
+	if (ret)
+		goto out;
+
+	printf("done\n");
+out:
+	munmap(file_addr, bytes);
+	close(fd);
+	return ret;
+}
+
+static int client_run(void)
+{
+	struct msg_hdr ack;
+	int ret, rs;
+	size_t len;
+
+	rs = client_connect();
+	if (rs < 0)
+		return rs;
+
+	ret = client_open(rs);
+	if (ret)
+		goto shutdown;
+
+	ret = client_start_write(rs);
+	if (ret)
+		goto close;
+
+	printf("...");
+	fflush(NULL);
+	gettimeofday(&start, NULL);
+	len = rsend(rs, file_addr, bytes, 0);
+	if (len == bytes)
+		ret = msg_get_resp(rs, &ack, CMD_WRITE);
+	else
+		ret = (int) len;
+
+	gettimeofday(&end, NULL);
+
+close:
+	client_close(rs);
+shutdown:
+	rshutdown(rs, SHUT_RDWR);
+	rclose(rs);
+	if (!ret)
+		show_perf();
+	return ret;
+}
+
+static void show_usage(char *program)
+{
+	printf("usage 1: %s [options]\n", program);
+	printf("\t     starts the server application\n");
+	printf("\t[-p  port_number]\n");
+	printf("usage 2: %s source server[:destination] [options]\n", program);
+	printf("\t     source - file name and path\n");
+	printf("\t     server - name or address\n");
+	printf("\t     destination - file name and path\n");
+	printf("\t[-p  port_number]\n");
+	exit(1);
+}
+
+static void server_opts(int argc, char **argv)
+{
+	int op;
+
+	while ((op = getopt(argc, argv, "p:")) != -1) {
+		switch (op) {
+		case 'p':
+			port = optarg;
+			break;
+		default:
+			show_usage(argv[0]);
+		}
+	}
+}
+
+static void client_opts(int argc, char **argv)
+{
+	int op;
+
+	if (argc < 3)
+		show_usage(argv[0]);
+
+	src_file = argv[1];
+	dst_addr = argv[2];
+	dst_file = strchr(dst_addr, ':');
+	if (dst_file) {
+		*dst_file = '\0';
+		dst_file++;
+	}
+	if (!dst_file)
+		dst_file = src_file;
+
+	while ((op = getopt(argc, argv, "p:")) != -1) {
+		switch (op) {
+		case 'p':
+			port = optarg;
+			break;
+		default:
+			show_usage(argv[0]);
+		}
+	}
+
+}
+
+int main(int argc, char **argv)
+{
+	int ret;
+
+	if (argc == 1 || argv[1][0] == '-') {
+		server_opts(argc, argv);
+		ret = server_run();
+	} else {
+		client_opts(argc, argv);
+		ret = client_run();
+	}
+
+	return ret;
+}
diff --git a/prov/rdmacm/examples/riostream.c b/prov/rdmacm/examples/riostream.c
new file mode 100644
index 00000000000..a1d36718aed
--- /dev/null
+++ b/prov/rdmacm/examples/riostream.c
@@ -0,0 +1,639 @@
+/*
+ * Copyright (c) 2011-2012 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under the OpenIB.org BSD license
+ * below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AWV
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#include <errno.h>
+#include <getopt.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <netdb.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+
+#include <rdma/rdma_cma.h>
+#include <rdma/rsocket.h>
+#include "common.h"
+
+struct test_size_param {
+	int size;
+	int option;
+};
+
+static struct test_size_param test_size[] = {
+	{ 1 <<  6, 0 },
+	{ 1 <<  7, 1 }, { (1 <<  7) + (1 <<  6), 1},
+	{ 1 <<  8, 1 }, { (1 <<  8) + (1 <<  7), 1},
+	{ 1 <<  9, 1 }, { (1 <<  9) + (1 <<  8), 1},
+	{ 1 << 10, 1 }, { (1 << 10) + (1 <<  9), 1},
+	{ 1 << 11, 1 }, { (1 << 11) + (1 << 10), 1},
+	{ 1 << 12, 0 }, { (1 << 12) + (1 << 11), 1},
+	{ 1 << 13, 1 }, { (1 << 13) + (1 << 12), 1},
+	{ 1 << 14, 1 }, { (1 << 14) + (1 << 13), 1},
+	{ 1 << 15, 1 }, { (1 << 15) + (1 << 14), 1},
+	{ 1 << 16, 0 }, { (1 << 16) + (1 << 15), 1},
+	{ 1 << 17, 1 }, { (1 << 17) + (1 << 16), 1},
+	{ 1 << 18, 1 }, { (1 << 18) + (1 << 17), 1},
+	{ 1 << 19, 1 }, { (1 << 19) + (1 << 18), 1},
+	{ 1 << 20, 0 }, { (1 << 20) + (1 << 19), 1},
+	{ 1 << 21, 1 }, { (1 << 21) + (1 << 20), 1},
+	{ 1 << 22, 1 }, { (1 << 22) + (1 << 21), 1},
+};
+#define TEST_CNT (sizeof test_size / sizeof test_size[0])
+
+static int rs, lrs;
+static int use_async;
+static int verify;
+static int flags = MSG_DONTWAIT;
+static int poll_timeout = 0;
+static int custom;
+static enum rs_optimization optimization;
+static int size_option;
+static int iterations = 1;
+static int transfer_size = 1000;
+static int transfer_count = 1000;
+static int buffer_size;
+static char test_name[10] = "custom";
+static char *port = "7471";
+static char *dst_addr;
+static char *src_addr;
+static struct timeval start, end;
+static void *buf;
+static volatile uint8_t *poll_byte;
+
+static void show_perf(void)
+{
+	char str[32];
+	float usec;
+	long long bytes;
+
+	usec = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec);
+	bytes = (long long) iterations * transfer_count * transfer_size * 2;
+
+	/* name size transfers iterations bytes seconds Gb/sec usec/xfer */
+	printf("%-10s", test_name);
+	size_str(str, sizeof str, transfer_size);
+	printf("%-8s", str);
+	cnt_str(str, sizeof str, transfer_count);
+	printf("%-8s", str);
+	cnt_str(str, sizeof str, iterations);
+	printf("%-8s", str);
+	size_str(str, sizeof str, bytes);
+	printf("%-8s", str);
+	printf("%8.2fs%10.2f%11.2f\n",
+		usec / 1000000., (bytes * 8) / (1000. * usec),
+		(usec / iterations) / (transfer_count * 2));
+}
+
+static void init_latency_test(int size)
+{
+	char sstr[5];
+
+	size_str(sstr, sizeof sstr, size);
+	snprintf(test_name, sizeof test_name, "%s_lat", sstr);
+	transfer_count = 1;
+	transfer_size = size;
+	iterations = size_to_count(transfer_size);
+}
+
+static void init_bandwidth_test(int size)
+{
+	char sstr[5];
+
+	size_str(sstr, sizeof sstr, size);
+	snprintf(test_name, sizeof test_name, "%s_bw", sstr);
+	iterations = 1;
+	transfer_size = size;
+	transfer_count = size_to_count(transfer_size);
+}
+
+static int send_msg(int size)
+{
+	struct pollfd fds;
+	int offset, ret;
+
+	if (verify)
+		format_buf(buf, size);
+
+	if (use_async) {
+		fds.fd = rs;
+		fds.events = POLLOUT;
+	}
+
+	for (offset = 0; offset < size; ) {
+		if (use_async) {
+			ret = do_poll(&fds, poll_timeout);
+			if (ret)
+				return ret;
+		}
+
+		ret = rsend(rs, buf + offset, size - offset, flags);
+		if (ret > 0) {
+			offset += ret;
+		} else if (errno != EWOULDBLOCK && errno != EAGAIN) {
+			perror("rsend");
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static int send_xfer(int size)
+{
+	struct pollfd fds;
+	int offset, ret;
+
+	if (verify)
+		format_buf(buf, size - 1);
+
+	if (use_async) {
+		fds.fd = rs;
+		fds.events = POLLOUT;
+	}
+
+	for (offset = 0; offset < size; ) {
+		if (use_async) {
+			ret = do_poll(&fds, poll_timeout);
+			if (ret)
+				return ret;
+		}
+
+		ret = riowrite(rs, buf + offset, size - offset, offset, flags);
+		if (ret > 0) {
+			offset += ret;
+		} else if (errno != EWOULDBLOCK && errno != EAGAIN) {
+			perror("riowrite");
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static int recv_msg(int size)
+{
+	struct pollfd fds;
+	int offset, ret;
+
+	if (use_async) {
+		fds.fd = rs;
+		fds.events = POLLIN;
+	}
+
+	for (offset = 0; offset < size; ) {
+		if (use_async) {
+			ret = do_poll(&fds, poll_timeout);
+			if (ret)
+				return ret;
+		}
+
+		ret = rrecv(rs, buf + offset, size - offset, flags);
+		if (ret > 0) {
+			offset += ret;
+		} else if (errno != EWOULDBLOCK && errno != EAGAIN) {
+			perror("rrecv");
+			return ret;
+		}
+	}
+
+	if (verify) {
+		ret = verify_buf(buf, size);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int recv_xfer(int size, uint8_t marker)
+{
+	int ret;
+
+	while (*poll_byte != marker)
+		;
+
+	if (verify) {
+		ret = verify_buf(buf, size - 1);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int sync_test(void)
+{
+	int ret;
+
+	ret = dst_addr ? send_msg(16) : recv_msg(16);
+	if (ret)
+		return ret;
+
+	return dst_addr ? recv_msg(16) : send_msg(16);
+}
+
+static int run_test(void)
+{
+	int ret, i, t;
+	off_t offset;
+	uint8_t marker = 0;
+
+	poll_byte = buf + transfer_size - 1;
+	*poll_byte = -1;
+	offset = riomap(rs, buf, transfer_size, PROT_WRITE, 0, 0);
+	if (offset ==  -1) {
+		perror("riomap");
+		ret = -1;
+		goto out;
+	}
+	ret = sync_test();
+	if (ret)
+		goto out;
+
+	gettimeofday(&start, NULL);
+	for (i = 0; i < iterations; i++) {
+		if (dst_addr) {
+			for (t = 0; t < transfer_count - 1; t++) {
+				ret = send_xfer(transfer_size);
+				if (ret)
+					goto out;
+			}
+			*poll_byte = (uint8_t) marker++;
+			ret = send_xfer(transfer_size);
+			if (ret)
+				goto out;
+
+			ret = recv_xfer(transfer_size, marker++);
+		} else {
+			ret = recv_xfer(transfer_size, marker++);
+			if (ret)
+				goto out;
+
+			for (t = 0; t < transfer_count - 1; t++) {
+				ret = send_xfer(transfer_size);
+				if (ret)
+					goto out;
+			}
+			*poll_byte = (uint8_t) marker++;
+			ret = send_xfer(transfer_size);
+		}
+		if (ret)
+			goto out;
+	}
+	gettimeofday(&end, NULL);
+	show_perf();
+	ret = riounmap(rs, buf, transfer_size);
+
+out:
+	return ret;
+}
+
+static void set_options(int rs)
+{
+	int val;
+
+	if (buffer_size) {
+		rsetsockopt(rs, SOL_SOCKET, SO_SNDBUF, (void *) &buffer_size,
+			    sizeof buffer_size);
+		rsetsockopt(rs, SOL_SOCKET, SO_RCVBUF, (void *) &buffer_size,
+			    sizeof buffer_size);
+	} else {
+		val = 1 << 19;
+		rsetsockopt(rs, SOL_SOCKET, SO_SNDBUF, (void *) &val, sizeof val);
+		rsetsockopt(rs, SOL_SOCKET, SO_RCVBUF, (void *) &val, sizeof val);
+	}
+
+	val = 1;
+	rsetsockopt(rs, IPPROTO_TCP, TCP_NODELAY, (void *) &val, sizeof(val));
+	rsetsockopt(rs, SOL_RDMA, RDMA_IOMAPSIZE, (void *) &val, sizeof val);
+
+	if (flags & MSG_DONTWAIT)
+		rfcntl(rs, F_SETFL, O_NONBLOCK);
+
+	/* Inline size based on experimental data */
+	if (optimization == opt_latency) {
+		val = 384;
+		rsetsockopt(rs, SOL_RDMA, RDMA_INLINE, &val, sizeof val);
+	} else if (optimization == opt_bandwidth) {
+		val = 0;
+		rsetsockopt(rs, SOL_RDMA, RDMA_INLINE, &val, sizeof val);
+	}
+}
+
+static int server_listen(void)
+{
+	struct addrinfo hints, *res;
+	int val, ret;
+
+	memset(&hints, 0, sizeof hints);
+	hints.ai_flags = AI_PASSIVE;
+ 	ret = getaddrinfo(src_addr, port, &hints, &res);
+	if (ret) {
+		perror("getaddrinfo");
+		return ret;
+	}
+
+	lrs = rsocket(res->ai_family, res->ai_socktype, res->ai_protocol);
+	if (lrs < 0) {
+		perror("rsocket");
+		ret = lrs;
+		goto free;
+	}
+
+	val = 1;
+	ret = rsetsockopt(lrs, SOL_SOCKET, SO_REUSEADDR, &val, sizeof val);
+	if (ret) {
+		perror("rsetsockopt SO_REUSEADDR");
+		goto close;
+	}
+
+	ret = rbind(lrs, res->ai_addr, res->ai_addrlen);
+	if (ret) {
+		perror("rbind");
+		goto close;
+	}
+
+	ret = rlisten(lrs, 1);
+	if (ret)
+		perror("rlisten");
+
+close:
+	if (ret)
+		rclose(lrs);
+free:
+	freeaddrinfo(res);
+	return ret;
+}
+
+static int server_connect(void)
+{
+	struct pollfd fds;
+	int ret = 0;
+
+	set_options(lrs);
+	do {
+		if (use_async) {
+			fds.fd = lrs;
+			fds.events = POLLIN;
+
+			ret = do_poll(&fds, poll_timeout);
+			if (ret) {
+				perror("rpoll");
+				return ret;
+			}
+		}
+
+		rs = raccept(lrs, NULL, 0);
+	} while (rs < 0 && (errno == EAGAIN || errno == EWOULDBLOCK));
+	if (rs < 0) {
+		perror("raccept");
+		return rs;
+	}
+
+	set_options(rs);
+	return ret;
+}
+
+static int client_connect(void)
+{
+	struct addrinfo *res;
+	struct pollfd fds;
+	int ret, err;
+	socklen_t len;
+
+ 	ret = getaddrinfo(dst_addr, port, NULL, &res);
+	if (ret) {
+		perror("getaddrinfo");
+		return ret;
+	}
+
+	rs = rsocket(res->ai_family, res->ai_socktype, res->ai_protocol);
+	if (rs < 0) {
+		perror("rsocket");
+		ret = rs;
+		goto free;
+	}
+
+	set_options(rs);
+	/* TODO: bind client to src_addr */
+
+	ret = rconnect(rs, res->ai_addr, res->ai_addrlen);
+	if (ret && (errno != EINPROGRESS)) {
+		perror("rconnect");
+		goto close;
+	}
+
+	if (ret && (errno == EINPROGRESS)) {
+		fds.fd = rs;
+		fds.events = POLLOUT;
+		ret = do_poll(&fds, poll_timeout);
+		if (ret)
+			goto close;
+
+		len = sizeof err;
+		ret = rgetsockopt(rs, SOL_SOCKET, SO_ERROR, &err, &len);
+		if (ret)
+			goto close;
+		if (err) {
+			ret = -1;
+			errno = err;
+			perror("async rconnect");
+		}
+	}
+
+close:
+	if (ret)
+		rclose(rs);
+free:
+	freeaddrinfo(res);
+	return ret;
+}
+
+static int run(void)
+{
+	int i, ret = 0;
+
+	buf = malloc(!custom ? test_size[TEST_CNT - 1].size : transfer_size);
+	if (!buf) {
+		perror("malloc");
+		return -1;
+	}
+
+	if (!dst_addr) {
+		ret = server_listen();
+		if (ret)
+			goto free;
+	}
+
+	printf("%-10s%-8s%-8s%-8s%-8s%8s %10s%13s\n",
+	       "name", "bytes", "xfers", "iters", "total", "time", "Gb/sec", "usec/xfer");
+	if (!custom) {
+		optimization = opt_latency;
+		ret = dst_addr ? client_connect() : server_connect();
+		if (ret)
+			goto free;
+
+		for (i = 0; i < TEST_CNT; i++) {
+			if (test_size[i].option > size_option)
+				continue;
+			init_latency_test(test_size[i].size);
+			run_test();
+		}
+		rshutdown(rs, SHUT_RDWR);
+		rclose(rs);
+
+		optimization = opt_bandwidth;
+		ret = dst_addr ? client_connect() : server_connect();
+		if (ret)
+			goto free;
+		for (i = 0; i < TEST_CNT; i++) {
+			if (test_size[i].option > size_option)
+				continue;
+			init_bandwidth_test(test_size[i].size);
+			run_test();
+		}
+	} else {
+		ret = dst_addr ? client_connect() : server_connect();
+		if (ret)
+			goto free;
+
+		ret = run_test();
+	}
+
+	rshutdown(rs, SHUT_RDWR);
+	rclose(rs);
+free:
+	free(buf);
+	return ret;
+}
+
+static int set_test_opt(char *optarg)
+{
+	if (strlen(optarg) == 1) {
+		switch (optarg[0]) {
+		case 'a':
+			use_async = 1;
+			break;
+		case 'b':
+			flags = (flags & ~MSG_DONTWAIT) | MSG_WAITALL;
+			break;
+		case 'n':
+			flags |= MSG_DONTWAIT;
+			break;
+		case 'v':
+			verify = 1;
+			break;
+		default:
+			return -1;
+		}
+	} else {
+		if (!strncasecmp("async", optarg, 5)) {
+			use_async = 1;
+		} else if (!strncasecmp("block", optarg, 5)) {
+			flags = (flags & ~MSG_DONTWAIT) | MSG_WAITALL;
+		} else if (!strncasecmp("nonblock", optarg, 8)) {
+			flags |= MSG_DONTWAIT;
+		} else if (!strncasecmp("verify", optarg, 6)) {
+			verify = 1;
+		} else {
+			return -1;
+		}
+	}
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	int op, ret;
+
+	while ((op = getopt(argc, argv, "s:b:B:I:C:S:p:T:")) != -1) {
+		switch (op) {
+		case 's':
+			dst_addr = optarg;
+			break;
+		case 'b':
+			src_addr = optarg;
+			break;
+		case 'B':
+			buffer_size = atoi(optarg);
+			break;
+		case 'I':
+			custom = 1;
+			iterations = atoi(optarg);
+			break;
+		case 'C':
+			custom = 1;
+			transfer_count = atoi(optarg);
+			break;
+		case 'S':
+			if (!strncasecmp("all", optarg, 3)) {
+				size_option = 1;
+			} else {
+				custom = 1;
+				transfer_size = atoi(optarg);
+			}
+			break;
+		case 'p':
+			port = optarg;
+			break;
+		case 'T':
+			if (!set_test_opt(optarg))
+				break;
+			/* invalid option - fall through */
+		default:
+			printf("usage: %s\n", argv[0]);
+			printf("\t[-s server_address]\n");
+			printf("\t[-b bind_address]\n");
+			printf("\t[-B buffer_size]\n");
+			printf("\t[-I iterations]\n");
+			printf("\t[-C transfer_count]\n");
+			printf("\t[-S transfer_size or all]\n");
+			printf("\t[-p port_number]\n");
+			printf("\t[-T test_option]\n");
+			printf("\t    a|async - asynchronous operation (use poll)\n");
+			printf("\t    b|blocking - use blocking calls\n");
+			printf("\t    n|nonblocking - use nonblocking calls\n");
+			printf("\t    v|verify - verify data\n");
+			exit(1);
+		}
+	}
+
+	if (!(flags & MSG_DONTWAIT))
+		poll_timeout = -1;
+
+	ret = run();
+	return ret;
+}
diff --git a/prov/rdmacm/examples/rstream.c b/prov/rdmacm/examples/rstream.c
new file mode 100644
index 00000000000..e94e8807048
--- /dev/null
+++ b/prov/rdmacm/examples/rstream.c
@@ -0,0 +1,609 @@
+/*
+ * Copyright (c) 2011-2012 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under the OpenIB.org BSD license
+ * below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AWV
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#include <errno.h>
+#include <getopt.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <netdb.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+
+#include <rdma/rsocket.h>
+#include "common.h"
+
+struct test_size_param {
+	int size;
+	int option;
+};
+
+static struct test_size_param test_size[] = {
+	{ 1 <<  6, 0 },
+	{ 1 <<  7, 1 }, { (1 <<  7) + (1 <<  6), 1},
+	{ 1 <<  8, 1 }, { (1 <<  8) + (1 <<  7), 1},
+	{ 1 <<  9, 1 }, { (1 <<  9) + (1 <<  8), 1},
+	{ 1 << 10, 1 }, { (1 << 10) + (1 <<  9), 1},
+	{ 1 << 11, 1 }, { (1 << 11) + (1 << 10), 1},
+	{ 1 << 12, 0 }, { (1 << 12) + (1 << 11), 1},
+	{ 1 << 13, 1 }, { (1 << 13) + (1 << 12), 1},
+	{ 1 << 14, 1 }, { (1 << 14) + (1 << 13), 1},
+	{ 1 << 15, 1 }, { (1 << 15) + (1 << 14), 1},
+	{ 1 << 16, 0 }, { (1 << 16) + (1 << 15), 1},
+	{ 1 << 17, 1 }, { (1 << 17) + (1 << 16), 1},
+	{ 1 << 18, 1 }, { (1 << 18) + (1 << 17), 1},
+	{ 1 << 19, 1 }, { (1 << 19) + (1 << 18), 1},
+	{ 1 << 20, 0 }, { (1 << 20) + (1 << 19), 1},
+	{ 1 << 21, 1 }, { (1 << 21) + (1 << 20), 1},
+	{ 1 << 22, 1 }, { (1 << 22) + (1 << 21), 1},
+};
+#define TEST_CNT (sizeof test_size / sizeof test_size[0])
+
+static int rs, lrs;
+static int use_async;
+static int verify;
+static int flags = MSG_DONTWAIT;
+static int poll_timeout = 0;
+static int custom;
+static int use_fork;
+static pid_t fork_pid;
+static enum rs_optimization optimization;
+static int size_option;
+static int iterations = 1;
+static int transfer_size = 1000;
+static int transfer_count = 1000;
+static int buffer_size;
+static char test_name[10] = "custom";
+static char *port = "7471";
+static char *dst_addr;
+static char *src_addr;
+static struct timeval start, end;
+static void *buf;
+static struct addrinfo ai_hints;
+
+static void show_perf(void)
+{
+	char str[32];
+	float usec;
+	long long bytes;
+
+	usec = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec);
+	bytes = (long long) iterations * transfer_count * transfer_size * 2;
+
+	/* name size transfers iterations bytes seconds Gb/sec usec/xfer */
+	printf("%-10s", test_name);
+	size_str(str, sizeof str, transfer_size);
+	printf("%-8s", str);
+	cnt_str(str, sizeof str, transfer_count);
+	printf("%-8s", str);
+	cnt_str(str, sizeof str, iterations);
+	printf("%-8s", str);
+	size_str(str, sizeof str, bytes);
+	printf("%-8s", str);
+	printf("%8.2fs%10.2f%11.2f\n",
+		usec / 1000000., (bytes * 8) / (1000. * usec),
+		(usec / iterations) / (transfer_count * 2));
+}
+
+static void init_latency_test(int size)
+{
+	char sstr[5];
+
+	size_str(sstr, sizeof sstr, size);
+	snprintf(test_name, sizeof test_name, "%s_lat", sstr);
+	transfer_count = 1;
+	transfer_size = size;
+	iterations = size_to_count(transfer_size);
+}
+
+static void init_bandwidth_test(int size)
+{
+	char sstr[5];
+
+	size_str(sstr, sizeof sstr, size);
+	snprintf(test_name, sizeof test_name, "%s_bw", sstr);
+	iterations = 1;
+	transfer_size = size;
+	transfer_count = size_to_count(transfer_size);
+}
+
+static int send_xfer(int size)
+{
+	struct pollfd fds;
+	int offset, ret;
+
+	if (verify)
+		format_buf(buf, size);
+
+	if (use_async) {
+		fds.fd = rs;
+		fds.events = POLLOUT;
+	}
+
+	for (offset = 0; offset < size; ) {
+		if (use_async) {
+			ret = do_poll(&fds, poll_timeout);
+			if (ret)
+				return ret;
+		}
+
+		ret = rs_send(rs, buf + offset, size - offset, flags);
+		if (ret > 0) {
+			offset += ret;
+		} else if (errno != EWOULDBLOCK && errno != EAGAIN) {
+			perror("rsend");
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static int recv_xfer(int size)
+{
+	struct pollfd fds;
+	int offset, ret;
+
+	if (use_async) {
+		fds.fd = rs;
+		fds.events = POLLIN;
+	}
+
+	for (offset = 0; offset < size; ) {
+		if (use_async) {
+			ret = do_poll(&fds, poll_timeout);
+			if (ret)
+				return ret;
+		}
+
+		ret = rs_recv(rs, buf + offset, size - offset, flags);
+		if (ret > 0) {
+			offset += ret;
+		} else if (errno != EWOULDBLOCK && errno != EAGAIN) {
+			perror("rrecv");
+			return ret;
+		}
+	}
+
+	if (verify) {
+		ret = verify_buf(buf, size);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int sync_test(void)
+{
+	int ret;
+
+	ret = dst_addr ? send_xfer(16) : recv_xfer(16);
+	if (ret)
+		return ret;
+
+	return dst_addr ? recv_xfer(16) : send_xfer(16);
+}
+
+static int run_test(void)
+{
+	int ret, i, t;
+
+	ret = sync_test();
+	if (ret)
+		goto out;
+
+	gettimeofday(&start, NULL);
+	for (i = 0; i < iterations; i++) {
+		for (t = 0; t < transfer_count; t++) {
+			ret = dst_addr ? send_xfer(transfer_size) :
+					 recv_xfer(transfer_size);
+			if (ret)
+				goto out;
+		}
+
+		for (t = 0; t < transfer_count; t++) {
+			ret = dst_addr ? recv_xfer(transfer_size) :
+					 send_xfer(transfer_size);
+			if (ret)
+				goto out;
+		}
+	}
+	gettimeofday(&end, NULL);
+	show_perf();
+	ret = 0;
+
+out:
+	return ret;
+}
+
+static void set_options(int rs)
+{
+	int val;
+
+	if (buffer_size) {
+		rs_setsockopt(rs, SOL_SOCKET, SO_SNDBUF, (void *) &buffer_size,
+			      sizeof buffer_size);
+		rs_setsockopt(rs, SOL_SOCKET, SO_RCVBUF, (void *) &buffer_size,
+			      sizeof buffer_size);
+	} else {
+		val = 1 << 19;
+		rs_setsockopt(rs, SOL_SOCKET, SO_SNDBUF, (void *) &val, sizeof val);
+		rs_setsockopt(rs, SOL_SOCKET, SO_RCVBUF, (void *) &val, sizeof val);
+	}
+
+	val = 1;
+	rs_setsockopt(rs, IPPROTO_TCP, TCP_NODELAY, (void *) &val, sizeof(val));
+
+	if (flags & MSG_DONTWAIT)
+		rs_fcntl(rs, F_SETFL, O_NONBLOCK);
+
+	if (use_rs) {
+		/* Inline size based on experimental data */
+		if (optimization == opt_latency) {
+			val = 384;
+			rs_setsockopt(rs, SOL_RDMA, RDMA_INLINE, &val, sizeof val);
+		} else if (optimization == opt_bandwidth) {
+			val = 0;
+			rs_setsockopt(rs, SOL_RDMA, RDMA_INLINE, &val, sizeof val);
+		}
+	}
+}
+
+static int server_listen(void)
+{
+	struct addrinfo *ai;
+	int val, ret;
+
+	ai_hints.ai_flags |= AI_PASSIVE;
+	ret = getaddrinfo(src_addr, port, &ai_hints, &ai);
+	if (ret) {
+		perror("getaddrinfo");
+		return ret;
+	}
+
+	lrs = rs_socket(ai->ai_family, SOCK_STREAM, 0);
+	if (lrs < 0) {
+		perror("rsocket");
+		ret = lrs;
+		goto free;
+	}
+
+	val = 1;
+	ret = rs_setsockopt(lrs, SOL_SOCKET, SO_REUSEADDR, &val, sizeof val);
+	if (ret) {
+		perror("rsetsockopt SO_REUSEADDR");
+		goto close;
+	}
+
+	ret = rs_bind(lrs, ai->ai_addr, ai->ai_addrlen);
+	if (ret) {
+		perror("rbind");
+		goto close;
+	}
+
+	ret = rs_listen(lrs, 1);
+	if (ret)
+		perror("rlisten");
+
+close:
+	if (ret)
+		rs_close(lrs);
+free:
+	freeaddrinfo(ai);
+	return ret;
+}
+
+static int server_connect(void)
+{
+	struct pollfd fds;
+	int ret = 0;
+
+	set_options(lrs);
+	do {
+		if (use_async) {
+			fds.fd = lrs;
+			fds.events = POLLIN;
+
+			ret = do_poll(&fds, poll_timeout);
+			if (ret) {
+				perror("rpoll");
+				return ret;
+			}
+		}
+
+		rs = rs_accept(lrs, NULL, 0);
+	} while (rs < 0 && (errno == EAGAIN || errno == EWOULDBLOCK));
+	if (rs < 0) {
+		perror("raccept");
+		return rs;
+	}
+
+	if (use_fork)
+		fork_pid = fork();
+	if (!fork_pid)
+		set_options(rs);
+	return ret;
+}
+
+static int client_connect(void)
+{
+	struct addrinfo *ai;
+	struct pollfd fds;
+	int ret, err;
+	socklen_t len;
+
+	ret = getaddrinfo(dst_addr, port, &ai_hints, &ai);
+	if (ret) {
+		perror("getaddrinfo");
+		return ret;
+	}
+
+	rs = rs_socket(ai->ai_family, SOCK_STREAM, 0);
+	if (rs < 0) {
+		perror("rsocket");
+		ret = rs;
+		goto free;
+	}
+
+	set_options(rs);
+	/* TODO: bind client to src_addr */
+
+	ret = rs_connect(rs, ai->ai_addr, ai->ai_addrlen);
+	if (ret && (errno != EINPROGRESS)) {
+		perror("rconnect");
+		goto close;
+	}
+
+	if (ret && (errno == EINPROGRESS)) {
+		fds.fd = rs;
+		fds.events = POLLOUT;
+		ret = do_poll(&fds, poll_timeout);
+		if (ret)
+			goto close;
+
+		len = sizeof err;
+		ret = rs_getsockopt(rs, SOL_SOCKET, SO_ERROR, &err, &len);
+		if (ret)
+			goto close;
+		if (err) {
+			ret = -1;
+			errno = err;
+			perror("async rconnect");
+		}
+	}
+
+close:
+	if (ret)
+		rs_close(rs);
+free:
+	freeaddrinfo(ai);
+	return ret;
+}
+
+static int run(void)
+{
+	int i, ret = 0;
+
+	buf = malloc(!custom ? test_size[TEST_CNT - 1].size : transfer_size);
+	if (!buf) {
+		perror("malloc");
+		return -1;
+	}
+
+	if (!dst_addr) {
+		ret = server_listen();
+		if (ret)
+			goto free;
+	}
+
+	printf("%-10s%-8s%-8s%-8s%-8s%8s %10s%13s\n",
+	       "name", "bytes", "xfers", "iters", "total", "time", "Gb/sec", "usec/xfer");
+	if (!custom) {
+		optimization = opt_latency;
+		ret = dst_addr ? client_connect() : server_connect();
+		if (ret)
+			goto free;
+
+		for (i = 0; i < TEST_CNT && !fork_pid; i++) {
+			if (test_size[i].option > size_option)
+				continue;
+			init_latency_test(test_size[i].size);
+			run_test();
+		}
+		if (fork_pid)
+			wait(NULL);
+		else
+			rs_shutdown(rs, SHUT_RDWR);
+		rs_close(rs);
+
+		if (!dst_addr && use_fork && !fork_pid)
+			goto free;
+
+		optimization = opt_bandwidth;
+		ret = dst_addr ? client_connect() : server_connect();
+		if (ret)
+			goto free;
+		for (i = 0; i < TEST_CNT && !fork_pid; i++) {
+			if (test_size[i].option > size_option)
+				continue;
+			init_bandwidth_test(test_size[i].size);
+			run_test();
+		}
+	} else {
+		ret = dst_addr ? client_connect() : server_connect();
+		if (ret)
+			goto free;
+
+		if (!fork_pid)
+			ret = run_test();
+	}
+
+	if (fork_pid)
+		wait(NULL);
+	else
+		rs_shutdown(rs, SHUT_RDWR);
+	rs_close(rs);
+free:
+	free(buf);
+	return ret;
+}
+
+static int set_test_opt(char *optarg)
+{
+	if (strlen(optarg) == 1) {
+		switch (optarg[0]) {
+		case 's':
+			use_rs = 0;
+			break;
+		case 'a':
+			use_async = 1;
+			break;
+		case 'b':
+			flags = (flags & ~MSG_DONTWAIT) | MSG_WAITALL;
+			break;
+		case 'f':
+			use_fork = 1;
+			use_rs = 0;
+			break;
+		case 'n':
+			flags |= MSG_DONTWAIT;
+			break;
+//		case 'r':
+//			use_rgai = 1;
+//			break;
+		case 'v':
+			verify = 1;
+			break;
+		default:
+			return -1;
+		}
+	} else {
+		if (!strncasecmp("socket", optarg, 6)) {
+			use_rs = 0;
+		} else if (!strncasecmp("async", optarg, 5)) {
+			use_async = 1;
+		} else if (!strncasecmp("block", optarg, 5)) {
+			flags = (flags & ~MSG_DONTWAIT) | MSG_WAITALL;
+		} else if (!strncasecmp("nonblock", optarg, 8)) {
+			flags |= MSG_DONTWAIT;
+//		} else if (strncasecmp("resolve", optarg, 7)) {
+//			use_rgai = 1;
+		} else if (!strncasecmp("verify", optarg, 6)) {
+			verify = 1;
+		} else if (!strncasecmp("fork", optarg, 4)) {
+			use_fork = 1;
+			use_rs = 0;
+		} else {
+			return -1;
+		}
+	}
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	int op, ret;
+
+	ai_hints.ai_socktype = SOCK_STREAM;
+	while ((op = getopt(argc, argv, "s:b:f:B:I:C:S:p:T:")) != -1) {
+		switch (op) {
+		case 's':
+			dst_addr = optarg;
+			break;
+		case 'b':
+			src_addr = optarg;
+			break;
+		case 'f':
+			if (!strncasecmp("ip", optarg, 2)) {
+				ai_hints.ai_flags = AI_NUMERICHOST;
+			}
+			break;
+		case 'B':
+			buffer_size = atoi(optarg);
+			break;
+		case 'I':
+			custom = 1;
+			iterations = atoi(optarg);
+			break;
+		case 'C':
+			custom = 1;
+			transfer_count = atoi(optarg);
+			break;
+		case 'S':
+			if (!strncasecmp("all", optarg, 3)) {
+				size_option = 1;
+			} else {
+				custom = 1;
+				transfer_size = atoi(optarg);
+			}
+			break;
+		case 'p':
+			port = optarg;
+			break;
+		case 'T':
+			if (!set_test_opt(optarg))
+				break;
+			/* invalid option - fall through */
+		default:
+			printf("usage: %s\n", argv[0]);
+			printf("\t[-s server_address]\n");
+			printf("\t[-b bind_address]\n");
+			printf("\t[-f address_format]\n");
+			printf("\t    name, ip, ipv6, or gid\n");
+			printf("\t[-B buffer_size]\n");
+			printf("\t[-I iterations]\n");
+			printf("\t[-C transfer_count]\n");
+			printf("\t[-S transfer_size or all]\n");
+			printf("\t[-p port_number]\n");
+			printf("\t[-T test_option]\n");
+			printf("\t    s|sockets - use standard tcp/ip sockets\n");
+			printf("\t    a|async - asynchronous operation (use poll)\n");
+			printf("\t    b|blocking - use blocking calls\n");
+			printf("\t    f|fork - fork server processing\n");
+			printf("\t    n|nonblocking - use nonblocking calls\n");
+			printf("\t    r|resolve - use rdma cm to resolve address\n");
+			printf("\t    v|verify - verify data\n");
+			exit(1);
+		}
+	}
+
+	if (!(flags & MSG_DONTWAIT))
+		poll_timeout = -1;
+
+	ret = run();
+	return ret;
+}
diff --git a/prov/rdmacm/examples/udpong.c b/prov/rdmacm/examples/udpong.c
new file mode 100644
index 00000000000..af8deb9ee8b
--- /dev/null
+++ b/prov/rdmacm/examples/udpong.c
@@ -0,0 +1,568 @@
+/*
+ * Copyright (c) 2012 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under the OpenIB.org BSD license
+ * below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AWV
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#include <errno.h>
+#include <getopt.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <netdb.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <arpa/inet.h>
+
+#include <rdma/rdma_cma.h>
+#include <rdma/rsocket.h>
+#include "common.h"
+
+static int test_size[] = {
+	(1 <<  6),
+	(1 <<  7), ((1 <<  7) + (1 << 6)),
+	(1 <<  8), ((1 <<  8) + (1 << 7)),
+	(1 <<  9), ((1 <<  9) + (1 << 8)),
+	(1 << 10), ((1 << 10) + (1 << 9)),
+};
+#define TEST_CNT (sizeof test_size / sizeof test_size[0])
+
+enum {
+	msg_op_login,
+	msg_op_start,
+	msg_op_data,
+	msg_op_echo,
+	msg_op_end
+};
+
+struct message {
+	uint8_t op;
+	uint8_t id;
+	uint8_t seqno;
+	uint8_t reserved;
+	uint32_t data;
+	uint8_t  buf[2048];
+};
+
+#define CTRL_MSG_SIZE 16
+
+struct client {
+	uint64_t recvcnt;
+};
+
+static struct client clients[256];
+static uint8_t id;
+
+static int rs;
+static int use_async;
+static int flags = MSG_DONTWAIT;
+static int poll_timeout;
+static int custom;
+static int echo;
+static int transfer_size = 1000;
+static int transfer_count = 1000;
+static int buffer_size;
+static char test_name[10] = "custom";
+static char *port = "7174";
+static char *dst_addr;
+static char *src_addr;
+static union socket_addr addr;
+static socklen_t addrlen;
+static struct timeval start, end;
+static struct message msg;
+
+static void show_perf(void)
+{
+	char str[32];
+	float usec;
+	long long bytes;
+	int transfers;
+
+	usec = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec);
+	transfers = echo ? transfer_count * 2 : ntohl(msg.data);
+	bytes = (long long) transfers * transfer_size;
+
+	/* name size transfers bytes seconds Gb/sec usec/xfer */
+	printf("%-10s", test_name);
+	size_str(str, sizeof str, transfer_size);
+	printf("%-8s", str);
+	cnt_str(str, sizeof str, transfers);
+	printf("%-8s", str);
+	size_str(str, sizeof str, bytes);
+	printf("%-8s", str);
+	printf("%8.2fs%10.2f%11.2f\n",
+		usec / 1000000., (bytes * 8) / (1000. * usec),
+		(usec / transfers));
+}
+
+static void init_latency_test(int size)
+{
+	char sstr[5];
+
+	size_str(sstr, sizeof sstr, size);
+	snprintf(test_name, sizeof test_name, "%s_lat", sstr);
+	transfer_size = size;
+	transfer_count = size_to_count(transfer_size) / 10;
+	echo = 1;
+}
+
+static void init_bandwidth_test(int size)
+{
+	char sstr[5];
+
+	size_str(sstr, sizeof sstr, size);
+	snprintf(test_name, sizeof test_name, "%s_bw", sstr);
+	transfer_size = size;
+	transfer_count = size_to_count(transfer_size);
+	echo = 0;
+}
+
+static void set_options(int rs)
+{
+	int val;
+
+	if (buffer_size) {
+		rs_setsockopt(rs, SOL_SOCKET, SO_SNDBUF, (void *) &buffer_size,
+			      sizeof buffer_size);
+		rs_setsockopt(rs, SOL_SOCKET, SO_RCVBUF, (void *) &buffer_size,
+			      sizeof buffer_size);
+	} else {
+		val = 1 << 19;
+		rs_setsockopt(rs, SOL_SOCKET, SO_SNDBUF, (void *) &val, sizeof val);
+		rs_setsockopt(rs, SOL_SOCKET, SO_RCVBUF, (void *) &val, sizeof val);
+	}
+
+	if (flags & MSG_DONTWAIT)
+		rs_fcntl(rs, F_SETFL, O_NONBLOCK);
+}
+
+static ssize_t svr_send(struct message *msg, size_t size,
+			union socket_addr *addr, socklen_t addrlen)
+{
+	struct pollfd fds;
+	ssize_t ret;
+
+	if (use_async) {
+		fds.fd = rs;
+		fds.events = POLLOUT;
+	}
+
+	do {
+		if (use_async) {
+			ret = do_poll(&fds, poll_timeout);
+			if (ret)
+				return ret;
+		}
+
+		ret = rs_sendto(rs, msg, size, flags, &addr->sa, addrlen);
+	} while (ret < 0 && (errno == EWOULDBLOCK || errno == EAGAIN));
+
+	if (ret < 0)
+		perror("rsend");
+
+	return ret;
+}
+
+static ssize_t svr_recv(struct message *msg, size_t size,
+			union socket_addr *addr, socklen_t *addrlen)
+{
+	struct pollfd fds;
+	ssize_t ret;
+
+	if (use_async) {
+		fds.fd = rs;
+		fds.events = POLLIN;
+	}
+
+	do {
+		if (use_async) {
+			ret = do_poll(&fds, poll_timeout);
+			if (ret)
+				return ret;
+		}
+
+		ret = rs_recvfrom(rs, msg, size, flags, &addr->sa, addrlen);
+	} while (ret < 0 && (errno == EWOULDBLOCK || errno == EAGAIN));
+
+	if (ret < 0)
+		perror("rrecv");
+
+	return ret;
+}
+
+static int svr_process(struct message *msg, size_t size,
+		       union socket_addr *addr, socklen_t addrlen)
+{
+	char str[64];
+	ssize_t ret;
+
+	switch (msg->op) {
+	case msg_op_login:
+		if (addr->sa.sa_family == AF_INET) {
+			printf("client login from %s\n",
+			       inet_ntop(AF_INET, &addr->sin.sin_addr.s_addr,
+					 str, sizeof str));
+		} else {
+			printf("client login from %s\n",
+			       inet_ntop(AF_INET6, &addr->sin6.sin6_addr.s6_addr,
+					 str, sizeof str));
+		}
+		msg->id = id++;
+		/* fall through */
+	case msg_op_start:
+		memset(&clients[msg->id], 0, sizeof clients[msg->id]);
+		break;
+	case msg_op_echo:
+		clients[msg->id].recvcnt++;
+		break;
+	case msg_op_end:
+		msg->data = htonl(clients[msg->id].recvcnt);
+		break;
+	default:
+		clients[msg->id].recvcnt++;
+		return 0;
+	}
+
+	ret = svr_send(msg, size, addr, addrlen);
+	return (ret == size) ? 0 : (int) ret;
+}
+
+static int svr_bind(void)
+{
+	struct addrinfo hints, *res;
+	int ret;
+
+	memset(&hints, 0, sizeof hints);
+	hints.ai_socktype = SOCK_DGRAM;
+ 	ret = getaddrinfo(src_addr, port, &hints, &res);
+	if (ret) {
+		perror("getaddrinfo");
+		return ret;
+	}
+
+	rs = rs_socket(res->ai_family, res->ai_socktype, res->ai_protocol);
+	if (rs < 0) {
+		perror("rsocket");
+		ret = rs;
+		goto out;
+	}
+
+	set_options(rs);
+	ret = rs_bind(rs, res->ai_addr, res->ai_addrlen);
+	if (ret) {
+		perror("rbind");
+		rs_close(rs);
+	}
+
+out:
+	free(res);
+	return ret;
+}
+
+static int svr_run(void)
+{
+	size_t len;
+	int ret;
+
+	ret = svr_bind();
+	while (!ret) {
+		addrlen = sizeof addr;
+		len = svr_recv(&msg, sizeof msg, &addr, &addrlen);
+		if (len < 0)
+			return len;
+
+		ret = svr_process(&msg, len, &addr, addrlen);
+	}
+	return ret;
+}
+
+static ssize_t client_send(struct message *msg, size_t size)
+{
+	struct pollfd fds;
+	int ret;
+
+	if (use_async) {
+		fds.fd = rs;
+		fds.events = POLLOUT;
+	}
+
+	do {
+		if (use_async) {
+			ret = do_poll(&fds, poll_timeout);
+			if (ret)
+				return ret;
+		}
+
+		ret = rs_send(rs, msg, size, flags);
+	} while (ret < 0 && (errno == EWOULDBLOCK || errno == EAGAIN));
+
+	if (ret < 0)
+		perror("rsend");
+
+	return ret;
+}
+
+static ssize_t client_recv(struct message *msg, size_t size, int timeout)
+{
+	struct pollfd fds;
+	int ret;
+
+	if (timeout) {
+		fds.fd = rs;
+		fds.events = POLLIN;
+
+		ret = rs_poll(&fds, 1, timeout);
+		if (ret <= 0)
+			return ret;
+	}
+
+	ret = rs_recv(rs, msg, size, flags | MSG_DONTWAIT);
+	if (ret < 0 && (errno == EWOULDBLOCK || errno == EAGAIN))
+		perror("rrecv");
+
+	return ret;
+}
+
+static int client_send_recv(struct message *msg, size_t size, int timeout)
+{
+	static uint8_t seqno;
+	int ret;
+
+	msg->seqno = seqno;
+	do {
+		ret = client_send(msg, size);
+		if (ret != size)
+			return ret;
+
+		ret = client_recv(msg, size, timeout);
+	} while (ret <= 0 || msg->seqno != seqno);
+
+	seqno++;
+	return ret;
+}
+
+static int run_test(void)
+{
+	int ret, i;
+
+	msg.op = msg_op_start;
+	ret = client_send_recv(&msg, CTRL_MSG_SIZE, 1000);
+	if (ret != CTRL_MSG_SIZE)
+		goto out;
+
+	msg.op = echo ? msg_op_echo : msg_op_data;
+	gettimeofday(&start, NULL);
+	for (i = 0; i < transfer_count; i++) {
+		ret = echo ? client_send_recv(&msg, transfer_size, 1) :
+			     client_send(&msg, transfer_size);
+		if (ret != transfer_size)
+			goto out;
+	}
+
+	msg.op = msg_op_end;
+	ret = client_send_recv(&msg, CTRL_MSG_SIZE, 1);
+	if (ret != CTRL_MSG_SIZE)
+		goto out;
+
+	gettimeofday(&end, NULL);
+	show_perf();
+	ret = 0;
+
+out:
+	return ret;
+}
+
+static int client_connect(void)
+{
+	struct addrinfo hints, *res;
+	int ret;
+
+	memset(&hints, 0, sizeof hints);
+	hints.ai_socktype = SOCK_DGRAM;
+ 	ret = getaddrinfo(dst_addr, port, &hints, &res);
+	if (ret) {
+		perror("getaddrinfo");
+		return ret;
+	}
+
+	rs = rs_socket(res->ai_family, res->ai_socktype, res->ai_protocol);
+	if (rs < 0) {
+		perror("rsocket");
+		ret = rs;
+		goto out;
+	}
+
+	set_options(rs);
+	ret = rs_connect(rs, res->ai_addr, res->ai_addrlen);
+	if (ret) {
+		perror("rconnect");
+		rs_close(rs);
+	}
+
+	msg.op = msg_op_login;
+	ret = client_send_recv(&msg, CTRL_MSG_SIZE, 1000);
+	if (ret == CTRL_MSG_SIZE)
+		ret = 0;
+
+out:
+	freeaddrinfo(res);
+	return ret;
+}
+
+static int client_run(void)
+{
+	int i, ret;
+
+	printf("%-10s%-8s%-8s%-8s%8s %10s%13s\n",
+	       "name", "bytes", "xfers", "total", "time", "Gb/sec", "usec/xfer");
+
+	ret = client_connect();
+	if (ret)
+		return ret;
+
+	if (!custom) {
+		for (i = 0; i < TEST_CNT; i++) {
+			init_latency_test(test_size[i]);
+			run_test();
+		}
+		for (i = 0; i < TEST_CNT; i++) {
+			init_bandwidth_test(test_size[i]);
+			run_test();
+		}
+	} else {
+		run_test();
+	}
+	rs_close(rs);
+
+	return ret;
+}
+
+static int set_test_opt(char *optarg)
+{
+	if (strlen(optarg) == 1) {
+		switch (optarg[0]) {
+		case 's':
+			use_rs = 0;
+			break;
+		case 'a':
+			use_async = 1;
+			break;
+		case 'b':
+			flags = 0;
+			break;
+		case 'n':
+			flags = MSG_DONTWAIT;
+			break;
+		case 'e':
+			echo = 1;
+			break;
+		default:
+			return -1;
+		}
+	} else {
+		if (!strncasecmp("socket", optarg, 6)) {
+			use_rs = 0;
+		} else if (!strncasecmp("async", optarg, 5)) {
+			use_async = 1;
+		} else if (!strncasecmp("block", optarg, 5)) {
+			flags = 0;
+		} else if (!strncasecmp("nonblock", optarg, 8)) {
+			flags = MSG_DONTWAIT;
+		} else if (!strncasecmp("echo", optarg, 4)) {
+			echo = 1;
+		} else {
+			return -1;
+		}
+	}
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	int op, ret;
+
+	while ((op = getopt(argc, argv, "s:b:B:C:S:p:T:")) != -1) {
+		switch (op) {
+		case 's':
+			dst_addr = optarg;
+			break;
+		case 'b':
+			src_addr = optarg;
+			break;
+		case 'B':
+			buffer_size = atoi(optarg);
+			break;
+		case 'C':
+			custom = 1;
+			transfer_count = atoi(optarg);
+			break;
+		case 'S':
+			custom = 1;
+			transfer_size = atoi(optarg);
+			if (transfer_size < CTRL_MSG_SIZE) {
+				printf("size must be at least %d bytes\n",
+				       CTRL_MSG_SIZE);
+				exit(1);
+			}
+			break;
+		case 'p':
+			port = optarg;
+			break;
+		case 'T':
+			if (!set_test_opt(optarg))
+				break;
+			/* invalid option - fall through */
+		default:
+			printf("usage: %s\n", argv[0]);
+			printf("\t[-s server_address]\n");
+			printf("\t[-b bind_address]\n");
+			printf("\t[-B buffer_size]\n");
+			printf("\t[-C transfer_count]\n");
+			printf("\t[-S transfer_size]\n");
+			printf("\t[-p port_number]\n");
+			printf("\t[-T test_option]\n");
+			printf("\t    s|sockets - use standard tcp/ip sockets\n");
+			printf("\t    a|async - asynchronous operation (use poll)\n");
+			printf("\t    b|blocking - use blocking calls\n");
+			printf("\t    n|nonblocking - use nonblocking calls\n");
+			printf("\t    e|echo - server echoes all messages\n");
+			exit(1);
+		}
+	}
+
+	if (flags)
+		poll_timeout = -1;
+
+	ret = dst_addr ? client_run() : svr_run();
+	return ret;
+}
diff --git a/prov/rdmacm/include/rdma/rdma_cma.h b/prov/rdmacm/include/rdma/rdma_cma.h
new file mode 100644
index 00000000000..a5a9150c618
--- /dev/null
+++ b/prov/rdmacm/include/rdma/rdma_cma.h
@@ -0,0 +1,684 @@
+/*
+ * Copyright (c) 2005 Voltaire Inc.  All rights reserved.
+ * Copyright (c) 2005-2012 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(RDMA_CMA_H)
+#define RDMA_CMA_H
+
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <infiniband/verbs.h>
+#include <rdma/fi_ucma.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/*
+ * Upon receiving a device removal event, users must destroy the associated
+ * RDMA identifier and release all resources allocated with the device.
+ */
+enum rdma_cm_event_type {
+	RDMA_CM_EVENT_ADDR_RESOLVED,
+	RDMA_CM_EVENT_ADDR_ERROR,
+	RDMA_CM_EVENT_ROUTE_RESOLVED,
+	RDMA_CM_EVENT_ROUTE_ERROR,
+	RDMA_CM_EVENT_CONNECT_REQUEST,
+	RDMA_CM_EVENT_CONNECT_RESPONSE,
+	RDMA_CM_EVENT_CONNECT_ERROR,
+	RDMA_CM_EVENT_UNREACHABLE,
+	RDMA_CM_EVENT_REJECTED,
+	RDMA_CM_EVENT_ESTABLISHED,
+	RDMA_CM_EVENT_DISCONNECTED,
+	RDMA_CM_EVENT_DEVICE_REMOVAL,
+	RDMA_CM_EVENT_MULTICAST_JOIN,
+	RDMA_CM_EVENT_MULTICAST_ERROR,
+	RDMA_CM_EVENT_ADDR_CHANGE,
+	RDMA_CM_EVENT_TIMEWAIT_EXIT
+};
+
+enum rdma_port_space {
+	RDMA_PS_IPOIB = 0x0002,
+	RDMA_PS_TCP   = 0x0106,
+	RDMA_PS_UDP   = 0x0111,
+	RDMA_PS_IB    = 0x013F,
+};
+
+#define RDMA_IB_IP_PS_MASK   0xFFFFFFFFFFFF0000ULL
+#define RDMA_IB_IP_PORT_MASK 0x000000000000FFFFULL
+#define RDMA_IB_IP_PS_TCP    0x0000000001060000ULL
+#define RDMA_IB_IP_PS_UDP    0x0000000001110000ULL
+#define RDMA_IB_PS_IB        0x00000000013F0000ULL
+
+/*
+ * Global qkey value for UDP QPs and multicast groups created via the 
+ * RDMA CM.
+ */
+#define RDMA_UDP_QKEY 0x01234567
+
+struct rdma_ib_addr {
+	union ibv_gid	sgid;
+	union ibv_gid	dgid;
+	uint16_t	pkey;
+};
+
+struct rdma_addr {
+	union {
+		struct sockaddr		src_addr;
+		struct sockaddr_in	src_sin;
+		struct sockaddr_in6	src_sin6;
+		struct sockaddr_storage src_storage;
+	};
+	union {
+		struct sockaddr		dst_addr;
+		struct sockaddr_in	dst_sin;
+		struct sockaddr_in6	dst_sin6;
+		struct sockaddr_storage dst_storage;
+	};
+	union {
+		struct rdma_ib_addr	ibaddr;
+	} addr;
+};
+
+struct rdma_route {
+	struct rdma_addr	 addr;
+	struct ibv_sa_path_rec	*path_rec;
+	int			 num_paths;
+};
+
+struct rdma_event_channel {
+	int			fd;
+	fid_t			fid;
+};
+
+struct rdma_cm_id {
+	struct ibv_context	*verbs;
+	struct rdma_event_channel *channel;
+	void			*context;
+	struct ibv_qp		*qp;
+	struct rdma_route	 route;
+	enum rdma_port_space	 ps;
+	uint8_t			 port_num;
+	struct rdma_cm_event	*event;
+	struct ibv_comp_channel *send_cq_channel;
+	struct ibv_cq		*send_cq;
+	struct ibv_comp_channel *recv_cq_channel;
+	struct ibv_cq		*recv_cq;
+	struct ibv_srq		*srq;
+	struct ibv_pd		*pd;
+	enum ibv_qp_type	qp_type;
+};
+
+enum {
+	RDMA_MAX_RESP_RES = 0xFF,
+	RDMA_MAX_INIT_DEPTH = 0xFF
+};
+
+struct rdma_conn_param {
+	const void *private_data;
+	uint8_t private_data_len;
+	uint8_t responder_resources;
+	uint8_t initiator_depth;
+	uint8_t flow_control;
+	uint8_t retry_count;		/* ignored when accepting */
+	uint8_t rnr_retry_count;
+	/* Fields below ignored if a QP is created on the rdma_cm_id. */
+	uint8_t srq;
+	uint32_t qp_num;
+};
+
+struct rdma_ud_param {
+	const void *private_data;
+	uint8_t private_data_len;
+	struct ibv_ah_attr ah_attr;
+	uint32_t qp_num;
+	uint32_t qkey;
+};
+
+struct rdma_cm_event {
+	struct rdma_cm_id	*id;
+	struct rdma_cm_id	*listen_id;
+	enum rdma_cm_event_type	 event;
+	int			 status;
+	union {
+		struct rdma_conn_param conn;
+		struct rdma_ud_param   ud;
+	} param;
+};
+
+#define RAI_PASSIVE		0x00000001
+#define RAI_NUMERICHOST		0x00000002
+#define RAI_NOROUTE		0x00000004
+#define RAI_FAMILY		0x00000008
+
+struct rdma_addrinfo {
+	int			ai_flags;
+	int			ai_family;
+	int			ai_qp_type;
+	int			ai_port_space;
+	socklen_t		ai_src_len;
+	socklen_t		ai_dst_len;
+	struct sockaddr		*ai_src_addr;
+	struct sockaddr		*ai_dst_addr;
+	char			*ai_src_canonname;
+	char			*ai_dst_canonname;
+	size_t			ai_route_len;
+	void			*ai_route;
+	size_t			ai_connect_len;
+	void			*ai_connect;
+	struct rdma_addrinfo	*ai_next;
+};
+
+/**
+ * rdma_create_event_channel - Open a channel used to report communication events.
+ * Description:
+ *   Asynchronous events are reported to users through event channels.  Each
+ *   event channel maps to a file descriptor.
+ * Notes:
+ *   All created event channels must be destroyed by calling
+ *   rdma_destroy_event_channel.  Users should call rdma_get_cm_event to
+ *   retrieve events on an event channel.
+ * See also:
+ *   rdma_get_cm_event, rdma_destroy_event_channel
+ */
+struct rdma_event_channel *rdma_create_event_channel(void);
+
+/**
+ * rdma_destroy_event_channel - Close an event communication channel.
+ * @channel: The communication channel to destroy.
+ * Description:
+ *   Release all resources associated with an event channel and closes the
+ *   associated file descriptor.
+ * Notes:
+ *   All rdma_cm_id's associated with the event channel must be destroyed,
+ *   and all returned events must be acked before calling this function.
+ * See also:
+ *  rdma_create_event_channel, rdma_get_cm_event, rdma_ack_cm_event
+ */
+void rdma_destroy_event_channel(struct rdma_event_channel *channel);
+
+/**
+ * rdma_create_id - Allocate a communication identifier.
+ * @channel: The communication channel that events associated with the
+ *   allocated rdma_cm_id will be reported on.
+ * @id: A reference where the allocated communication identifier will be
+ *   returned.
+ * @context: User specified context associated with the rdma_cm_id.
+ * @ps: RDMA port space.
+ * Description:
+ *   Creates an identifier that is used to track communication information.
+ * Notes:
+ *   Rdma_cm_id's are conceptually equivalent to a socket for RDMA
+ *   communication.  The difference is that RDMA communication requires
+ *   explicitly binding to a specified RDMA device before communication
+ *   can occur, and most operations are asynchronous in nature.  Communication
+ *   events on an rdma_cm_id are reported through the associated event
+ *   channel.  Users must release the rdma_cm_id by calling rdma_destroy_id.
+ * See also:
+ *   rdma_create_event_channel, rdma_destroy_id, rdma_get_devices,
+ *   rdma_bind_addr, rdma_resolve_addr, rdma_connect, rdma_listen,
+ */
+int rdma_create_id(struct rdma_event_channel *channel,
+		   struct rdma_cm_id **id, void *context,
+		   enum rdma_port_space ps);
+
+/**
+ * rdma_create_ep - Allocate a communication identifier and qp.
+ * @id: A reference where the allocated communication identifier will be
+ *   returned.
+ * @res: Result from rdma_getaddrinfo, which specifies the source and
+ *   destination addresses, plus optional routing and connection information.
+ * @pd: Optional protection domain.  This parameter is ignored if qp_init_attr
+ *   is NULL.
+ * @qp_init_attr: Optional attributes for a QP created on the rdma_cm_id.
+ * Description:
+ *   Create an identifier and option QP used for communication.
+ * Notes:
+ *   If qp_init_attr is provided, then a queue pair will be allocated and
+ *   associated with the rdma_cm_id.  If a pd is provided, the QP will be
+ *   created on that PD.  Otherwise, the QP will be allocated on a default
+ *   PD.
+ *   The rdma_cm_id will be set to use synchronous operations (connect,
+ *   listen, and get_request).  To convert to asynchronous operation, the
+ *   rdma_cm_id should be migrated to a user allocated event channel.
+ * See also:
+ *   rdma_create_id, rdma_create_qp, rdma_migrate_id, rdma_connect,
+ *   rdma_listen
+ */
+int rdma_create_ep(struct rdma_cm_id **id, struct rdma_addrinfo *res,
+		   struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr);
+
+/**
+ * rdma_destroy_ep - Deallocates a communication identifier and qp.
+ * @id: The communication identifer to destroy.
+ * Description:
+ *   Destroys the specified rdma_cm_id and any associated QP created
+ *   on that id.
+ * See also:
+ *   rdma_create_ep
+ */
+void rdma_destroy_ep(struct rdma_cm_id *id);
+
+/**
+ * rdma_destroy_id - Release a communication identifier.
+ * @id: The communication identifier to destroy.
+ * Description:
+ *   Destroys the specified rdma_cm_id and cancels any outstanding
+ *   asynchronous operation.
+ * Notes:
+ *   Users must free any associated QP with the rdma_cm_id before
+ *   calling this routine and ack an related events.
+ * See also:
+ *   rdma_create_id, rdma_destroy_qp, rdma_ack_cm_event
+ */
+int rdma_destroy_id(struct rdma_cm_id *id);
+
+/**
+ * rdma_bind_addr - Bind an RDMA identifier to a source address.
+ * @id: RDMA identifier.
+ * @addr: Local address information.  Wildcard values are permitted.
+ * Description:
+ *   Associates a source address with an rdma_cm_id.  The address may be
+ *   wildcarded.  If binding to a specific local address, the rdma_cm_id
+ *   will also be bound to a local RDMA device.
+ * Notes:
+ *   Typically, this routine is called before calling rdma_listen to bind
+ *   to a specific port number, but it may also be called on the active side
+ *   of a connection before calling rdma_resolve_addr to bind to a specific
+ *   address.
+ * See also:
+ *   rdma_create_id, rdma_listen, rdma_resolve_addr, rdma_create_qp
+ */
+int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr);
+
+/**
+ * rdma_resolve_addr - Resolve destination and optional source addresses.
+ * @id: RDMA identifier.
+ * @src_addr: Source address information.  This parameter may be NULL.
+ * @dst_addr: Destination address information.
+ * @timeout_ms: Time to wait for resolution to complete.
+ * Description:
+ *   Resolve destination and optional source addresses from IP addresses
+ *   to an RDMA address.  If successful, the specified rdma_cm_id will
+ *   be bound to a local device.
+ * Notes:
+ *   This call is used to map a given destination IP address to a usable RDMA
+ *   address.  If a source address is given, the rdma_cm_id is bound to that
+ *   address, the same as if rdma_bind_addr were called.  If no source
+ *   address is given, and the rdma_cm_id has not yet been bound to a device,
+ *   then the rdma_cm_id will be bound to a source address based on the
+ *   local routing tables.  After this call, the rdma_cm_id will be bound to
+ *   an RDMA device.  This call is typically made from the active side of a
+ *   connection before calling rdma_resolve_route and rdma_connect.
+ * See also:
+ *   rdma_create_id, rdma_resolve_route, rdma_connect, rdma_create_qp,
+ *   rdma_get_cm_event, rdma_bind_addr
+ */
+int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
+		      struct sockaddr *dst_addr, int timeout_ms);
+
+/**
+ * rdma_resolve_route - Resolve the route information needed to establish a connection.
+ * @id: RDMA identifier.
+ * @timeout_ms: Time to wait for resolution to complete.
+ * Description:
+ *   Resolves an RDMA route to the destination address in order to establish
+ *   a connection.  The destination address must have already been resolved
+ *   by calling rdma_resolve_addr.
+ * Notes:
+ *   This is called on the client side of a connection after calling
+ *   rdma_resolve_addr, but before calling rdma_connect.
+ * See also:
+ *   rdma_resolve_addr, rdma_connect, rdma_get_cm_event
+ */
+int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms);
+
+/**
+ * rdma_create_qp - Allocate a QP.
+ * @id: RDMA identifier.
+ * @pd: Optional protection domain for the QP.
+ * @qp_init_attr: initial QP attributes.
+ * Description:
+ *  Allocate a QP associated with the specified rdma_cm_id and transition it
+ *  for sending and receiving.
+ * Notes:
+ *   The rdma_cm_id must be bound to a local RDMA device before calling this
+ *   function, and the protection domain must be for that same device.
+ *   QPs allocated to an rdma_cm_id are automatically transitioned by the
+ *   librdmacm through their states.  After being allocated, the QP will be
+ *   ready to handle posting of receives.  If the QP is unconnected, it will
+ *   be ready to post sends.
+ *   If pd is NULL, then the QP will be allocated using a default protection
+ *   domain associated with the underlying RDMA device.
+ * See also:
+ *   rdma_bind_addr, rdma_resolve_addr, rdma_destroy_qp, ibv_create_qp,
+ *   ibv_modify_qp
+ */
+int rdma_create_qp(struct rdma_cm_id *id, struct ibv_pd *pd,
+		   struct ibv_qp_init_attr *qp_init_attr);
+
+/**
+ * rdma_destroy_qp - Deallocate a QP.
+ * @id: RDMA identifier.
+ * Description:
+ *   Destroy a QP allocated on the rdma_cm_id.
+ * Notes:
+ *   Users must destroy any QP associated with an rdma_cm_id before
+ *   destroying the ID.
+ * See also:
+ *   rdma_create_qp, rdma_destroy_id, ibv_destroy_qp
+ */
+void rdma_destroy_qp(struct rdma_cm_id *id);
+
+/**
+ * rdma_connect - Initiate an active connection request.
+ * @id: RDMA identifier.
+ * @conn_param: optional connection parameters.
+ * Description:
+ *   For a connected rdma_cm_id, this call initiates a connection request
+ *   to a remote destination.  For an unconnected rdma_cm_id, it initiates
+ *   a lookup of the remote QP providing the datagram service.
+ * Notes:
+ *   Users must have resolved a route to the destination address
+ *   by having called rdma_resolve_route before calling this routine.
+ *   A user may override the default connection parameters and exchange
+ *   private data as part of the connection by using the conn_param parameter.
+ * See also:
+ *   rdma_resolve_route, rdma_disconnect, rdma_listen, rdma_get_cm_event
+ */
+int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param);
+
+/**
+ * rdma_listen - Listen for incoming connection requests.
+ * @id: RDMA identifier.
+ * @backlog: backlog of incoming connection requests.
+ * Description:
+ *   Initiates a listen for incoming connection requests or datagram service
+ *   lookup.  The listen will be restricted to the locally bound source
+ *   address.
+ * Notes:
+ *   Users must have bound the rdma_cm_id to a local address by calling
+ *   rdma_bind_addr before calling this routine.  If the rdma_cm_id is
+ *   bound to a specific IP address, the listen will be restricted to that
+ *   address and the associated RDMA device.  If the rdma_cm_id is bound
+ *   to an RDMA port number only, the listen will occur across all RDMA
+ *   devices.
+ * See also:
+ *   rdma_bind_addr, rdma_connect, rdma_accept, rdma_reject, rdma_get_cm_event
+ */
+int rdma_listen(struct rdma_cm_id *id, int backlog);
+
+/**
+ * rdma_get_request
+ */
+int rdma_get_request(struct rdma_cm_id *listen, struct rdma_cm_id **id);
+
+/**
+ * rdma_accept - Called to accept a connection request.
+ * @id: Connection identifier associated with the request.
+ * @conn_param: Optional information needed to establish the connection.
+ * Description:
+ *   Called from the listening side to accept a connection or datagram
+ *   service lookup request.
+ * Notes:
+ *   Unlike the socket accept routine, rdma_accept is not called on a
+ *   listening rdma_cm_id.  Instead, after calling rdma_listen, the user
+ *   waits for a connection request event to occur.  Connection request
+ *   events give the user a newly created rdma_cm_id, similar to a new
+ *   socket, but the rdma_cm_id is bound to a specific RDMA device.
+ *   rdma_accept is called on the new rdma_cm_id.
+ *   A user may override the default connection parameters and exchange
+ *   private data as part of the connection by using the conn_param parameter.
+ * See also:
+ *   rdma_listen, rdma_reject, rdma_get_cm_event
+ */
+int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param);
+
+/**
+ * rdma_reject - Called to reject a connection request.
+ * @id: Connection identifier associated with the request.
+ * @private_data: Optional private data to send with the reject message.
+ * @private_data_len: Size of the private_data to send, in bytes.
+ * Description:
+ *   Called from the listening side to reject a connection or datagram
+ *   service lookup request.
+ * Notes:
+ *   After receiving a connection request event, a user may call rdma_reject
+ *   to reject the request.  If the underlying RDMA transport supports
+ *   private data in the reject message, the specified data will be passed to
+ *   the remote side.
+ * See also:
+ *   rdma_listen, rdma_accept, rdma_get_cm_event
+ */
+int rdma_reject(struct rdma_cm_id *id, const void *private_data,
+		uint8_t private_data_len);
+
+/**
+ * rdma_notify - Notifies the librdmacm of an asynchronous event.
+ * @id: RDMA identifier.
+ * @event: Asynchronous event.
+ * Description:
+ *   Used to notify the librdmacm of asynchronous events that have occurred
+ *   on a QP associated with the rdma_cm_id.
+ * Notes:
+ *   Asynchronous events that occur on a QP are reported through the user's
+ *   device event handler.  This routine is used to notify the librdmacm of
+ *   communication events.  In most cases, use of this routine is not
+ *   necessary, however if connection establishment is done out of band
+ *   (such as done through Infiniband), it's possible to receive data on a
+ *   QP that is not yet considered connected.  This routine forces the
+ *   connection into an established state in this case in order to handle
+ *   the rare situation where the connection never forms on its own.
+ *   Events that should be reported to the CM are: IB_EVENT_COMM_EST.
+ * See also:
+ *   rdma_connect, rdma_accept, rdma_listen
+ */
+int rdma_notify(struct rdma_cm_id *id, enum ibv_event_type event);
+
+/**
+ * rdma_disconnect - This function disconnects a connection.
+ * @id: RDMA identifier.
+ * Description:
+ *   Disconnects a connection and transitions any associated QP to the
+ *   error state.
+ * See also:
+ *   rdma_connect, rdma_listen, rdma_accept
+ */
+int rdma_disconnect(struct rdma_cm_id *id);
+
+/**
+ * rdma_join_multicast - Joins a multicast group.
+ * @id: Communication identifier associated with the request.
+ * @addr: Multicast address identifying the group to join.
+ * @context: User-defined context associated with the join request.
+ * Description:
+ *   Joins a multicast group and attaches an associated QP to the group.
+ * Notes:
+ *   Before joining a multicast group, the rdma_cm_id must be bound to
+ *   an RDMA device by calling rdma_bind_addr or rdma_resolve_addr.  Use of
+ *   rdma_resolve_addr requires the local routing tables to resolve the
+ *   multicast address to an RDMA device.  The user must call
+ *   rdma_leave_multicast to leave the multicast group and release any
+ *   multicast resources.  The context is returned to the user through
+ *   the private_data field in the rdma_cm_event.
+ * See also:
+ *   rdma_leave_multicast, rdma_bind_addr, rdma_resolve_addr, rdma_create_qp
+ */
+int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
+			void *context);
+
+/**
+ * rdma_leave_multicast - Leaves a multicast group.
+ * @id: Communication identifier associated with the request.
+ * @addr: Multicast address identifying the group to leave.
+ * Description:
+ *   Leaves a multicast group and detaches an associated QP from the group.
+ * Notes:
+ *   Calling this function before a group has been fully joined results in
+ *   canceling the join operation.  Users should be aware that messages
+ *   received from the multicast group may stilled be queued for
+ *   completion processing immediately after leaving a multicast group.
+ *   Destroying an rdma_cm_id will automatically leave all multicast groups.
+ * See also:
+ *   rdma_join_multicast, rdma_destroy_qp
+ */
+int rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr);
+
+/**
+ * rdma_get_cm_event - Retrieves the next pending communication event.
+ * @channel: Event channel to check for events.
+ * @event: Allocated information about the next communication event.
+ * Description:
+ *   Retrieves a communication event.  If no events are pending, by default,
+ *   the call will block until an event is received.
+ * Notes:
+ *   The default synchronous behavior of this routine can be changed by
+ *   modifying the file descriptor associated with the given channel.  All
+ *   events that are reported must be acknowledged by calling rdma_ack_cm_event.
+ *   Destruction of an rdma_cm_id will block until related events have been
+ *   acknowledged.
+ * See also:
+ *   rdma_ack_cm_event, rdma_create_event_channel, rdma_event_str
+ */
+int rdma_get_cm_event(struct rdma_event_channel *channel,
+		      struct rdma_cm_event **event);
+
+/**
+ * rdma_ack_cm_event - Free a communication event.
+ * @event: Event to be released.
+ * Description:
+ *   All events which are allocated by rdma_get_cm_event must be released,
+ *   there should be a one-to-one correspondence between successful gets
+ *   and acks.
+ * See also:
+ *   rdma_get_cm_event, rdma_destroy_id
+ */
+int rdma_ack_cm_event(struct rdma_cm_event *event);
+
+uint16_t rdma_get_src_port(struct rdma_cm_id *id);
+uint16_t rdma_get_dst_port(struct rdma_cm_id *id);
+
+static inline struct sockaddr *rdma_get_local_addr(struct rdma_cm_id *id)
+{
+	return &id->route.addr.src_addr;
+}
+
+static inline struct sockaddr *rdma_get_peer_addr(struct rdma_cm_id *id)
+{
+	return &id->route.addr.dst_addr;
+}
+
+/**
+ * rdma_get_devices - Get list of RDMA devices currently available.
+ * @num_devices: If non-NULL, set to the number of devices returned.
+ * Description:
+ *   Return a NULL-terminated array of opened RDMA devices.  Callers can use
+ *   this routine to allocate resources on specific RDMA devices that will be
+ *   shared across multiple rdma_cm_id's.
+ * Notes:
+ *   The returned array must be released by calling rdma_free_devices.  Devices
+ *   remain opened while the librdmacm is loaded.
+ * See also:
+ *   rdma_free_devices
+ */
+struct ibv_context **rdma_get_devices(int *num_devices);
+
+/**
+ * rdma_free_devices - Frees the list of devices returned by rdma_get_devices.
+ * @list: List of devices returned from rdma_get_devices.
+ * Description:
+ *   Frees the device array returned by rdma_get_devices.
+ * See also:
+ *   rdma_get_devices
+ */
+void rdma_free_devices(struct ibv_context **list);
+
+/**
+ * rdma_event_str - Returns a string representation of an rdma cm event.
+ * @event: Asynchronous event.
+ * Description:
+ *   Returns a string representation of an asynchronous event.
+ * See also:
+ *   rdma_get_cm_event
+ */
+const char *rdma_event_str(enum rdma_cm_event_type event);
+
+/* Option levels */
+enum {
+	RDMA_OPTION_ID		= 0,
+	RDMA_OPTION_IB		= 1
+};
+
+/* Option details */
+enum {
+	RDMA_OPTION_ID_TOS	 = 0,	/* uint8_t: RFC 2474 */
+	RDMA_OPTION_ID_REUSEADDR = 1,   /* int: ~SO_REUSEADDR */
+	RDMA_OPTION_ID_AFONLY	 = 2,   /* int: ~IPV6_V6ONLY */
+	RDMA_OPTION_IB_PATH	 = 1	/* struct ibv_path_data[] */
+};
+
+/**
+ * rdma_set_option - Set options for an rdma_cm_id.
+ * @id: Communication identifier to set option for.
+ * @level: Protocol level of the option to set.
+ * @optname: Name of the option to set.
+ * @optval: Reference to the option data.
+ * @optlen: The size of the %optval buffer.
+ */
+int rdma_set_option(struct rdma_cm_id *id, int level, int optname,
+		    void *optval, size_t optlen);
+
+/**
+ * rdma_migrate_id - Move an rdma_cm_id to a new event channel.
+ * @id: Communication identifier to migrate.
+ * @channel: New event channel for rdma_cm_id events.
+ */
+int rdma_migrate_id(struct rdma_cm_id *id, struct rdma_event_channel *channel);
+
+/**
+ * rdma_getaddrinfo - RDMA address and route resolution service.
+ */
+int rdma_getaddrinfo(char *node, char *service,
+		     struct rdma_addrinfo *hints,
+		     struct rdma_addrinfo **res);
+
+void rdma_freeaddrinfo(struct rdma_addrinfo *res);
+
+int rdma_addrlen(struct sockaddr *addr);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* RDMA_CMA_H */
diff --git a/prov/rdmacm/include/rdma/rdma_verbs.h b/prov/rdmacm/include/rdma/rdma_verbs.h
new file mode 100644
index 00000000000..198c6a595b1
--- /dev/null
+++ b/prov/rdmacm/include/rdma/rdma_verbs.h
@@ -0,0 +1,316 @@
+/*
+ * Copyright (c) 2010-2011 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(RDMA_VERBS_H)
+#define RDMA_VERBS_H
+
+#include <assert.h>
+#include <infiniband/verbs.h>
+#include <rdma/rdma_cma.h>
+#include <errno.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static inline int rdma_seterrno(int ret)
+{
+	if (ret) {
+		errno = ret;
+		ret = -1;
+	}
+	return ret;
+}
+
+/*
+ * Shared receive queues.
+ */
+int rdma_create_srq(struct rdma_cm_id *id, struct ibv_pd *pd,
+		    struct ibv_srq_init_attr *attr);
+
+void rdma_destroy_srq(struct rdma_cm_id *id);
+
+
+/*
+ * Memory registration helpers.
+ */
+static inline struct ibv_mr *
+rdma_reg_msgs(struct rdma_cm_id *id, void *addr, size_t length)
+{
+	return ibv_reg_mr(id->pd, addr, length, IBV_ACCESS_LOCAL_WRITE);
+}
+
+static inline struct ibv_mr *
+rdma_reg_read(struct rdma_cm_id *id, void *addr, size_t length)
+{
+	return ibv_reg_mr(id->pd, addr, length, IBV_ACCESS_LOCAL_WRITE |
+						IBV_ACCESS_REMOTE_READ);
+}
+
+static inline struct ibv_mr *
+rdma_reg_write(struct rdma_cm_id *id, void *addr, size_t length)
+{
+	return ibv_reg_mr(id->pd, addr, length, IBV_ACCESS_LOCAL_WRITE |
+						IBV_ACCESS_REMOTE_WRITE);
+}
+
+static inline int
+rdma_dereg_mr(struct ibv_mr *mr)
+{
+	return rdma_seterrno(ibv_dereg_mr(mr));
+}
+
+
+/*
+ * Vectored send, receive, and RDMA operations.
+ * Support multiple scatter-gather entries.
+ */
+static inline int
+rdma_post_recvv(struct rdma_cm_id *id, void *context, struct ibv_sge *sgl,
+		int nsge)
+{
+	struct ibv_recv_wr wr, *bad;
+
+	wr.wr_id = (uintptr_t) context;
+	wr.next = NULL;
+	wr.sg_list = sgl;
+	wr.num_sge = nsge;
+
+	if (id->srq)
+		return rdma_seterrno(ibv_post_srq_recv(id->srq, &wr, &bad));
+	else
+		return rdma_seterrno(ibv_post_recv(id->qp, &wr, &bad));
+}
+
+static inline int
+rdma_post_sendv(struct rdma_cm_id *id, void *context, struct ibv_sge *sgl,
+		int nsge, int flags)
+{
+	struct ibv_send_wr wr, *bad;
+
+	wr.wr_id = (uintptr_t) context;
+	wr.next = NULL;
+	wr.sg_list = sgl;
+	wr.num_sge = nsge;
+	wr.opcode = IBV_WR_SEND;
+	wr.send_flags = flags;
+
+	return rdma_seterrno(ibv_post_send(id->qp, &wr, &bad));
+}
+
+static inline int
+rdma_post_readv(struct rdma_cm_id *id, void *context, struct ibv_sge *sgl,
+		int nsge, int flags, uint64_t remote_addr, uint32_t rkey)
+{
+	struct ibv_send_wr wr, *bad;
+
+	wr.wr_id = (uintptr_t) context;
+	wr.next = NULL;
+	wr.sg_list = sgl;
+	wr.num_sge = nsge;
+	wr.opcode = IBV_WR_RDMA_READ;
+	wr.send_flags = flags;
+	wr.wr.rdma.remote_addr = remote_addr;
+	wr.wr.rdma.rkey = rkey;
+
+	return rdma_seterrno(ibv_post_send(id->qp, &wr, &bad));
+}
+
+static inline int
+rdma_post_writev(struct rdma_cm_id *id, void *context, struct ibv_sge *sgl,
+		 int nsge, int flags, uint64_t remote_addr, uint32_t rkey)
+{
+	struct ibv_send_wr wr, *bad;
+
+	wr.wr_id = (uintptr_t) context;
+	wr.next = NULL;
+	wr.sg_list = sgl;
+	wr.num_sge = nsge;
+	wr.opcode = IBV_WR_RDMA_WRITE;
+	wr.send_flags = flags;
+	wr.wr.rdma.remote_addr = remote_addr;
+	wr.wr.rdma.rkey = rkey;
+
+	return rdma_seterrno(ibv_post_send(id->qp, &wr, &bad));
+}
+
+/*
+ * Simple send, receive, and RDMA calls.
+ */
+static inline int
+rdma_post_recv(struct rdma_cm_id *id, void *context, void *addr,
+	       size_t length, struct ibv_mr *mr)
+{
+	struct ibv_sge sge;
+
+	assert((addr >= mr->addr) &&
+		(((uint8_t *) addr + length) <= ((uint8_t *) mr->addr + mr->length)));
+	sge.addr = (uint64_t) (uintptr_t) addr;
+	sge.length = (uint32_t) length;
+	sge.lkey = mr->lkey;
+
+	return rdma_post_recvv(id, context, &sge, 1);
+}
+
+static inline int
+rdma_post_send(struct rdma_cm_id *id, void *context, void *addr,
+	       size_t length, struct ibv_mr *mr, int flags)
+{
+	struct ibv_sge sge;
+
+	sge.addr = (uint64_t) (uintptr_t) addr;
+	sge.length = (uint32_t) length;
+	sge.lkey = mr ? mr->lkey : 0;
+
+	return rdma_post_sendv(id, context, &sge, 1, flags);
+}
+
+static inline int
+rdma_post_read(struct rdma_cm_id *id, void *context, void *addr,
+	       size_t length, struct ibv_mr *mr, int flags,
+	       uint64_t remote_addr, uint32_t rkey)
+{
+	struct ibv_sge sge;
+
+	sge.addr = (uint64_t) (uintptr_t) addr;
+	sge.length = (uint32_t) length;
+	sge.lkey = mr->lkey;
+
+	return rdma_post_readv(id, context, &sge, 1, flags, remote_addr, rkey);
+}
+
+static inline int
+rdma_post_write(struct rdma_cm_id *id, void *context, void *addr,
+		size_t length, struct ibv_mr *mr, int flags,
+		uint64_t remote_addr, uint32_t rkey)
+{
+	struct ibv_sge sge;
+
+	sge.addr = (uint64_t) (uintptr_t) addr;
+	sge.length = (uint32_t) length;
+	sge.lkey = mr ? mr->lkey : 0;
+
+	return rdma_post_writev(id, context, &sge, 1, flags, remote_addr, rkey);
+}
+
+static inline int
+rdma_post_ud_send(struct rdma_cm_id *id, void *context, void *addr,
+		  size_t length, struct ibv_mr *mr, int flags,
+		  struct ibv_ah *ah, uint32_t remote_qpn)
+{
+	struct ibv_send_wr wr, *bad;
+	struct ibv_sge sge;
+
+	sge.addr = (uint64_t) (uintptr_t) addr;
+	sge.length = (uint32_t) length;
+	sge.lkey = mr ? mr->lkey : 0;
+
+	wr.wr_id = (uintptr_t) context;
+	wr.next = NULL;
+	wr.sg_list = &sge;
+	wr.num_sge = 1;
+	wr.opcode = IBV_WR_SEND;
+	wr.send_flags = flags;
+	wr.wr.ud.ah = ah;
+	wr.wr.ud.remote_qpn = remote_qpn;
+	wr.wr.ud.remote_qkey = RDMA_UDP_QKEY;
+
+	return rdma_seterrno(ibv_post_send(id->qp, &wr, &bad));
+}
+
+static inline int
+rdma_get_send_comp(struct rdma_cm_id *id, struct ibv_wc *wc)
+{
+	struct ibv_cq *cq;
+	void *context;
+	int ret;
+
+	do {
+		ret = ibv_poll_cq(id->send_cq, 1, wc);
+		if (ret)
+			break;
+
+		ret = ibv_req_notify_cq(id->send_cq, 0);
+		if (ret)
+			return rdma_seterrno(ret);
+
+		ret = ibv_poll_cq(id->send_cq, 1, wc);
+		if (ret)
+			break;
+
+		ret = ibv_get_cq_event(id->send_cq_channel, &cq, &context);
+		if (ret)
+			return ret;
+
+		assert(cq == id->send_cq && context == id);
+		ibv_ack_cq_events(id->send_cq, 1);
+	} while (1);
+
+	return (ret < 0) ? rdma_seterrno(ret) : ret;
+}
+
+static inline int
+rdma_get_recv_comp(struct rdma_cm_id *id, struct ibv_wc *wc)
+{
+	struct ibv_cq *cq;
+	void *context;
+	int ret;
+
+	do {
+		ret = ibv_poll_cq(id->recv_cq, 1, wc);
+		if (ret)
+			break;
+
+		ret = ibv_req_notify_cq(id->recv_cq, 0);
+		if (ret)
+			return rdma_seterrno(ret);
+
+		ret = ibv_poll_cq(id->recv_cq, 1, wc);
+		if (ret)
+			break;
+
+		ret = ibv_get_cq_event(id->recv_cq_channel, &cq, &context);
+		if (ret)
+			return ret;
+
+		assert(cq == id->recv_cq && context == id);
+		ibv_ack_cq_events(id->recv_cq, 1);
+	} while (1);
+
+	return (ret < 0) ? rdma_seterrno(ret) : ret;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* RDMA_CMA_H */
diff --git a/prov/rdmacm/include/rdma/rsocket.h b/prov/rdmacm/include/rdma/rsocket.h
new file mode 100644
index 00000000000..efd0db58bf9
--- /dev/null
+++ b/prov/rdmacm/include/rdma/rsocket.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2011-2012 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(RSOCKET_H)
+#define RSOCKET_H
+
+#include <infiniband/verbs.h>
+#include <rdma/rdma_cma.h>
+#include <sys/socket.h>
+#include <errno.h>
+#include <poll.h>
+#include <sys/select.h>
+#include <sys/mman.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int rsocket(int domain, int type, int protocol);
+int rbind(int socket, const struct sockaddr *addr, socklen_t addrlen);
+int rlisten(int socket, int backlog);
+int raccept(int socket, struct sockaddr *addr, socklen_t *addrlen);
+int rconnect(int socket, const struct sockaddr *addr, socklen_t addrlen);
+int rshutdown(int socket, int how);
+int rclose(int socket);
+
+ssize_t rrecv(int socket, void *buf, size_t len, int flags);
+ssize_t rrecvfrom(int socket, void *buf, size_t len, int flags,
+		  struct sockaddr *src_addr, socklen_t *addrlen);
+ssize_t rrecvmsg(int socket, struct msghdr *msg, int flags);
+ssize_t rsend(int socket, const void *buf, size_t len, int flags);
+ssize_t rsendto(int socket, const void *buf, size_t len, int flags,
+		const struct sockaddr *dest_addr, socklen_t addrlen);
+ssize_t rsendmsg(int socket, const struct msghdr *msg, int flags);
+ssize_t rread(int socket, void *buf, size_t count);
+ssize_t rreadv(int socket, const struct iovec *iov, int iovcnt);
+ssize_t rwrite(int socket, const void *buf, size_t count);
+ssize_t rwritev(int socket, const struct iovec *iov, int iovcnt);
+
+int rpoll(struct pollfd *fds, nfds_t nfds, int timeout);
+int rselect(int nfds, fd_set *readfds, fd_set *writefds,
+	    fd_set *exceptfds, struct timeval *timeout);
+
+int rgetpeername(int socket, struct sockaddr *addr, socklen_t *addrlen);
+int rgetsockname(int socket, struct sockaddr *addr, socklen_t *addrlen);
+
+#define SOL_RDMA 0x10000
+enum {
+	RDMA_SQSIZE,
+	RDMA_RQSIZE,
+	RDMA_INLINE,
+	RDMA_IOMAPSIZE,
+	RDMA_ROUTE
+};
+
+int rsetsockopt(int socket, int level, int optname,
+		const void *optval, socklen_t optlen);
+int rgetsockopt(int socket, int level, int optname,
+		void *optval, socklen_t *optlen);
+int rfcntl(int socket, int cmd, ... /* arg */ );
+
+off_t riomap(int socket, void *buf, size_t len, int prot, int flags, off_t offset);
+int riounmap(int socket, void *buf, size_t len);
+size_t riowrite(int socket, const void *buf, size_t count, off_t offset, int flags);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* RSOCKET_H */
diff --git a/prov/rdmacm/src/acm.c b/prov/rdmacm/src/acm.c
new file mode 100644
index 00000000000..3dc26bfa984
--- /dev/null
+++ b/prov/rdmacm/src/acm.c
@@ -0,0 +1,439 @@
+/*
+ * Copyright (c) 2010-2012 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if HAVE_CONFIG_H
+#  include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#include <stdio.h>
+#include <inttypes.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netdb.h>
+#include <unistd.h>
+
+#include "cma.h"
+#include <rdma/rdma_cma.h>
+#include <infiniband/ib.h>
+#include <rdma/fi_ucma.h>
+
+#define ACM_VERSION             1
+
+#define ACM_OP_RESOLVE          0x01
+#define ACM_OP_ACK              0x80
+
+#define ACM_STATUS_SUCCESS      0
+#define ACM_STATUS_ENOMEM       1
+#define ACM_STATUS_EINVAL       2
+#define ACM_STATUS_ENODATA      3
+#define ACM_STATUS_ENOTCONN     5
+#define ACM_STATUS_ETIMEDOUT    6
+#define ACM_STATUS_ESRCADDR     7
+#define ACM_STATUS_ESRCTYPE     8
+#define ACM_STATUS_EDESTADDR    9
+#define ACM_STATUS_EDESTTYPE    10
+
+#define ACM_FLAGS_NODELAY	(1<<30)
+
+#define ACM_MSG_HDR_LENGTH      16
+#define ACM_MAX_ADDRESS         64
+#define ACM_MSG_EP_LENGTH       72
+#define ACM_MSG_DATA_LENGTH     (ACM_MSG_EP_LENGTH * 8)
+
+struct acm_hdr {
+	uint8_t                 version;
+	uint8_t                 opcode;
+	uint8_t                 status;
+	uint8_t		        data[3];
+	uint16_t                length;
+	uint64_t                tid;
+};
+
+#define ACM_EP_INFO_NAME        0x0001
+#define ACM_EP_INFO_ADDRESS_IP  0x0002
+#define ACM_EP_INFO_ADDRESS_IP6 0x0003
+#define ACM_EP_INFO_PATH        0x0010
+
+union acm_ep_info {
+	uint8_t                 addr[ACM_MAX_ADDRESS];
+	uint8_t                 name[ACM_MAX_ADDRESS];
+	struct ibv_path_record  path;
+};
+
+#define ACM_EP_FLAG_SOURCE      (1<<0)
+#define ACM_EP_FLAG_DEST        (1<<1)
+
+struct acm_ep_addr_data {
+	uint32_t                flags;
+	uint16_t                type;
+	uint16_t                reserved;
+	union acm_ep_info       info;
+};
+
+struct acm_resolve_msg {
+	struct acm_hdr          hdr;
+	struct acm_ep_addr_data data[0];
+};
+
+struct acm_msg {
+	struct acm_hdr                  hdr;
+	union{
+		uint8_t                 data[ACM_MSG_DATA_LENGTH];
+		struct acm_ep_addr_data resolve_data[0];
+	};
+};
+
+static pthread_mutex_t acm_lock = PTHREAD_MUTEX_INITIALIZER;
+static int sock = -1;
+static uint16_t server_port;
+
+static int ucma_set_server_port(void)
+{
+	FILE *f;
+
+	if ((f = fopen("/var/run/ibacm.port", "r"))) {
+		fscanf(f, "%" SCNu16, &server_port);
+		fclose(f);
+	}
+	return server_port;
+}
+
+void ucma_ib_init(void)
+{
+	struct sockaddr_in addr;
+	static int init;
+	int ret;
+
+	if (init)
+		return;
+
+	pthread_mutex_lock(&acm_lock);
+	if (!ucma_set_server_port())
+		goto out;
+
+	sock = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
+	if (sock < 0)
+		goto out;
+
+	memset(&addr, 0, sizeof addr);
+	addr.sin_family = AF_INET;
+	addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+	addr.sin_port = htons(server_port);
+	ret = connect(sock, (struct sockaddr *) &addr, sizeof(addr));
+	if (ret) {
+		close(sock);
+		sock = -1;
+	}
+out:
+	init = 1;
+	pthread_mutex_unlock(&acm_lock);
+}
+
+void ucma_ib_cleanup(void)
+{
+	if (sock >= 0) {
+		shutdown(sock, SHUT_RDWR);
+		close(sock);
+	}
+}
+
+static int ucma_ib_set_addr(struct rdma_addrinfo *ib_rai,
+			    struct rdma_addrinfo *rai)
+{
+	struct sockaddr_ib *src, *dst;
+	struct ibv_path_record *path;
+
+	src = calloc(1, sizeof *src);
+	if (!src)
+		return ERR(ENOMEM);
+
+	dst = calloc(1, sizeof *dst);
+	if (!dst) {
+		free(src);
+		return ERR(ENOMEM);
+	}
+
+	path = &((struct ibv_path_data *) ib_rai->ai_route)->path;
+
+	src->sib_family = AF_IB;
+	src->sib_pkey = path->pkey;
+	src->sib_flowinfo = htonl(ntohl(path->flowlabel_hoplimit) >> 8);
+	memcpy(&src->sib_addr, &path->sgid, 16);
+	ucma_set_sid(ib_rai->ai_port_space, rai->ai_src_addr, src);
+
+	dst->sib_family = AF_IB;
+	dst->sib_pkey = path->pkey;
+	dst->sib_flowinfo = htonl(ntohl(path->flowlabel_hoplimit) >> 8);
+	memcpy(&dst->sib_addr, &path->dgid, 16);
+	ucma_set_sid(ib_rai->ai_port_space, rai->ai_dst_addr, dst);
+
+	ib_rai->ai_src_addr = (struct sockaddr *) src;
+	ib_rai->ai_src_len = sizeof(*src);
+
+	ib_rai->ai_dst_addr = (struct sockaddr *) dst;
+	ib_rai->ai_dst_len = sizeof(*dst);
+
+	return 0;
+}
+				 
+static int ucma_ib_set_connect(struct rdma_addrinfo *ib_rai,
+			       struct rdma_addrinfo *rai)
+{
+	struct ib_connect_hdr *hdr;
+
+	if (rai->ai_family == AF_IB)
+		return 0;
+
+	hdr = calloc(1, sizeof *hdr);
+	if (!hdr)
+		return ERR(ENOMEM);
+
+	if (rai->ai_family == AF_INET) {
+		hdr->ip_version = 4 << 4;
+		memcpy(&hdr->cma_src_ip4,
+		       &((struct sockaddr_in *) rai->ai_src_addr)->sin_addr, 4);
+		memcpy(&hdr->cma_dst_ip4,
+		       &((struct sockaddr_in *) rai->ai_dst_addr)->sin_addr, 4);
+	} else {
+		hdr->ip_version = 6 << 4;
+		memcpy(&hdr->cma_src_ip6,
+		       &((struct sockaddr_in6 *) rai->ai_src_addr)->sin6_addr, 16);
+		memcpy(&hdr->cma_dst_ip6,
+		       &((struct sockaddr_in6 *) rai->ai_dst_addr)->sin6_addr, 16);
+	}
+
+	ib_rai->ai_connect = hdr;
+	ib_rai->ai_connect_len = sizeof(*hdr);
+	return 0;
+}
+
+static void ucma_resolve_af_ib(struct rdma_addrinfo **rai)
+{
+	struct rdma_addrinfo *ib_rai;
+
+	ib_rai = calloc(1, sizeof(*ib_rai));
+	if (!ib_rai)
+		return;
+
+	ib_rai->ai_flags = (*rai)->ai_flags;
+	ib_rai->ai_family = AF_IB;
+	ib_rai->ai_qp_type = (*rai)->ai_qp_type;
+	ib_rai->ai_port_space = (*rai)->ai_port_space;
+
+	ib_rai->ai_route = calloc(1, (*rai)->ai_route_len);
+	if (!ib_rai->ai_route)
+		goto err;
+
+	memcpy(ib_rai->ai_route, (*rai)->ai_route, (*rai)->ai_route_len);
+	ib_rai->ai_route_len = (*rai)->ai_route_len;
+
+	if ((*rai)->ai_src_canonname) {
+		ib_rai->ai_src_canonname = strdup((*rai)->ai_src_canonname);
+		if (!ib_rai->ai_src_canonname)
+			goto err;
+	}
+
+	if ((*rai)->ai_dst_canonname) {
+		ib_rai->ai_dst_canonname = strdup((*rai)->ai_dst_canonname);
+		if (!ib_rai->ai_dst_canonname)
+			goto err;
+	}
+
+	if (ucma_ib_set_connect(ib_rai, *rai))
+		goto err;
+
+	if (ucma_ib_set_addr(ib_rai, *rai))
+		goto err;
+
+	ib_rai->ai_next = *rai;
+	*rai = ib_rai;
+	return;
+
+err:
+	rdma_freeaddrinfo(ib_rai);
+}
+
+static void ucma_ib_save_resp(struct rdma_addrinfo *rai, struct acm_msg *msg)
+{
+	struct acm_ep_addr_data *ep_data;
+	struct ibv_path_data *path_data = NULL;
+	struct sockaddr_in *sin;
+	struct sockaddr_in6 *sin6;
+	int i, cnt, path_cnt = 0;
+
+	cnt = (msg->hdr.length - ACM_MSG_HDR_LENGTH) / ACM_MSG_EP_LENGTH;
+	for (i = 0; i < cnt; i++) {
+		ep_data = &msg->resolve_data[i];
+		switch (ep_data->type) {
+		case ACM_EP_INFO_PATH:
+			ep_data->type = 0;
+			if (!path_data)
+				path_data = (struct ibv_path_data *) ep_data;
+			path_cnt++;
+			break;
+		case ACM_EP_INFO_ADDRESS_IP:
+			if (!(ep_data->flags & ACM_EP_FLAG_SOURCE) || rai->ai_src_len)
+				break;
+
+			sin = calloc(1, sizeof(*sin));
+			if (!sin)
+				break;
+
+			sin->sin_family = AF_INET;
+			memcpy(&sin->sin_addr, &ep_data->info.addr, 4);
+			rai->ai_src_len = sizeof(*sin);
+			rai->ai_src_addr = (struct sockaddr *) sin;
+			break;
+		case ACM_EP_INFO_ADDRESS_IP6:
+			if (!(ep_data->flags & ACM_EP_FLAG_SOURCE) || rai->ai_src_len)
+				break;
+
+			sin6 = calloc(1, sizeof(*sin6));
+			if (!sin6)
+				break;
+
+			sin6->sin6_family = AF_INET6;
+			memcpy(&sin6->sin6_addr, &ep_data->info.addr, 16);
+			rai->ai_src_len = sizeof(*sin6);
+			rai->ai_src_addr = (struct sockaddr *) sin6;
+			break;
+		default:
+			break;
+		}
+	}
+
+	rai->ai_route = calloc(path_cnt, sizeof(*path_data));
+	if (rai->ai_route) {
+		memcpy(rai->ai_route, path_data, path_cnt * sizeof(*path_data));
+		rai->ai_route_len = path_cnt * sizeof(*path_data);
+	}
+}
+
+static void ucma_set_ep_addr(struct acm_ep_addr_data *data, struct sockaddr *addr)
+{
+	if (addr->sa_family == AF_INET) {
+		data->type = ACM_EP_INFO_ADDRESS_IP;
+		memcpy(data->info.addr, &((struct sockaddr_in *) addr)->sin_addr, 4);
+	} else {
+		data->type = ACM_EP_INFO_ADDRESS_IP6;
+		memcpy(data->info.addr, &((struct sockaddr_in6 *) addr)->sin6_addr, 16);
+	}
+}
+
+static int ucma_inet_addr(struct sockaddr *addr, socklen_t len)
+{
+	return len && addr && (addr->sa_family == AF_INET ||
+			       addr->sa_family == AF_INET6);
+}
+
+static int ucma_ib_addr(struct sockaddr *addr, socklen_t len)
+{
+	return len && addr && (addr->sa_family == AF_IB);
+}
+
+void ucma_ib_resolve(struct rdma_addrinfo **rai, struct rdma_addrinfo *hints)
+{
+	struct acm_msg msg;
+	struct acm_ep_addr_data *data;
+	int ret;
+
+	ucma_ib_init();
+	if (sock < 0)
+		return;
+
+	memset(&msg, 0, sizeof msg);
+	msg.hdr.version = ACM_VERSION;
+	msg.hdr.opcode = ACM_OP_RESOLVE;
+	msg.hdr.length = ACM_MSG_HDR_LENGTH;
+
+	data = &msg.resolve_data[0];
+	if (ucma_inet_addr((*rai)->ai_src_addr, (*rai)->ai_src_len)) {
+		data->flags = ACM_EP_FLAG_SOURCE;
+		ucma_set_ep_addr(data, (*rai)->ai_src_addr);
+		data++;
+		msg.hdr.length += ACM_MSG_EP_LENGTH;
+	}
+
+	if (ucma_inet_addr((*rai)->ai_dst_addr, (*rai)->ai_dst_len)) {
+		data->flags = ACM_EP_FLAG_DEST;
+		if (hints->ai_flags & (RAI_NUMERICHOST | RAI_NOROUTE))
+			data->flags |= ACM_FLAGS_NODELAY;
+		ucma_set_ep_addr(data, (*rai)->ai_dst_addr);
+		data++;
+		msg.hdr.length += ACM_MSG_EP_LENGTH;
+	}
+
+	if (hints->ai_route_len ||
+	    ucma_ib_addr((*rai)->ai_src_addr, (*rai)->ai_src_len) ||
+	    ucma_ib_addr((*rai)->ai_dst_addr, (*rai)->ai_dst_len)) {
+		struct ibv_path_record *path;
+
+		if (hints->ai_route_len == sizeof(struct ibv_path_record))
+			path = (struct ibv_path_record *) hints->ai_route;
+		else if (hints->ai_route_len == sizeof(struct ibv_path_data))
+			path = &((struct ibv_path_data *) hints->ai_route)->path;
+		else
+			path = NULL;
+
+		if (path)
+			memcpy(&data->info.path, path, sizeof(*path));
+
+		if (ucma_ib_addr((*rai)->ai_src_addr, (*rai)->ai_src_len)) {
+			memcpy(&data->info.path.sgid,
+			       &((struct sockaddr_ib *) (*rai)->ai_src_addr)->sib_addr, 16);
+		}
+		if (ucma_ib_addr((*rai)->ai_dst_addr, (*rai)->ai_dst_len)) {
+			memcpy(&data->info.path.dgid,
+			       &((struct sockaddr_ib *) (*rai)->ai_dst_addr)->sib_addr, 16);
+		}
+		data->type = ACM_EP_INFO_PATH;
+		data++;
+		msg.hdr.length += ACM_MSG_EP_LENGTH;
+	}
+
+	pthread_mutex_lock(&acm_lock);
+	ret = send(sock, (char *) &msg, msg.hdr.length, 0);
+	if (ret != msg.hdr.length) {
+		pthread_mutex_unlock(&acm_lock);
+		return;
+	}
+
+	ret = recv(sock, (char *) &msg, sizeof msg, 0);
+	pthread_mutex_unlock(&acm_lock);
+	if (ret < ACM_MSG_HDR_LENGTH || ret != msg.hdr.length || msg.hdr.status)
+		return;
+
+	ucma_ib_save_resp(*rai, &msg);
+
+	if (af_ib_support && !(hints->ai_flags & RAI_ROUTEONLY) && (*rai)->ai_route_len)
+		ucma_resolve_af_ib(rai);
+}
diff --git a/prov/rdmacm/src/addrinfo.c b/prov/rdmacm/src/addrinfo.c
new file mode 100644
index 00000000000..68eaddd3497
--- /dev/null
+++ b/prov/rdmacm/src/addrinfo.c
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) 2010 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: cm.c 3453 2005-09-15 21:43:21Z sean.hefty $
+ */
+
+#if HAVE_CONFIG_H
+#  include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netdb.h>
+#include <unistd.h>
+
+#include "cma.h"
+#include <rdma/rdma_cma.h>
+#include <infiniband/ib.h>
+
+#ifdef IBV_XRC_OPS
+#define RDMA_QPT_XRC_SEND IBV_QPT_XRC_SEND
+#define RDMA_QPT_XRC_RECV IBV_QPT_XRC_RECV
+#else
+#define RDMA_QPT_XRC_SEND 9
+#define RDMA_QPT_XRC_RECV 10
+#endif
+
+struct rdma_addrinfo nohints;
+
+static void ucma_convert_to_ai(struct addrinfo *ai, struct rdma_addrinfo *rai)
+{
+	memset(ai, 0, sizeof *ai);
+	if (rai->ai_flags & RAI_PASSIVE)
+		ai->ai_flags = AI_PASSIVE;
+	if (rai->ai_flags & RAI_NUMERICHOST)
+		ai->ai_flags |= AI_NUMERICHOST;
+	if (rai->ai_family != AF_IB)
+		ai->ai_family = rai->ai_family;
+
+	switch (rai->ai_qp_type) {
+	case IBV_QPT_RC:
+	case IBV_QPT_UC:
+	case RDMA_QPT_XRC_SEND:
+	case RDMA_QPT_XRC_RECV:
+		ai->ai_socktype = SOCK_STREAM;
+		break;
+	case IBV_QPT_UD:
+		ai->ai_socktype = SOCK_DGRAM;
+		break;
+	}
+
+	switch (rai->ai_port_space) {
+	case RDMA_PS_TCP:
+		ai->ai_protocol = IPPROTO_TCP;
+		break;
+	case RDMA_PS_IPOIB:
+	case RDMA_PS_UDP:
+		ai->ai_protocol = IPPROTO_UDP;
+		break;
+	case RDMA_PS_IB:
+		if (ai->ai_socktype == SOCK_STREAM)
+			ai->ai_protocol = IPPROTO_TCP;
+		else if (ai->ai_socktype == SOCK_DGRAM)
+			ai->ai_protocol = IPPROTO_UDP;
+		break;
+	}
+
+	if (rai->ai_flags & RAI_PASSIVE) {
+		ai->ai_addrlen = rai->ai_src_len;
+		ai->ai_addr = rai->ai_src_addr;
+	} else {
+		ai->ai_addrlen = rai->ai_dst_len;
+		ai->ai_addr = rai->ai_dst_addr;
+	}
+	ai->ai_canonname = rai->ai_dst_canonname;
+	ai->ai_next = NULL;
+}
+
+static int ucma_copy_addr(struct sockaddr **dst, socklen_t *dst_len,
+			  struct sockaddr *src, socklen_t src_len)
+{
+	*dst = malloc(src_len);
+	if (!(*dst))
+		return ERR(ENOMEM);
+
+	memcpy(*dst, src, src_len);
+	*dst_len = src_len;
+	return 0;
+}
+
+void ucma_set_sid(enum rdma_port_space ps, struct sockaddr *addr,
+		  struct sockaddr_ib *sib)
+{
+	uint16_t port;
+
+	port = addr ? ucma_get_port(addr) : 0;
+	sib->sib_sid = htonll(((uint64_t) ps << 16) + ntohs(port));
+
+	if (ps)
+		sib->sib_sid_mask = htonll(RDMA_IB_IP_PS_MASK);
+	if (port)
+		sib->sib_sid_mask |= htonll(RDMA_IB_IP_PORT_MASK);
+}
+
+static int ucma_convert_in6(int ps, struct sockaddr_ib **dst, socklen_t *dst_len,
+			    struct sockaddr_in6 *src, socklen_t src_len)
+{
+	*dst = calloc(1, sizeof(struct sockaddr_ib));
+	if (!(*dst))
+		return ERR(ENOMEM);
+
+	(*dst)->sib_family = AF_IB;
+	(*dst)->sib_pkey = 0xFFFF;
+	(*dst)->sib_flowinfo = src->sin6_flowinfo;
+	ib_addr_set(&(*dst)->sib_addr, src->sin6_addr.s6_addr32[0],
+		    src->sin6_addr.s6_addr32[1], src->sin6_addr.s6_addr32[2],
+		    src->sin6_addr.s6_addr32[3]);
+	ucma_set_sid(ps, (struct sockaddr *) src, *dst);
+	(*dst)->sib_scope_id = src->sin6_scope_id;
+
+	*dst_len = sizeof(struct sockaddr_ib);
+	return 0;
+}
+
+static int ucma_convert_to_rai(struct rdma_addrinfo *rai,
+			       struct rdma_addrinfo *hints, struct addrinfo *ai)
+{
+	int ret;
+
+	if (hints->ai_qp_type) {
+		rai->ai_qp_type = hints->ai_qp_type;
+	} else {
+		switch (ai->ai_socktype) {
+		case SOCK_STREAM:
+			rai->ai_qp_type = IBV_QPT_RC;
+			break;
+		case SOCK_DGRAM:
+			rai->ai_qp_type = IBV_QPT_UD;
+			break;
+		}
+	}
+
+	if (hints->ai_port_space) {
+		rai->ai_port_space = hints->ai_port_space;
+	} else {
+		switch (ai->ai_protocol) {
+		case IPPROTO_TCP:
+			rai->ai_port_space = RDMA_PS_TCP;
+			break;
+		case IPPROTO_UDP:
+			rai->ai_port_space = RDMA_PS_UDP;
+			break;
+		}
+	}
+
+	if (ai->ai_flags & AI_PASSIVE) {
+		rai->ai_flags = RAI_PASSIVE;
+		if (ai->ai_canonname)
+			rai->ai_src_canonname = strdup(ai->ai_canonname);
+
+		if ((hints->ai_flags & RAI_FAMILY) && (hints->ai_family == AF_IB) &&
+		    (hints->ai_flags & RAI_NUMERICHOST)) {
+			rai->ai_family = AF_IB;
+			ret = ucma_convert_in6(rai->ai_port_space,
+					       (struct sockaddr_ib **) &rai->ai_src_addr,
+					       &rai->ai_src_len,
+					       (struct sockaddr_in6 *) ai->ai_addr,
+					       ai->ai_addrlen);
+		} else {
+			rai->ai_family = ai->ai_family;
+			ret = ucma_copy_addr(&rai->ai_src_addr, &rai->ai_src_len,
+					     ai->ai_addr, ai->ai_addrlen);
+		}
+	} else {
+		if (ai->ai_canonname)
+			rai->ai_dst_canonname = strdup(ai->ai_canonname);
+
+		if ((hints->ai_flags & RAI_FAMILY) && (hints->ai_family == AF_IB) &&
+		    (hints->ai_flags & RAI_NUMERICHOST)) {
+			rai->ai_family = AF_IB;
+			ret = ucma_convert_in6(rai->ai_port_space,
+					       (struct sockaddr_ib **) &rai->ai_dst_addr,
+					       &rai->ai_dst_len,
+					       (struct sockaddr_in6 *) ai->ai_addr,
+					       ai->ai_addrlen);
+		} else {
+			rai->ai_family = ai->ai_family;
+			ret = ucma_copy_addr(&rai->ai_dst_addr, &rai->ai_dst_len,
+					     ai->ai_addr, ai->ai_addrlen);
+		}
+	}
+	return ret;
+}
+
+static int ucma_getaddrinfo(char *node, char *service,
+			    struct rdma_addrinfo *hints,
+			    struct rdma_addrinfo *rai)
+{
+	struct addrinfo ai_hints;
+	struct addrinfo *ai;
+	int ret;
+
+	if (hints != &nohints) {
+		ucma_convert_to_ai(&ai_hints, hints);
+		ret = getaddrinfo(node, service, &ai_hints, &ai);
+	} else {
+		ret = getaddrinfo(node, service, NULL, &ai);
+	}
+	if (ret)
+		return ret;
+
+	ret = ucma_convert_to_rai(rai, hints, ai);
+	freeaddrinfo(ai);
+	return ret;
+}
+
+int rdma_getaddrinfo(char *node, char *service,
+		     struct rdma_addrinfo *hints,
+		     struct rdma_addrinfo **res)
+{
+	struct rdma_addrinfo *rai;
+	int ret;
+
+	if (!service && !node && !hints)
+		return ERR(EINVAL);
+
+	ret = ucma_init();
+	if (ret)
+		return ret;
+
+	rai = calloc(1, sizeof(*rai));
+	if (!rai)
+		return ERR(ENOMEM);
+
+	if (!hints)
+		hints = &nohints;
+
+	if (node || service) {
+		ret = ucma_getaddrinfo(node, service, hints, rai);
+	} else {
+		rai->ai_flags = hints->ai_flags;
+		rai->ai_family = hints->ai_family;
+		rai->ai_qp_type = hints->ai_qp_type;
+		rai->ai_port_space = hints->ai_port_space;
+		if (hints->ai_dst_len) {
+			ret = ucma_copy_addr(&rai->ai_dst_addr, &rai->ai_dst_len,
+					     hints->ai_dst_addr, hints->ai_dst_len);
+		}
+	}
+	if (ret)
+		goto err;
+
+	if (!rai->ai_src_len && hints->ai_src_len) {
+		ret = ucma_copy_addr(&rai->ai_src_addr, &rai->ai_src_len,
+				     hints->ai_src_addr, hints->ai_src_len);
+		if (ret)
+			goto err;
+	}
+
+	if (!(rai->ai_flags & RAI_PASSIVE))
+		ucma_ib_resolve(&rai, hints);
+
+	*res = rai;
+	return 0;
+
+err:
+	rdma_freeaddrinfo(rai);
+	return ret;
+}
+
+void rdma_freeaddrinfo(struct rdma_addrinfo *res)
+{
+	struct rdma_addrinfo *rai;
+
+	while (res) {
+		rai = res;
+		res = res->ai_next;
+
+		if (rai->ai_connect)
+			free(rai->ai_connect);
+
+		if (rai->ai_route)
+			free(rai->ai_route);
+
+		if (rai->ai_src_canonname)
+			free(rai->ai_src_canonname);
+
+		if (rai->ai_dst_canonname)
+			free(rai->ai_dst_canonname);
+
+		if (rai->ai_src_addr)
+			free(rai->ai_src_addr);
+
+		if (rai->ai_dst_addr)
+			free(rai->ai_dst_addr);
+
+		free(rai);
+	}
+}
diff --git a/prov/rdmacm/src/cma.c b/prov/rdmacm/src/cma.c
new file mode 100644
index 00000000000..b79f73c3397
--- /dev/null
+++ b/prov/rdmacm/src/cma.c
@@ -0,0 +1,2210 @@
+/*
+ * Copyright (c) 2005-2012 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if HAVE_CONFIG_H
+#  include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#include <stdlib.h>
+#include <string.h>
+#include <glob.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdint.h>
+#include <poll.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <endian.h>
+#include <byteswap.h>
+#include <stddef.h>
+#include <netdb.h>
+#include <syslog.h>
+
+#include "cma.h"
+#include "indexer.h"
+#include <infiniband/driver.h>
+#include <infiniband/marshall.h>
+#include <rdma/rdma_cma.h>
+#include <rdma/rdma_verbs.h>
+#include <infiniband/ib.h>
+#include <fi.h>
+#include <rdma/fi_ucma.h>
+
+
+#define CMA_INIT_CMD(req, req_size, op)	\
+	memset(req, 0, req_size)
+#define CMA_INIT_CMD_RESP(req, req_size, op, resp, resp_size) \
+	memset(req, 0, req_size)
+
+struct cma_device {
+	struct ibv_context *verbs;
+	struct ibv_pd	   *pd;
+	uint64_t	    guid;
+	int		    port_cnt;
+	int		    refcnt;
+	int		    max_qpsize;
+	uint8_t		    max_initiator_depth;
+	uint8_t		    max_responder_resources;
+};
+
+struct cma_id_private {
+	struct rdma_cm_id	id;
+	struct cma_device	*cma_dev;
+	void			*connect;
+	size_t			connect_len;
+	int			events_completed;
+	int			connect_error;
+	int			sync;
+	pthread_cond_t		cond;
+	pthread_mutex_t		mut;
+	uint32_t		handle;
+	struct cma_multicast	*mc_list;
+	struct ibv_qp_init_attr	*qp_init_attr;
+	uint8_t			initiator_depth;
+	uint8_t			responder_resources;
+};
+
+struct cma_multicast {
+	struct cma_multicast  *next;
+	struct cma_id_private *id_priv;
+	void		*context;
+	int		events_completed;
+	pthread_cond_t	cond;
+	uint32_t	handle;
+	union ibv_gid	mgid;
+	uint16_t	mlid;
+	struct sockaddr_storage addr;
+};
+
+struct cma_event {
+	struct rdma_cm_event	event;
+	uint8_t			private_data[RDMA_MAX_PRIVATE_DATA];
+	struct cma_id_private	*id_priv;
+	struct cma_multicast	*mc;
+};
+
+static struct cma_device *cma_dev_array;
+static int cma_dev_cnt;
+static pthread_mutex_t mut = PTHREAD_MUTEX_INITIALIZER;
+int af_ib_support;
+static struct index_map ucma_idm;
+static fastlock_t idm_lock;
+
+static void ucma_cleanup(void)
+{
+	ucma_ib_cleanup();
+
+	if (cma_dev_cnt) {
+		while (cma_dev_cnt--) {
+			if (cma_dev_array[cma_dev_cnt].refcnt)
+				ibv_dealloc_pd(cma_dev_array[cma_dev_cnt].pd);
+			ibv_close_device(cma_dev_array[cma_dev_cnt].verbs);
+		}
+
+		fastlock_destroy(&idm_lock);
+		free(cma_dev_array);
+		cma_dev_cnt = 0;
+	}
+}
+
+/*
+ * This function is called holding the mutex lock
+ * cma_dev_cnt must be set before calling this function to
+ * ensure that the lock is not acquired recursively.
+ */
+static void ucma_set_af_ib_support(void)
+{
+	struct rdma_cm_id *id;
+	struct sockaddr_ib sib;
+	int ret;
+
+	ret = rdma_create_id(NULL, &id, NULL, RDMA_PS_IB);
+	if (ret)
+		return;
+
+	memset(&sib, 0, sizeof sib);
+	sib.sib_family = AF_IB;
+	sib.sib_sid = htonll(RDMA_IB_IP_PS_TCP);
+	sib.sib_sid_mask = htonll(RDMA_IB_IP_PS_MASK);
+	af_ib_support = 1;
+	ret = rdma_bind_addr(id, (struct sockaddr *) &sib);
+	af_ib_support = !ret;
+
+	rdma_destroy_id(id);
+}
+
+int _ucma_init(void)
+{
+	struct ibv_device **dev_list = NULL;
+	struct cma_device *cma_dev;
+	struct ibv_device_attr attr;
+	int i, ret, dev_cnt;
+
+	/* Quick check without lock to see if we're already initialized */
+	if (cma_dev_cnt)
+		return 0;
+
+	pthread_mutex_lock(&mut);
+	if (cma_dev_cnt) {
+		pthread_mutex_unlock(&mut);
+		return 0;
+	}
+
+	fastlock_init(&idm_lock);
+	dev_list = ibv_get_device_list(&dev_cnt);
+	if (!dev_list) {
+		fprintf(stderr, "rdmacm: fatal: unable to get RDMA device list\n");
+		ret = ERR(ENODEV);
+		goto err1;
+	}
+
+	if (!dev_cnt) {
+		fprintf(stderr, "rdmacm: fatal: no RDMA devices found\n");
+		ret = ERR(ENODEV);
+		goto err2;
+	}
+		
+	cma_dev_array = calloc(dev_cnt, sizeof *cma_dev);
+	if (!cma_dev_array) {
+		ret = ERR(ENOMEM);
+		goto err2;
+	}
+
+	for (i = 0; dev_list[i];) {
+		cma_dev = &cma_dev_array[i];
+
+		cma_dev->guid = ibv_get_device_guid(dev_list[i]);
+		cma_dev->verbs = ibv_open_device(dev_list[i]);
+		if (!cma_dev->verbs) {
+			fprintf(stderr, "rdmacm: fatal: unable to open RDMA device\n");
+			ret = ERR(ENODEV);
+			goto err3;
+		}
+
+		i++;
+		ret = ibv_query_device(cma_dev->verbs, &attr);
+		if (ret) {
+			fprintf(stderr, "rdmacm: fatal: unable to query RDMA device\n");
+			ret = ERR(ret);
+			goto err3;
+		}
+
+		cma_dev->port_cnt = attr.phys_port_cnt;
+		cma_dev->max_qpsize = attr.max_qp_wr;
+		cma_dev->max_initiator_depth = (uint8_t) attr.max_qp_init_rd_atom;
+		cma_dev->max_responder_resources = (uint8_t) attr.max_qp_rd_atom;
+	}
+
+	cma_dev_cnt = dev_cnt;
+	ucma_set_af_ib_support();
+	pthread_mutex_unlock(&mut);
+	ibv_free_device_list(dev_list);
+	return 0;
+
+err3:
+	while (i--)
+		ibv_close_device(cma_dev_array[i].verbs);
+	free(cma_dev_array);
+err2:
+	ibv_free_device_list(dev_list);
+err1:
+	fastlock_destroy(&idm_lock);
+	pthread_mutex_unlock(&mut);
+	return ret;
+}
+
+struct ibv_context **rdma_get_devices(int *num_devices)
+{
+	struct ibv_context **devs = NULL;
+	int i;
+
+	if (_ucma_init())
+		goto out;
+
+	devs = malloc(sizeof *devs * (cma_dev_cnt + 1));
+	if (!devs)
+		goto out;
+
+	for (i = 0; i < cma_dev_cnt; i++)
+		devs[i] = cma_dev_array[i].verbs;
+	devs[i] = NULL;
+out:
+	if (num_devices)
+		*num_devices = devs ? cma_dev_cnt : 0;
+	return devs;
+}
+
+void rdma_free_devices(struct ibv_context **list)
+{
+	free(list);
+}
+
+void rdma_cm_ini(void)
+{
+}
+
+void rdma_cm_fini(void)
+{
+	ucma_cleanup();
+}
+
+struct rdma_event_channel *rdma_create_event_channel(void)
+{
+	struct rdma_event_channel *channel;
+	struct fid_ucma *ucma;
+	int ret;
+
+	if (_ucma_init())
+		return NULL;
+
+	channel = malloc(sizeof *channel);
+	if (!channel)
+		return NULL;
+
+	ret = fi_open("ucma", NULL, 0, &channel->fid, channel);
+	if (ret) {
+		fprintf(stderr, "rdmacm: fatal: unable to open /dev/infiniband/rdma_cm\n");
+		goto err;
+	}
+
+	ucma = container_of(channel->fid, struct fid_ucma, fid);
+	channel->fd = ucma->fd;
+	return channel;
+err:
+	free(channel);
+	return NULL;
+}
+
+void rdma_destroy_event_channel(struct rdma_event_channel *channel)
+{
+	fi_close(channel->fid);
+	free(channel);
+}
+
+static int ucma_get_device(struct cma_id_private *id_priv, uint64_t guid)
+{
+	struct cma_device *cma_dev;
+	int i, ret = 0;
+
+	for (i = 0; i < cma_dev_cnt; i++) {
+		cma_dev = &cma_dev_array[i];
+		if (cma_dev->guid == guid)
+			goto match;
+	}
+
+	return ERR(ENODEV);
+match:
+	pthread_mutex_lock(&mut);
+	if (!cma_dev->refcnt++) {
+		cma_dev->pd = ibv_alloc_pd(cma_dev_array[i].verbs);
+		if (!cma_dev->pd) {
+			cma_dev->refcnt--;
+			ret = ERR(ENOMEM);
+			goto out;
+		}
+	}
+	id_priv->cma_dev = cma_dev;
+	id_priv->id.verbs = cma_dev->verbs;
+	id_priv->id.pd = cma_dev->pd;
+out:
+	pthread_mutex_unlock(&mut);
+	return ret;
+}
+
+static void ucma_put_device(struct cma_device *cma_dev)
+{
+	pthread_mutex_lock(&mut);
+	if (!--cma_dev->refcnt)
+		ibv_dealloc_pd(cma_dev->pd);
+	pthread_mutex_unlock(&mut);
+}
+
+static void ucma_insert_id(struct cma_id_private *id_priv)
+{
+	fastlock_acquire(&idm_lock);
+	idm_set(&ucma_idm, id_priv->handle, id_priv);
+	fastlock_release(&idm_lock);
+}
+
+static void ucma_remove_id(struct cma_id_private *id_priv)
+{
+	if (id_priv->handle <= IDX_MAX_INDEX)
+		idm_clear(&ucma_idm, id_priv->handle);
+}
+
+static struct cma_id_private *ucma_lookup_id(int handle)
+{
+	return idm_lookup(&ucma_idm, handle);
+}
+
+static void ucma_free_id(struct cma_id_private *id_priv)
+{
+	ucma_remove_id(id_priv);
+	if (id_priv->cma_dev)
+		ucma_put_device(id_priv->cma_dev);
+	pthread_cond_destroy(&id_priv->cond);
+	pthread_mutex_destroy(&id_priv->mut);
+	if (id_priv->id.route.path_rec)
+		free(id_priv->id.route.path_rec);
+
+	if (id_priv->sync)
+		rdma_destroy_event_channel(id_priv->id.channel);
+	if (id_priv->connect_len)
+		free(id_priv->connect);
+	free(id_priv);
+}
+
+static struct cma_id_private *ucma_alloc_id(struct rdma_event_channel *channel,
+					    void *context,
+					    enum rdma_port_space ps,
+					    enum ibv_qp_type qp_type)
+{
+	struct cma_id_private *id_priv;
+
+	id_priv = calloc(1, sizeof *id_priv);
+	if (!id_priv)
+		return NULL;
+
+	id_priv->id.context = context;
+	id_priv->id.ps = ps;
+	id_priv->id.qp_type = qp_type;
+	id_priv->handle = 0xFFFFFFFF;
+
+	if (!channel) {
+		id_priv->id.channel = rdma_create_event_channel();
+		if (!id_priv->id.channel)
+			goto err;
+		id_priv->sync = 1;
+	} else {
+		id_priv->id.channel = channel;
+	}
+
+	pthread_mutex_init(&id_priv->mut, NULL);
+	if (pthread_cond_init(&id_priv->cond, NULL))
+		goto err;
+
+	return id_priv;
+
+err:	ucma_free_id(id_priv);
+	return NULL;
+}
+
+static int rdma_create_id2(struct rdma_event_channel *channel,
+			   struct rdma_cm_id **id, void *context,
+			   enum rdma_port_space ps, enum ibv_qp_type qp_type)
+{
+	struct ucma_abi_create_id_resp resp;
+	struct ucma_abi_create_id cmd;
+	struct cma_id_private *id_priv;
+	int ret;
+
+	ret = _ucma_init();
+	if (ret)
+		return ret;
+
+	id_priv = ucma_alloc_id(channel, context, ps, qp_type);
+	if (!id_priv)
+		return ERR(ENOMEM);
+
+	CMA_INIT_CMD_RESP(&cmd, sizeof cmd, CREATE_ID, &resp, sizeof resp);
+	cmd.uid = (uintptr_t) id_priv;
+	cmd.ps = ps;
+	cmd.qp_type = qp_type;
+
+	ret = ucma_create_id(id_priv->id.channel->fid,
+			     &cmd, sizeof cmd, &resp, sizeof resp);
+	if (ret) {
+		ret = ERR(-ret);
+		goto err;
+	}
+
+	id_priv->handle = resp.id;
+	ucma_insert_id(id_priv);
+	*id = &id_priv->id;
+	return 0;
+
+err:
+	ucma_free_id(id_priv);
+	return ret;
+}
+
+int rdma_create_id(struct rdma_event_channel *channel,
+		   struct rdma_cm_id **id, void *context,
+		   enum rdma_port_space ps)
+{
+	enum ibv_qp_type qp_type;
+
+	qp_type = (ps == RDMA_PS_IPOIB || ps == RDMA_PS_UDP) ?
+		  IBV_QPT_UD : IBV_QPT_RC;
+	return rdma_create_id2(channel, id, context, ps, qp_type);
+}
+
+static int ucma_destroy_kern_id(fid_t fid, uint32_t handle)
+{
+	struct ucma_abi_destroy_id_resp resp;
+	struct ucma_abi_destroy_id cmd;
+	int ret;
+	
+	CMA_INIT_CMD_RESP(&cmd, sizeof cmd, DESTROY_ID, &resp, sizeof resp);
+	cmd.id = handle;
+
+	ret = ucma_destroy_id(fid, &cmd, sizeof cmd, &resp, sizeof resp);
+	if (ret)
+		return ERR(-ret);
+
+	return resp.events_reported;
+}
+
+int rdma_destroy_id(struct rdma_cm_id *id)
+{
+	struct cma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct cma_id_private, id);
+	ret = ucma_destroy_kern_id(id->channel->fid, id_priv->handle);
+	if (ret < 0)
+		return ret;
+
+	if (id_priv->id.event)
+		rdma_ack_cm_event(id_priv->id.event);
+
+	pthread_mutex_lock(&id_priv->mut);
+	while (id_priv->events_completed < ret)
+		pthread_cond_wait(&id_priv->cond, &id_priv->mut);
+	pthread_mutex_unlock(&id_priv->mut);
+
+	ucma_free_id(id_priv);
+	return 0;
+}
+
+int rdma_addrlen(struct sockaddr *addr)
+{
+	if (!addr)
+		return 0;
+
+	switch (addr->sa_family) {
+	case PF_INET:
+		return sizeof(struct sockaddr_in);
+	case PF_INET6:
+		return sizeof(struct sockaddr_in6);
+	case PF_IB:
+		return af_ib_support ? sizeof(struct sockaddr_ib) : 0;
+	default:
+		return 0;
+	}
+}
+
+static int ucma_query_addr(struct rdma_cm_id *id)
+{
+	struct ucma_abi_query_addr_resp resp;
+	struct ucma_abi_query cmd;
+	struct cma_id_private *id_priv;
+	int ret;
+	
+	CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY, &resp, sizeof resp);
+	id_priv = container_of(id, struct cma_id_private, id);
+	cmd.id = id_priv->handle;
+	cmd.option = UCMA_QUERY_ADDR;
+
+	ret = ucma_query(id->channel->fid, &cmd, sizeof cmd,
+			 &resp, sizeof resp);
+	if (ret)
+		return ERR(-ret);
+
+	memcpy(&id->route.addr.src_addr, &resp.src_addr, resp.src_size);
+	memcpy(&id->route.addr.dst_addr, &resp.dst_addr, resp.dst_size);
+
+	if (!id_priv->cma_dev && resp.node_guid) {
+		ret = ucma_get_device(id_priv, resp.node_guid);
+		if (ret)
+			return ret;
+		id->port_num = resp.port_num;
+		id->route.addr.addr.ibaddr.pkey = resp.pkey;
+	}
+
+	return 0;
+}
+
+static int ucma_query_gid(struct rdma_cm_id *id)
+{
+	struct ucma_abi_query_addr_resp resp;
+	struct ucma_abi_query cmd;
+	struct cma_id_private *id_priv;
+	struct sockaddr_ib *sib;
+	int ret;
+	
+	CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY, &resp, sizeof resp);
+	id_priv = container_of(id, struct cma_id_private, id);
+	cmd.id = id_priv->handle;
+	cmd.option = UCMA_QUERY_GID;
+
+	ret = ucma_query(id->channel->fid, &cmd, sizeof cmd,
+			 &resp, sizeof resp);
+	if (ret)
+		return ERR(-ret);
+
+	sib = (struct sockaddr_ib *) &resp.src_addr;
+	memcpy(id->route.addr.addr.ibaddr.sgid.raw, sib->sib_addr.sib_raw,
+	       sizeof id->route.addr.addr.ibaddr.sgid);
+
+	sib = (struct sockaddr_ib *) &resp.dst_addr;
+	memcpy(id->route.addr.addr.ibaddr.dgid.raw, sib->sib_addr.sib_raw,
+	       sizeof id->route.addr.addr.ibaddr.dgid);
+
+	return 0;
+}
+
+static void ucma_convert_path(struct ibv_path_data *path_data,
+			      struct ibv_sa_path_rec *sa_path)
+{
+	uint32_t fl_hop;
+
+	memcpy(&sa_path->dgid, path_data->path.dgid, 16);
+	memcpy(&sa_path->sgid, path_data->path.sgid, 16);
+	sa_path->dlid = path_data->path.dlid;
+	sa_path->slid = path_data->path.slid;
+	sa_path->raw_traffic = 0;
+
+	fl_hop = ntohl(path_data->path.flowlabel_hoplimit);
+	sa_path->flow_label = htonl(fl_hop >> 8);
+	sa_path->hop_limit = (uint8_t) fl_hop;
+
+	sa_path->traffic_class = path_data->path.tclass;
+	sa_path->reversible = path_data->path.reversible_numpath >> 7;
+	sa_path->numb_path = 1;
+	sa_path->pkey = path_data->path.pkey;
+	sa_path->sl = ntohs(path_data->path.qosclass_sl) & 0xF;
+	sa_path->mtu_selector = 1;
+	sa_path->mtu = path_data->path.mtu & 0x1F;
+	sa_path->rate_selector = 1;
+	sa_path->rate = path_data->path.rate & 0x1F;
+	sa_path->packet_life_time_selector = 1;
+	sa_path->packet_life_time = path_data->path.packetlifetime & 0x1F;
+
+	sa_path->preference = (uint8_t) path_data->flags;
+}
+
+static int ucma_query_path(struct rdma_cm_id *id)
+{
+	struct ucma_abi_query_path_resp *resp;
+	struct ucma_abi_query cmd;
+	struct cma_id_private *id_priv;
+	int ret, i, size;
+
+	size = sizeof(*resp) + sizeof(struct ibv_path_data) * 6;
+	resp = alloca(size);
+	if (!resp)
+		return ERR(ENOMEM);
+
+	CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY, resp, size);
+	id_priv = container_of(id, struct cma_id_private, id);
+	cmd.id = id_priv->handle;
+	cmd.option = UCMA_QUERY_PATH;
+
+	ret = ucma_query(id->channel->fid, &cmd, sizeof cmd, resp, size);
+	if (ret)
+		return ERR(-ret);
+
+	if (resp->num_paths) {
+		id->route.path_rec = malloc(sizeof(*id->route.path_rec) *
+					    resp->num_paths);
+		if (!id->route.path_rec)
+			return ERR(ENOMEM);
+
+		id->route.num_paths = resp->num_paths;
+		for (i = 0; i < resp->num_paths; i++)
+			ucma_convert_path(&resp->path_data[i], &id->route.path_rec[i]);
+	}
+
+	return 0;
+}
+
+static int _ucma_query_route(struct rdma_cm_id *id)
+{
+	struct ucma_abi_query_route_resp resp;
+	struct ucma_abi_query cmd;
+	struct cma_id_private *id_priv;
+	int ret, i;
+
+	CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY_ROUTE, &resp, sizeof resp);
+	id_priv = container_of(id, struct cma_id_private, id);
+	cmd.id = id_priv->handle;
+
+	ret = ucma_query_route(id->channel->fid, &cmd, sizeof cmd,
+			       &resp, sizeof resp);
+	if (ret)
+		return ERR(-ret);
+
+	if (resp.num_paths) {
+		id->route.path_rec = malloc(sizeof *id->route.path_rec *
+					    resp.num_paths);
+		if (!id->route.path_rec)
+			return ERR(ENOMEM);
+
+		id->route.num_paths = resp.num_paths;
+		for (i = 0; i < resp.num_paths; i++)
+			ibv_copy_path_rec_from_kern(&id->route.path_rec[i],
+						    &resp.ib_route[i]);
+	}
+
+	memcpy(id->route.addr.addr.ibaddr.sgid.raw, resp.ib_route[0].sgid,
+	       sizeof id->route.addr.addr.ibaddr.sgid);
+	memcpy(id->route.addr.addr.ibaddr.dgid.raw, resp.ib_route[0].dgid,
+	       sizeof id->route.addr.addr.ibaddr.dgid);
+	id->route.addr.addr.ibaddr.pkey = resp.ib_route[0].pkey;
+	memcpy(&id->route.addr.src_addr, &resp.src_addr,
+	       sizeof resp.src_addr);
+	memcpy(&id->route.addr.dst_addr, &resp.dst_addr,
+	       sizeof resp.dst_addr);
+
+	if (!id_priv->cma_dev && resp.node_guid) {
+		ret = ucma_get_device(id_priv, resp.node_guid);
+		if (ret)
+			return ret;
+		id_priv->id.port_num = resp.port_num;
+	}
+
+	return 0;
+}
+
+static int rdma_bind_addr2(struct rdma_cm_id *id, struct sockaddr *addr,
+			   socklen_t addrlen)
+{
+	struct ucma_abi_bind cmd;
+	struct cma_id_private *id_priv;
+	int ret;
+	
+	CMA_INIT_CMD(&cmd, sizeof cmd, BIND);
+	id_priv = container_of(id, struct cma_id_private, id);
+	cmd.id = id_priv->handle;
+	cmd.addr_size = addrlen;
+	memcpy(&cmd.addr, addr, addrlen);
+
+	ret = ucma_bind(id->channel->fid, &cmd, sizeof cmd);
+	if (ret)
+		return ERR(-ret);
+
+	return ucma_query_addr(id);
+}
+
+int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr)
+{
+	struct ucma_abi_bind_ip cmd;
+	struct cma_id_private *id_priv;
+	int ret, addrlen;
+	
+	addrlen = rdma_addrlen(addr);
+	if (!addrlen)
+		return ERR(EINVAL);
+
+	if (af_ib_support)
+		return rdma_bind_addr2(id, addr, addrlen);
+
+	CMA_INIT_CMD(&cmd, sizeof cmd, BIND_IP);
+	id_priv = container_of(id, struct cma_id_private, id);
+	cmd.id = id_priv->handle;
+	memcpy(&cmd.addr, addr, addrlen);
+
+	ret = ucma_bind_ip(id->channel->fid, &cmd, sizeof cmd);
+	if (ret)
+		return ERR(-ret);
+
+	return _ucma_query_route(id);
+}
+
+int ucma_complete(struct rdma_cm_id *id)
+{
+	struct cma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct cma_id_private, id);
+	if (!id_priv->sync)
+		return 0;
+
+	if (id_priv->id.event) {
+		rdma_ack_cm_event(id_priv->id.event);
+		id_priv->id.event = NULL;
+	}
+
+	ret = rdma_get_cm_event(id_priv->id.channel, &id_priv->id.event);
+	if (ret)
+		return ret;
+
+	if (id_priv->id.event->status) {
+		if (id_priv->id.event->event == RDMA_CM_EVENT_REJECTED)
+			ret = ERR(ECONNREFUSED);
+		else if (id_priv->id.event->status < 0)
+			ret = ERR(-id_priv->id.event->status);
+		else
+			ret = ERR(-id_priv->id.event->status);
+	}
+	return ret;
+}
+
+static int rdma_resolve_addr2(struct rdma_cm_id *id, struct sockaddr *src_addr,
+			      socklen_t src_len, struct sockaddr *dst_addr,
+			      socklen_t dst_len, int timeout_ms)
+{
+	struct ucma_abi_resolve_addr cmd;
+	struct cma_id_private *id_priv;
+	int ret;
+	
+	CMA_INIT_CMD(&cmd, sizeof cmd, RESOLVE_ADDR);
+	id_priv = container_of(id, struct cma_id_private, id);
+	cmd.id = id_priv->handle;
+	if ((cmd.src_size = src_len))
+		memcpy(&cmd.src_addr, src_addr, src_len);
+	memcpy(&cmd.dst_addr, dst_addr, dst_len);
+	cmd.dst_size = dst_len;
+	cmd.timeout_ms = timeout_ms;
+
+	ret = ucma_resolve_addr(id->channel->fid, &cmd, sizeof cmd);
+	if (ret)
+		return ERR(-ret);
+
+	memcpy(&id->route.addr.dst_addr, dst_addr, dst_len);
+	return ucma_complete(id);
+}
+
+int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
+		      struct sockaddr *dst_addr, int timeout_ms)
+{
+	struct ucma_abi_resolve_ip cmd;
+	struct cma_id_private *id_priv;
+	int ret, dst_len, src_len;
+	
+	dst_len = rdma_addrlen(dst_addr);
+	if (!dst_len)
+		return ERR(EINVAL);
+
+	src_len = rdma_addrlen(src_addr);
+	if (src_addr && !src_len)
+		return ERR(EINVAL);
+
+	if (af_ib_support)
+		return rdma_resolve_addr2(id, src_addr, src_len, dst_addr,
+					  dst_len, timeout_ms);
+
+	CMA_INIT_CMD(&cmd, sizeof cmd, RESOLVE_IP);
+	id_priv = container_of(id, struct cma_id_private, id);
+	cmd.id = id_priv->handle;
+	if (src_addr)
+		memcpy(&cmd.src_addr, src_addr, src_len);
+	memcpy(&cmd.dst_addr, dst_addr, dst_len);
+	cmd.timeout_ms = timeout_ms;
+
+	ret = ucma_resolve_ip(id->channel->fid, &cmd, sizeof cmd);
+	if (ret)
+		return ERR(-ret);
+
+	memcpy(&id->route.addr.dst_addr, dst_addr, dst_len);
+	return ucma_complete(id);
+}
+
+static int ucma_set_ib_route(struct rdma_cm_id *id)
+{
+	struct rdma_addrinfo hint, *rai;
+	int ret;
+
+	memset(&hint, 0, sizeof hint);
+	hint.ai_flags = RAI_ROUTEONLY;
+	hint.ai_family = id->route.addr.src_addr.sa_family;
+	hint.ai_src_len = rdma_addrlen((struct sockaddr *) &id->route.addr.src_addr);
+	hint.ai_src_addr = &id->route.addr.src_addr;
+	hint.ai_dst_len = rdma_addrlen((struct sockaddr *) &id->route.addr.dst_addr);
+	hint.ai_dst_addr = &id->route.addr.dst_addr;
+
+	ret = rdma_getaddrinfo(NULL, NULL, &hint, &rai);
+	if (ret)
+		return ret;
+
+	if (rai->ai_route_len)
+		ret = rdma_set_option(id, RDMA_OPTION_IB, RDMA_OPTION_IB_PATH,
+				      rai->ai_route, rai->ai_route_len);
+	else
+		ret = -1;
+
+	rdma_freeaddrinfo(rai);
+	return ret;
+}
+
+int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms)
+{
+	struct ucma_abi_resolve_route cmd;
+	struct cma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct cma_id_private, id);
+	if (id->verbs->device->transport_type == IBV_TRANSPORT_IB) {
+		ret = ucma_set_ib_route(id);
+		if (!ret)
+			goto out;
+	}
+
+	CMA_INIT_CMD(&cmd, sizeof cmd, RESOLVE_ROUTE);
+	cmd.id = id_priv->handle;
+	cmd.timeout_ms = timeout_ms;
+
+	ret = ucma_resolve_route(id->channel->fid, &cmd, sizeof cmd);
+	if (ret)
+		return ERR(-ret);
+
+out:
+	return ucma_complete(id);
+}
+
+static int ucma_is_ud_qp(enum ibv_qp_type qp_type)
+{
+	return (qp_type == IBV_QPT_UD);
+}
+
+static int rdma_init_qp_attr(struct rdma_cm_id *id, struct ibv_qp_attr *qp_attr,
+			     int *qp_attr_mask)
+{
+	struct ucma_abi_init_qp_attr cmd;
+	struct ibv_kern_qp_attr resp;
+	struct cma_id_private *id_priv;
+	int ret;
+	
+	CMA_INIT_CMD_RESP(&cmd, sizeof cmd, INIT_QP_ATTR, &resp, sizeof resp);
+	id_priv = container_of(id, struct cma_id_private, id);
+	cmd.id = id_priv->handle;
+	cmd.qp_state = qp_attr->qp_state;
+
+	ret = ucma_init_qp_attr(id->channel->fid, &cmd, sizeof cmd,
+				&resp, sizeof resp);
+	if (ret)
+		return ERR(-ret);
+
+	ibv_copy_qp_attr_from_kern(qp_attr, &resp);
+	*qp_attr_mask = resp.qp_attr_mask;
+	return 0;
+}
+
+static int ucma_modify_qp_rtr(struct rdma_cm_id *id, uint8_t resp_res)
+{
+	struct ibv_qp_attr qp_attr;
+	int qp_attr_mask, ret;
+
+	if (!id->qp)
+		return ERR(EINVAL);
+
+	/* Need to update QP attributes from default values. */
+	qp_attr.qp_state = IBV_QPS_INIT;
+	ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask);
+	if (ret)
+		return ret;
+
+	ret = ibv_modify_qp(id->qp, &qp_attr, qp_attr_mask);
+	if (ret)
+		return ERR(ret);
+
+	qp_attr.qp_state = IBV_QPS_RTR;
+	ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask);
+	if (ret)
+		return ret;
+
+	if (resp_res != RDMA_MAX_RESP_RES)
+		qp_attr.max_dest_rd_atomic = resp_res;
+	return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, qp_attr_mask));
+}
+
+static int ucma_modify_qp_rts(struct rdma_cm_id *id, uint8_t init_depth)
+{
+	struct ibv_qp_attr qp_attr;
+	int qp_attr_mask, ret;
+
+	qp_attr.qp_state = IBV_QPS_RTS;
+	ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask);
+	if (ret)
+		return ret;
+
+	if (init_depth != RDMA_MAX_INIT_DEPTH)
+		qp_attr.max_rd_atomic = init_depth;
+	return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, qp_attr_mask));
+}
+
+static int ucma_modify_qp_sqd(struct rdma_cm_id *id)
+{
+	struct ibv_qp_attr qp_attr;
+
+	if (!id->qp)
+		return 0;
+
+	qp_attr.qp_state = IBV_QPS_SQD;
+	return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, IBV_QP_STATE));
+}
+
+static int ucma_modify_qp_err(struct rdma_cm_id *id)
+{
+	struct ibv_qp_attr qp_attr;
+
+	if (!id->qp)
+		return 0;
+
+	qp_attr.qp_state = IBV_QPS_ERR;
+	return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, IBV_QP_STATE));
+}
+
+static int ucma_init_conn_qp(struct cma_id_private *id_priv, struct ibv_qp *qp)
+{
+	struct ibv_qp_attr qp_attr;
+	int qp_attr_mask, ret;
+
+	qp_attr.qp_state = IBV_QPS_INIT;
+	ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+	if (ret)
+		return ret;
+
+	return rdma_seterrno(ibv_modify_qp(qp, &qp_attr, qp_attr_mask));
+}
+
+static int ucma_init_ud_qp(struct cma_id_private *id_priv, struct ibv_qp *qp)
+{
+	struct ibv_qp_attr qp_attr;
+	int qp_attr_mask, ret;
+
+	qp_attr.qp_state = IBV_QPS_INIT;
+	ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+	if (ret)
+		return ret;
+
+	ret = ibv_modify_qp(qp, &qp_attr, qp_attr_mask);
+	if (ret)
+		return ERR(ret);
+
+	qp_attr.qp_state = IBV_QPS_RTR;
+	ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE);
+	if (ret)
+		return ERR(ret);
+
+	qp_attr.qp_state = IBV_QPS_RTS;
+	qp_attr.sq_psn = 0;
+	ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_SQ_PSN);
+	return rdma_seterrno(ret);
+}
+
+static void ucma_destroy_cqs(struct rdma_cm_id *id)
+{
+	if (id->recv_cq)
+		ibv_destroy_cq(id->recv_cq);
+
+	if (id->recv_cq_channel)
+		ibv_destroy_comp_channel(id->recv_cq_channel);
+
+	if (id->send_cq && (id->send_cq != id->recv_cq))
+		ibv_destroy_cq(id->send_cq);
+
+	if (id->send_cq_channel && (id->send_cq_channel != id->recv_cq_channel))
+		ibv_destroy_comp_channel(id->send_cq_channel);
+}
+
+static int ucma_create_cqs(struct rdma_cm_id *id, uint32_t send_size, uint32_t recv_size)
+{
+	if (recv_size) {
+		id->recv_cq_channel = ibv_create_comp_channel(id->verbs);
+		if (!id->recv_cq_channel)
+			goto err;
+
+		id->recv_cq = ibv_create_cq(id->verbs, recv_size,
+					    id, id->recv_cq_channel, 0);
+		if (!id->recv_cq)
+			goto err;
+	}
+
+	if (send_size) {
+		id->send_cq_channel = ibv_create_comp_channel(id->verbs);
+		if (!id->send_cq_channel)
+			goto err;
+
+		id->send_cq = ibv_create_cq(id->verbs, send_size,
+					    id, id->send_cq_channel, 0);
+		if (!id->send_cq)
+			goto err;
+	}
+
+	return 0;
+err:
+	ucma_destroy_cqs(id);
+	return ERR(ENOMEM);
+}
+
+int rdma_create_srq(struct rdma_cm_id *id, struct ibv_pd *pd,
+		    struct ibv_srq_init_attr *attr)
+{
+	struct ibv_srq *srq;
+	int ret;
+
+	if (!pd)
+		pd = id->pd;
+
+#ifdef IBV_XRC_OPS
+	if (attr->srq_type == IBV_SRQT_XRC) {
+		if (!attr->ext.xrc.cq) {
+			ret = ucma_create_cqs(id, 0, attr->attr.max_wr);
+			if (ret)
+				return ret;
+
+			attr->ext.xrc.cq = id->recv_cq;
+		}
+	}
+
+	srq = ibv_create_xsrq(pd, attr);
+#else
+	srq = ibv_create_srq(pd, attr);
+#endif
+	if (!srq) {
+		ret = -1;
+		goto err;
+	}
+
+	id->pd = pd;
+	id->srq = srq;
+	return 0;
+err:
+	ucma_destroy_cqs(id);
+	return ret;
+}
+
+void rdma_destroy_srq(struct rdma_cm_id *id)
+{
+	ibv_destroy_srq(id->srq);
+	if (!id->qp)
+		ucma_destroy_cqs(id);
+	id->srq = NULL;
+}
+
+int rdma_create_qp(struct rdma_cm_id *id, struct ibv_pd *pd,
+		   struct ibv_qp_init_attr *qp_init_attr)
+{
+	struct cma_id_private *id_priv;
+	struct ibv_qp *qp;
+	int ret;
+
+	if (id->qp)
+		return ERR(EINVAL);
+
+	id_priv = container_of(id, struct cma_id_private, id);
+	if (!pd)
+		pd = id->pd;
+	else if (id->verbs != pd->context)
+		return ERR(EINVAL);
+
+	ret = ucma_create_cqs(id, qp_init_attr->send_cq ? 0 : qp_init_attr->cap.max_send_wr,
+			      qp_init_attr->recv_cq ? 0 : qp_init_attr->cap.max_recv_wr);
+	if (ret)
+		return ret;
+
+	if (!qp_init_attr->send_cq)
+		qp_init_attr->send_cq = id->send_cq;
+	if (!qp_init_attr->recv_cq)
+		qp_init_attr->recv_cq = id->recv_cq;
+	qp = ibv_create_qp(pd, qp_init_attr);
+	if (!qp) {
+		ret = ERR(ENOMEM);
+		goto err1;
+	}
+
+	if (ucma_is_ud_qp(id->qp_type))
+		ret = ucma_init_ud_qp(id_priv, qp);
+	else
+		ret = ucma_init_conn_qp(id_priv, qp);
+	if (ret)
+		goto err2;
+
+	id->pd = pd;
+	id->qp = qp;
+	return 0;
+err2:
+	ibv_destroy_qp(qp);
+err1:
+	ucma_destroy_cqs(id);
+	return ret;
+}
+
+void rdma_destroy_qp(struct rdma_cm_id *id)
+{
+	ibv_destroy_qp(id->qp);
+	ucma_destroy_cqs(id);
+	id->qp = NULL;
+}
+
+static int ucma_valid_param(struct cma_id_private *id_priv,
+			    struct rdma_conn_param *param)
+{
+	if (id_priv->id.ps != RDMA_PS_TCP)
+		return 0;
+
+	if (!id_priv->id.qp && !param)
+		goto err;
+
+	if (!param)
+		return 0;
+
+	if ((param->responder_resources != RDMA_MAX_RESP_RES) &&
+	    (param->responder_resources > id_priv->cma_dev->max_responder_resources))
+		goto err;
+
+	if ((param->initiator_depth != RDMA_MAX_INIT_DEPTH) &&
+	    (param->initiator_depth > id_priv->cma_dev->max_initiator_depth))
+		goto err;
+
+	return 0;
+err:
+	return ERR(EINVAL);
+}
+
+static void ucma_copy_conn_param_to_kern(struct cma_id_private *id_priv,
+					 struct ucma_abi_conn_param *dst,
+					 struct rdma_conn_param *src,
+					 uint32_t qp_num, uint8_t srq)
+{
+	dst->qp_num = qp_num;
+	dst->srq = srq;
+	dst->responder_resources = id_priv->responder_resources;
+	dst->initiator_depth = id_priv->initiator_depth;
+	dst->valid = 1;
+
+	if (id_priv->connect_len) {
+		memcpy(dst->private_data, id_priv->connect, id_priv->connect_len);
+		dst->private_data_len = id_priv->connect_len;
+	}
+
+	if (src) {
+		dst->flow_control = src->flow_control;
+		dst->retry_count = src->retry_count;
+		dst->rnr_retry_count = src->rnr_retry_count;
+
+		if (src->private_data && src->private_data_len) {
+			memcpy(dst->private_data + dst->private_data_len,
+			       src->private_data, src->private_data_len);
+			dst->private_data_len += src->private_data_len;
+		}
+	} else {
+		dst->retry_count = 7;
+		dst->rnr_retry_count = 7;
+	}
+}
+
+int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
+{
+	struct ucma_abi_connect cmd;
+	struct cma_id_private *id_priv;
+	int ret;
+	
+	id_priv = container_of(id, struct cma_id_private, id);
+	ret = ucma_valid_param(id_priv, conn_param);
+	if (ret)
+		return ret;
+
+	if (conn_param && conn_param->initiator_depth != RDMA_MAX_INIT_DEPTH)
+		id_priv->initiator_depth = conn_param->initiator_depth;
+	else
+		id_priv->initiator_depth = id_priv->cma_dev->max_initiator_depth;
+	if (conn_param && conn_param->responder_resources != RDMA_MAX_RESP_RES)
+		id_priv->responder_resources = conn_param->responder_resources;
+	else
+		id_priv->responder_resources = id_priv->cma_dev->max_responder_resources;
+
+	CMA_INIT_CMD(&cmd, sizeof cmd, CONNECT);
+	cmd.id = id_priv->handle;
+	if (id->qp) {
+		ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param,
+					     conn_param, id->qp->qp_num,
+					     (id->qp->srq != NULL));
+	} else if (conn_param) {
+		ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param,
+					     conn_param, conn_param->qp_num,
+					     conn_param->srq);
+	} else {
+		ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param,
+					     conn_param, 0, 0);
+	}
+
+	ret = ucma_connect(id->channel->fid, &cmd, sizeof cmd);
+	if (ret)
+		return ERR(-ret);
+
+	if (id_priv->connect_len) {
+		free(id_priv->connect);
+		id_priv->connect_len = 0;
+	}
+
+	return ucma_complete(id);
+}
+
+int rdma_listen(struct rdma_cm_id *id, int backlog)
+{
+	struct ucma_abi_listen cmd;
+	struct cma_id_private *id_priv;
+	int ret;
+	
+	CMA_INIT_CMD(&cmd, sizeof cmd, LISTEN);
+	id_priv = container_of(id, struct cma_id_private, id);
+	cmd.id = id_priv->handle;
+	cmd.backlog = backlog;
+
+	ret = ucma_listen(id->channel->fid, &cmd, sizeof cmd);
+	if (ret)
+		return ERR(-ret);
+
+	if (af_ib_support)
+		return ucma_query_addr(id);
+	else
+		return _ucma_query_route(id);
+}
+
+int rdma_get_request(struct rdma_cm_id *listen, struct rdma_cm_id **id)
+{
+	struct cma_id_private *id_priv;
+	struct rdma_cm_event *event;
+	int ret;
+
+	id_priv = container_of(listen, struct cma_id_private, id);
+	if (!id_priv->sync)
+		return ERR(EINVAL);
+
+	if (listen->event) {
+		rdma_ack_cm_event(listen->event);
+		listen->event = NULL;
+	}
+
+	ret = rdma_get_cm_event(listen->channel, &event);
+	if (ret)
+		return ret;
+
+	if (event->status) {
+		ret = ERR(event->status);
+		goto err;
+	}
+	
+	if (event->event != RDMA_CM_EVENT_CONNECT_REQUEST) {
+		ret = ERR(EINVAL);
+		goto err;
+	}
+
+	if (id_priv->qp_init_attr) {
+		struct ibv_qp_init_attr attr;
+
+		attr = *id_priv->qp_init_attr;
+		ret = rdma_create_qp(event->id, listen->pd, &attr);
+		if (ret)
+			goto err;
+	}
+
+	*id = event->id;
+	(*id)->event = event;
+	return 0;
+
+err:
+	listen->event = event;
+	return ret;
+}
+
+int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
+{
+	struct ucma_abi_accept cmd;
+	struct cma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct cma_id_private, id);
+	ret = ucma_valid_param(id_priv, conn_param);
+	if (ret)
+		return ret;
+
+	if (!conn_param || conn_param->initiator_depth == RDMA_MAX_INIT_DEPTH) {
+		id_priv->initiator_depth = min(id_priv->initiator_depth,
+					       id_priv->cma_dev->max_initiator_depth);
+	} else {
+		id_priv->initiator_depth = conn_param->initiator_depth;
+	}
+	if (!conn_param || conn_param->responder_resources == RDMA_MAX_RESP_RES) {
+		id_priv->responder_resources = min(id_priv->responder_resources,
+						   id_priv->cma_dev->max_responder_resources);
+	} else {
+		id_priv->responder_resources = conn_param->responder_resources;
+	}
+
+	if (!ucma_is_ud_qp(id->qp_type)) {
+		ret = ucma_modify_qp_rtr(id, id_priv->responder_resources);
+		if (ret)
+			return ret;
+
+		ret = ucma_modify_qp_rts(id, id_priv->initiator_depth);
+		if (ret)
+			return ret;
+	}
+
+	CMA_INIT_CMD(&cmd, sizeof cmd, ACCEPT);
+	cmd.id = id_priv->handle;
+	cmd.uid = (uintptr_t) id_priv;
+	if (id->qp)
+		ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param,
+					     conn_param, id->qp->qp_num,
+					     (id->qp->srq != NULL));
+	else
+		ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param,
+					     conn_param, conn_param->qp_num,
+					     conn_param->srq);
+
+	ret = ucma_accept(id->channel->fid, &cmd, sizeof cmd);
+	if (ret) {
+		ucma_modify_qp_err(id);
+		return ERR(-ret);
+	}
+
+	if (ucma_is_ud_qp(id->qp_type))
+		return 0;
+
+	return ucma_complete(id);
+}
+
+int rdma_reject(struct rdma_cm_id *id, const void *private_data,
+		uint8_t private_data_len)
+{
+	struct ucma_abi_reject cmd;
+	struct cma_id_private *id_priv;
+	int ret;
+	
+	CMA_INIT_CMD(&cmd, sizeof cmd, REJECT);
+
+	id_priv = container_of(id, struct cma_id_private, id);
+	cmd.id = id_priv->handle;
+	if (private_data && private_data_len) {
+		memcpy(cmd.private_data, private_data, private_data_len);
+		cmd.private_data_len = private_data_len;
+	}
+
+	ret = ucma_reject(id->channel->fid, &cmd, sizeof cmd);
+	if (ret)
+		return ERR(-ret);
+
+	return 0;
+}
+
+int rdma_notify(struct rdma_cm_id *id, enum ibv_event_type event)
+{
+	struct ucma_abi_notify cmd;
+	struct cma_id_private *id_priv;
+	int ret;
+	
+	CMA_INIT_CMD(&cmd, sizeof cmd, NOTIFY);
+
+	id_priv = container_of(id, struct cma_id_private, id);
+	cmd.id = id_priv->handle;
+	cmd.event = event;
+	ret = ucma_notify(id->channel->fid, &cmd, sizeof cmd);
+	if (ret)
+		return ERR(-ret);
+
+	return 0;
+}
+
+int rdma_disconnect(struct rdma_cm_id *id)
+{
+	struct ucma_abi_disconnect cmd;
+	struct cma_id_private *id_priv;
+	int ret;
+
+	switch (id->verbs->device->transport_type) {
+	case IBV_TRANSPORT_IB:
+		ret = ucma_modify_qp_err(id);
+		break;
+	case IBV_TRANSPORT_IWARP:
+		ret = ucma_modify_qp_sqd(id);
+		break;
+	default:
+		ret = ERR(EINVAL);
+	}
+	if (ret)
+		return ret;
+
+	CMA_INIT_CMD(&cmd, sizeof cmd, DISCONNECT);
+	id_priv = container_of(id, struct cma_id_private, id);
+	cmd.id = id_priv->handle;
+
+	ret = ucma_disconnect(id->channel->fid, &cmd, sizeof cmd);
+	if (ret)
+		return ERR(-ret);
+
+	return ucma_complete(id);
+}
+
+static int rdma_join_multicast2(struct rdma_cm_id *id, struct sockaddr *addr,
+				socklen_t addrlen, void *context)
+{
+	struct ucma_abi_create_id_resp resp;
+	struct cma_id_private *id_priv;
+	struct cma_multicast *mc, **pos;
+	int ret;
+	
+	id_priv = container_of(id, struct cma_id_private, id);
+	mc = calloc(1, sizeof *mc);
+	if (!mc)
+		return ERR(ENOMEM);
+
+	mc->context = context;
+	mc->id_priv = id_priv;
+	memcpy(&mc->addr, addr, addrlen);
+	if (pthread_cond_init(&mc->cond, NULL)) {
+		ret = -1;
+		goto err1;
+	}
+
+	pthread_mutex_lock(&id_priv->mut);
+	mc->next = id_priv->mc_list;
+	id_priv->mc_list = mc;
+	pthread_mutex_unlock(&id_priv->mut);
+
+	if (af_ib_support) {
+		struct ucma_abi_join_mcast cmd;
+
+		CMA_INIT_CMD_RESP(&cmd, sizeof cmd, JOIN_MCAST, &resp, sizeof resp);
+		cmd.id = id_priv->handle;
+		memcpy(&cmd.addr, addr, addrlen);
+		cmd.addr_size = addrlen;
+		cmd.uid = (uintptr_t) mc;
+		cmd.reserved = 0;
+
+		ret = ucma_join_mcast(id->channel->fid, &cmd, sizeof cmd,
+				      &resp, sizeof resp);
+		if (ret) {
+			return ERR(-ret);
+			goto err2;
+		}
+	} else {
+		struct ucma_abi_join_ip_mcast cmd;
+
+		CMA_INIT_CMD_RESP(&cmd, sizeof cmd, JOIN_IP_MCAST, &resp, sizeof resp);
+		cmd.id = id_priv->handle;
+		memcpy(&cmd.addr, addr, addrlen);
+		cmd.uid = (uintptr_t) mc;
+
+		ret = ucma_join_ip_mcast(id->channel->fid, &cmd, sizeof cmd,
+					 &resp, sizeof resp);
+		if (ret) {
+			return ERR(-ret);
+			goto err2;
+		}
+	}
+
+	VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
+
+	mc->handle = resp.id;
+	return ucma_complete(id);
+
+err2:
+	pthread_mutex_lock(&id_priv->mut);
+	for (pos = &id_priv->mc_list; *pos != mc; pos = &(*pos)->next)
+		;
+	*pos = mc->next;
+	pthread_mutex_unlock(&id_priv->mut);
+err1:
+	free(mc);
+	return ret;
+}
+
+int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
+			void *context)
+{
+	int addrlen;
+	
+	addrlen = rdma_addrlen(addr);
+	if (!addrlen)
+		return ERR(EINVAL);
+
+	return rdma_join_multicast2(id, addr, addrlen, context);
+}
+
+int rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr)
+{
+	struct ucma_abi_destroy_id cmd;
+	struct ucma_abi_destroy_id_resp resp;
+	struct cma_id_private *id_priv;
+	struct cma_multicast *mc, **pos;
+	int ret, addrlen;
+	
+	addrlen = rdma_addrlen(addr);
+	if (!addrlen)
+		return ERR(EINVAL);
+
+	id_priv = container_of(id, struct cma_id_private, id);
+	pthread_mutex_lock(&id_priv->mut);
+	for (pos = &id_priv->mc_list; *pos; pos = &(*pos)->next)
+		if (!memcmp(&(*pos)->addr, addr, addrlen))
+			break;
+
+	mc = *pos;
+	if (*pos)
+		*pos = mc->next;
+	pthread_mutex_unlock(&id_priv->mut);
+	if (!mc)
+		return ERR(EADDRNOTAVAIL);
+
+	if (id->qp)
+		ibv_detach_mcast(id->qp, &mc->mgid, mc->mlid);
+	
+	CMA_INIT_CMD_RESP(&cmd, sizeof cmd, LEAVE_MCAST, &resp, sizeof resp);
+	cmd.id = mc->handle;
+
+	ret = ucma_leave_mcast(id->channel->fid, &cmd, sizeof cmd, &resp, sizeof resp);
+	if (ret) {
+		return ERR(-ret);
+		goto free;
+	}
+
+	VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
+
+	pthread_mutex_lock(&id_priv->mut);
+	while (mc->events_completed < resp.events_reported)
+		pthread_cond_wait(&mc->cond, &id_priv->mut);
+	pthread_mutex_unlock(&id_priv->mut);
+
+	ret = 0;
+free:
+	free(mc);
+	return ret;
+}
+
+static void ucma_complete_event(struct cma_id_private *id_priv)
+{
+	pthread_mutex_lock(&id_priv->mut);
+	id_priv->events_completed++;
+	pthread_cond_signal(&id_priv->cond);
+	pthread_mutex_unlock(&id_priv->mut);
+}
+
+static void ucma_complete_mc_event(struct cma_multicast *mc)
+{
+	pthread_mutex_lock(&mc->id_priv->mut);
+	mc->events_completed++;
+	pthread_cond_signal(&mc->cond);
+	mc->id_priv->events_completed++;
+	pthread_cond_signal(&mc->id_priv->cond);
+	pthread_mutex_unlock(&mc->id_priv->mut);
+}
+
+int rdma_ack_cm_event(struct rdma_cm_event *event)
+{
+	struct cma_event *evt;
+
+	if (!event)
+		return ERR(EINVAL);
+
+	evt = container_of(event, struct cma_event, event);
+
+	if (evt->mc)
+		ucma_complete_mc_event(evt->mc);
+	else
+		ucma_complete_event(evt->id_priv);
+	free(evt);
+	return 0;
+}
+
+static void ucma_process_addr_resolved(struct cma_event *evt)
+{
+	if (af_ib_support) {
+		evt->event.status = ucma_query_addr(&evt->id_priv->id);
+		if (!evt->event.status &&
+		    evt->id_priv->id.verbs->device->transport_type == IBV_TRANSPORT_IB)
+			evt->event.status = ucma_query_gid(&evt->id_priv->id);
+	} else {
+		evt->event.status = _ucma_query_route(&evt->id_priv->id);
+	}
+
+	if (evt->event.status)
+		evt->event.event = RDMA_CM_EVENT_ADDR_ERROR;
+}
+
+static void ucma_process_route_resolved(struct cma_event *evt)
+{
+	if (evt->id_priv->id.verbs->device->transport_type != IBV_TRANSPORT_IB)
+		return;
+
+	if (af_ib_support)
+		evt->event.status = ucma_query_path(&evt->id_priv->id);
+	else
+		evt->event.status = _ucma_query_route(&evt->id_priv->id);
+
+	if (evt->event.status)
+		evt->event.event = RDMA_CM_EVENT_ROUTE_ERROR;
+}
+
+static int ucma_query_req_info(struct rdma_cm_id *id)
+{
+	int ret;
+
+	if (!af_ib_support)
+		return _ucma_query_route(id);
+
+	ret = ucma_query_addr(id);
+	if (ret)
+		return ret;
+
+	ret = ucma_query_gid(id);
+	if (ret)
+		return ret;
+
+	ret = ucma_query_path(id);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static int ucma_process_conn_req(struct cma_event *evt,
+				 uint32_t handle)
+{
+	struct cma_id_private *id_priv;
+	int ret;
+
+	id_priv = ucma_alloc_id(evt->id_priv->id.channel,
+				evt->id_priv->id.context, evt->id_priv->id.ps,
+				evt->id_priv->id.qp_type);
+	if (!id_priv) {
+		ucma_destroy_kern_id(evt->id_priv->id.channel->fid, handle);
+		ret = ERR(ENOMEM);
+		goto err1;
+	}
+
+	evt->event.listen_id = &evt->id_priv->id;
+	evt->event.id = &id_priv->id;
+	id_priv->handle = handle;
+	ucma_insert_id(id_priv);
+	id_priv->initiator_depth = evt->event.param.conn.initiator_depth;
+	id_priv->responder_resources = evt->event.param.conn.responder_resources;
+
+	if (evt->id_priv->sync) {
+		ret = rdma_migrate_id(&id_priv->id, NULL);
+		if (ret)
+			goto err2;
+	}
+
+	ret = ucma_query_req_info(&id_priv->id);
+	if (ret)
+		goto err2;
+
+	return 0;
+
+err2:
+	rdma_destroy_id(&id_priv->id);
+err1:
+	ucma_complete_event(evt->id_priv);
+	return ret;
+}
+
+static int ucma_process_conn_resp(struct cma_id_private *id_priv)
+{
+	struct ucma_abi_accept cmd;
+	int ret;
+
+	ret = ucma_modify_qp_rtr(&id_priv->id, RDMA_MAX_RESP_RES);
+	if (ret)
+		goto err;
+
+	ret = ucma_modify_qp_rts(&id_priv->id, RDMA_MAX_INIT_DEPTH);
+	if (ret)
+		goto err;
+
+	CMA_INIT_CMD(&cmd, sizeof cmd, ACCEPT);
+	cmd.id = id_priv->handle;
+
+	ret = ucma_accept(id_priv->id.channel->fid, &cmd, sizeof cmd);
+	if (ret) {
+		return ERR(-ret);
+		goto err;
+	}
+
+	return 0;
+err:
+	ucma_modify_qp_err(&id_priv->id);
+	return ret;
+}
+
+static int ucma_process_join(struct cma_event *evt)
+{
+	evt->mc->mgid = evt->event.param.ud.ah_attr.grh.dgid;
+	evt->mc->mlid = evt->event.param.ud.ah_attr.dlid;
+
+	if (!evt->id_priv->id.qp)
+		return 0;
+
+	return rdma_seterrno(ibv_attach_mcast(evt->id_priv->id.qp,
+					      &evt->mc->mgid, evt->mc->mlid));
+}
+
+static void ucma_copy_conn_event(struct cma_event *event,
+				 struct ucma_abi_conn_param *src)
+{
+	struct rdma_conn_param *dst = &event->event.param.conn;
+
+	dst->private_data_len = src->private_data_len;
+	if (src->private_data_len) {
+		dst->private_data = &event->private_data;
+		memcpy(&event->private_data, src->private_data,
+		       src->private_data_len);
+	}
+
+	dst->responder_resources = src->responder_resources;
+	dst->initiator_depth = src->initiator_depth;
+	dst->flow_control = src->flow_control;
+	dst->retry_count = src->retry_count;
+	dst->rnr_retry_count = src->rnr_retry_count;
+	dst->srq = src->srq;
+	dst->qp_num = src->qp_num;
+}
+
+static void ucma_copy_ud_event(struct cma_event *event,
+			       struct ucma_abi_ud_param *src)
+{
+	struct rdma_ud_param *dst = &event->event.param.ud;
+
+	dst->private_data_len = src->private_data_len;
+	if (src->private_data_len) {
+		dst->private_data = &event->private_data;
+		memcpy(&event->private_data, src->private_data,
+		       src->private_data_len);
+	}
+
+	ibv_copy_ah_attr_from_kern(&dst->ah_attr, &src->ah_attr);
+	dst->qp_num = src->qp_num;
+	dst->qkey = src->qkey;
+}
+
+int rdma_get_cm_event(struct rdma_event_channel *channel,
+		      struct rdma_cm_event **event)
+{
+	struct ucma_abi_event_resp resp;
+	struct ucma_abi_get_event cmd;
+	struct cma_event *evt;
+	int ret;
+
+	ret = _ucma_init();
+	if (ret)
+		return ret;
+
+	if (!event)
+		return ERR(EINVAL);
+
+	evt = malloc(sizeof *evt);
+	if (!evt)
+		return ERR(ENOMEM);
+
+retry:
+	memset(evt, 0, sizeof *evt);
+	CMA_INIT_CMD_RESP(&cmd, sizeof cmd, GET_EVENT, &resp, sizeof resp);
+	ret = ucma_get_event(channel->fid, &cmd, sizeof cmd, &resp, sizeof resp);
+	if (ret) {
+		free(evt);
+		return ERR(-ret);
+	}
+	
+	VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
+
+	evt->event.event = resp.event;
+	/*
+	 * We should have a non-zero uid, except for connection requests.
+	 * But a bug in older kernels can report a uid 0.  Work-around this
+	 * issue by looking up the cma_id based on the kernel's id when the
+	 * uid is 0 and we're processing a connection established event.
+	 * In all other cases, if the uid is 0, we discard the event, like
+	 * the kernel should have done.
+	 */
+	if (resp.uid) {
+		evt->id_priv = (void *) (uintptr_t) resp.uid;
+	} else {
+		evt->id_priv = ucma_lookup_id(resp.id);
+		if (!evt->id_priv) {
+			syslog(LOG_WARNING, "rdmacm: warning: discarding unmatched "
+				"event - rdma_destroy_id may hang.\n");
+			goto retry;
+		}
+		if (resp.event != RDMA_CM_EVENT_ESTABLISHED) {
+			ucma_complete_event(evt->id_priv);
+			goto retry;
+		}
+	}
+	evt->event.id = &evt->id_priv->id;
+	evt->event.status = resp.status;
+
+	switch (resp.event) {
+	case RDMA_CM_EVENT_ADDR_RESOLVED:
+		ucma_process_addr_resolved(evt);
+		break;
+	case RDMA_CM_EVENT_ROUTE_RESOLVED:
+		ucma_process_route_resolved(evt);
+		break;
+	case RDMA_CM_EVENT_CONNECT_REQUEST:
+		evt->id_priv = (void *) (uintptr_t) resp.uid;
+		if (ucma_is_ud_qp(evt->id_priv->id.qp_type))
+			ucma_copy_ud_event(evt, &resp.param.ud);
+		else
+			ucma_copy_conn_event(evt, &resp.param.conn);
+
+		ret = ucma_process_conn_req(evt, resp.id);
+		if (ret)
+			goto retry;
+		break;
+	case RDMA_CM_EVENT_CONNECT_RESPONSE:
+		ucma_copy_conn_event(evt, &resp.param.conn);
+		evt->event.status = ucma_process_conn_resp(evt->id_priv);
+		if (!evt->event.status)
+			evt->event.event = RDMA_CM_EVENT_ESTABLISHED;
+		else {
+			evt->event.event = RDMA_CM_EVENT_CONNECT_ERROR;
+			evt->id_priv->connect_error = 1;
+		}
+		break;
+	case RDMA_CM_EVENT_ESTABLISHED:
+		if (ucma_is_ud_qp(evt->id_priv->id.qp_type)) {
+			ucma_copy_ud_event(evt, &resp.param.ud);
+			break;
+		}
+
+		ucma_copy_conn_event(evt, &resp.param.conn);
+		break;
+	case RDMA_CM_EVENT_REJECTED:
+		if (evt->id_priv->connect_error) {
+			ucma_complete_event(evt->id_priv);
+			goto retry;
+		}
+		ucma_copy_conn_event(evt, &resp.param.conn);
+		ucma_modify_qp_err(evt->event.id);
+		break;
+	case RDMA_CM_EVENT_DISCONNECTED:
+		if (evt->id_priv->connect_error) {
+			ucma_complete_event(evt->id_priv);
+			goto retry;
+		}
+		ucma_copy_conn_event(evt, &resp.param.conn);
+		break;
+	case RDMA_CM_EVENT_MULTICAST_JOIN:
+		evt->mc = (void *) (uintptr_t) resp.uid;
+		evt->id_priv = evt->mc->id_priv;
+		evt->event.id = &evt->id_priv->id;
+		ucma_copy_ud_event(evt, &resp.param.ud);
+		evt->event.param.ud.private_data = evt->mc->context;
+		evt->event.status = ucma_process_join(evt);
+		if (evt->event.status)
+			evt->event.event = RDMA_CM_EVENT_MULTICAST_ERROR;
+		break;
+	case RDMA_CM_EVENT_MULTICAST_ERROR:
+		evt->mc = (void *) (uintptr_t) resp.uid;
+		evt->id_priv = evt->mc->id_priv;
+		evt->event.id = &evt->id_priv->id;
+		evt->event.param.ud.private_data = evt->mc->context;
+		break;
+	default:
+		evt->id_priv = (void *) (uintptr_t) resp.uid;
+		evt->event.id = &evt->id_priv->id;
+		evt->event.status = resp.status;
+		if (ucma_is_ud_qp(evt->id_priv->id.qp_type))
+			ucma_copy_ud_event(evt, &resp.param.ud);
+		else
+			ucma_copy_conn_event(evt, &resp.param.conn);
+		break;
+	}
+
+	*event = &evt->event;
+	return 0;
+}
+
+const char *rdma_event_str(enum rdma_cm_event_type event)
+{
+	switch (event) {
+	case RDMA_CM_EVENT_ADDR_RESOLVED:
+		return "RDMA_CM_EVENT_ADDR_RESOLVED";
+	case RDMA_CM_EVENT_ADDR_ERROR:
+		return "RDMA_CM_EVENT_ADDR_ERROR";
+	case RDMA_CM_EVENT_ROUTE_RESOLVED:
+		return "RDMA_CM_EVENT_ROUTE_RESOLVED";
+	case RDMA_CM_EVENT_ROUTE_ERROR:
+		return "RDMA_CM_EVENT_ROUTE_ERROR";
+	case RDMA_CM_EVENT_CONNECT_REQUEST:
+		return "RDMA_CM_EVENT_CONNECT_REQUEST";
+	case RDMA_CM_EVENT_CONNECT_RESPONSE:
+		return "RDMA_CM_EVENT_CONNECT_RESPONSE";
+	case RDMA_CM_EVENT_CONNECT_ERROR:
+		return "RDMA_CM_EVENT_CONNECT_ERROR";
+	case RDMA_CM_EVENT_UNREACHABLE:
+		return "RDMA_CM_EVENT_UNREACHABLE";
+	case RDMA_CM_EVENT_REJECTED:
+		return "RDMA_CM_EVENT_REJECTED";
+	case RDMA_CM_EVENT_ESTABLISHED:
+		return "RDMA_CM_EVENT_ESTABLISHED";
+	case RDMA_CM_EVENT_DISCONNECTED:
+		return "RDMA_CM_EVENT_DISCONNECTED";
+	case RDMA_CM_EVENT_DEVICE_REMOVAL:
+		return "RDMA_CM_EVENT_DEVICE_REMOVAL";
+	case RDMA_CM_EVENT_MULTICAST_JOIN:
+		return "RDMA_CM_EVENT_MULTICAST_JOIN";
+	case RDMA_CM_EVENT_MULTICAST_ERROR:
+		return "RDMA_CM_EVENT_MULTICAST_ERROR";
+	case RDMA_CM_EVENT_ADDR_CHANGE:
+		return "RDMA_CM_EVENT_ADDR_CHANGE";
+	case RDMA_CM_EVENT_TIMEWAIT_EXIT:
+		return "RDMA_CM_EVENT_TIMEWAIT_EXIT";
+	default:
+		return "UNKNOWN EVENT";
+	}
+}
+
+int rdma_set_option(struct rdma_cm_id *id, int level, int optname,
+		    void *optval, size_t optlen)
+{
+	struct ucma_abi_set_option cmd;
+	struct cma_id_private *id_priv;
+	int ret;
+	
+	CMA_INIT_CMD(&cmd, sizeof cmd, SET_OPTION);
+	id_priv = container_of(id, struct cma_id_private, id);
+	cmd.id = id_priv->handle;
+	cmd.optval = (uintptr_t) optval;
+	cmd.level = level;
+	cmd.optname = optname;
+	cmd.optlen = optlen;
+
+	ret = ucma_set_option(id->channel->fid, &cmd, sizeof cmd);
+	if (ret)
+		return ERR(-ret);
+
+	return 0;
+}
+
+int rdma_migrate_id(struct rdma_cm_id *id, struct rdma_event_channel *channel)
+{
+	struct ucma_abi_migrate_resp resp;
+	struct ucma_abi_migrate_id cmd;
+	struct cma_id_private *id_priv;
+	int ret, sync;
+
+	id_priv = container_of(id, struct cma_id_private, id);
+	if (id_priv->sync && !channel)
+		return ERR(EINVAL);
+
+	if ((sync = (channel == NULL))) {
+		channel = rdma_create_event_channel();
+		if (!channel)
+			return -1;
+	}
+
+	CMA_INIT_CMD_RESP(&cmd, sizeof cmd, MIGRATE_ID, &resp, sizeof resp);
+	cmd.id = id_priv->handle;
+	cmd.fd = id->channel->fd;
+
+	ret = ucma_migrate_id(channel->fid, &cmd, sizeof cmd, &resp, sizeof resp);
+	if (ret) {
+		if (sync)
+			rdma_destroy_event_channel(channel);
+		return ERR(-ret);
+	}
+
+	VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
+
+	if (id_priv->sync) {
+		if (id->event) {
+			rdma_ack_cm_event(id->event);
+			id->event = NULL;
+		}
+		rdma_destroy_event_channel(id->channel);
+	}
+
+	/*
+	 * Eventually if we want to support migrating channels while events are
+	 * being processed on the current channel, we need to block here while
+	 * there are any outstanding events on the current channel for this id
+	 * to prevent the user from processing events for this id on the old
+	 * channel after this call returns.
+	 */
+	pthread_mutex_lock(&id_priv->mut);
+	id_priv->sync = sync;
+	id->channel = channel;
+	while (id_priv->events_completed < resp.events_reported)
+		pthread_cond_wait(&id_priv->cond, &id_priv->mut);
+	pthread_mutex_unlock(&id_priv->mut);
+
+	return 0;
+}
+
+static int ucma_passive_ep(struct rdma_cm_id *id, struct rdma_addrinfo *res,
+			   struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr)
+{
+	struct cma_id_private *id_priv;
+	int ret;
+
+	if (af_ib_support)
+		ret = rdma_bind_addr2(id, res->ai_src_addr, res->ai_src_len);
+	else
+		ret = rdma_bind_addr(id, res->ai_src_addr);
+	if (ret)
+		return ret;
+
+	id_priv = container_of(id, struct cma_id_private, id);
+	if (pd)
+		id->pd = pd;
+
+	if (qp_init_attr) {
+		id_priv->qp_init_attr = malloc(sizeof *qp_init_attr);
+		if (!id_priv->qp_init_attr)
+			return ERR(ENOMEM);
+
+		*id_priv->qp_init_attr = *qp_init_attr;
+		id_priv->qp_init_attr->qp_type = res->ai_qp_type;
+	}
+
+	return 0;
+}
+
+int rdma_create_ep(struct rdma_cm_id **id, struct rdma_addrinfo *res,
+		   struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr)
+{
+	struct rdma_cm_id *cm_id;
+	struct cma_id_private *id_priv;
+	int ret;
+
+	ret = rdma_create_id2(NULL, &cm_id, NULL, res->ai_port_space, res->ai_qp_type);
+	if (ret)
+		return ret;
+
+	if (res->ai_flags & RAI_PASSIVE) {
+		ret = ucma_passive_ep(cm_id, res, pd, qp_init_attr);
+		if (ret)
+			goto err;
+		goto out;
+	}
+
+	if (af_ib_support)
+		ret = rdma_resolve_addr2(cm_id, res->ai_src_addr, res->ai_src_len,
+					 res->ai_dst_addr, res->ai_dst_len, 2000);
+	else
+		ret = rdma_resolve_addr(cm_id, res->ai_src_addr, res->ai_dst_addr, 2000);
+	if (ret)
+		goto err;
+
+	if (res->ai_route_len) {
+		ret = rdma_set_option(cm_id, RDMA_OPTION_IB, RDMA_OPTION_IB_PATH,
+				      res->ai_route, res->ai_route_len);
+		if (!ret)
+			ret = ucma_complete(cm_id);
+	} else {
+		ret = rdma_resolve_route(cm_id, 2000);
+	}
+	if (ret)
+		goto err;
+
+	if (qp_init_attr) {
+		qp_init_attr->qp_type = res->ai_qp_type;
+		ret = rdma_create_qp(cm_id, pd, qp_init_attr);
+		if (ret)
+			goto err;
+	}
+
+	if (res->ai_connect_len) {
+		id_priv = container_of(cm_id, struct cma_id_private, id);
+		id_priv->connect = malloc(res->ai_connect_len);
+		if (!id_priv->connect) {
+			ret = ERR(ENOMEM);
+			goto err;
+		}
+		memcpy(id_priv->connect, res->ai_connect, res->ai_connect_len);
+		id_priv->connect_len = res->ai_connect_len;
+	}
+
+out:
+	*id = cm_id;
+	return 0;
+
+err:
+	rdma_destroy_ep(cm_id);
+	return ret;
+}
+
+void rdma_destroy_ep(struct rdma_cm_id *id)
+{
+	struct cma_id_private *id_priv;
+
+	if (id->qp)
+		rdma_destroy_qp(id);
+
+	if (id->srq)
+		rdma_destroy_srq(id);
+
+	id_priv = container_of(id, struct cma_id_private, id);
+	if (id_priv->qp_init_attr)
+		free(id_priv->qp_init_attr);
+
+	rdma_destroy_id(id);
+}
+
+int ucma_max_qpsize(struct rdma_cm_id *id)
+{
+	struct cma_id_private *id_priv;
+	int i, max_size = 0;
+
+	id_priv = container_of(id, struct cma_id_private, id);
+	if (id && id_priv->cma_dev) {
+		max_size = id_priv->cma_dev->max_qpsize;
+	} else {
+		_ucma_init();
+		for (i = 0; i < cma_dev_cnt; i++) {
+			if (!max_size || max_size > cma_dev_array[i].max_qpsize)
+				max_size = cma_dev_array[i].max_qpsize;
+		}
+	}
+	return max_size;
+}
+
+uint16_t ucma_get_port(struct sockaddr *addr)
+{
+	switch (addr->sa_family) {
+	case AF_INET:
+		return ((struct sockaddr_in *) addr)->sin_port;
+	case AF_INET6:
+		return ((struct sockaddr_in6 *) addr)->sin6_port;
+	case AF_IB:
+		return htons((uint16_t) ntohll(((struct sockaddr_ib *) addr)->sib_sid));
+	default:
+		return 0;
+	}
+}
+
+uint16_t rdma_get_src_port(struct rdma_cm_id *id)
+{
+	return ucma_get_port(&id->route.addr.src_addr);
+}
+
+uint16_t rdma_get_dst_port(struct rdma_cm_id *id)
+{
+	return ucma_get_port(&id->route.addr.dst_addr);
+}
+
diff --git a/prov/rdmacm/src/cma.h b/prov/rdmacm/src/cma.h
new file mode 100644
index 00000000000..97c7cfdf998
--- /dev/null
+++ b/prov/rdmacm/src/cma.h
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2005-2012 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#if !defined(CMA_H)
+#define CMA_H
+
+#if HAVE_CONFIG_H
+#  include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#include <stdlib.h>
+#include <errno.h>
+#include <endian.h>
+#include <byteswap.h>
+#include <semaphore.h>
+
+#include <rdma/rdma_cma.h>
+#include <infiniband/ib.h>
+#include <rdma/fabric.h>
+#include <fi.h>
+
+
+/*
+ * Fast synchronization for low contention locking.
+ */
+#if DEFINE_ATOMICS
+#define fastlock_t pthread_mutex_t
+#define fastlock_init(lock) pthread_mutex_init(lock, NULL)
+#define fastlock_destroy(lock) pthread_mutex_destroy(lock)
+#define fastlock_acquire(lock) pthread_mutex_lock(lock)
+#define fastlock_release(lock) pthread_mutex_unlock(lock)
+
+typedef struct { pthread_mutex_t mut; int val; } atomic_t;
+static inline int atomic_inc(atomic_t *atomic)
+{
+	int v;
+
+	pthread_mutex_lock(&atomic->mut);
+	v = ++(atomic->val);
+	pthread_mutex_unlock(&atomic->mut);
+	return v;
+}
+static inline int atomic_dec(atomic_t *atomic)
+{
+	int v;
+
+	pthread_mutex_lock(&atomic->mut);
+	v = --(atomic->val);
+	pthread_mutex_unlock(&atomic->mut);
+	return v;
+}
+static inline void atomic_init(atomic_t *atomic)
+{
+	pthread_mutex_init(&atomic->mut, NULL);
+	atomic->val = 0;
+}
+#else
+typedef struct {
+	sem_t sem;
+	volatile int cnt;
+} fastlock_t;
+static inline void fastlock_init(fastlock_t *lock)
+{
+	sem_init(&lock->sem, 0, 0);
+	lock->cnt = 0;
+}
+static inline void fastlock_destroy(fastlock_t *lock)
+{
+	sem_destroy(&lock->sem);
+}
+static inline void fastlock_acquire(fastlock_t *lock)
+{
+	if (__sync_add_and_fetch(&lock->cnt, 1) > 1)
+		sem_wait(&lock->sem);
+}
+static inline void fastlock_release(fastlock_t *lock)
+{
+	if (__sync_sub_and_fetch(&lock->cnt, 1) > 0)
+		sem_post(&lock->sem);
+}
+
+typedef struct { volatile int val; } atomic_t;
+#define atomic_inc(v) (__sync_add_and_fetch(&(v)->val, 1))
+#define atomic_dec(v) (__sync_sub_and_fetch(&(v)->val, 1))
+#define atomic_init(v) ((v)->val = 0)
+#endif /* DEFINE_ATOMICS */
+#define atomic_get(v) ((v)->val)
+#define atomic_set(v, s) ((v)->val = s)
+
+uint16_t ucma_get_port(struct sockaddr *addr);
+void ucma_set_sid(enum rdma_port_space ps, struct sockaddr *addr,
+		  struct sockaddr_ib *sib);
+int ucma_max_qpsize(struct rdma_cm_id *id);
+int ucma_complete(struct rdma_cm_id *id);
+
+static inline int ERR(int err)
+{
+	errno = err;
+	return -1;
+}
+
+int ucma_init();
+extern int af_ib_support;
+
+#define RAI_ROUTEONLY		0x01000000
+
+void ucma_ib_init();
+void ucma_ib_cleanup();
+void ucma_ib_resolve(struct rdma_addrinfo **rai, struct rdma_addrinfo *hints);
+
+struct ib_connect_hdr {
+	uint8_t  cma_version;
+	uint8_t  ip_version; /* IP version: 7:4 */
+	uint16_t port;
+	uint32_t src_addr[4];
+	uint32_t dst_addr[4];
+#define cma_src_ip4 src_addr[3]
+#define cma_src_ip6 src_addr[0]
+#define cma_dst_ip4 dst_addr[3]
+#define cma_dst_ip6 dst_addr[0]
+};
+
+#define RS_CONF_DIR RDMA_CONF_DIR "/rsocket"
+
+#endif /* CMA_H */
diff --git a/prov/rdmacm/src/indexer.c b/prov/rdmacm/src/indexer.c
new file mode 100644
index 00000000000..c8e8bce53ce
--- /dev/null
+++ b/prov/rdmacm/src/indexer.c
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2011 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#if HAVE_CONFIG_H
+#  include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#include <sys/types.h>
+#include <stdlib.h>
+
+#include "indexer.h"
+#include "cma.h"
+
+/*
+ * Indexer - to find a structure given an index
+ *
+ * We store pointers using a double lookup and return an index to the
+ * user which is then used to retrieve the pointer.  The upper bits of
+ * the index are itself an index into an array of memory allocations.
+ * The lower bits specify the offset into the allocated memory where
+ * the pointer is stored.
+ *
+ * This allows us to adjust the number of pointers stored by the index
+ * list without taking a lock during data lookups.
+ */
+
+static int idx_grow(struct indexer *idx)
+{
+	union idx_entry *entry;
+	int i, start_index;
+
+	if (idx->size >= IDX_ARRAY_SIZE)
+		goto nomem;
+
+	idx->array[idx->size] = calloc(IDX_ENTRY_SIZE, sizeof(union idx_entry));
+	if (!idx->array[idx->size])
+		goto nomem;
+
+	entry = idx->array[idx->size];
+	start_index = idx->size << IDX_ENTRY_BITS;
+	entry[IDX_ENTRY_SIZE - 1].next = idx->free_list;
+
+	for (i = IDX_ENTRY_SIZE - 2; i >= 0; i--)
+		entry[i].next = start_index + i + 1;
+
+	/* Index 0 is reserved */
+	if (start_index == 0)
+		start_index++;
+	idx->free_list = start_index;
+	idx->size++;
+	return start_index;
+
+nomem:
+	errno = ENOMEM;
+	return -1;
+}
+
+int idx_insert(struct indexer *idx, void *item)
+{
+	union idx_entry *entry;
+	int index;
+
+	if ((index = idx->free_list) == 0) {
+		if ((index = idx_grow(idx)) <= 0)
+			return index;
+	}
+
+	entry = idx->array[idx_array_index(index)];
+	idx->free_list = entry[idx_entry_index(index)].next;
+	entry[idx_entry_index(index)].item = item;
+	return index;
+}
+
+void *idx_remove(struct indexer *idx, int index)
+{
+	union idx_entry *entry;
+	void *item;
+
+	entry = idx->array[idx_array_index(index)];
+	item = entry[idx_entry_index(index)].item;
+	entry[idx_entry_index(index)].next = idx->free_list;
+	idx->free_list = index;
+	return item;
+}
+
+void idx_replace(struct indexer *idx, int index, void *item)
+{
+	union idx_entry *entry;
+
+	entry = idx->array[idx_array_index(index)];
+	entry[idx_entry_index(index)].item = item;
+}
+
+
+static int idm_grow(struct index_map *idm, int index)
+{
+	idm->array[idx_array_index(index)] = calloc(IDX_ENTRY_SIZE, sizeof(void *));
+	if (!idm->array[idx_array_index(index)])
+		goto nomem;
+
+	return index;
+
+nomem:
+	errno = ENOMEM;
+	return -1;
+}
+
+int idm_set(struct index_map *idm, int index, void *item)
+{
+	void **entry;
+
+	if (index > IDX_MAX_INDEX) {
+		errno = ENOMEM;
+		return -1;
+	}
+
+	if (!idm->array[idx_array_index(index)]) {
+		if (idm_grow(idm, index) < 0)
+			return -1;
+	}
+
+	entry = idm->array[idx_array_index(index)];
+	entry[idx_entry_index(index)] = item;
+	return index;
+}
+
+void *idm_clear(struct index_map *idm, int index)
+{
+	void **entry;
+	void *item;
+
+	entry = idm->array[idx_array_index(index)];
+	item = entry[idx_entry_index(index)];
+	entry[idx_entry_index(index)] = NULL;
+	return item;
+}
diff --git a/prov/rdmacm/src/indexer.h b/prov/rdmacm/src/indexer.h
new file mode 100644
index 00000000000..0c5f3882673
--- /dev/null
+++ b/prov/rdmacm/src/indexer.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2011 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#if !defined(INDEXER_H)
+#define INDEXER_H
+
+#if HAVE_CONFIG_H
+#  include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#include <sys/types.h>
+
+/*
+ * Indexer - to find a structure given an index.  Synchronization
+ * must be provided by the caller.  Caller must initialize the
+ * indexer by setting free_list and size to 0.
+ */
+
+union idx_entry {
+	void *item;
+	int   next;
+};
+
+#define IDX_INDEX_BITS 16
+#define IDX_ENTRY_BITS 10
+#define IDX_ENTRY_SIZE (1 << IDX_ENTRY_BITS)
+#define IDX_ARRAY_SIZE (1 << (IDX_INDEX_BITS - IDX_ENTRY_BITS))
+#define IDX_MAX_INDEX  ((1 << IDX_INDEX_BITS) - 1)
+
+struct indexer
+{
+	union idx_entry *array[IDX_ARRAY_SIZE];
+	int		 free_list;
+	int		 size;
+};
+
+#define idx_array_index(index) (index >> IDX_ENTRY_BITS)
+#define idx_entry_index(index) (index & (IDX_ENTRY_SIZE - 1))
+
+int idx_insert(struct indexer *idx, void *item);
+void *idx_remove(struct indexer *idx, int index);
+void idx_replace(struct indexer *idx, int index, void *item);
+
+static inline void *idx_at(struct indexer *idx, int index)
+{
+	return (idx->array[idx_array_index(index)] + idx_entry_index(index))->item;
+}
+
+/*
+ * Index map - associates a structure with an index.  Synchronization
+ * must be provided by the caller.  Caller must initialize the
+ * index map by setting it to 0.
+ */
+
+struct index_map
+{
+	void **array[IDX_ARRAY_SIZE];
+};
+
+int idm_set(struct index_map *idm, int index, void *item);
+void *idm_clear(struct index_map *idm, int index);
+
+static inline void *idm_at(struct index_map *idm, int index)
+{
+	void **entry;
+	entry = idm->array[idx_array_index(index)];
+	return entry[idx_entry_index(index)];
+}
+
+static inline void *idm_lookup(struct index_map *idm, int index)
+{
+	return ((index <= IDX_MAX_INDEX) && idm->array[idx_array_index(index)]) ?
+		idm_at(idm, index) : NULL;
+}
+
+typedef struct _dlist_entry {
+	struct _dlist_entry	*next;
+	struct _dlist_entry	*prev;
+}	dlist_entry;
+
+static inline void dlist_init(dlist_entry *head)
+{
+	head->next = head;
+	head->prev = head;
+}
+
+static inline int dlist_empty(dlist_entry *head)
+{
+	return head->next == head;
+}
+
+static inline void dlist_insert_after(dlist_entry *item, dlist_entry *head)
+{
+	item->next = head->next;
+	item->prev = head;
+	head->next->prev = item;
+	head->next = item;
+}
+
+static inline void dlist_insert_before(dlist_entry *item, dlist_entry *head)
+{
+	dlist_insert_after(item, head->prev);
+}
+
+#define dlist_insert_head dlist_insert_after
+#define dlist_insert_tail dlist_insert_before
+
+static inline void dlist_remove(dlist_entry *item)
+{
+	item->prev->next = item->next;
+	item->next->prev = item->prev;
+}
+
+#endif /* INDEXER_H */
diff --git a/prov/rdmacm/src/preload.c b/prov/rdmacm/src/preload.c
new file mode 100644
index 00000000000..fb2149bf467
--- /dev/null
+++ b/prov/rdmacm/src/preload.c
@@ -0,0 +1,1057 @@
+/*
+ * Copyright (c) 2011-2012 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#if HAVE_CONFIG_H
+#  include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/sendfile.h>
+#include <stdarg.h>
+#include <dlfcn.h>
+#include <netdb.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <string.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <unistd.h>
+#include <semaphore.h>
+
+#include <rdma/rdma_cma.h>
+#include <rdma/rdma_verbs.h>
+#include <rdma/rsocket.h>
+#include "cma.h"
+#include "indexer.h"
+
+struct socket_calls {
+	int (*socket)(int domain, int type, int protocol);
+	int (*bind)(int socket, const struct sockaddr *addr, socklen_t addrlen);
+	int (*listen)(int socket, int backlog);
+	int (*accept)(int socket, struct sockaddr *addr, socklen_t *addrlen);
+	int (*connect)(int socket, const struct sockaddr *addr, socklen_t addrlen);
+	ssize_t (*recv)(int socket, void *buf, size_t len, int flags);
+	ssize_t (*recvfrom)(int socket, void *buf, size_t len, int flags,
+			    struct sockaddr *src_addr, socklen_t *addrlen);
+	ssize_t (*recvmsg)(int socket, struct msghdr *msg, int flags);
+	ssize_t (*read)(int socket, void *buf, size_t count);
+	ssize_t (*readv)(int socket, const struct iovec *iov, int iovcnt);
+	ssize_t (*send)(int socket, const void *buf, size_t len, int flags);
+	ssize_t (*sendto)(int socket, const void *buf, size_t len, int flags,
+			  const struct sockaddr *dest_addr, socklen_t addrlen);
+	ssize_t (*sendmsg)(int socket, const struct msghdr *msg, int flags);
+	ssize_t (*write)(int socket, const void *buf, size_t count);
+	ssize_t (*writev)(int socket, const struct iovec *iov, int iovcnt);
+	int (*poll)(struct pollfd *fds, nfds_t nfds, int timeout);
+	int (*shutdown)(int socket, int how);
+	int (*close)(int socket);
+	int (*getpeername)(int socket, struct sockaddr *addr, socklen_t *addrlen);
+	int (*getsockname)(int socket, struct sockaddr *addr, socklen_t *addrlen);
+	int (*setsockopt)(int socket, int level, int optname,
+			  const void *optval, socklen_t optlen);
+	int (*getsockopt)(int socket, int level, int optname,
+			  void *optval, socklen_t *optlen);
+	int (*fcntl)(int socket, int cmd, ... /* arg */);
+	int (*dup2)(int oldfd, int newfd);
+	ssize_t (*sendfile)(int out_fd, int in_fd, off_t *offset, size_t count);
+	int (*fxstat)(int ver, int fd, struct stat *buf);
+};
+
+static struct socket_calls real;
+static struct socket_calls rs;
+
+static struct index_map idm;
+static pthread_mutex_t mut = PTHREAD_MUTEX_INITIALIZER;
+
+static int sq_size;
+static int rq_size;
+static int sq_inline;
+static int fork_support;
+
+enum fd_type {
+	fd_normal,
+	fd_rsocket
+};
+
+enum fd_fork_state {
+	fd_ready,
+	fd_fork,
+	fd_fork_listen,
+	fd_fork_active,
+	fd_fork_passive
+};
+
+struct fd_info {
+	enum fd_type type;
+	enum fd_fork_state state;
+	int fd;
+	int dupfd;
+	atomic_t refcnt;
+};
+
+static int fd_open(void)
+{
+	struct fd_info *fdi;
+	int ret, index;
+
+	fdi = calloc(1, sizeof *fdi);
+	if (!fdi)
+		return ERR(ENOMEM);
+
+	index = open("/dev/null", O_RDONLY);
+	if (index < 0) {
+		ret = index;
+		goto err1;
+	}
+
+	fdi->dupfd = -1;
+	atomic_init(&fdi->refcnt);
+	atomic_set(&fdi->refcnt, 1);
+	pthread_mutex_lock(&mut);
+	ret = idm_set(&idm, index, fdi);
+	pthread_mutex_unlock(&mut);
+	if (ret < 0)
+		goto err2;
+
+	return index;
+
+err2:
+	real.close(index);
+err1:
+	free(fdi);
+	return ret;
+}
+
+static void fd_store(int index, int fd, enum fd_type type, enum fd_fork_state state)
+{
+	struct fd_info *fdi;
+
+	fdi = idm_at(&idm, index);
+	fdi->fd = fd;
+	fdi->type = type;
+	fdi->state = state;
+}
+
+static inline enum fd_type fd_get(int index, int *fd)
+{
+	struct fd_info *fdi;
+
+	fdi = idm_lookup(&idm, index);
+	if (fdi) {
+		*fd = fdi->fd;
+		return fdi->type;
+
+	} else {
+		*fd = index;
+		return fd_normal;
+	}
+}
+
+static inline int fd_getd(int index)
+{
+	struct fd_info *fdi;
+
+	fdi = idm_lookup(&idm, index);
+	return fdi ? fdi->fd : index;
+}
+
+static inline enum fd_fork_state fd_gets(int index)
+{
+	struct fd_info *fdi;
+
+	fdi = idm_lookup(&idm, index);
+	return fdi ? fdi->state : fd_ready;
+}
+
+static inline enum fd_type fd_gett(int index)
+{
+	struct fd_info *fdi;
+
+	fdi = idm_lookup(&idm, index);
+	return fdi ? fdi->type : fd_normal;
+}
+
+static enum fd_type fd_close(int index, int *fd)
+{
+	struct fd_info *fdi;
+	enum fd_type type;
+
+	fdi = idm_lookup(&idm, index);
+	if (fdi) {
+		idm_clear(&idm, index);
+		*fd = fdi->fd;
+		type = fdi->type;
+		real.close(index);
+		free(fdi);
+	} else {
+		*fd = index;
+		type = fd_normal;
+	}
+	return type;
+}
+
+void getenv_options(void)
+{
+	char *var;
+
+	var = getenv("RS_SQ_SIZE");
+	if (var)
+		sq_size = atoi(var);
+
+	var = getenv("RS_RQ_SIZE");
+	if (var)
+		rq_size = atoi(var);
+
+	var = getenv("RS_INLINE");
+	if (var)
+		sq_inline = atoi(var);
+
+	var = getenv("RDMAV_FORK_SAFE");
+	if (var)
+		fork_support = atoi(var);
+}
+
+static void init_preload(void)
+{
+	static int init;
+
+	/* Quick check without lock */
+	if (init)
+		return;
+
+	pthread_mutex_lock(&mut);
+	if (init)
+		goto out;
+
+	real.socket = dlsym(RTLD_NEXT, "socket");
+	real.bind = dlsym(RTLD_NEXT, "bind");
+	real.listen = dlsym(RTLD_NEXT, "listen");
+	real.accept = dlsym(RTLD_NEXT, "accept");
+	real.connect = dlsym(RTLD_NEXT, "connect");
+	real.recv = dlsym(RTLD_NEXT, "recv");
+	real.recvfrom = dlsym(RTLD_NEXT, "recvfrom");
+	real.recvmsg = dlsym(RTLD_NEXT, "recvmsg");
+	real.read = dlsym(RTLD_NEXT, "read");
+	real.readv = dlsym(RTLD_NEXT, "readv");
+	real.send = dlsym(RTLD_NEXT, "send");
+	real.sendto = dlsym(RTLD_NEXT, "sendto");
+	real.sendmsg = dlsym(RTLD_NEXT, "sendmsg");
+	real.write = dlsym(RTLD_NEXT, "write");
+	real.writev = dlsym(RTLD_NEXT, "writev");
+	real.poll = dlsym(RTLD_NEXT, "poll");
+	real.shutdown = dlsym(RTLD_NEXT, "shutdown");
+	real.close = dlsym(RTLD_NEXT, "close");
+	real.getpeername = dlsym(RTLD_NEXT, "getpeername");
+	real.getsockname = dlsym(RTLD_NEXT, "getsockname");
+	real.setsockopt = dlsym(RTLD_NEXT, "setsockopt");
+	real.getsockopt = dlsym(RTLD_NEXT, "getsockopt");
+	real.fcntl = dlsym(RTLD_NEXT, "fcntl");
+	real.dup2 = dlsym(RTLD_NEXT, "dup2");
+	real.sendfile = dlsym(RTLD_NEXT, "sendfile");
+	real.fxstat = dlsym(RTLD_NEXT, "__fxstat");
+
+	rs.socket = dlsym(RTLD_DEFAULT, "rsocket");
+	rs.bind = dlsym(RTLD_DEFAULT, "rbind");
+	rs.listen = dlsym(RTLD_DEFAULT, "rlisten");
+	rs.accept = dlsym(RTLD_DEFAULT, "raccept");
+	rs.connect = dlsym(RTLD_DEFAULT, "rconnect");
+	rs.recv = dlsym(RTLD_DEFAULT, "rrecv");
+	rs.recvfrom = dlsym(RTLD_DEFAULT, "rrecvfrom");
+	rs.recvmsg = dlsym(RTLD_DEFAULT, "rrecvmsg");
+	rs.read = dlsym(RTLD_DEFAULT, "rread");
+	rs.readv = dlsym(RTLD_DEFAULT, "rreadv");
+	rs.send = dlsym(RTLD_DEFAULT, "rsend");
+	rs.sendto = dlsym(RTLD_DEFAULT, "rsendto");
+	rs.sendmsg = dlsym(RTLD_DEFAULT, "rsendmsg");
+	rs.write = dlsym(RTLD_DEFAULT, "rwrite");
+	rs.writev = dlsym(RTLD_DEFAULT, "rwritev");
+	rs.poll = dlsym(RTLD_DEFAULT, "rpoll");
+	rs.shutdown = dlsym(RTLD_DEFAULT, "rshutdown");
+	rs.close = dlsym(RTLD_DEFAULT, "rclose");
+	rs.getpeername = dlsym(RTLD_DEFAULT, "rgetpeername");
+	rs.getsockname = dlsym(RTLD_DEFAULT, "rgetsockname");
+	rs.setsockopt = dlsym(RTLD_DEFAULT, "rsetsockopt");
+	rs.getsockopt = dlsym(RTLD_DEFAULT, "rgetsockopt");
+	rs.fcntl = dlsym(RTLD_DEFAULT, "rfcntl");
+
+	getenv_options();
+	init = 1;
+out:
+	pthread_mutex_unlock(&mut);
+}
+
+/*
+ * We currently only handle copying a few common values.
+ */
+static int copysockopts(int dfd, int sfd, struct socket_calls *dapi,
+			struct socket_calls *sapi)
+{
+	socklen_t len;
+	int param, ret;
+
+	ret = sapi->fcntl(sfd, F_GETFL);
+	if (ret > 0)
+		ret = dapi->fcntl(dfd, F_SETFL, ret);
+	if (ret)
+		return ret;
+
+	len = sizeof param;
+	ret = sapi->getsockopt(sfd, SOL_SOCKET, SO_REUSEADDR, &param, &len);
+	if (param && !ret)
+		ret = dapi->setsockopt(dfd, SOL_SOCKET, SO_REUSEADDR, &param, len);
+	if (ret)
+		return ret;
+
+	len = sizeof param;
+	ret = sapi->getsockopt(sfd, IPPROTO_TCP, TCP_NODELAY, &param, &len);
+	if (param && !ret)
+		ret = dapi->setsockopt(dfd, IPPROTO_TCP, TCP_NODELAY, &param, len);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+/*
+ * Convert between an rsocket and a normal socket.
+ */
+static int transpose_socket(int socket, enum fd_type new_type)
+{
+	socklen_t len = 0;
+	int sfd, dfd, param, ret;
+	struct socket_calls *sapi, *dapi;
+
+	sfd = fd_getd(socket);
+	if (new_type == fd_rsocket) {
+		dapi = &rs;
+		sapi = &real;
+	} else {
+		dapi = &real;
+		sapi = &rs;
+	}
+
+	ret = sapi->getsockname(sfd, NULL, &len);
+	if (ret)
+		return ret;
+
+	param = (len == sizeof(struct sockaddr_in6)) ? PF_INET6 : PF_INET;
+	dfd = dapi->socket(param, SOCK_STREAM, 0);
+	if (dfd < 0)
+		return dfd;
+
+	ret = copysockopts(dfd, sfd, dapi, sapi);
+	if (ret)
+		goto err;
+
+	fd_store(socket, dfd, new_type, fd_ready);
+	return dfd;
+
+err:
+	dapi->close(dfd);
+	return ret;
+}
+
+/*
+ * Use defaults on failure.
+ */
+void set_rsocket_options(int rsocket)
+{
+	if (sq_size)
+		rsetsockopt(rsocket, SOL_RDMA, RDMA_SQSIZE, &sq_size, sizeof sq_size);
+
+	if (rq_size)
+		rsetsockopt(rsocket, SOL_RDMA, RDMA_RQSIZE, &rq_size, sizeof rq_size);
+
+	if (sq_inline)
+		rsetsockopt(rsocket, SOL_RDMA, RDMA_INLINE, &sq_inline, sizeof sq_inline);
+}
+
+int socket(int domain, int type, int protocol)
+{
+	static __thread int recursive;
+	int index, ret;
+
+	if (recursive)
+		goto real;
+
+	init_preload();
+	index = fd_open();
+	if (index < 0)
+		return index;
+
+	if (fork_support && (domain == PF_INET || domain == PF_INET6) &&
+	    (type == SOCK_STREAM) && (!protocol || protocol == IPPROTO_TCP)) {
+		ret = real.socket(domain, type, protocol);
+		if (ret < 0)
+			return ret;
+		fd_store(index, ret, fd_normal, fd_fork);
+		return index;
+	}
+
+	recursive = 1;
+	ret = rsocket(domain, type, protocol);
+	recursive = 0;
+	if (ret >= 0) {
+		fd_store(index, ret, fd_rsocket, fd_ready);
+		set_rsocket_options(ret);
+		return index;
+	}
+	fd_close(index, &ret);
+real:
+	return real.socket(domain, type, protocol);
+}
+
+int bind(int socket, const struct sockaddr *addr, socklen_t addrlen)
+{
+	int fd;
+	return (fd_get(socket, &fd) == fd_rsocket) ?
+		rbind(fd, addr, addrlen) : real.bind(fd, addr, addrlen);
+}
+
+int listen(int socket, int backlog)
+{
+	int fd, ret;
+	if (fd_get(socket, &fd) == fd_rsocket) {
+		ret = rlisten(fd, backlog);
+	} else {
+		ret = real.listen(fd, backlog);
+		if (!ret && fd_gets(socket) == fd_fork)
+			fd_store(socket, fd, fd_normal, fd_fork_listen);
+	}
+	return ret;
+}
+
+int accept(int socket, struct sockaddr *addr, socklen_t *addrlen)
+{
+	int fd, index, ret;
+
+	if (fd_get(socket, &fd) == fd_rsocket) {
+		index = fd_open();
+		if (index < 0)
+			return index;
+
+		ret = raccept(fd, addr, addrlen);
+		if (ret < 0) {
+			fd_close(index, &fd);
+			return ret;
+		}
+
+		fd_store(index, ret, fd_rsocket, fd_ready);
+		return index;
+	} else if (fd_gets(socket) == fd_fork_listen) {
+		index = fd_open();
+		if (index < 0)
+			return index;
+
+		ret = real.accept(fd, addr, addrlen);
+		if (ret < 0) {
+			fd_close(index, &fd);
+			return ret;
+		}
+
+		fd_store(index, ret, fd_normal, fd_fork_passive);
+		return index;
+	} else {
+		return real.accept(fd, addr, addrlen);
+	}
+}
+
+/*
+ * We can't fork RDMA connections and pass them from the parent to the child
+ * process.  Instead, we need to establish the RDMA connection after calling
+ * fork.  To do this, we delay establishing the RDMA connection until we try
+ * to send/receive on the server side.
+ */
+static void fork_active(int socket)
+{
+	struct sockaddr_storage addr;
+	int sfd, dfd, ret;
+	socklen_t len;
+	uint32_t msg;
+	long flags;
+
+	sfd = fd_getd(socket);
+
+	flags = real.fcntl(sfd, F_GETFL);
+	real.fcntl(sfd, F_SETFL, 0);
+	ret = real.recv(sfd, &msg, sizeof msg, MSG_PEEK);
+	real.fcntl(sfd, F_SETFL, flags);
+	if ((ret != sizeof msg) || msg)
+		goto err1;
+
+	len = sizeof addr;
+	ret = real.getpeername(sfd, (struct sockaddr *) &addr, &len);
+	if (ret)
+		goto err1;
+
+	dfd = rsocket(addr.ss_family, SOCK_STREAM, 0);
+	if (dfd < 0)
+		goto err1;
+
+	ret = rconnect(dfd, (struct sockaddr *) &addr, len);
+	if (ret)
+		goto err2;
+
+	set_rsocket_options(dfd);
+	copysockopts(dfd, sfd, &rs, &real);
+	real.shutdown(sfd, SHUT_RDWR);
+	real.close(sfd);
+	fd_store(socket, dfd, fd_rsocket, fd_ready);
+	return;
+
+err2:
+	rclose(dfd);
+err1:
+	fd_store(socket, sfd, fd_normal, fd_ready);
+}
+
+/*
+ * The server will start listening for the new connection, then send a
+ * message to the active side when the listen is ready.  This does leave
+ * fork unsupported in the following case: the server is nonblocking and
+ * calls select/poll waiting to receive data from the client.
+ */
+static void fork_passive(int socket)
+{
+	struct sockaddr_in6 sin6;
+	sem_t *sem;
+	int lfd, sfd, dfd, ret, param;
+	socklen_t len;
+	uint32_t msg;
+
+	sfd = fd_getd(socket);
+
+	len = sizeof sin6;
+	ret = real.getsockname(sfd, (struct sockaddr *) &sin6, &len);
+	if (ret)
+		goto out;
+	sin6.sin6_flowinfo = sin6.sin6_scope_id = 0;
+	memset(&sin6.sin6_addr, 0, sizeof sin6.sin6_addr);
+
+	sem = sem_open("/rsocket_fork", O_CREAT | O_RDWR,
+		       S_IRWXU | S_IRWXG, 1);
+	if (sem == SEM_FAILED) {
+		ret = -1;
+		goto out;
+	}
+
+	lfd = rsocket(sin6.sin6_family, SOCK_STREAM, 0);
+	if (lfd < 0) {
+		ret = lfd;
+		goto sclose;
+	}
+
+	param = 1;
+	rsetsockopt(lfd, SOL_SOCKET, SO_REUSEADDR, &param, sizeof param);
+
+	sem_wait(sem);
+	ret = rbind(lfd, (struct sockaddr *) &sin6, sizeof sin6);
+	if (ret)
+		goto lclose;
+
+	ret = rlisten(lfd, 1);
+	if (ret)
+		goto lclose;
+
+	msg = 0;
+	len = real.write(sfd, &msg, sizeof msg);
+	if (len != sizeof msg)
+		goto lclose;
+
+	dfd = raccept(lfd, NULL, NULL);
+	if (dfd < 0) {
+		ret  = dfd;
+		goto lclose;
+	}
+
+	set_rsocket_options(dfd);
+	copysockopts(dfd, sfd, &rs, &real);
+	real.shutdown(sfd, SHUT_RDWR);
+	real.close(sfd);
+	fd_store(socket, dfd, fd_rsocket, fd_ready);
+
+lclose:
+	rclose(lfd);
+	sem_post(sem);
+sclose:
+	sem_close(sem);
+out:
+	if (ret)
+		fd_store(socket, sfd, fd_normal, fd_ready);
+}
+
+static inline enum fd_type fd_fork_get(int index, int *fd)
+{
+	struct fd_info *fdi;
+
+	fdi = idm_lookup(&idm, index);
+	if (fdi) {
+		if (fdi->state == fd_fork_passive)
+			fork_passive(index);
+		else if (fdi->state == fd_fork_active)
+			fork_active(index);
+		*fd = fdi->fd;
+		return fdi->type;
+
+	} else {
+		*fd = index;
+		return fd_normal;
+	}
+}
+
+int connect(int socket, const struct sockaddr *addr, socklen_t addrlen)
+{
+	int fd, ret;
+
+	if (fd_get(socket, &fd) == fd_rsocket) {
+		ret = rconnect(fd, addr, addrlen);
+		if (!ret || errno == EINPROGRESS)
+			return ret;
+
+		ret = transpose_socket(socket, fd_normal);
+		if (ret < 0)
+			return ret;
+
+		rclose(fd);
+		fd = ret;
+	} else if (fd_gets(socket) == fd_fork) {
+		fd_store(socket, fd, fd_normal, fd_fork_active);
+	}
+
+	return real.connect(fd, addr, addrlen);
+}
+
+ssize_t recv(int socket, void *buf, size_t len, int flags)
+{
+	int fd;
+	return (fd_fork_get(socket, &fd) == fd_rsocket) ?
+		rrecv(fd, buf, len, flags) : real.recv(fd, buf, len, flags);
+}
+
+ssize_t recvfrom(int socket, void *buf, size_t len, int flags,
+		 struct sockaddr *src_addr, socklen_t *addrlen)
+{
+	int fd;
+	return (fd_fork_get(socket, &fd) == fd_rsocket) ?
+		rrecvfrom(fd, buf, len, flags, src_addr, addrlen) :
+		real.recvfrom(fd, buf, len, flags, src_addr, addrlen);
+}
+
+ssize_t recvmsg(int socket, struct msghdr *msg, int flags)
+{
+	int fd;
+	return (fd_fork_get(socket, &fd) == fd_rsocket) ?
+		rrecvmsg(fd, msg, flags) : real.recvmsg(fd, msg, flags);
+}
+
+ssize_t read(int socket, void *buf, size_t count)
+{
+	int fd;
+	init_preload();
+	return (fd_fork_get(socket, &fd) == fd_rsocket) ?
+		rread(fd, buf, count) : real.read(fd, buf, count);
+}
+
+ssize_t readv(int socket, const struct iovec *iov, int iovcnt)
+{
+	int fd;
+	init_preload();
+	return (fd_fork_get(socket, &fd) == fd_rsocket) ?
+		rreadv(fd, iov, iovcnt) : real.readv(fd, iov, iovcnt);
+}
+
+ssize_t send(int socket, const void *buf, size_t len, int flags)
+{
+	int fd;
+	return (fd_fork_get(socket, &fd) == fd_rsocket) ?
+		rsend(fd, buf, len, flags) : real.send(fd, buf, len, flags);
+}
+
+ssize_t sendto(int socket, const void *buf, size_t len, int flags,
+		const struct sockaddr *dest_addr, socklen_t addrlen)
+{
+	int fd;
+	return (fd_fork_get(socket, &fd) == fd_rsocket) ?
+		rsendto(fd, buf, len, flags, dest_addr, addrlen) :
+		real.sendto(fd, buf, len, flags, dest_addr, addrlen);
+}
+
+ssize_t sendmsg(int socket, const struct msghdr *msg, int flags)
+{
+	int fd;
+	return (fd_fork_get(socket, &fd) == fd_rsocket) ?
+		rsendmsg(fd, msg, flags) : real.sendmsg(fd, msg, flags);
+}
+
+ssize_t write(int socket, const void *buf, size_t count)
+{
+	int fd;
+	init_preload();
+	return (fd_fork_get(socket, &fd) == fd_rsocket) ?
+		rwrite(fd, buf, count) : real.write(fd, buf, count);
+}
+
+ssize_t writev(int socket, const struct iovec *iov, int iovcnt)
+{
+	int fd;
+	init_preload();
+	return (fd_fork_get(socket, &fd) == fd_rsocket) ?
+		rwritev(fd, iov, iovcnt) : real.writev(fd, iov, iovcnt);
+}
+
+static struct pollfd *fds_alloc(nfds_t nfds)
+{
+	static __thread struct pollfd *rfds;
+	static __thread nfds_t rnfds;
+
+	if (nfds > rnfds) {
+		if (rfds)
+			free(rfds);
+
+		rfds = malloc(sizeof *rfds * nfds);
+		rnfds = rfds ? nfds : 0;
+	}
+
+	return rfds;
+}
+
+int poll(struct pollfd *fds, nfds_t nfds, int timeout)
+{
+	struct pollfd *rfds;
+	int i, ret;
+
+	init_preload();
+	for (i = 0; i < nfds; i++) {
+		if (fd_gett(fds[i].fd) == fd_rsocket)
+			goto use_rpoll;
+	}
+
+	return real.poll(fds, nfds, timeout);
+
+use_rpoll:
+	rfds = fds_alloc(nfds);
+	if (!rfds)
+		return ERR(ENOMEM);
+
+	for (i = 0; i < nfds; i++) {
+		rfds[i].fd = fd_getd(fds[i].fd);
+		rfds[i].events = fds[i].events;
+		rfds[i].revents = 0;
+	}
+
+	ret = rpoll(rfds, nfds, timeout);
+
+	for (i = 0; i < nfds; i++)
+		fds[i].revents = rfds[i].revents;
+
+	return ret;
+}
+
+static void select_to_rpoll(struct pollfd *fds, int *nfds,
+			    fd_set *readfds, fd_set *writefds, fd_set *exceptfds)
+{
+	int fd, events, i = 0;
+
+	for (fd = 0; fd < *nfds; fd++) {
+		events = (readfds && FD_ISSET(fd, readfds)) ? POLLIN : 0;
+		if (writefds && FD_ISSET(fd, writefds))
+			events |= POLLOUT;
+
+		if (events || (exceptfds && FD_ISSET(fd, exceptfds))) {
+			fds[i].fd = fd_getd(fd);
+			fds[i++].events = events;
+		}
+	}
+
+	*nfds = i;
+}
+
+static int rpoll_to_select(struct pollfd *fds, int nfds,
+			   fd_set *readfds, fd_set *writefds, fd_set *exceptfds)
+{
+	int fd, rfd, i, cnt = 0;
+
+	for (i = 0, fd = 0; i < nfds; fd++) {
+		rfd = fd_getd(fd);
+		if (rfd != fds[i].fd)
+			continue;
+
+		if (readfds && (fds[i].revents & POLLIN)) {
+			FD_SET(fd, readfds);
+			cnt++;
+		}
+
+		if (writefds && (fds[i].revents & POLLOUT)) {
+			FD_SET(fd, writefds);
+			cnt++;
+		}
+
+		if (exceptfds && (fds[i].revents & ~(POLLIN | POLLOUT))) {
+			FD_SET(fd, exceptfds);
+			cnt++;
+		}
+		i++;
+	}
+
+	return cnt;
+}
+
+static int rs_convert_timeout(struct timeval *timeout)
+{
+	return !timeout ? -1 : timeout->tv_sec * 1000 + timeout->tv_usec / 1000;
+}
+
+int select(int nfds, fd_set *readfds, fd_set *writefds,
+	   fd_set *exceptfds, struct timeval *timeout)
+{
+	struct pollfd *fds;
+	int ret;
+
+	fds = fds_alloc(nfds);
+	if (!fds)
+		return ERR(ENOMEM);
+
+	select_to_rpoll(fds, &nfds, readfds, writefds, exceptfds);
+	ret = rpoll(fds, nfds, rs_convert_timeout(timeout));
+
+	if (readfds)
+		FD_ZERO(readfds);
+	if (writefds)
+		FD_ZERO(writefds);
+	if (exceptfds)
+		FD_ZERO(exceptfds);
+
+	if (ret > 0)
+		ret = rpoll_to_select(fds, nfds, readfds, writefds, exceptfds);
+
+	return ret;
+}
+
+int shutdown(int socket, int how)
+{
+	int fd;
+	return (fd_get(socket, &fd) == fd_rsocket) ?
+		rshutdown(fd, how) : real.shutdown(fd, how);
+}
+
+int close(int socket)
+{
+	struct fd_info *fdi;
+	int ret;
+
+	init_preload();
+	fdi = idm_lookup(&idm, socket);
+	if (!fdi)
+		return real.close(socket);
+
+	if (fdi->dupfd != -1) {
+		ret = close(fdi->dupfd);
+		if (ret)
+			return ret;
+	}
+
+	if (atomic_dec(&fdi->refcnt))
+		return 0;
+
+	idm_clear(&idm, socket);
+	real.close(socket);
+	ret = (fdi->type == fd_rsocket) ? rclose(fdi->fd) : real.close(fdi->fd);
+	free(fdi);
+	return ret;
+}
+
+int getpeername(int socket, struct sockaddr *addr, socklen_t *addrlen)
+{
+	int fd;
+	return (fd_get(socket, &fd) == fd_rsocket) ?
+		rgetpeername(fd, addr, addrlen) :
+		real.getpeername(fd, addr, addrlen);
+}
+
+int getsockname(int socket, struct sockaddr *addr, socklen_t *addrlen)
+{
+	int fd;
+	init_preload();
+	return (fd_get(socket, &fd) == fd_rsocket) ?
+		rgetsockname(fd, addr, addrlen) :
+		real.getsockname(fd, addr, addrlen);
+}
+
+int setsockopt(int socket, int level, int optname,
+		const void *optval, socklen_t optlen)
+{
+	int fd;
+	return (fd_get(socket, &fd) == fd_rsocket) ?
+		rsetsockopt(fd, level, optname, optval, optlen) :
+		real.setsockopt(fd, level, optname, optval, optlen);
+}
+
+int getsockopt(int socket, int level, int optname,
+		void *optval, socklen_t *optlen)
+{
+	int fd;
+	return (fd_get(socket, &fd) == fd_rsocket) ?
+		rgetsockopt(fd, level, optname, optval, optlen) :
+		real.getsockopt(fd, level, optname, optval, optlen);
+}
+
+int fcntl(int socket, int cmd, ... /* arg */)
+{
+	va_list args;
+	long lparam;
+	void *pparam;
+	int fd, ret;
+
+	init_preload();
+	va_start(args, cmd);
+	switch (cmd) {
+	case F_GETFD:
+	case F_GETFL:
+	case F_GETOWN:
+	case F_GETSIG:
+	case F_GETLEASE:
+		ret = (fd_get(socket, &fd) == fd_rsocket) ?
+			rfcntl(fd, cmd) : real.fcntl(fd, cmd);
+		break;
+	case F_DUPFD:
+	/*case F_DUPFD_CLOEXEC:*/
+	case F_SETFD:
+	case F_SETFL:
+	case F_SETOWN:
+	case F_SETSIG:
+	case F_SETLEASE:
+	case F_NOTIFY:
+		lparam = va_arg(args, long);
+		ret = (fd_get(socket, &fd) == fd_rsocket) ?
+			rfcntl(fd, cmd, lparam) : real.fcntl(fd, cmd, lparam);
+		break;
+	default:
+		pparam = va_arg(args, void *);
+		ret = (fd_get(socket, &fd) == fd_rsocket) ?
+			rfcntl(fd, cmd, pparam) : real.fcntl(fd, cmd, pparam);
+		break;
+	}
+	va_end(args);
+	return ret;
+}
+
+/*
+ * dup2 is not thread safe
+ */
+int dup2(int oldfd, int newfd)
+{
+	struct fd_info *oldfdi, *newfdi;
+	int ret;
+
+	init_preload();
+	oldfdi = idm_lookup(&idm, oldfd);
+	if (oldfdi) {
+		if (oldfdi->state == fd_fork_passive)
+			fork_passive(oldfd);
+		else if (oldfdi->state == fd_fork_active)
+			fork_active(oldfd);
+	}
+
+	newfdi = idm_lookup(&idm, newfd);
+	if (newfdi) {
+		 /* newfd cannot have been dup'ed directly */
+		if (atomic_get(&newfdi->refcnt) > 1)
+			return ERR(EBUSY);
+		close(newfd);
+	}
+
+	ret = real.dup2(oldfd, newfd);
+	if (!oldfdi || ret != newfd)
+		return ret;
+
+	newfdi = calloc(1, sizeof *newfdi);
+	if (!newfdi) {
+		close(newfd);
+		return ERR(ENOMEM);
+	}
+
+	pthread_mutex_lock(&mut);
+	idm_set(&idm, newfd, newfdi);
+	pthread_mutex_unlock(&mut);
+
+	newfdi->fd = oldfdi->fd;
+	newfdi->type = oldfdi->type;
+	if (oldfdi->dupfd != -1) {
+		newfdi->dupfd = oldfdi->dupfd;
+		oldfdi = idm_lookup(&idm, oldfdi->dupfd);
+	} else {
+		newfdi->dupfd = oldfd;
+	}
+	atomic_init(&newfdi->refcnt);
+	atomic_set(&newfdi->refcnt, 1);
+	atomic_inc(&oldfdi->refcnt);
+	return newfd;
+}
+
+ssize_t sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
+{
+	void *file_addr;
+	int fd;
+	size_t ret;
+
+	if (fd_get(out_fd, &fd) != fd_rsocket)
+		return real.sendfile(fd, in_fd, offset, count);
+
+	file_addr = mmap(NULL, count, PROT_READ, 0, in_fd, offset ? *offset : 0);
+	if (file_addr == (void *) -1)
+		return -1;
+
+	ret = rwrite(fd, file_addr, count);
+	if ((ret > 0) && offset)
+		lseek(in_fd, ret, SEEK_CUR);
+	munmap(file_addr, count);
+	return ret;
+}
+
+int __fxstat(int ver, int socket, struct stat *buf)
+{
+	int fd, ret;
+
+	init_preload();
+	if (fd_get(socket, &fd) == fd_rsocket) {
+		ret = real.fxstat(ver, socket, buf);
+		if (!ret)
+			buf->st_mode = (buf->st_mode & ~S_IFMT) | __S_IFSOCK;
+	} else {
+		ret = real.fxstat(ver, fd, buf);
+	}
+	return ret;
+}
diff --git a/prov/rdmacm/src/rsocket.c b/prov/rdmacm/src/rsocket.c
new file mode 100644
index 00000000000..e5595687db0
--- /dev/null
+++ b/prov/rdmacm/src/rsocket.c
@@ -0,0 +1,3970 @@
+/*
+ * Copyright (c) 2008-2013 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#if HAVE_CONFIG_H
+#  include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <stdarg.h>
+#include <netdb.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stddef.h>
+#include <string.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <sys/epoll.h>
+#include <search.h>
+
+#include <rdma/rdma_cma.h>
+#include <rdma/rdma_verbs.h>
+#include <rdma/rsocket.h>
+#include "cma.h"
+#include "indexer.h"
+
+#define RS_OLAP_START_SIZE 2048
+#define RS_MAX_TRANSFER 65536
+#define RS_SNDLOWAT 2048
+#define RS_QP_MAX_SIZE 0xFFFE
+#define RS_QP_CTRL_SIZE 4
+#define RS_CONN_RETRIES 6
+#define RS_SGL_SIZE 2
+static struct index_map idm;
+static pthread_mutex_t mut = PTHREAD_MUTEX_INITIALIZER;
+
+struct rsocket;
+
+enum {
+	RS_SVC_DGRAM = 1 << 0
+};
+
+struct rs_svc_msg {
+	uint32_t svcs;
+	uint32_t status;
+	struct rsocket *rs;
+};
+
+static pthread_t svc_id;
+static int svc_sock[2];
+static int svc_cnt;
+static int svc_size;
+static struct rsocket **svc_rss;
+static struct pollfd *svc_fds;
+static uint8_t svc_buf[RS_SNDLOWAT];
+static void *rs_svc_run(void *arg);
+
+static uint16_t def_iomap_size = 0;
+static uint16_t def_inline = 64;
+static uint16_t def_sqsize = 384;
+static uint16_t def_rqsize = 384;
+static uint32_t def_mem = (1 << 17);
+static uint32_t def_wmem = (1 << 17);
+static uint32_t polling_time = 10;
+
+/*
+ * Immediate data format is determined by the upper bits
+ * bit 31: message type, 0 - data, 1 - control
+ * bit 30: buffers updated, 0 - target, 1 - direct-receive
+ * bit 29: more data, 0 - end of transfer, 1 - more data available
+ *
+ * for data transfers:
+ * bits [28:0]: bytes transferred
+ * for control messages:
+ * SGL, CTRL
+ * bits [28-0]: receive credits granted
+ * IOMAP_SGL
+ * bits [28-16]: reserved, bits [15-0]: index
+ */
+
+enum {
+	RS_OP_DATA,
+	RS_OP_RSVD_DATA_MORE,
+	RS_OP_WRITE, /* opcode is not transmitted over the network */
+	RS_OP_RSVD_DRA_MORE,
+	RS_OP_SGL,
+	RS_OP_RSVD,
+	RS_OP_IOMAP_SGL,
+	RS_OP_CTRL
+};
+#define rs_msg_set(op, data)  ((op << 29) | (uint32_t) (data))
+#define rs_msg_op(imm_data)   (imm_data >> 29)
+#define rs_msg_data(imm_data) (imm_data & 0x1FFFFFFF)
+#define RS_MSG_SIZE	      sizeof(uint32_t)
+
+#define RS_WR_ID_FLAG_RECV (((uint64_t) 1) << 63)
+#define rs_send_wr_id(data) ((uint64_t) data)
+#define rs_recv_wr_id(data) (RS_WR_ID_FLAG_RECV | (uint64_t) data)
+#define rs_wr_is_recv(wr_id) (wr_id & RS_WR_ID_FLAG_RECV)
+#define rs_wr_data(wr_id) ((uint32_t) wr_id)
+
+enum {
+	RS_CTRL_DISCONNECT,
+	RS_CTRL_SHUTDOWN
+};
+
+struct rs_msg {
+	uint32_t op;
+	uint32_t data;
+};
+
+struct ds_qp;
+
+struct ds_rmsg {
+	struct ds_qp	*qp;
+	uint32_t	offset;
+	uint32_t	length;
+};
+
+struct ds_smsg {
+	struct ds_smsg	*next;
+};
+
+struct rs_sge {
+	uint64_t addr;
+	uint32_t key;
+	uint32_t length;
+};
+
+struct rs_iomap {
+	uint64_t offset;
+	struct rs_sge sge;
+};
+
+struct rs_iomap_mr {
+	uint64_t offset;
+	struct ibv_mr *mr;
+	dlist_entry entry;
+	atomic_t refcnt;
+	int index;	/* -1 if mapping is local and not in iomap_list */
+};
+
+#define RS_MIN_INLINE      (sizeof(struct rs_sge))
+#define rs_host_is_net()   (1 == htonl(1))
+#define RS_CONN_FLAG_NET   (1 << 0)
+#define RS_CONN_FLAG_IOMAP (1 << 1)
+
+struct rs_conn_data {
+	uint8_t		  version;
+	uint8_t		  flags;
+	uint16_t	  credits;
+	uint8_t		  reserved[3];
+	uint8_t		  target_iomap_size;
+	struct rs_sge	  target_sgl;
+	struct rs_sge	  data_buf;
+};
+
+struct rs_conn_private_data {
+	union {
+		struct rs_conn_data		conn_data;
+		struct {
+			struct ib_connect_hdr	ib_hdr;
+			struct rs_conn_data	conn_data;
+		} af_ib;
+	};
+};
+
+/*
+ * rsocket states are ordered as passive, connecting, connected, disconnected.
+ */
+enum rs_state {
+	rs_init,
+	rs_bound	   =		    0x0001,
+	rs_listening	   =		    0x0002,
+	rs_opening	   =		    0x0004,
+	rs_resolving_addr  = rs_opening |   0x0010,
+	rs_resolving_route = rs_opening |   0x0020,
+	rs_connecting      = rs_opening |   0x0040,
+	rs_accepting       = rs_opening |   0x0080,
+	rs_connected	   =		    0x0100,
+	rs_writable 	   =		    0x0200,
+	rs_readable	   =		    0x0400,
+	rs_connect_rdwr    = rs_connected | rs_readable | rs_writable,
+	rs_connect_error   =		    0x0800,
+	rs_disconnected	   =		    0x1000,
+	rs_error	   =		    0x2000,
+};
+
+#define RS_OPT_SWAP_SGL	(1 << 0)
+/*
+ * iWarp does not support RDMA write with immediate data.  For iWarp, we
+ * transfer rsocket messages as inline sends.
+ */
+#define RS_OPT_MSG_SEND	(1 << 1)
+
+union socket_addr {
+	struct sockaddr		sa;
+	struct sockaddr_in	sin;
+	struct sockaddr_in6	sin6;
+};
+
+struct ds_header {
+	uint8_t		  version;
+	uint8_t		  length;
+	uint16_t	  port;
+	union {
+		uint32_t  ipv4;
+		struct {
+			uint32_t flowinfo;
+			uint8_t  addr[16];
+		} ipv6;
+	} addr;
+};
+
+#define DS_IPV4_HDR_LEN  8
+#define DS_IPV6_HDR_LEN 24
+
+struct ds_dest {
+	union socket_addr addr;	/* must be first */
+	struct ds_qp	  *qp;
+	struct ibv_ah	  *ah;
+	uint32_t	   qpn;
+};
+
+struct ds_qp {
+	dlist_entry	  list;
+	struct rsocket	  *rs;
+	struct rdma_cm_id *cm_id;
+	struct ds_header  hdr;
+	struct ds_dest	  dest;
+
+	struct ibv_mr	  *smr;
+	struct ibv_mr	  *rmr;
+	uint8_t		  *rbuf;
+
+	int		  cq_armed;
+};
+
+struct rsocket {
+	int		  type;
+	int		  index;
+	fastlock_t	  slock;
+	fastlock_t	  rlock;
+	fastlock_t	  cq_lock;
+	fastlock_t	  cq_wait_lock;
+	fastlock_t	  map_lock; /* acquire slock first if needed */
+
+	union {
+		/* data stream */
+		struct {
+			struct rdma_cm_id *cm_id;
+			uint64_t	  tcp_opts;
+
+			int		  ctrl_avail;
+			uint16_t	  sseq_no;
+			uint16_t	  sseq_comp;
+			uint16_t	  rseq_no;
+			uint16_t	  rseq_comp;
+
+			int		  remote_sge;
+			struct rs_sge	  remote_sgl;
+			struct rs_sge	  remote_iomap;
+
+			struct ibv_mr	  *target_mr;
+			int		  target_sge;
+			int		  target_iomap_size;
+			void		  *target_buffer_list;
+			volatile struct rs_sge	  *target_sgl;
+			struct rs_iomap   *target_iomap;
+
+			int		  rbuf_msg_index;
+			int		  rbuf_bytes_avail;
+			int		  rbuf_free_offset;
+			int		  rbuf_offset;
+			struct ibv_mr	  *rmr;
+			uint8_t		  *rbuf;
+
+			int		  sbuf_bytes_avail;
+			struct ibv_mr	  *smr;
+			struct ibv_sge	  ssgl[2];
+		};
+		/* datagram */
+		struct {
+			struct ds_qp	  *qp_list;
+			void		  *dest_map;
+			struct ds_dest    *conn_dest;
+
+			int		  udp_sock;
+			int		  epfd;
+			int		  rqe_avail;
+			struct ds_smsg	  *smsg_free;
+		};
+	};
+
+	int		  svcs;
+	int		  opts;
+	long		  fd_flags;
+	uint64_t	  so_opts;
+	uint64_t	  ipv6_opts;
+	void		  *optval;
+	size_t		  optlen;
+	int		  state;
+	int		  cq_armed;
+	int		  retries;
+	int		  err;
+
+	int		  sqe_avail;
+	uint32_t	  sbuf_size;
+	uint16_t	  sq_size;
+	uint16_t	  sq_inline;
+
+	uint32_t	  rbuf_size;
+	uint16_t	  rq_size;
+	int		  rmsg_head;
+	int		  rmsg_tail;
+	union {
+		struct rs_msg	  *rmsg;
+		struct ds_rmsg	  *dmsg;
+	};
+
+	uint8_t		  *sbuf;
+	struct rs_iomap_mr *remote_iomappings;
+	dlist_entry	  iomap_list;
+	dlist_entry	  iomap_queue;
+	int		  iomap_pending;
+};
+
+#define DS_UDP_TAG 0x55555555
+
+struct ds_udp_header {
+	uint32_t	  tag;
+	uint8_t		  version;
+	uint8_t		  op;
+	uint8_t		  length;
+	uint8_t		  reserved;
+	uint32_t	  qpn;  /* lower 8-bits reserved */
+	union {
+		uint32_t ipv4;
+		uint8_t  ipv6[16];
+	} addr;
+};
+
+#define DS_UDP_IPV4_HDR_LEN 16
+#define DS_UDP_IPV6_HDR_LEN 28
+
+#define ds_next_qp(qp) container_of((qp)->list.next, struct ds_qp, list)
+
+static void ds_insert_qp(struct rsocket *rs, struct ds_qp *qp)
+{
+	if (!rs->qp_list)
+		dlist_init(&qp->list);
+	else
+		dlist_insert_head(&qp->list, &rs->qp_list->list);
+	rs->qp_list = qp;
+}
+
+static void ds_remove_qp(struct rsocket *rs, struct ds_qp *qp)
+{
+	if (qp->list.next != &qp->list) {
+		rs->qp_list = ds_next_qp(qp);
+		dlist_remove(&qp->list);
+	} else {
+		rs->qp_list = NULL;
+	}
+}
+
+static int rs_modify_svcs(struct rsocket *rs, int svcs)
+{
+	struct rs_svc_msg msg;
+	int ret;
+
+	pthread_mutex_lock(&mut);
+	if (!svc_cnt) {
+		ret = socketpair(AF_UNIX, SOCK_STREAM, 0, svc_sock);
+		if (ret)
+			goto unlock;
+
+		ret = pthread_create(&svc_id, NULL, rs_svc_run, NULL);
+		if (ret) {
+			ret = ERR(ret);
+			goto closepair;
+		}
+	}
+
+	msg.svcs = svcs;
+	msg.status = EINVAL;
+	msg.rs = rs;
+	write(svc_sock[0], &msg, sizeof msg);
+	read(svc_sock[0], &msg, sizeof msg);
+	ret = rdma_seterrno(msg.status);
+	if (svc_cnt)
+		goto unlock;
+
+	pthread_join(svc_id, NULL);
+closepair:
+	close(svc_sock[0]);
+	close(svc_sock[1]);
+unlock:
+	pthread_mutex_unlock(&mut);
+	return ret;
+}
+
+static int ds_compare_addr(const void *dst1, const void *dst2)
+{
+	const struct sockaddr *sa1, *sa2;
+	size_t len;
+
+	sa1 = (const struct sockaddr *) dst1;
+	sa2 = (const struct sockaddr *) dst2;
+
+	len = (sa1->sa_family == AF_INET6 && sa2->sa_family == AF_INET6) ?
+	      sizeof(struct sockaddr_in6) : sizeof(struct sockaddr_in);
+	return memcmp(dst1, dst2, len);
+}
+
+static int rs_value_to_scale(int value, int bits)
+{
+	return value <= (1 << (bits - 1)) ?
+	       value : (1 << (bits - 1)) | (value >> bits);
+}
+
+static int rs_scale_to_value(int value, int bits)
+{
+	return value <= (1 << (bits - 1)) ?
+	       value : (value & ~(1 << (bits - 1))) << bits;
+}
+
+void rs_configure(void)
+{
+	FILE *f;
+	static int init;
+
+	if (init)
+		return;
+
+	pthread_mutex_lock(&mut);
+	if (init)
+		goto out;
+
+	if (ucma_init())
+		goto out;
+	ucma_ib_init();
+
+	if ((f = fopen(RS_CONF_DIR "/polling_time", "r"))) {
+		(void) fscanf(f, "%u", &polling_time);
+		fclose(f);
+	}
+
+	if ((f = fopen(RS_CONF_DIR "/inline_default", "r"))) {
+		(void) fscanf(f, "%hu", &def_inline);
+		fclose(f);
+
+		if (def_inline < RS_MIN_INLINE)
+			def_inline = RS_MIN_INLINE;
+	}
+
+	if ((f = fopen(RS_CONF_DIR "/sqsize_default", "r"))) {
+		(void) fscanf(f, "%hu", &def_sqsize);
+		fclose(f);
+	}
+
+	if ((f = fopen(RS_CONF_DIR "/rqsize_default", "r"))) {
+		(void) fscanf(f, "%hu", &def_rqsize);
+		fclose(f);
+	}
+
+	if ((f = fopen(RS_CONF_DIR "/mem_default", "r"))) {
+		(void) fscanf(f, "%u", &def_mem);
+		fclose(f);
+
+		if (def_mem < 1)
+			def_mem = 1;
+	}
+
+	if ((f = fopen(RS_CONF_DIR "/wmem_default", "r"))) {
+		(void) fscanf(f, "%u", &def_wmem);
+		fclose(f);
+		if (def_wmem < RS_SNDLOWAT)
+			def_wmem = RS_SNDLOWAT << 1;
+	}
+
+	if ((f = fopen(RS_CONF_DIR "/iomap_size", "r"))) {
+		(void) fscanf(f, "%hu", &def_iomap_size);
+		fclose(f);
+
+		/* round to supported values */
+		def_iomap_size = (uint8_t) rs_value_to_scale(
+			(uint16_t) rs_scale_to_value(def_iomap_size, 8), 8);
+	}
+	init = 1;
+out:
+	pthread_mutex_unlock(&mut);
+}
+
+static int rs_insert(struct rsocket *rs, int index)
+{
+	pthread_mutex_lock(&mut);
+	rs->index = idm_set(&idm, index, rs);
+	pthread_mutex_unlock(&mut);
+	return rs->index;
+}
+
+static void rs_remove(struct rsocket *rs)
+{
+	pthread_mutex_lock(&mut);
+	idm_clear(&idm, rs->index);
+	pthread_mutex_unlock(&mut);
+}
+
+static struct rsocket *rs_alloc(struct rsocket *inherited_rs, int type)
+{
+	struct rsocket *rs;
+
+	rs = calloc(1, sizeof *rs);
+	if (!rs)
+		return NULL;
+
+	rs->type = type;
+	rs->index = -1;
+	if (type == SOCK_DGRAM) {
+		rs->udp_sock = -1;
+		rs->epfd = -1;
+	}
+
+	if (inherited_rs) {
+		rs->sbuf_size = inherited_rs->sbuf_size;
+		rs->rbuf_size = inherited_rs->rbuf_size;
+		rs->sq_inline = inherited_rs->sq_inline;
+		rs->sq_size = inherited_rs->sq_size;
+		rs->rq_size = inherited_rs->rq_size;
+		if (type == SOCK_STREAM) {
+			rs->ctrl_avail = inherited_rs->ctrl_avail;
+			rs->target_iomap_size = inherited_rs->target_iomap_size;
+		}
+	} else {
+		rs->sbuf_size = def_wmem;
+		rs->rbuf_size = def_mem;
+		rs->sq_inline = def_inline;
+		rs->sq_size = def_sqsize;
+		rs->rq_size = def_rqsize;
+		if (type == SOCK_STREAM) {
+			rs->ctrl_avail = RS_QP_CTRL_SIZE;
+			rs->target_iomap_size = def_iomap_size;
+		}
+	}
+	fastlock_init(&rs->slock);
+	fastlock_init(&rs->rlock);
+	fastlock_init(&rs->cq_lock);
+	fastlock_init(&rs->cq_wait_lock);
+	fastlock_init(&rs->map_lock);
+	dlist_init(&rs->iomap_list);
+	dlist_init(&rs->iomap_queue);
+	return rs;
+}
+
+static int rs_set_nonblocking(struct rsocket *rs, long arg)
+{
+	struct ds_qp *qp;
+	int ret = 0;
+
+	if (rs->type == SOCK_STREAM) {
+		if (rs->cm_id->recv_cq_channel)
+			ret = fcntl(rs->cm_id->recv_cq_channel->fd, F_SETFL, arg);
+
+		if (!ret && rs->state < rs_connected)
+			ret = fcntl(rs->cm_id->channel->fd, F_SETFL, arg);
+	} else {
+		ret = fcntl(rs->epfd, F_SETFL, arg);
+		if (!ret && rs->qp_list) {
+			qp = rs->qp_list;
+			do {
+				ret = fcntl(qp->cm_id->recv_cq_channel->fd,
+					    F_SETFL, arg);
+				qp = ds_next_qp(qp);
+			} while (qp != rs->qp_list && !ret);
+		}
+	}
+
+	return ret;
+}
+
+static void rs_set_qp_size(struct rsocket *rs)
+{
+	uint16_t max_size;
+
+	max_size = min(ucma_max_qpsize(rs->cm_id), RS_QP_MAX_SIZE);
+
+	if (rs->sq_size > max_size)
+		rs->sq_size = max_size;
+	else if (rs->sq_size < 4)
+		rs->sq_size = 4;
+	if (rs->sq_size <= (RS_QP_CTRL_SIZE << 2))
+		rs->ctrl_avail = 2;
+
+	if (rs->rq_size > max_size)
+		rs->rq_size = max_size;
+	else if (rs->rq_size < 4)
+		rs->rq_size = 4;
+}
+
+static void ds_set_qp_size(struct rsocket *rs)
+{
+	uint16_t max_size;
+
+	max_size = min(ucma_max_qpsize(NULL), RS_QP_MAX_SIZE);
+
+	if (rs->sq_size > max_size)
+		rs->sq_size = max_size;
+	if (rs->rq_size > max_size)
+		rs->rq_size = max_size;
+
+	if (rs->rq_size > (rs->rbuf_size / RS_SNDLOWAT))
+		rs->rq_size = rs->rbuf_size / RS_SNDLOWAT;
+	else
+		rs->rbuf_size = rs->rq_size * RS_SNDLOWAT;
+
+	if (rs->sq_size > (rs->sbuf_size / RS_SNDLOWAT))
+		rs->sq_size = rs->sbuf_size / RS_SNDLOWAT;
+	else
+		rs->sbuf_size = rs->sq_size * RS_SNDLOWAT;
+}
+
+static int rs_init_bufs(struct rsocket *rs)
+{
+	uint32_t rbuf_msg_size;
+	size_t len;
+
+	rs->rmsg = calloc(rs->rq_size + 1, sizeof(*rs->rmsg));
+	if (!rs->rmsg)
+		return ERR(ENOMEM);
+
+	rs->sbuf = calloc(rs->sbuf_size, sizeof(*rs->sbuf));
+	if (!rs->sbuf)
+		return ERR(ENOMEM);
+
+	rs->smr = rdma_reg_msgs(rs->cm_id, rs->sbuf, rs->sbuf_size);
+	if (!rs->smr)
+		return -1;
+
+	len = sizeof(*rs->target_sgl) * RS_SGL_SIZE +
+	      sizeof(*rs->target_iomap) * rs->target_iomap_size;
+	rs->target_buffer_list = malloc(len);
+	if (!rs->target_buffer_list)
+		return ERR(ENOMEM);
+
+	rs->target_mr = rdma_reg_write(rs->cm_id, rs->target_buffer_list, len);
+	if (!rs->target_mr)
+		return -1;
+
+	memset(rs->target_buffer_list, 0, len);
+	rs->target_sgl = rs->target_buffer_list;
+	if (rs->target_iomap_size)
+		rs->target_iomap = (struct rs_iomap *) (rs->target_sgl + RS_SGL_SIZE);
+
+	rbuf_msg_size = rs->rbuf_size;
+	if (rs->opts & RS_OPT_MSG_SEND)
+		rbuf_msg_size += rs->rq_size * RS_MSG_SIZE;
+	rs->rbuf = calloc(rbuf_msg_size, 1);
+	if (!rs->rbuf)
+		return ERR(ENOMEM);
+
+	rs->rmr = rdma_reg_write(rs->cm_id, rs->rbuf, rbuf_msg_size);
+	if (!rs->rmr)
+		return -1;
+
+	rs->ssgl[0].addr = rs->ssgl[1].addr = (uintptr_t) rs->sbuf;
+	rs->sbuf_bytes_avail = rs->sbuf_size;
+	rs->ssgl[0].lkey = rs->ssgl[1].lkey = rs->smr->lkey;
+
+	rs->rbuf_free_offset = rs->rbuf_size >> 1;
+	rs->rbuf_bytes_avail = rs->rbuf_size >> 1;
+	rs->sqe_avail = rs->sq_size - rs->ctrl_avail;
+	rs->rseq_comp = rs->rq_size >> 1;
+	return 0;
+}
+
+static int ds_init_bufs(struct ds_qp *qp)
+{
+	qp->rbuf = calloc(qp->rs->rbuf_size + sizeof(struct ibv_grh), 1);
+	if (!qp->rbuf)
+		return ERR(ENOMEM);
+
+	qp->smr = rdma_reg_msgs(qp->cm_id, qp->rs->sbuf, qp->rs->sbuf_size);
+	if (!qp->smr)
+		return -1;
+
+	qp->rmr = rdma_reg_msgs(qp->cm_id, qp->rbuf, qp->rs->rbuf_size +
+						     sizeof(struct ibv_grh));
+	if (!qp->rmr)
+		return -1;
+
+	return 0;
+}
+
+/*
+ * If a user is waiting on a datagram rsocket through poll or select, then
+ * we need the first completion to generate an event on the related epoll fd
+ * in order to signal the user.  We arm the CQ on creation for this purpose
+ */
+static int rs_create_cq(struct rsocket *rs, struct rdma_cm_id *cm_id)
+{
+	cm_id->recv_cq_channel = ibv_create_comp_channel(cm_id->verbs);
+	if (!cm_id->recv_cq_channel)
+		return -1;
+
+	cm_id->recv_cq = ibv_create_cq(cm_id->verbs, rs->sq_size + rs->rq_size,
+				       cm_id, cm_id->recv_cq_channel, 0);
+	if (!cm_id->recv_cq)
+		goto err1;
+
+	if (rs->fd_flags & O_NONBLOCK) {
+		if (fcntl(cm_id->recv_cq_channel->fd, F_SETFL, O_NONBLOCK))
+			goto err2;
+	}
+
+	ibv_req_notify_cq(cm_id->recv_cq, 0);
+	cm_id->send_cq_channel = cm_id->recv_cq_channel;
+	cm_id->send_cq = cm_id->recv_cq;
+	return 0;
+
+err2:
+	ibv_destroy_cq(cm_id->recv_cq);
+	cm_id->recv_cq = NULL;
+err1:
+	ibv_destroy_comp_channel(cm_id->recv_cq_channel);
+	cm_id->recv_cq_channel = NULL;
+	return -1;
+}
+
+static inline int rs_post_recv(struct rsocket *rs)
+{
+	struct ibv_recv_wr wr, *bad;
+	struct ibv_sge sge;
+
+	wr.next = NULL;
+	if (!(rs->opts & RS_OPT_MSG_SEND)) {
+		wr.wr_id = rs_recv_wr_id(0);
+		wr.sg_list = NULL;
+		wr.num_sge = 0;
+	} else {
+		wr.wr_id = rs_recv_wr_id(rs->rbuf_msg_index);
+		sge.addr = (uintptr_t) rs->rbuf + rs->rbuf_size +
+			   (rs->rbuf_msg_index * RS_MSG_SIZE);
+		sge.length = RS_MSG_SIZE;
+		sge.lkey = rs->rmr->lkey;
+
+		wr.sg_list = &sge;
+		wr.num_sge = 1;
+		if(++rs->rbuf_msg_index == rs->rq_size)
+			rs->rbuf_msg_index = 0;
+	}
+
+	return rdma_seterrno(ibv_post_recv(rs->cm_id->qp, &wr, &bad));
+}
+
+static inline int ds_post_recv(struct rsocket *rs, struct ds_qp *qp, uint32_t offset)
+{
+	struct ibv_recv_wr wr, *bad;
+	struct ibv_sge sge[2];
+
+	sge[0].addr = (uintptr_t) qp->rbuf + rs->rbuf_size;
+	sge[0].length = sizeof(struct ibv_grh);
+	sge[0].lkey = qp->rmr->lkey;
+	sge[1].addr = (uintptr_t) qp->rbuf + offset;
+	sge[1].length = RS_SNDLOWAT;
+	sge[1].lkey = qp->rmr->lkey;
+
+	wr.wr_id = rs_recv_wr_id(offset);
+	wr.next = NULL;
+	wr.sg_list = sge;
+	wr.num_sge = 2;
+
+	return rdma_seterrno(ibv_post_recv(qp->cm_id->qp, &wr, &bad));
+}
+
+static int rs_create_ep(struct rsocket *rs)
+{
+	struct ibv_qp_init_attr qp_attr;
+	int i, ret;
+
+	rs_set_qp_size(rs);
+	if (rs->cm_id->verbs->device->transport_type == IBV_TRANSPORT_IWARP)
+		rs->opts |= RS_OPT_MSG_SEND;
+	ret = rs_init_bufs(rs);
+	if (ret)
+		return ret;
+
+	ret = rs_create_cq(rs, rs->cm_id);
+	if (ret)
+		return ret;
+
+	memset(&qp_attr, 0, sizeof qp_attr);
+	qp_attr.qp_context = rs;
+	qp_attr.send_cq = rs->cm_id->send_cq;
+	qp_attr.recv_cq = rs->cm_id->recv_cq;
+	qp_attr.qp_type = IBV_QPT_RC;
+	qp_attr.sq_sig_all = 1;
+	qp_attr.cap.max_send_wr = rs->sq_size;
+	qp_attr.cap.max_recv_wr = rs->rq_size;
+	qp_attr.cap.max_send_sge = 2;
+	qp_attr.cap.max_recv_sge = 1;
+	qp_attr.cap.max_inline_data = rs->sq_inline;
+
+	ret = rdma_create_qp(rs->cm_id, NULL, &qp_attr);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < rs->rq_size; i++) {
+		ret = rs_post_recv(rs);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
+static void rs_release_iomap_mr(struct rs_iomap_mr *iomr)
+{
+	if (atomic_dec(&iomr->refcnt))
+		return;
+
+	dlist_remove(&iomr->entry);
+	ibv_dereg_mr(iomr->mr);
+	if (iomr->index >= 0)
+		iomr->mr = NULL;
+	else
+		free(iomr);
+}
+
+static void rs_free_iomappings(struct rsocket *rs)
+{
+	struct rs_iomap_mr *iomr;
+
+	while (!dlist_empty(&rs->iomap_list)) {
+		iomr = container_of(rs->iomap_list.next,
+				    struct rs_iomap_mr, entry);
+		riounmap(rs->index, iomr->mr->addr, iomr->mr->length);
+	}
+	while (!dlist_empty(&rs->iomap_queue)) {
+		iomr = container_of(rs->iomap_queue.next,
+				    struct rs_iomap_mr, entry);
+		riounmap(rs->index, iomr->mr->addr, iomr->mr->length);
+	}
+}
+
+static void ds_free_qp(struct ds_qp *qp)
+{
+	if (qp->smr)
+		rdma_dereg_mr(qp->smr);
+
+	if (qp->rbuf) {
+		if (qp->rmr)
+			rdma_dereg_mr(qp->rmr);
+		free(qp->rbuf);
+	}
+
+	if (qp->cm_id) {
+		if (qp->cm_id->qp) {
+			tdelete(&qp->dest.addr, &qp->rs->dest_map, ds_compare_addr);
+			epoll_ctl(qp->rs->epfd, EPOLL_CTL_DEL,
+				  qp->cm_id->recv_cq_channel->fd, NULL);
+			rdma_destroy_qp(qp->cm_id);
+		}
+		rdma_destroy_id(qp->cm_id);
+	}
+
+	free(qp);
+}
+
+static void ds_free(struct rsocket *rs)
+{
+	struct ds_qp *qp;
+
+	if (rs->udp_sock >= 0)
+		close(rs->udp_sock);
+
+	if (rs->index >= 0)
+		rs_remove(rs);
+
+	if (rs->dmsg)
+		free(rs->dmsg);
+
+	while ((qp = rs->qp_list)) {
+		ds_remove_qp(rs, qp);
+		ds_free_qp(qp);
+	}
+
+	if (rs->epfd >= 0)
+		close(rs->epfd);
+
+	if (rs->sbuf)
+		free(rs->sbuf);
+
+	tdestroy(rs->dest_map, free);
+	fastlock_destroy(&rs->map_lock);
+	fastlock_destroy(&rs->cq_wait_lock);
+	fastlock_destroy(&rs->cq_lock);
+	fastlock_destroy(&rs->rlock);
+	fastlock_destroy(&rs->slock);
+	free(rs);
+}
+
+static void rs_free(struct rsocket *rs)
+{
+	if (rs->type == SOCK_DGRAM) {
+		ds_free(rs);
+		return;
+	}
+
+	if (rs->index >= 0)
+		rs_remove(rs);
+
+	if (rs->rmsg)
+		free(rs->rmsg);
+
+	if (rs->sbuf) {
+		if (rs->smr)
+			rdma_dereg_mr(rs->smr);
+		free(rs->sbuf);
+	}
+
+	if (rs->rbuf) {
+		if (rs->rmr)
+			rdma_dereg_mr(rs->rmr);
+		free(rs->rbuf);
+	}
+
+	if (rs->target_buffer_list) {
+		if (rs->target_mr)
+			rdma_dereg_mr(rs->target_mr);
+		free(rs->target_buffer_list);
+	}
+
+	if (rs->cm_id) {
+		rs_free_iomappings(rs);
+		if (rs->cm_id->qp)
+			rdma_destroy_qp(rs->cm_id);
+		rdma_destroy_id(rs->cm_id);
+	}
+
+	fastlock_destroy(&rs->map_lock);
+	fastlock_destroy(&rs->cq_wait_lock);
+	fastlock_destroy(&rs->cq_lock);
+	fastlock_destroy(&rs->rlock);
+	fastlock_destroy(&rs->slock);
+	free(rs);
+}
+
+static size_t rs_conn_data_offset(struct rsocket *rs)
+{
+	return (rs->cm_id->route.addr.src_addr.sa_family == AF_IB) ?
+		sizeof(struct ib_connect_hdr) : 0;
+}
+
+static void rs_format_conn_data(struct rsocket *rs, struct rs_conn_data *conn)
+{
+	conn->version = 1;
+	conn->flags = RS_CONN_FLAG_IOMAP |
+		      (rs_host_is_net() ? RS_CONN_FLAG_NET : 0);
+	conn->credits = htons(rs->rq_size);
+	memset(conn->reserved, 0, sizeof conn->reserved);
+	conn->target_iomap_size = (uint8_t) rs_value_to_scale(rs->target_iomap_size, 8);
+
+	conn->target_sgl.addr = htonll((uintptr_t) rs->target_sgl);
+	conn->target_sgl.length = htonl(RS_SGL_SIZE);
+	conn->target_sgl.key = htonl(rs->target_mr->rkey);
+
+	conn->data_buf.addr = htonll((uintptr_t) rs->rbuf);
+	conn->data_buf.length = htonl(rs->rbuf_size >> 1);
+	conn->data_buf.key = htonl(rs->rmr->rkey);
+}
+
+static void rs_save_conn_data(struct rsocket *rs, struct rs_conn_data *conn)
+{
+	rs->remote_sgl.addr = ntohll(conn->target_sgl.addr);
+	rs->remote_sgl.length = ntohl(conn->target_sgl.length);
+	rs->remote_sgl.key = ntohl(conn->target_sgl.key);
+	rs->remote_sge = 1;
+	if ((rs_host_is_net() && !(conn->flags & RS_CONN_FLAG_NET)) ||
+	    (!rs_host_is_net() && (conn->flags & RS_CONN_FLAG_NET)))
+		rs->opts = RS_OPT_SWAP_SGL;
+
+	if (conn->flags & RS_CONN_FLAG_IOMAP) {
+		rs->remote_iomap.addr = rs->remote_sgl.addr +
+					sizeof(rs->remote_sgl) * rs->remote_sgl.length;
+		rs->remote_iomap.length = rs_scale_to_value(conn->target_iomap_size, 8);
+		rs->remote_iomap.key = rs->remote_sgl.key;
+	}
+
+	rs->target_sgl[0].addr = ntohll(conn->data_buf.addr);
+	rs->target_sgl[0].length = ntohl(conn->data_buf.length);
+	rs->target_sgl[0].key = ntohl(conn->data_buf.key);
+
+	rs->sseq_comp = ntohs(conn->credits);
+}
+
+static int ds_init(struct rsocket *rs, int domain)
+{
+	rs->udp_sock = socket(domain, SOCK_DGRAM, 0);
+	if (rs->udp_sock < 0)
+		return rs->udp_sock;
+
+	rs->epfd = epoll_create(2);
+	if (rs->epfd < 0)
+		return rs->epfd;
+
+	return 0;
+}
+
+static int ds_init_ep(struct rsocket *rs)
+{
+	struct ds_smsg *msg;
+	int i, ret;
+
+	ds_set_qp_size(rs);
+
+	rs->sbuf = calloc(rs->sq_size, RS_SNDLOWAT);
+	if (!rs->sbuf)
+		return ERR(ENOMEM);
+
+	rs->dmsg = calloc(rs->rq_size + 1, sizeof(*rs->dmsg));
+	if (!rs->dmsg)
+		return ERR(ENOMEM);
+
+	rs->sqe_avail = rs->sq_size;
+	rs->rqe_avail = rs->rq_size;
+
+	rs->smsg_free = (struct ds_smsg *) rs->sbuf;
+	msg = rs->smsg_free;
+	for (i = 0; i < rs->sq_size - 1; i++) {
+		msg->next = (void *) msg + RS_SNDLOWAT;
+		msg = msg->next;
+	}
+	msg->next = NULL;
+
+	ret = rs_modify_svcs(rs, RS_SVC_DGRAM);
+	if (ret)
+		return ret;
+
+	rs->state = rs_readable | rs_writable;
+	return 0;
+}
+
+int rsocket(int domain, int type, int protocol)
+{
+	struct rsocket *rs;
+	int index, ret;
+
+	if ((domain != AF_INET && domain != AF_INET6 && domain != AF_IB) ||
+	    ((type != SOCK_STREAM) && (type != SOCK_DGRAM)) ||
+	    (type == SOCK_STREAM && protocol && protocol != IPPROTO_TCP) ||
+	    (type == SOCK_DGRAM && protocol && protocol != IPPROTO_UDP))
+		return ERR(ENOTSUP);
+
+	rs_configure();
+	rs = rs_alloc(NULL, type);
+	if (!rs)
+		return ERR(ENOMEM);
+
+	if (type == SOCK_STREAM) {
+		ret = rdma_create_id(NULL, &rs->cm_id, rs, RDMA_PS_TCP);
+		if (ret)
+			goto err;
+
+		rs->cm_id->route.addr.src_addr.sa_family = domain;
+		index = rs->cm_id->channel->fd;
+	} else {
+		ret = ds_init(rs, domain);
+		if (ret)
+			goto err;
+
+		index = rs->udp_sock;
+	}
+
+	ret = rs_insert(rs, index);
+	if (ret < 0)
+		goto err;
+
+	return rs->index;
+
+err:
+	rs_free(rs);
+	return ret;
+}
+
+int rbind(int socket, const struct sockaddr *addr, socklen_t addrlen)
+{
+	struct rsocket *rs;
+	int ret;
+
+	rs = idm_at(&idm, socket);
+	if (rs->type == SOCK_STREAM) {
+		ret = rdma_bind_addr(rs->cm_id, (struct sockaddr *) addr);
+		if (!ret)
+			rs->state = rs_bound;
+	} else {
+		if (rs->state == rs_init) {
+			ret = ds_init_ep(rs);
+			if (ret)
+				return ret;
+		}
+		ret = bind(rs->udp_sock, addr, addrlen);
+	}
+	return ret;
+}
+
+int rlisten(int socket, int backlog)
+{
+	struct rsocket *rs;
+	int ret;
+
+	rs = idm_at(&idm, socket);
+	ret = rdma_listen(rs->cm_id, backlog);
+	if (!ret)
+		rs->state = rs_listening;
+	return ret;
+}
+
+/*
+ * Nonblocking is usually not inherited between sockets, but we need to
+ * inherit it here to establish the connection only.  This is needed to
+ * prevent rdma_accept from blocking until the remote side finishes
+ * establishing the connection.  If we were to allow rdma_accept to block,
+ * then a single thread cannot establish a connection with itself, or
+ * two threads which try to connect to each other can deadlock trying to
+ * form a connection.
+ *
+ * Data transfers on the new socket remain blocking unless the user
+ * specifies otherwise through rfcntl.
+ */
+int raccept(int socket, struct sockaddr *addr, socklen_t *addrlen)
+{
+	struct rsocket *rs, *new_rs;
+	struct rdma_conn_param param;
+	struct rs_conn_data *creq, cresp;
+	int ret;
+
+	rs = idm_at(&idm, socket);
+	new_rs = rs_alloc(rs, rs->type);
+	if (!new_rs)
+		return ERR(ENOMEM);
+
+	ret = rdma_get_request(rs->cm_id, &new_rs->cm_id);
+	if (ret)
+		goto err;
+
+	ret = rs_insert(new_rs, new_rs->cm_id->channel->fd);
+	if (ret < 0)
+		goto err;
+
+	creq = (struct rs_conn_data *)
+	       (new_rs->cm_id->event->param.conn.private_data + rs_conn_data_offset(rs));
+	if (creq->version != 1) {
+		ret = ERR(ENOTSUP);
+		goto err;
+	}
+
+	if (rs->fd_flags & O_NONBLOCK)
+		fcntl(new_rs->cm_id->channel->fd, F_SETFL, O_NONBLOCK);
+
+	ret = rs_create_ep(new_rs);
+	if (ret)
+		goto err;
+
+	rs_save_conn_data(new_rs, creq);
+	param = new_rs->cm_id->event->param.conn;
+	rs_format_conn_data(new_rs, &cresp);
+	param.private_data = &cresp;
+	param.private_data_len = sizeof cresp;
+	ret = rdma_accept(new_rs->cm_id, &param);
+	if (!ret)
+		new_rs->state = rs_connect_rdwr;
+	else if (errno == EAGAIN || errno == EWOULDBLOCK)
+		new_rs->state = rs_accepting;
+	else
+		goto err;
+
+	if (addr && addrlen)
+		rgetpeername(new_rs->index, addr, addrlen);
+	return new_rs->index;
+
+err:
+	rs_free(new_rs);
+	return ret;
+}
+
+static int rs_do_connect(struct rsocket *rs)
+{
+	struct rdma_conn_param param;
+	struct rs_conn_private_data cdata;
+	struct rs_conn_data *creq, *cresp;
+	int to, ret;
+
+	switch (rs->state) {
+	case rs_init:
+	case rs_bound:
+resolve_addr:
+		to = 1000 << rs->retries++;
+		ret = rdma_resolve_addr(rs->cm_id, NULL,
+					&rs->cm_id->route.addr.dst_addr, to);
+		if (!ret)
+			goto resolve_route;
+		if (errno == EAGAIN || errno == EWOULDBLOCK)
+			rs->state = rs_resolving_addr;
+		break;
+	case rs_resolving_addr:
+		ret = ucma_complete(rs->cm_id);
+		if (ret) {
+			if (errno == ETIMEDOUT && rs->retries <= RS_CONN_RETRIES)
+				goto resolve_addr;
+			break;
+		}
+
+		rs->retries = 0;
+resolve_route:
+		to = 1000 << rs->retries++;
+		if (rs->optval) {
+			ret = rdma_set_option(rs->cm_id,  RDMA_OPTION_IB,
+					      RDMA_OPTION_IB_PATH, rs->optval,
+					      rs->optlen);
+			free(rs->optval);
+			rs->optval = NULL;
+			if (!ret) {
+				rs->state = rs_resolving_route;
+				goto resolving_route;
+			}
+		} else {
+			ret = rdma_resolve_route(rs->cm_id, to);
+			if (!ret)
+				goto do_connect;
+		}
+		if (errno == EAGAIN || errno == EWOULDBLOCK)
+			rs->state = rs_resolving_route;
+		break;
+	case rs_resolving_route:
+resolving_route:
+		ret = ucma_complete(rs->cm_id);
+		if (ret) {
+			if (errno == ETIMEDOUT && rs->retries <= RS_CONN_RETRIES)
+				goto resolve_route;
+			break;
+		}
+do_connect:
+		ret = rs_create_ep(rs);
+		if (ret)
+			break;
+
+		memset(&param, 0, sizeof param);
+		creq = (void *) &cdata + rs_conn_data_offset(rs);
+		rs_format_conn_data(rs, creq);
+		param.private_data = (void *) creq - rs_conn_data_offset(rs);
+		param.private_data_len = sizeof(*creq) + rs_conn_data_offset(rs);
+		param.flow_control = 1;
+		param.retry_count = 7;
+		param.rnr_retry_count = 7;
+		/* work-around: iWarp issues RDMA read during connection */
+		if (rs->opts & RS_OPT_MSG_SEND)
+			param.initiator_depth = 1;
+		rs->retries = 0;
+
+		ret = rdma_connect(rs->cm_id, &param);
+		if (!ret)
+			goto connected;
+		if (errno == EAGAIN || errno == EWOULDBLOCK)
+			rs->state = rs_connecting;
+		break;
+	case rs_connecting:
+		ret = ucma_complete(rs->cm_id);
+		if (ret)
+			break;
+connected:
+		cresp = (struct rs_conn_data *) rs->cm_id->event->param.conn.private_data;
+		if (cresp->version != 1) {
+			ret = ERR(ENOTSUP);
+			break;
+		}
+
+		rs_save_conn_data(rs, cresp);
+		rs->state = rs_connect_rdwr;
+		break;
+	case rs_accepting:
+		if (!(rs->fd_flags & O_NONBLOCK))
+			fcntl(rs->cm_id->channel->fd, F_SETFL, 0);
+
+		ret = ucma_complete(rs->cm_id);
+		if (ret)
+			break;
+
+		rs->state = rs_connect_rdwr;
+		break;
+	default:
+		ret = ERR(EINVAL);
+		break;
+	}
+
+	if (ret) {
+		if (errno == EAGAIN || errno == EWOULDBLOCK) {
+			errno = EINPROGRESS;
+		} else {
+			rs->state = rs_connect_error;
+			rs->err = errno;
+		}
+	}
+	return ret;
+}
+
+static int rs_any_addr(const union socket_addr *addr)
+{
+	if (addr->sa.sa_family == AF_INET) {
+		return (addr->sin.sin_addr.s_addr == INADDR_ANY ||
+			addr->sin.sin_addr.s_addr == INADDR_LOOPBACK);
+	} else {
+		return (!memcmp(&addr->sin6.sin6_addr, &in6addr_any, 16) ||
+			!memcmp(&addr->sin6.sin6_addr, &in6addr_loopback, 16));
+	}
+}
+
+static int ds_get_src_addr(struct rsocket *rs,
+			   const struct sockaddr *dest_addr, socklen_t dest_len,
+			   union socket_addr *src_addr, socklen_t *src_len)
+{
+	int sock, ret;
+	uint16_t port;
+
+	*src_len = sizeof *src_addr;
+	ret = getsockname(rs->udp_sock, &src_addr->sa, src_len);
+	if (ret || !rs_any_addr(src_addr))
+		return ret;
+
+	port = src_addr->sin.sin_port;
+	sock = socket(dest_addr->sa_family, SOCK_DGRAM, 0);
+	if (sock < 0)
+		return sock;
+
+	ret = connect(sock, dest_addr, dest_len);
+	if (ret)
+		goto out;
+
+	*src_len = sizeof *src_addr;
+	ret = getsockname(sock, &src_addr->sa, src_len);
+	src_addr->sin.sin_port = port;
+out:
+	close(sock);
+	return ret;
+}
+
+static void ds_format_hdr(struct ds_header *hdr, union socket_addr *addr)
+{
+	if (addr->sa.sa_family == AF_INET) {
+		hdr->version = 4;
+		hdr->length = DS_IPV4_HDR_LEN;
+		hdr->port = addr->sin.sin_port;
+		hdr->addr.ipv4 = addr->sin.sin_addr.s_addr;
+	} else {
+		hdr->version = 6;
+		hdr->length = DS_IPV6_HDR_LEN;
+		hdr->port = addr->sin6.sin6_port;
+		hdr->addr.ipv6.flowinfo= addr->sin6.sin6_flowinfo;
+		memcpy(&hdr->addr.ipv6.addr, &addr->sin6.sin6_addr, 16);
+	}
+}
+
+static int ds_add_qp_dest(struct ds_qp *qp, union socket_addr *addr,
+			  socklen_t addrlen)
+{
+	struct ibv_port_attr port_attr;
+	struct ibv_ah_attr attr;
+	int ret;
+
+	memcpy(&qp->dest.addr, addr, addrlen);
+	qp->dest.qp = qp;
+	qp->dest.qpn = qp->cm_id->qp->qp_num;
+
+	ret = ibv_query_port(qp->cm_id->verbs, qp->cm_id->port_num, &port_attr);
+	if (ret)
+		return ret;
+
+	memset(&attr, 0, sizeof attr);
+	attr.dlid = port_attr.lid;
+	attr.port_num = qp->cm_id->port_num;
+	qp->dest.ah = ibv_create_ah(qp->cm_id->pd, &attr);
+	if (!qp->dest.ah)
+		return ERR(ENOMEM);
+
+	tsearch(&qp->dest.addr, &qp->rs->dest_map, ds_compare_addr);
+	return 0;
+}
+
+static int ds_create_qp(struct rsocket *rs, union socket_addr *src_addr,
+			socklen_t addrlen, struct ds_qp **new_qp)
+{
+	struct ds_qp *qp;
+	struct ibv_qp_init_attr qp_attr;
+	struct epoll_event event;
+	int i, ret;
+
+	qp = calloc(1, sizeof(*qp));
+	if (!qp)
+		return ERR(ENOMEM);
+
+	qp->rs = rs;
+	ret = rdma_create_id(NULL, &qp->cm_id, qp, RDMA_PS_UDP);
+	if (ret)
+		goto err;
+
+	ds_format_hdr(&qp->hdr, src_addr);
+	ret = rdma_bind_addr(qp->cm_id, &src_addr->sa);
+	if (ret)
+		goto err;
+
+	ret = ds_init_bufs(qp);
+	if (ret)
+		goto err;
+
+	ret = rs_create_cq(rs, qp->cm_id);
+	if (ret)
+		goto err;
+
+	memset(&qp_attr, 0, sizeof qp_attr);
+	qp_attr.qp_context = qp;
+	qp_attr.send_cq = qp->cm_id->send_cq;
+	qp_attr.recv_cq = qp->cm_id->recv_cq;
+	qp_attr.qp_type = IBV_QPT_UD;
+	qp_attr.sq_sig_all = 1;
+	qp_attr.cap.max_send_wr = rs->sq_size;
+	qp_attr.cap.max_recv_wr = rs->rq_size;
+	qp_attr.cap.max_send_sge = 1;
+	qp_attr.cap.max_recv_sge = 2;
+	qp_attr.cap.max_inline_data = rs->sq_inline;
+	ret = rdma_create_qp(qp->cm_id, NULL, &qp_attr);
+	if (ret)
+		goto err;
+
+	ret = ds_add_qp_dest(qp, src_addr, addrlen);
+	if (ret)
+		goto err;
+
+	event.events = EPOLLIN;
+	event.data.ptr = qp;
+	ret = epoll_ctl(rs->epfd,  EPOLL_CTL_ADD,
+			qp->cm_id->recv_cq_channel->fd, &event);
+	if (ret)
+		goto err;
+
+	for (i = 0; i < rs->rq_size; i++) {
+		ret = ds_post_recv(rs, qp, i * RS_SNDLOWAT);
+		if (ret)
+			goto err;
+	}
+
+	ds_insert_qp(rs, qp);
+	*new_qp = qp;
+	return 0;
+err:
+	ds_free_qp(qp);
+	return ret;
+}
+
+static int ds_get_qp(struct rsocket *rs, union socket_addr *src_addr,
+		     socklen_t addrlen, struct ds_qp **qp)
+{
+	if (rs->qp_list) {
+		*qp = rs->qp_list;
+		do {
+			if (!ds_compare_addr(rdma_get_local_addr((*qp)->cm_id),
+					     src_addr))
+				return 0;
+
+			*qp = ds_next_qp(*qp);
+		} while (*qp != rs->qp_list);
+	}
+
+	return ds_create_qp(rs, src_addr, addrlen, qp);
+}
+
+static int ds_get_dest(struct rsocket *rs, const struct sockaddr *addr,
+		       socklen_t addrlen, struct ds_dest **dest)
+{
+	union socket_addr src_addr;
+	socklen_t src_len;
+	struct ds_qp *qp;
+	struct ds_dest **tdest, *new_dest;
+	int ret = 0;
+
+	fastlock_acquire(&rs->map_lock);
+	tdest = tfind(addr, &rs->dest_map, ds_compare_addr);
+	if (tdest)
+		goto found;
+
+	ret = ds_get_src_addr(rs, addr, addrlen, &src_addr, &src_len);
+	if (ret)
+		goto out;
+
+	ret = ds_get_qp(rs, &src_addr, src_len, &qp);
+	if (ret)
+		goto out;
+
+	tdest = tfind(addr, &rs->dest_map, ds_compare_addr);
+	if (!tdest) {
+		new_dest = calloc(1, sizeof(*new_dest));
+		if (!new_dest) {
+			ret = ERR(ENOMEM);
+			goto out;
+		}
+
+		memcpy(&new_dest->addr, addr, addrlen);
+		new_dest->qp = qp;
+		tdest = tsearch(&new_dest->addr, &rs->dest_map, ds_compare_addr);
+	}
+
+found:
+	*dest = *tdest;
+out:
+	fastlock_release(&rs->map_lock);
+	return ret;
+}
+
+int rconnect(int socket, const struct sockaddr *addr, socklen_t addrlen)
+{
+	struct rsocket *rs;
+	int ret;
+
+	rs = idm_at(&idm, socket);
+	if (rs->type == SOCK_STREAM) {
+		memcpy(&rs->cm_id->route.addr.dst_addr, addr, addrlen);
+		ret = rs_do_connect(rs);
+	} else {
+		if (rs->state == rs_init) {
+			ret = ds_init_ep(rs);
+			if (ret)
+				return ret;
+		}
+
+		fastlock_acquire(&rs->slock);
+		ret = connect(rs->udp_sock, addr, addrlen);
+		if (!ret)
+			ret = ds_get_dest(rs, addr, addrlen, &rs->conn_dest);
+		fastlock_release(&rs->slock);
+	}
+	return ret;
+}
+
+static int rs_post_msg(struct rsocket *rs, uint32_t msg)
+{
+	struct ibv_send_wr wr, *bad;
+	struct ibv_sge sge;
+
+	wr.wr_id = rs_send_wr_id(msg);
+	wr.next = NULL;
+	if (!(rs->opts & RS_OPT_MSG_SEND)) {
+		wr.sg_list = NULL;
+		wr.num_sge = 0;
+		wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
+		wr.send_flags = 0;
+		wr.imm_data = htonl(msg);
+	} else {
+		sge.addr = (uintptr_t) &msg;
+		sge.lkey = 0;
+		sge.length = sizeof msg;
+		wr.sg_list = &sge;
+		wr.num_sge = 1;
+		wr.opcode = IBV_WR_SEND;
+		wr.send_flags = IBV_SEND_INLINE;
+	}
+
+	return rdma_seterrno(ibv_post_send(rs->cm_id->qp, &wr, &bad));
+}
+
+static int rs_post_write(struct rsocket *rs,
+			 struct ibv_sge *sgl, int nsge,
+			 uint32_t wr_data, int flags,
+			 uint64_t addr, uint32_t rkey)
+{
+	struct ibv_send_wr wr, *bad;
+
+	wr.wr_id = rs_send_wr_id(wr_data);
+	wr.next = NULL;
+	wr.sg_list = sgl;
+	wr.num_sge = nsge;
+	wr.opcode = IBV_WR_RDMA_WRITE;
+	wr.send_flags = flags;
+	wr.wr.rdma.remote_addr = addr;
+	wr.wr.rdma.rkey = rkey;
+
+	return rdma_seterrno(ibv_post_send(rs->cm_id->qp, &wr, &bad));
+}
+
+static int rs_post_write_msg(struct rsocket *rs,
+			 struct ibv_sge *sgl, int nsge,
+			 uint32_t msg, int flags,
+			 uint64_t addr, uint32_t rkey)
+{
+	struct ibv_send_wr wr, *bad;
+	int ret;
+
+	if (!(rs->opts & RS_OPT_MSG_SEND)) {
+		wr.wr_id = rs_send_wr_id(msg);
+		wr.next = NULL;
+		wr.sg_list = sgl;
+		wr.num_sge = nsge;
+		wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
+		wr.send_flags = flags;
+		wr.imm_data = htonl(msg);
+		wr.wr.rdma.remote_addr = addr;
+		wr.wr.rdma.rkey = rkey;
+
+		return rdma_seterrno(ibv_post_send(rs->cm_id->qp, &wr, &bad));
+	} else {
+		ret = rs_post_write(rs, sgl, nsge, msg, flags, addr, rkey);
+		if (!ret)
+			ret = rs_post_msg(rs, msg);
+		return ret;
+	}
+}
+
+static int ds_post_send(struct rsocket *rs, struct ibv_sge *sge,
+			uint32_t wr_data)
+{
+	struct ibv_send_wr wr, *bad;
+
+	wr.wr_id = rs_send_wr_id(wr_data);
+	wr.next = NULL;
+	wr.sg_list = sge;
+	wr.num_sge = 1;
+	wr.opcode = IBV_WR_SEND;
+	wr.send_flags = (sge->length <= rs->sq_inline) ? IBV_SEND_INLINE : 0;
+	wr.wr.ud.ah = rs->conn_dest->ah;
+	wr.wr.ud.remote_qpn = rs->conn_dest->qpn;
+	wr.wr.ud.remote_qkey = RDMA_UDP_QKEY;
+
+	return rdma_seterrno(ibv_post_send(rs->conn_dest->qp->cm_id->qp, &wr, &bad));
+}
+
+/*
+ * Update target SGE before sending data.  Otherwise the remote side may
+ * update the entry before we do.
+ */
+static int rs_write_data(struct rsocket *rs,
+			 struct ibv_sge *sgl, int nsge,
+			 uint32_t length, int flags)
+{
+	uint64_t addr;
+	uint32_t rkey;
+
+	rs->sseq_no++;
+	rs->sqe_avail--;
+	if (rs->opts & RS_OPT_MSG_SEND)
+		rs->sqe_avail--;
+	rs->sbuf_bytes_avail -= length;
+
+	addr = rs->target_sgl[rs->target_sge].addr;
+	rkey = rs->target_sgl[rs->target_sge].key;
+
+	rs->target_sgl[rs->target_sge].addr += length;
+	rs->target_sgl[rs->target_sge].length -= length;
+
+	if (!rs->target_sgl[rs->target_sge].length) {
+		if (++rs->target_sge == RS_SGL_SIZE)
+			rs->target_sge = 0;
+	}
+
+	return rs_post_write_msg(rs, sgl, nsge, rs_msg_set(RS_OP_DATA, length),
+				 flags, addr, rkey);
+}
+
+static int rs_write_direct(struct rsocket *rs, struct rs_iomap *iom, uint64_t offset,
+			   struct ibv_sge *sgl, int nsge, uint32_t length, int flags)
+{
+	uint64_t addr;
+
+	rs->sqe_avail--;
+	rs->sbuf_bytes_avail -= length;
+
+	addr = iom->sge.addr + offset - iom->offset;
+	return rs_post_write(rs, sgl, nsge, rs_msg_set(RS_OP_WRITE, length),
+			     flags, addr, iom->sge.key);
+}
+
+static int rs_write_iomap(struct rsocket *rs, struct rs_iomap_mr *iomr,
+			  struct ibv_sge *sgl, int nsge, int flags)
+{
+	uint64_t addr;
+
+	rs->sseq_no++;
+	rs->sqe_avail--;
+	if (rs->opts & RS_OPT_MSG_SEND)
+		rs->sqe_avail--;
+	rs->sbuf_bytes_avail -= sizeof(struct rs_iomap);
+
+	addr = rs->remote_iomap.addr + iomr->index * sizeof(struct rs_iomap);
+	return rs_post_write_msg(rs, sgl, nsge, rs_msg_set(RS_OP_IOMAP_SGL, iomr->index),
+			         flags, addr, rs->remote_iomap.key);
+}
+
+static uint32_t rs_sbuf_left(struct rsocket *rs)
+{
+	return (uint32_t) (((uint64_t) (uintptr_t) &rs->sbuf[rs->sbuf_size]) -
+			   rs->ssgl[0].addr);
+}
+
+static void rs_send_credits(struct rsocket *rs)
+{
+	struct ibv_sge ibsge;
+	struct rs_sge sge;
+
+	rs->ctrl_avail--;
+	rs->rseq_comp = rs->rseq_no + (rs->rq_size >> 1);
+	if (rs->rbuf_bytes_avail >= (rs->rbuf_size >> 1)) {
+		if (rs->opts & RS_OPT_MSG_SEND)
+			rs->ctrl_avail--;
+
+		if (!(rs->opts & RS_OPT_SWAP_SGL)) {
+			sge.addr = (uintptr_t) &rs->rbuf[rs->rbuf_free_offset];
+			sge.key = rs->rmr->rkey;
+			sge.length = rs->rbuf_size >> 1;
+		} else {
+			sge.addr = bswap_64((uintptr_t) &rs->rbuf[rs->rbuf_free_offset]);
+			sge.key = bswap_32(rs->rmr->rkey);
+			sge.length = bswap_32(rs->rbuf_size >> 1);
+		}
+
+		ibsge.addr = (uintptr_t) &sge;
+		ibsge.lkey = 0;
+		ibsge.length = sizeof(sge);
+
+		rs_post_write_msg(rs, &ibsge, 1,
+				  rs_msg_set(RS_OP_SGL, rs->rseq_no + rs->rq_size),
+				  IBV_SEND_INLINE,
+				  rs->remote_sgl.addr +
+				  rs->remote_sge * sizeof(struct rs_sge),
+				  rs->remote_sgl.key);
+
+		rs->rbuf_bytes_avail -= rs->rbuf_size >> 1;
+		rs->rbuf_free_offset += rs->rbuf_size >> 1;
+		if (rs->rbuf_free_offset >= rs->rbuf_size)
+			rs->rbuf_free_offset = 0;
+		if (++rs->remote_sge == rs->remote_sgl.length)
+			rs->remote_sge = 0;
+	} else {
+		rs_post_msg(rs, rs_msg_set(RS_OP_SGL, rs->rseq_no + rs->rq_size));
+	}
+}
+
+static int rs_give_credits(struct rsocket *rs)
+{
+	if (!(rs->opts & RS_OPT_MSG_SEND)) {
+		return ((rs->rbuf_bytes_avail >= (rs->rbuf_size >> 1)) ||
+			((short) ((short) rs->rseq_no - (short) rs->rseq_comp) >= 0)) &&
+		       rs->ctrl_avail && (rs->state & rs_connected);
+	} else {
+		return ((rs->rbuf_bytes_avail >= (rs->rbuf_size >> 1)) ||
+			((short) ((short) rs->rseq_no - (short) rs->rseq_comp) >= 0)) &&
+		       (rs->ctrl_avail > 1) && (rs->state & rs_connected);
+	}
+}
+
+static void rs_update_credits(struct rsocket *rs)
+{
+	if (rs_give_credits(rs))
+		rs_send_credits(rs);
+}
+
+static int rs_poll_cq(struct rsocket *rs)
+{
+	struct ibv_wc wc;
+	uint32_t msg;
+	int ret, rcnt = 0;
+
+	while ((ret = ibv_poll_cq(rs->cm_id->recv_cq, 1, &wc)) > 0) {
+		if (rs_wr_is_recv(wc.wr_id)) {
+			if (wc.status != IBV_WC_SUCCESS)
+				continue;
+			rcnt++;
+
+			if (wc.wc_flags & IBV_WC_WITH_IMM) {
+				msg = ntohl(wc.imm_data);
+			} else {
+				msg = ((uint32_t *) (rs->rbuf + rs->rbuf_size))
+					[rs_wr_data(wc.wr_id)];
+
+			}
+			switch (rs_msg_op(msg)) {
+			case RS_OP_SGL:
+				rs->sseq_comp = (uint16_t) rs_msg_data(msg);
+				break;
+			case RS_OP_IOMAP_SGL:
+				/* The iomap was updated, that's nice to know. */
+				break;
+			case RS_OP_CTRL:
+				if (rs_msg_data(msg) == RS_CTRL_DISCONNECT) {
+					rs->state = rs_disconnected;
+					return 0;
+				} else if (rs_msg_data(msg) == RS_CTRL_SHUTDOWN) {
+					if (rs->state & rs_writable) {
+						rs->state &= ~rs_readable;
+					} else {
+						rs->state = rs_disconnected;
+						return 0;
+					}
+				}
+				break;
+			case RS_OP_WRITE:
+				/* We really shouldn't be here. */
+				break;
+			default:
+				rs->rmsg[rs->rmsg_tail].op = rs_msg_op(msg);
+				rs->rmsg[rs->rmsg_tail].data = rs_msg_data(msg);
+				if (++rs->rmsg_tail == rs->rq_size + 1)
+					rs->rmsg_tail = 0;
+				break;
+			}
+		} else {
+			switch  (rs_msg_op(rs_wr_data(wc.wr_id))) {
+			case RS_OP_SGL:
+				rs->ctrl_avail++;
+				break;
+			case RS_OP_CTRL:
+				rs->ctrl_avail++;
+				if (rs_msg_data(rs_wr_data(wc.wr_id)) == RS_CTRL_DISCONNECT)
+					rs->state = rs_disconnected;
+				break;
+			case RS_OP_IOMAP_SGL:
+				rs->sqe_avail++;
+				rs->sbuf_bytes_avail += sizeof(struct rs_iomap);
+				break;
+			default:
+				rs->sqe_avail++;
+				rs->sbuf_bytes_avail += rs_msg_data(rs_wr_data(wc.wr_id));
+				break;
+			}
+			if (wc.status != IBV_WC_SUCCESS && (rs->state & rs_connected)) {
+				rs->state = rs_error;
+				rs->err = EIO;
+			}
+		}
+	}
+
+	if (rs->state & rs_connected) {
+		while (!ret && rcnt--)
+			ret = rs_post_recv(rs);
+
+		if (ret) {
+			rs->state = rs_error;
+			rs->err = errno;
+		}
+	}
+	return ret;
+}
+
+static int rs_get_cq_event(struct rsocket *rs)
+{
+	struct ibv_cq *cq;
+	void *context;
+	int ret;
+
+	if (!rs->cq_armed)
+		return 0;
+
+	ret = ibv_get_cq_event(rs->cm_id->recv_cq_channel, &cq, &context);
+	if (!ret) {
+		ibv_ack_cq_events(rs->cm_id->recv_cq, 1);
+		rs->cq_armed = 0;
+	} else if (errno != EAGAIN) {
+		rs->state = rs_error;
+	}
+
+	return ret;
+}
+
+/*
+ * Although we serialize rsend and rrecv calls with respect to themselves,
+ * both calls may run simultaneously and need to poll the CQ for completions.
+ * We need to serialize access to the CQ, but rsend and rrecv need to
+ * allow each other to make forward progress.
+ *
+ * For example, rsend may need to wait for credits from the remote side,
+ * which could be stalled until the remote process calls rrecv.  This should
+ * not block rrecv from receiving data from the remote side however.
+ *
+ * We handle this by using two locks.  The cq_lock protects against polling
+ * the CQ and processing completions.  The cq_wait_lock serializes access to
+ * waiting on the CQ.
+ */
+static int rs_process_cq(struct rsocket *rs, int nonblock, int (*test)(struct rsocket *rs))
+{
+	int ret;
+
+	fastlock_acquire(&rs->cq_lock);
+	do {
+		rs_update_credits(rs);
+		ret = rs_poll_cq(rs);
+		if (test(rs)) {
+			ret = 0;
+			break;
+		} else if (ret) {
+			break;
+		} else if (nonblock) {
+			ret = ERR(EWOULDBLOCK);
+		} else if (!rs->cq_armed) {
+			ibv_req_notify_cq(rs->cm_id->recv_cq, 0);
+			rs->cq_armed = 1;
+		} else {
+			rs_update_credits(rs);
+			fastlock_acquire(&rs->cq_wait_lock);
+			fastlock_release(&rs->cq_lock);
+
+			ret = rs_get_cq_event(rs);
+			fastlock_release(&rs->cq_wait_lock);
+			fastlock_acquire(&rs->cq_lock);
+		}
+	} while (!ret);
+
+	rs_update_credits(rs);
+	fastlock_release(&rs->cq_lock);
+	return ret;
+}
+
+static int rs_get_comp(struct rsocket *rs, int nonblock, int (*test)(struct rsocket *rs))
+{
+	struct timeval s, e;
+	uint32_t poll_time = 0;
+	int ret;
+
+	do {
+		ret = rs_process_cq(rs, 1, test);
+		if (!ret || nonblock || errno != EWOULDBLOCK)
+			return ret;
+
+		if (!poll_time)
+			gettimeofday(&s, NULL);
+
+		gettimeofday(&e, NULL);
+		poll_time = (e.tv_sec - s.tv_sec) * 1000000 +
+			    (e.tv_usec - s.tv_usec) + 1;
+	} while (poll_time <= polling_time);
+
+	ret = rs_process_cq(rs, 0, test);
+	return ret;
+}
+
+static int ds_valid_recv(struct ds_qp *qp, struct ibv_wc *wc)
+{
+	struct ds_header *hdr;
+
+	hdr = (struct ds_header *) (qp->rbuf + rs_wr_data(wc->wr_id));
+	return ((wc->byte_len >= sizeof(struct ibv_grh) + DS_IPV4_HDR_LEN) &&
+		((hdr->version == 4 && hdr->length == DS_IPV4_HDR_LEN) ||
+		 (hdr->version == 6 && hdr->length == DS_IPV6_HDR_LEN)));
+}
+
+/*
+ * Poll all CQs associated with a datagram rsocket.  We need to drop any
+ * received messages that we do not have room to store.  To limit drops,
+ * we only poll if we have room to store the receive or we need a send
+ * buffer.  To ensure fairness, we poll the CQs round robin, remembering
+ * where we left off.
+ */
+static void ds_poll_cqs(struct rsocket *rs)
+{
+	struct ds_qp *qp;
+	struct ds_smsg *smsg;
+	struct ds_rmsg *rmsg;
+	struct ibv_wc wc;
+	int ret, cnt;
+
+	if (!(qp = rs->qp_list))
+		return;
+
+	do {
+		cnt = 0;
+		do {
+			ret = ibv_poll_cq(qp->cm_id->recv_cq, 1, &wc);
+			if (ret <= 0) {
+				qp = ds_next_qp(qp);
+				continue;
+			}
+
+			if (rs_wr_is_recv(wc.wr_id)) {
+				if (rs->rqe_avail && wc.status == IBV_WC_SUCCESS &&
+				    ds_valid_recv(qp, &wc)) {
+					rs->rqe_avail--;
+					rmsg = &rs->dmsg[rs->rmsg_tail];
+					rmsg->qp = qp;
+					rmsg->offset = rs_wr_data(wc.wr_id);
+					rmsg->length = wc.byte_len - sizeof(struct ibv_grh);
+					if (++rs->rmsg_tail == rs->rq_size + 1)
+						rs->rmsg_tail = 0;
+				} else {
+					ds_post_recv(rs, qp, rs_wr_data(wc.wr_id));
+				}
+			} else {
+				smsg = (struct ds_smsg *) (rs->sbuf + rs_wr_data(wc.wr_id));
+				smsg->next = rs->smsg_free;
+				rs->smsg_free = smsg;
+				rs->sqe_avail++;
+			}
+
+			qp = ds_next_qp(qp);
+			if (!rs->rqe_avail && rs->sqe_avail) {
+				rs->qp_list = qp;
+				return;
+			}
+			cnt++;
+		} while (qp != rs->qp_list);
+	} while (cnt);
+}
+
+static void ds_req_notify_cqs(struct rsocket *rs)
+{
+	struct ds_qp *qp;
+
+	if (!(qp = rs->qp_list))
+		return;
+
+	do {
+		if (!qp->cq_armed) {
+			ibv_req_notify_cq(qp->cm_id->recv_cq, 0);
+			qp->cq_armed = 1;
+		}
+		qp = ds_next_qp(qp);
+	} while (qp != rs->qp_list);
+}
+
+static int ds_get_cq_event(struct rsocket *rs)
+{
+	struct epoll_event event;
+	struct ds_qp *qp;
+	struct ibv_cq *cq;
+	void *context;
+	int ret;
+
+	if (!rs->cq_armed)
+		return 0;
+
+	ret = epoll_wait(rs->epfd, &event, 1, -1);
+	if (ret <= 0)
+		return ret;
+
+	qp = event.data.ptr;
+	ret = ibv_get_cq_event(qp->cm_id->recv_cq_channel, &cq, &context);
+	if (!ret) {
+		ibv_ack_cq_events(qp->cm_id->recv_cq, 1);
+		qp->cq_armed = 0;
+		rs->cq_armed = 0;
+	}
+
+	return ret;
+}
+
+static int ds_process_cqs(struct rsocket *rs, int nonblock, int (*test)(struct rsocket *rs))
+{
+	int ret = 0;
+
+	fastlock_acquire(&rs->cq_lock);
+	do {
+		ds_poll_cqs(rs);
+		if (test(rs)) {
+			ret = 0;
+			break;
+		} else if (nonblock) {
+			ret = ERR(EWOULDBLOCK);
+		} else if (!rs->cq_armed) {
+			ds_req_notify_cqs(rs);
+			rs->cq_armed = 1;
+		} else {
+			fastlock_acquire(&rs->cq_wait_lock);
+			fastlock_release(&rs->cq_lock);
+
+			ret = ds_get_cq_event(rs);
+			fastlock_release(&rs->cq_wait_lock);
+			fastlock_acquire(&rs->cq_lock);
+		}
+	} while (!ret);
+
+	fastlock_release(&rs->cq_lock);
+	return ret;
+}
+
+static int ds_get_comp(struct rsocket *rs, int nonblock, int (*test)(struct rsocket *rs))
+{
+	struct timeval s, e;
+	uint32_t poll_time = 0;
+	int ret;
+
+	do {
+		ret = ds_process_cqs(rs, 1, test);
+		if (!ret || nonblock || errno != EWOULDBLOCK)
+			return ret;
+
+		if (!poll_time)
+			gettimeofday(&s, NULL);
+
+		gettimeofday(&e, NULL);
+		poll_time = (e.tv_sec - s.tv_sec) * 1000000 +
+			    (e.tv_usec - s.tv_usec) + 1;
+	} while (poll_time <= polling_time);
+
+	ret = ds_process_cqs(rs, 0, test);
+	return ret;
+}
+
+static int rs_nonblocking(struct rsocket *rs, int flags)
+{
+	return (rs->fd_flags & O_NONBLOCK) || (flags & MSG_DONTWAIT);
+}
+
+static int rs_is_cq_armed(struct rsocket *rs)
+{
+	return rs->cq_armed;
+}
+
+static int rs_poll_all(struct rsocket *rs)
+{
+	return 1;
+}
+
+/*
+ * We use hardware flow control to prevent over running the remote
+ * receive queue.  However, data transfers still require space in
+ * the remote rmsg queue, or we risk losing notification that data
+ * has been transfered.
+ *
+ * Be careful with race conditions in the check below.  The target SGL
+ * may be updated by a remote RDMA write.
+ */
+static int rs_can_send(struct rsocket *rs)
+{
+	if (!(rs->opts & RS_OPT_MSG_SEND)) {
+		return rs->sqe_avail && (rs->sbuf_bytes_avail >= RS_SNDLOWAT) &&
+		       (rs->sseq_no != rs->sseq_comp) &&
+		       (rs->target_sgl[rs->target_sge].length != 0);
+	} else {
+		return (rs->sqe_avail >= 2) && (rs->sbuf_bytes_avail >= RS_SNDLOWAT) &&
+		       (rs->sseq_no != rs->sseq_comp) &&
+		       (rs->target_sgl[rs->target_sge].length != 0);
+	}
+}
+
+static int ds_can_send(struct rsocket *rs)
+{
+	return rs->sqe_avail;
+}
+
+static int ds_all_sends_done(struct rsocket *rs)
+{
+	return rs->sqe_avail == rs->sq_size;
+}
+
+static int rs_conn_can_send(struct rsocket *rs)
+{
+	return rs_can_send(rs) || !(rs->state & rs_writable);
+}
+
+static int rs_conn_can_send_ctrl(struct rsocket *rs)
+{
+	return rs->ctrl_avail || !(rs->state & rs_connected);
+}
+
+static int rs_have_rdata(struct rsocket *rs)
+{
+	return (rs->rmsg_head != rs->rmsg_tail);
+}
+
+static int rs_conn_have_rdata(struct rsocket *rs)
+{
+	return rs_have_rdata(rs) || !(rs->state & rs_readable);
+}
+
+static int rs_conn_all_sends_done(struct rsocket *rs)
+{
+	return ((rs->sqe_avail + rs->ctrl_avail) == rs->sq_size) ||
+	       !(rs->state & rs_connected);
+}
+
+static void ds_set_src(struct sockaddr *addr, socklen_t *addrlen,
+		       struct ds_header *hdr)
+{
+	union socket_addr sa;
+
+	memset(&sa, 0, sizeof sa);
+	if (hdr->version == 4) {
+		if (*addrlen > sizeof(sa.sin))
+			*addrlen = sizeof(sa.sin);
+
+		sa.sin.sin_family = AF_INET;
+		sa.sin.sin_port = hdr->port;
+		sa.sin.sin_addr.s_addr =  hdr->addr.ipv4;
+	} else {
+		if (*addrlen > sizeof(sa.sin6))
+			*addrlen = sizeof(sa.sin6);
+
+		sa.sin6.sin6_family = AF_INET6;
+		sa.sin6.sin6_port = hdr->port;
+		sa.sin6.sin6_flowinfo = hdr->addr.ipv6.flowinfo;
+		memcpy(&sa.sin6.sin6_addr, &hdr->addr.ipv6.addr, 16);
+	}
+	memcpy(addr, &sa, *addrlen);
+}
+
+static ssize_t ds_recvfrom(struct rsocket *rs, void *buf, size_t len, int flags,
+			   struct sockaddr *src_addr, socklen_t *addrlen)
+{
+	struct ds_rmsg *rmsg;
+	struct ds_header *hdr;
+	int ret;
+
+	if (!(rs->state & rs_readable))
+		return ERR(EINVAL);
+
+	if (!rs_have_rdata(rs)) {
+		ret = ds_get_comp(rs, rs_nonblocking(rs, flags),
+				  rs_have_rdata);
+		if (ret)
+			return ret;
+	}
+
+	rmsg = &rs->dmsg[rs->rmsg_head];
+	hdr = (struct ds_header *) (rmsg->qp->rbuf + rmsg->offset);
+	if (len > rmsg->length - hdr->length)
+		len = rmsg->length - hdr->length;
+
+	memcpy(buf, (void *) hdr + hdr->length, len);
+	if (addrlen)
+		ds_set_src(src_addr, addrlen, hdr);
+
+	if (!(flags & MSG_PEEK)) {
+		ds_post_recv(rs, rmsg->qp, rmsg->offset);
+		if (++rs->rmsg_head == rs->rq_size + 1)
+			rs->rmsg_head = 0;
+		rs->rqe_avail++;
+	}
+
+	return len;
+}
+
+static ssize_t rs_peek(struct rsocket *rs, void *buf, size_t len)
+{
+	size_t left = len;
+	uint32_t end_size, rsize;
+	int rmsg_head, rbuf_offset;
+
+	rmsg_head = rs->rmsg_head;
+	rbuf_offset = rs->rbuf_offset;
+
+	for (; left && (rmsg_head != rs->rmsg_tail); left -= rsize) {
+		if (left < rs->rmsg[rmsg_head].data) {
+			rsize = left;
+		} else {
+			rsize = rs->rmsg[rmsg_head].data;
+			if (++rmsg_head == rs->rq_size + 1)
+				rmsg_head = 0;
+		}
+
+		end_size = rs->rbuf_size - rbuf_offset;
+		if (rsize > end_size) {
+			memcpy(buf, &rs->rbuf[rbuf_offset], end_size);
+			rbuf_offset = 0;
+			buf += end_size;
+			rsize -= end_size;
+			left -= end_size;
+		}
+		memcpy(buf, &rs->rbuf[rbuf_offset], rsize);
+		rbuf_offset += rsize;
+		buf += rsize;
+	}
+
+	return len - left;
+}
+
+/*
+ * Continue to receive any queued data even if the remote side has disconnected.
+ */
+ssize_t rrecv(int socket, void *buf, size_t len, int flags)
+{
+	struct rsocket *rs;
+	size_t left = len;
+	uint32_t end_size, rsize;
+	int ret;
+
+	rs = idm_at(&idm, socket);
+	if (rs->type == SOCK_DGRAM) {
+		fastlock_acquire(&rs->rlock);
+		ret = ds_recvfrom(rs, buf, len, flags, NULL, 0);
+		fastlock_release(&rs->rlock);
+		return ret;
+	}
+
+	if (rs->state & rs_opening) {
+		ret = rs_do_connect(rs);
+		if (ret) {
+			if (errno == EINPROGRESS)
+				errno = EAGAIN;
+			return ret;
+		}
+	}
+	fastlock_acquire(&rs->rlock);
+	do {
+		if (!rs_have_rdata(rs)) {
+			ret = rs_get_comp(rs, rs_nonblocking(rs, flags),
+					  rs_conn_have_rdata);
+			if (ret)
+				break;
+		}
+
+		ret = 0;
+		if (flags & MSG_PEEK) {
+			left = len - rs_peek(rs, buf, left);
+			break;
+		}
+
+		for (; left && rs_have_rdata(rs); left -= rsize) {
+			if (left < rs->rmsg[rs->rmsg_head].data) {
+				rsize = left;
+				rs->rmsg[rs->rmsg_head].data -= left;
+			} else {
+				rs->rseq_no++;
+				rsize = rs->rmsg[rs->rmsg_head].data;
+				if (++rs->rmsg_head == rs->rq_size + 1)
+					rs->rmsg_head = 0;
+			}
+
+			end_size = rs->rbuf_size - rs->rbuf_offset;
+			if (rsize > end_size) {
+				memcpy(buf, &rs->rbuf[rs->rbuf_offset], end_size);
+				rs->rbuf_offset = 0;
+				buf += end_size;
+				rsize -= end_size;
+				left -= end_size;
+				rs->rbuf_bytes_avail += end_size;
+			}
+			memcpy(buf, &rs->rbuf[rs->rbuf_offset], rsize);
+			rs->rbuf_offset += rsize;
+			buf += rsize;
+			rs->rbuf_bytes_avail += rsize;
+		}
+
+	} while (left && (flags & MSG_WAITALL) && (rs->state & rs_readable));
+
+	fastlock_release(&rs->rlock);
+	return ret ? ret : len - left;
+}
+
+ssize_t rrecvfrom(int socket, void *buf, size_t len, int flags,
+		  struct sockaddr *src_addr, socklen_t *addrlen)
+{
+	struct rsocket *rs;
+	int ret;
+
+	rs = idm_at(&idm, socket);
+	if (rs->type == SOCK_DGRAM) {
+		fastlock_acquire(&rs->rlock);
+		ret = ds_recvfrom(rs, buf, len, flags, src_addr, addrlen);
+		fastlock_release(&rs->rlock);
+		return ret;
+	}
+
+	ret = rrecv(socket, buf, len, flags);
+	if (ret > 0 && src_addr)
+		rgetpeername(socket, src_addr, addrlen);
+
+	return ret;
+}
+
+/*
+ * Simple, straightforward implementation for now that only tries to fill
+ * in the first vector.
+ */
+static ssize_t rrecvv(int socket, const struct iovec *iov, int iovcnt, int flags)
+{
+	return rrecv(socket, iov[0].iov_base, iov[0].iov_len, flags);
+}
+
+ssize_t rrecvmsg(int socket, struct msghdr *msg, int flags)
+{
+	if (msg->msg_control && msg->msg_controllen)
+		return ERR(ENOTSUP);
+
+	return rrecvv(socket, msg->msg_iov, (int) msg->msg_iovlen, msg->msg_flags);
+}
+
+ssize_t rread(int socket, void *buf, size_t count)
+{
+	return rrecv(socket, buf, count, 0);
+}
+
+ssize_t rreadv(int socket, const struct iovec *iov, int iovcnt)
+{
+	return rrecvv(socket, iov, iovcnt, 0);
+}
+
+static int rs_send_iomaps(struct rsocket *rs, int flags)
+{
+	struct rs_iomap_mr *iomr;
+	struct ibv_sge sge;
+	struct rs_iomap iom;
+	int ret;
+
+	fastlock_acquire(&rs->map_lock);
+	while (!dlist_empty(&rs->iomap_queue)) {
+		if (!rs_can_send(rs)) {
+			ret = rs_get_comp(rs, rs_nonblocking(rs, flags),
+					  rs_conn_can_send);
+			if (ret)
+				break;
+			if (!(rs->state & rs_writable)) {
+				ret = ERR(ECONNRESET);
+				break;
+			}
+		}
+
+		iomr = container_of(rs->iomap_queue.next, struct rs_iomap_mr, entry);
+		if (!(rs->opts & RS_OPT_SWAP_SGL)) {
+			iom.offset = iomr->offset;
+			iom.sge.addr = (uintptr_t) iomr->mr->addr;
+			iom.sge.length = iomr->mr->length;
+			iom.sge.key = iomr->mr->rkey;
+		} else {
+			iom.offset = bswap_64(iomr->offset);
+			iom.sge.addr = bswap_64((uintptr_t) iomr->mr->addr);
+			iom.sge.length = bswap_32(iomr->mr->length);
+			iom.sge.key = bswap_32(iomr->mr->rkey);
+		}
+
+		if (rs->sq_inline >= sizeof iom) {
+			sge.addr = (uintptr_t) &iom;
+			sge.length = sizeof iom;
+			sge.lkey = 0;
+			ret = rs_write_iomap(rs, iomr, &sge, 1, IBV_SEND_INLINE);
+		} else if (rs_sbuf_left(rs) >= sizeof iom) {
+			memcpy((void *) (uintptr_t) rs->ssgl[0].addr, &iom, sizeof iom);
+			rs->ssgl[0].length = sizeof iom;
+			ret = rs_write_iomap(rs, iomr, rs->ssgl, 1, 0);
+			if (rs_sbuf_left(rs) > sizeof iom)
+				rs->ssgl[0].addr += sizeof iom;
+			else
+				rs->ssgl[0].addr = (uintptr_t) rs->sbuf;
+		} else {
+			rs->ssgl[0].length = rs_sbuf_left(rs);
+			memcpy((void *) (uintptr_t) rs->ssgl[0].addr, &iom,
+				rs->ssgl[0].length);
+			rs->ssgl[1].length = sizeof iom - rs->ssgl[0].length;
+			memcpy(rs->sbuf, ((void *) &iom) + rs->ssgl[0].length,
+			       rs->ssgl[1].length);
+			ret = rs_write_iomap(rs, iomr, rs->ssgl, 2, 0);
+			rs->ssgl[0].addr = (uintptr_t) rs->sbuf + rs->ssgl[1].length;
+		}
+		dlist_remove(&iomr->entry);
+		dlist_insert_tail(&iomr->entry, &rs->iomap_list);
+		if (ret)
+			break;
+	}
+
+	rs->iomap_pending = !dlist_empty(&rs->iomap_queue);
+	fastlock_release(&rs->map_lock);
+	return ret;
+}
+
+static ssize_t ds_sendv_udp(struct rsocket *rs, const struct iovec *iov,
+			    int iovcnt, int flags, uint8_t op)
+{
+	struct ds_udp_header hdr;
+	struct msghdr msg;
+	struct iovec miov[8];
+	ssize_t ret;
+
+	if (iovcnt > 8)
+		return ERR(ENOTSUP);
+
+	hdr.tag = htonl(DS_UDP_TAG);
+	hdr.version = rs->conn_dest->qp->hdr.version;
+	hdr.op = op;
+	hdr.reserved = 0;
+	hdr.qpn = htonl(rs->conn_dest->qp->cm_id->qp->qp_num & 0xFFFFFF);
+	if (rs->conn_dest->qp->hdr.version == 4) {
+		hdr.length = DS_UDP_IPV4_HDR_LEN;
+		hdr.addr.ipv4 = rs->conn_dest->qp->hdr.addr.ipv4;
+	} else {
+		hdr.length = DS_UDP_IPV6_HDR_LEN;
+		memcpy(hdr.addr.ipv6, &rs->conn_dest->qp->hdr.addr.ipv6, 16);
+	}
+
+	miov[0].iov_base = &hdr;
+	miov[0].iov_len = hdr.length;
+	if (iov && iovcnt)
+		memcpy(&miov[1], iov, sizeof *iov * iovcnt);
+
+	memset(&msg, 0, sizeof msg);
+	msg.msg_name = &rs->conn_dest->addr;
+	msg.msg_namelen = rdma_addrlen(&rs->conn_dest->addr.sa);
+	msg.msg_iov = miov;
+	msg.msg_iovlen = iovcnt + 1;
+	ret = sendmsg(rs->udp_sock, &msg, flags);
+	return ret > 0 ? ret - hdr.length : ret;
+}
+
+static ssize_t ds_send_udp(struct rsocket *rs, const void *buf, size_t len,
+			   int flags, uint8_t op)
+{
+	struct iovec iov;
+	if (buf && len) {
+		iov.iov_base = (void *) buf;
+		iov.iov_len = len;
+		return ds_sendv_udp(rs, &iov, 1, flags, op);
+	} else {
+		return ds_sendv_udp(rs, NULL, 0, flags, op);
+	}
+}
+
+static ssize_t dsend(struct rsocket *rs, const void *buf, size_t len, int flags)
+{
+	struct ds_smsg *msg;
+	struct ibv_sge sge;
+	uint64_t offset;
+	int ret = 0;
+
+	if (!rs->conn_dest->ah)
+		return ds_send_udp(rs, buf, len, flags, RS_OP_DATA);
+
+	if (!ds_can_send(rs)) {
+		ret = ds_get_comp(rs, rs_nonblocking(rs, flags), ds_can_send);
+		if (ret)
+			return ret;
+	}
+
+	msg = rs->smsg_free;
+	rs->smsg_free = msg->next;
+	rs->sqe_avail--;
+
+	memcpy((void *) msg, &rs->conn_dest->qp->hdr, rs->conn_dest->qp->hdr.length);
+	memcpy((void *) msg + rs->conn_dest->qp->hdr.length, buf, len);
+	sge.addr = (uintptr_t) msg;
+	sge.length = rs->conn_dest->qp->hdr.length + len;
+	sge.lkey = rs->conn_dest->qp->smr->lkey;
+	offset = (uint8_t *) msg - rs->sbuf;
+
+	ret = ds_post_send(rs, &sge, offset);
+	return ret ? ret : len;
+}
+
+/*
+ * We overlap sending the data, by posting a small work request immediately,
+ * then increasing the size of the send on each iteration.
+ */
+ssize_t rsend(int socket, const void *buf, size_t len, int flags)
+{
+	struct rsocket *rs;
+	struct ibv_sge sge;
+	size_t left = len;
+	uint32_t xfer_size, olen = RS_OLAP_START_SIZE;
+	int ret = 0;
+
+	rs = idm_at(&idm, socket);
+	if (rs->type == SOCK_DGRAM) {
+		fastlock_acquire(&rs->slock);
+		ret = dsend(rs, buf, len, flags);
+		fastlock_release(&rs->slock);
+		return ret;
+	}
+
+	if (rs->state & rs_opening) {
+		ret = rs_do_connect(rs);
+		if (ret) {
+			if (errno == EINPROGRESS)
+				errno = EAGAIN;
+			return ret;
+		}
+	}
+
+	fastlock_acquire(&rs->slock);
+	if (rs->iomap_pending) {
+		ret = rs_send_iomaps(rs, flags);
+		if (ret)
+			goto out;
+	}
+	for (; left; left -= xfer_size, buf += xfer_size) {
+		if (!rs_can_send(rs)) {
+			ret = rs_get_comp(rs, rs_nonblocking(rs, flags),
+					  rs_conn_can_send);
+			if (ret)
+				break;
+			if (!(rs->state & rs_writable)) {
+				ret = ERR(ECONNRESET);
+				break;
+			}
+		}
+
+		if (olen < left) {
+			xfer_size = olen;
+			if (olen < RS_MAX_TRANSFER)
+				olen <<= 1;
+		} else {
+			xfer_size = left;
+		}
+
+		if (xfer_size > rs->sbuf_bytes_avail)
+			xfer_size = rs->sbuf_bytes_avail;
+		if (xfer_size > rs->target_sgl[rs->target_sge].length)
+			xfer_size = rs->target_sgl[rs->target_sge].length;
+
+		if (xfer_size <= rs->sq_inline) {
+			sge.addr = (uintptr_t) buf;
+			sge.length = xfer_size;
+			sge.lkey = 0;
+			ret = rs_write_data(rs, &sge, 1, xfer_size, IBV_SEND_INLINE);
+		} else if (xfer_size <= rs_sbuf_left(rs)) {
+			memcpy((void *) (uintptr_t) rs->ssgl[0].addr, buf, xfer_size);
+			rs->ssgl[0].length = xfer_size;
+			ret = rs_write_data(rs, rs->ssgl, 1, xfer_size, 0);
+			if (xfer_size < rs_sbuf_left(rs))
+				rs->ssgl[0].addr += xfer_size;
+			else
+				rs->ssgl[0].addr = (uintptr_t) rs->sbuf;
+		} else {
+			rs->ssgl[0].length = rs_sbuf_left(rs);
+			memcpy((void *) (uintptr_t) rs->ssgl[0].addr, buf,
+				rs->ssgl[0].length);
+			rs->ssgl[1].length = xfer_size - rs->ssgl[0].length;
+			memcpy(rs->sbuf, buf + rs->ssgl[0].length, rs->ssgl[1].length);
+			ret = rs_write_data(rs, rs->ssgl, 2, xfer_size, 0);
+			rs->ssgl[0].addr = (uintptr_t) rs->sbuf + rs->ssgl[1].length;
+		}
+		if (ret)
+			break;
+	}
+out:
+	fastlock_release(&rs->slock);
+
+	return (ret && left == len) ? ret : len - left;
+}
+
+ssize_t rsendto(int socket, const void *buf, size_t len, int flags,
+		const struct sockaddr *dest_addr, socklen_t addrlen)
+{
+	struct rsocket *rs;
+	int ret;
+
+	rs = idm_at(&idm, socket);
+	if (rs->type == SOCK_STREAM) {
+		if (dest_addr || addrlen)
+			return ERR(EISCONN);
+
+		return rsend(socket, buf, len, flags);
+	}
+
+	if (rs->state == rs_init) {
+		ret = ds_init_ep(rs);
+		if (ret)
+			return ret;
+	}
+
+	fastlock_acquire(&rs->slock);
+	if (!rs->conn_dest || ds_compare_addr(dest_addr, &rs->conn_dest->addr)) {
+		ret = ds_get_dest(rs, dest_addr, addrlen, &rs->conn_dest);
+		if (ret)
+			goto out;
+	}
+
+	ret = dsend(rs, buf, len, flags);
+out:
+	fastlock_release(&rs->slock);
+	return ret;
+}
+
+static void rs_copy_iov(void *dst, const struct iovec **iov, size_t *offset, size_t len)
+{
+	size_t size;
+
+	while (len) {
+		size = (*iov)->iov_len - *offset;
+		if (size > len) {
+			memcpy (dst, (*iov)->iov_base + *offset, len);
+			*offset += len;
+			break;
+		}
+
+		memcpy(dst, (*iov)->iov_base + *offset, size);
+		len -= size;
+		dst += size;
+		(*iov)++;
+		*offset = 0;
+	}
+}
+
+static ssize_t rsendv(int socket, const struct iovec *iov, int iovcnt, int flags)
+{
+	struct rsocket *rs;
+	const struct iovec *cur_iov;
+	size_t left, len, offset = 0;
+	uint32_t xfer_size, olen = RS_OLAP_START_SIZE;
+	int i, ret = 0;
+
+	rs = idm_at(&idm, socket);
+	if (rs->state & rs_opening) {
+		ret = rs_do_connect(rs);
+		if (ret) {
+			if (errno == EINPROGRESS)
+				errno = EAGAIN;
+			return ret;
+		}
+	}
+
+	cur_iov = iov;
+	len = iov[0].iov_len;
+	for (i = 1; i < iovcnt; i++)
+		len += iov[i].iov_len;
+	left = len;
+
+	fastlock_acquire(&rs->slock);
+	if (rs->iomap_pending) {
+		ret = rs_send_iomaps(rs, flags);
+		if (ret)
+			goto out;
+	}
+	for (; left; left -= xfer_size) {
+		if (!rs_can_send(rs)) {
+			ret = rs_get_comp(rs, rs_nonblocking(rs, flags),
+					  rs_conn_can_send);
+			if (ret)
+				break;
+			if (!(rs->state & rs_writable)) {
+				ret = ERR(ECONNRESET);
+				break;
+			}
+		}
+
+		if (olen < left) {
+			xfer_size = olen;
+			if (olen < RS_MAX_TRANSFER)
+				olen <<= 1;
+		} else {
+			xfer_size = left;
+		}
+
+		if (xfer_size > rs->sbuf_bytes_avail)
+			xfer_size = rs->sbuf_bytes_avail;
+		if (xfer_size > rs->target_sgl[rs->target_sge].length)
+			xfer_size = rs->target_sgl[rs->target_sge].length;
+
+		if (xfer_size <= rs_sbuf_left(rs)) {
+			rs_copy_iov((void *) (uintptr_t) rs->ssgl[0].addr,
+				    &cur_iov, &offset, xfer_size);
+			rs->ssgl[0].length = xfer_size;
+			ret = rs_write_data(rs, rs->ssgl, 1, xfer_size,
+					    xfer_size <= rs->sq_inline ? IBV_SEND_INLINE : 0);
+			if (xfer_size < rs_sbuf_left(rs))
+				rs->ssgl[0].addr += xfer_size;
+			else
+				rs->ssgl[0].addr = (uintptr_t) rs->sbuf;
+		} else {
+			rs->ssgl[0].length = rs_sbuf_left(rs);
+			rs_copy_iov((void *) (uintptr_t) rs->ssgl[0].addr, &cur_iov,
+				    &offset, rs->ssgl[0].length);
+			rs->ssgl[1].length = xfer_size - rs->ssgl[0].length;
+			rs_copy_iov(rs->sbuf, &cur_iov, &offset, rs->ssgl[1].length);
+			ret = rs_write_data(rs, rs->ssgl, 2, xfer_size,
+					    xfer_size <= rs->sq_inline ? IBV_SEND_INLINE : 0);
+			rs->ssgl[0].addr = (uintptr_t) rs->sbuf + rs->ssgl[1].length;
+		}
+		if (ret)
+			break;
+	}
+out:
+	fastlock_release(&rs->slock);
+
+	return (ret && left == len) ? ret : len - left;
+}
+
+ssize_t rsendmsg(int socket, const struct msghdr *msg, int flags)
+{
+	if (msg->msg_control && msg->msg_controllen)
+		return ERR(ENOTSUP);
+
+	return rsendv(socket, msg->msg_iov, (int) msg->msg_iovlen, flags);
+}
+
+ssize_t rwrite(int socket, const void *buf, size_t count)
+{
+	return rsend(socket, buf, count, 0);
+}
+
+ssize_t rwritev(int socket, const struct iovec *iov, int iovcnt)
+{
+	return rsendv(socket, iov, iovcnt, 0);
+}
+
+static struct pollfd *rs_fds_alloc(nfds_t nfds)
+{
+	static __thread struct pollfd *rfds;
+	static __thread nfds_t rnfds;
+
+	if (nfds > rnfds) {
+		if (rfds)
+			free(rfds);
+
+		rfds = malloc(sizeof *rfds * nfds);
+		rnfds = rfds ? nfds : 0;
+	}
+
+	return rfds;
+}
+
+static int rs_poll_rs(struct rsocket *rs, int events,
+		      int nonblock, int (*test)(struct rsocket *rs))
+{
+	struct pollfd fds;
+	short revents;
+	int ret;
+
+check_cq:
+	if ((rs->type == SOCK_STREAM) && ((rs->state & rs_connected) ||
+	     (rs->state == rs_disconnected) || (rs->state & rs_error))) {
+		rs_process_cq(rs, nonblock, test);
+
+		revents = 0;
+		if ((events & POLLIN) && rs_conn_have_rdata(rs))
+			revents |= POLLIN;
+		if ((events & POLLOUT) && rs_can_send(rs))
+			revents |= POLLOUT;
+		if (!(rs->state & rs_connected)) {
+			if (rs->state == rs_disconnected)
+				revents |= POLLHUP;
+			else
+				revents |= POLLERR;
+		}
+
+		return revents;
+	} else if (rs->type == SOCK_DGRAM) {
+		ds_process_cqs(rs, nonblock, test);
+
+		revents = 0;
+		if ((events & POLLIN) && rs_have_rdata(rs))
+			revents |= POLLIN;
+		if ((events & POLLOUT) && ds_can_send(rs))
+			revents |= POLLOUT;
+
+		return revents;
+	}
+
+	if (rs->state == rs_listening) {
+		fds.fd = rs->cm_id->channel->fd;
+		fds.events = events;
+		fds.revents = 0;
+		poll(&fds, 1, 0);
+		return fds.revents;
+	}
+
+	if (rs->state & rs_opening) {
+		ret = rs_do_connect(rs);
+		if (ret) {
+			if (errno == EINPROGRESS) {
+				errno = 0;
+				return 0;
+			} else {
+				return POLLOUT;
+			}
+		}
+		goto check_cq;
+	}
+
+	if (rs->state == rs_connect_error)
+		return (rs->err && events & POLLOUT) ? POLLOUT : 0;
+
+	return 0;
+}
+
+static int rs_poll_check(struct pollfd *fds, nfds_t nfds)
+{
+	struct rsocket *rs;
+	int i, cnt = 0;
+
+	for (i = 0; i < nfds; i++) {
+		rs = idm_lookup(&idm, fds[i].fd);
+		if (rs)
+			fds[i].revents = rs_poll_rs(rs, fds[i].events, 1, rs_poll_all);
+		else
+			poll(&fds[i], 1, 0);
+
+		if (fds[i].revents)
+			cnt++;
+	}
+	return cnt;
+}
+
+static int rs_poll_arm(struct pollfd *rfds, struct pollfd *fds, nfds_t nfds)
+{
+	struct rsocket *rs;
+	int i;
+
+	for (i = 0; i < nfds; i++) {
+		rs = idm_lookup(&idm, fds[i].fd);
+		if (rs) {
+			fds[i].revents = rs_poll_rs(rs, fds[i].events, 0, rs_is_cq_armed);
+			if (fds[i].revents)
+				return 1;
+
+			if (rs->type == SOCK_STREAM) {
+				if (rs->state >= rs_connected)
+					rfds[i].fd = rs->cm_id->recv_cq_channel->fd;
+				else
+					rfds[i].fd = rs->cm_id->channel->fd;
+			} else {
+				rfds[i].fd = rs->epfd;
+			}
+			rfds[i].events = POLLIN;
+		} else {
+			rfds[i].fd = fds[i].fd;
+			rfds[i].events = fds[i].events;
+		}
+		rfds[i].revents = 0;
+	}
+	return 0;
+}
+
+static int rs_poll_events(struct pollfd *rfds, struct pollfd *fds, nfds_t nfds)
+{
+	struct rsocket *rs;
+	int i, cnt = 0;
+
+	for (i = 0; i < nfds; i++) {
+		if (!rfds[i].revents)
+			continue;
+
+		rs = idm_lookup(&idm, fds[i].fd);
+		if (rs) {
+			fastlock_acquire(&rs->cq_wait_lock);
+			if (rs->type == SOCK_STREAM)
+				rs_get_cq_event(rs);
+			else
+				ds_get_cq_event(rs);
+			fastlock_release(&rs->cq_wait_lock);
+			fds[i].revents = rs_poll_rs(rs, fds[i].events, 1, rs_poll_all);
+		} else {
+			fds[i].revents = rfds[i].revents;
+		}
+		if (fds[i].revents)
+			cnt++;
+	}
+	return cnt;
+}
+
+/*
+ * We need to poll *all* fd's that the user specifies at least once.
+ * Note that we may receive events on an rsocket that may not be reported
+ * to the user (e.g. connection events or credit updates).  Process those
+ * events, then return to polling until we find ones of interest.
+ */
+int rpoll(struct pollfd *fds, nfds_t nfds, int timeout)
+{
+	struct timeval s, e;
+	struct pollfd *rfds;
+	uint32_t poll_time = 0;
+	int ret;
+
+	do {
+		ret = rs_poll_check(fds, nfds);
+		if (ret || !timeout)
+			return ret;
+
+		if (!poll_time)
+			gettimeofday(&s, NULL);
+
+		gettimeofday(&e, NULL);
+		poll_time = (e.tv_sec - s.tv_sec) * 1000000 +
+			    (e.tv_usec - s.tv_usec) + 1;
+	} while (poll_time <= polling_time);
+
+	rfds = rs_fds_alloc(nfds);
+	if (!rfds)
+		return ERR(ENOMEM);
+
+	do {
+		ret = rs_poll_arm(rfds, fds, nfds);
+		if (ret)
+			break;
+
+		ret = poll(rfds, nfds, timeout);
+		if (ret <= 0)
+			break;
+
+		ret = rs_poll_events(rfds, fds, nfds);
+	} while (!ret);
+
+	return ret;
+}
+
+static struct pollfd *
+rs_select_to_poll(int *nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds)
+{
+	struct pollfd *fds;
+	int fd, i = 0;
+
+	fds = calloc(*nfds, sizeof *fds);
+	if (!fds)
+		return NULL;
+
+	for (fd = 0; fd < *nfds; fd++) {
+		if (readfds && FD_ISSET(fd, readfds)) {
+			fds[i].fd = fd;
+			fds[i].events = POLLIN;
+		}
+
+		if (writefds && FD_ISSET(fd, writefds)) {
+			fds[i].fd = fd;
+			fds[i].events |= POLLOUT;
+		}
+
+		if (exceptfds && FD_ISSET(fd, exceptfds))
+			fds[i].fd = fd;
+
+		if (fds[i].fd)
+			i++;
+	}
+
+	*nfds = i;
+	return fds;
+}
+
+static int
+rs_poll_to_select(int nfds, struct pollfd *fds, fd_set *readfds,
+		  fd_set *writefds, fd_set *exceptfds)
+{
+	int i, cnt = 0;
+
+	for (i = 0; i < nfds; i++) {
+		if (readfds && (fds[i].revents & (POLLIN | POLLHUP))) {
+			FD_SET(fds[i].fd, readfds);
+			cnt++;
+		}
+
+		if (writefds && (fds[i].revents & POLLOUT)) {
+			FD_SET(fds[i].fd, writefds);
+			cnt++;
+		}
+
+		if (exceptfds && (fds[i].revents & ~(POLLIN | POLLOUT))) {
+			FD_SET(fds[i].fd, exceptfds);
+			cnt++;
+		}
+	}
+	return cnt;
+}
+
+static int rs_convert_timeout(struct timeval *timeout)
+{
+	return !timeout ? -1 :
+		timeout->tv_sec * 1000 + timeout->tv_usec / 1000;
+}
+
+int rselect(int nfds, fd_set *readfds, fd_set *writefds,
+	    fd_set *exceptfds, struct timeval *timeout)
+{
+	struct pollfd *fds;
+	int ret;
+
+	fds = rs_select_to_poll(&nfds, readfds, writefds, exceptfds);
+	if (!fds)
+		return ERR(ENOMEM);
+
+	ret = rpoll(fds, nfds, rs_convert_timeout(timeout));
+
+	if (readfds)
+		FD_ZERO(readfds);
+	if (writefds)
+		FD_ZERO(writefds);
+	if (exceptfds)
+		FD_ZERO(exceptfds);
+
+	if (ret > 0)
+		ret = rs_poll_to_select(nfds, fds, readfds, writefds, exceptfds);
+
+	free(fds);
+	return ret;
+}
+
+/*
+ * For graceful disconnect, notify the remote side that we're
+ * disconnecting and wait until all outstanding sends complete, provided
+ * that the remote side has not sent a disconnect message.
+ */
+int rshutdown(int socket, int how)
+{
+	struct rsocket *rs;
+	int ctrl, ret = 0;
+
+	rs = idm_at(&idm, socket);
+	if (rs->fd_flags & O_NONBLOCK)
+		rs_set_nonblocking(rs, 0);
+
+	if (rs->state & rs_connected) {
+		if (how == SHUT_RDWR) {
+			ctrl = RS_CTRL_DISCONNECT;
+			rs->state &= ~(rs_readable | rs_writable);
+		} else if (how == SHUT_WR) {
+			rs->state &= ~rs_writable;
+			ctrl = (rs->state & rs_readable) ?
+				RS_CTRL_SHUTDOWN : RS_CTRL_DISCONNECT;
+		} else {
+			rs->state &= ~rs_readable;
+			if (rs->state & rs_writable)
+				goto out;
+			ctrl = RS_CTRL_DISCONNECT;
+		}
+		if (!rs->ctrl_avail) {
+			ret = rs_process_cq(rs, 0, rs_conn_can_send_ctrl);
+			if (ret)
+				goto out;
+		}
+
+		if ((rs->state & rs_connected) && rs->ctrl_avail) {
+			rs->ctrl_avail--;
+			ret = rs_post_msg(rs, rs_msg_set(RS_OP_CTRL, ctrl));
+		}
+	}
+
+	if (rs->state & rs_connected)
+		rs_process_cq(rs, 0, rs_conn_all_sends_done);
+
+out:
+	if ((rs->fd_flags & O_NONBLOCK) && (rs->state & rs_connected))
+		rs_set_nonblocking(rs, rs->fd_flags);
+
+	if (rs->state & rs_disconnected) {
+		/* Generate event by flushing receives to unblock rpoll */
+		ibv_req_notify_cq(rs->cm_id->recv_cq, 0);
+		rdma_disconnect(rs->cm_id);
+	}
+
+	return ret;
+}
+
+static void ds_shutdown(struct rsocket *rs)
+{
+	if (rs->svcs)
+		rs_modify_svcs(rs, 0);
+
+	if (rs->fd_flags & O_NONBLOCK)
+		rs_set_nonblocking(rs, 0);
+
+	rs->state &= ~(rs_readable | rs_writable);
+	ds_process_cqs(rs, 0, ds_all_sends_done);
+
+	if (rs->fd_flags & O_NONBLOCK)
+		rs_set_nonblocking(rs, rs->fd_flags);
+}
+
+int rclose(int socket)
+{
+	struct rsocket *rs;
+
+	rs = idm_at(&idm, socket);
+	if (rs->type == SOCK_STREAM) {
+		if (rs->state & rs_connected)
+			rshutdown(socket, SHUT_RDWR);
+	} else {
+		ds_shutdown(rs);
+	}
+
+	rs_free(rs);
+	return 0;
+}
+
+static void rs_copy_addr(struct sockaddr *dst, struct sockaddr *src, socklen_t *len)
+{
+	socklen_t size;
+
+	if (src->sa_family == AF_INET) {
+		size = min(*len, sizeof(struct sockaddr_in));
+		*len = sizeof(struct sockaddr_in);
+	} else {
+		size = min(*len, sizeof(struct sockaddr_in6));
+		*len = sizeof(struct sockaddr_in6);
+	}
+	memcpy(dst, src, size);
+}
+
+int rgetpeername(int socket, struct sockaddr *addr, socklen_t *addrlen)
+{
+	struct rsocket *rs;
+
+	rs = idm_at(&idm, socket);
+	if (rs->type == SOCK_STREAM) {
+		rs_copy_addr(addr, rdma_get_peer_addr(rs->cm_id), addrlen);
+		return 0;
+	} else {
+		return getpeername(rs->udp_sock, addr, addrlen);
+	}
+}
+
+int rgetsockname(int socket, struct sockaddr *addr, socklen_t *addrlen)
+{
+	struct rsocket *rs;
+
+	rs = idm_at(&idm, socket);
+	if (rs->type == SOCK_STREAM) {
+		rs_copy_addr(addr, rdma_get_local_addr(rs->cm_id), addrlen);
+		return 0;
+	} else {
+		return getsockname(rs->udp_sock, addr, addrlen);
+	}
+}
+
+int rsetsockopt(int socket, int level, int optname,
+		const void *optval, socklen_t optlen)
+{
+	struct rsocket *rs;
+	int ret, opt_on = 0;
+	uint64_t *opts = NULL;
+
+	ret = ERR(ENOTSUP);
+	rs = idm_at(&idm, socket);
+	if (rs->type == SOCK_DGRAM && level != SOL_RDMA) {
+		ret = setsockopt(rs->udp_sock, level, optname, optval, optlen);
+		if (ret)
+			return ret;
+	}
+
+	switch (level) {
+	case SOL_SOCKET:
+		opts = &rs->so_opts;
+		switch (optname) {
+		case SO_REUSEADDR:
+			if (rs->type == SOCK_STREAM) {
+				ret = rdma_set_option(rs->cm_id, RDMA_OPTION_ID,
+						      RDMA_OPTION_ID_REUSEADDR,
+						      (void *) optval, optlen);
+				if (ret && ((errno == ENOSYS) || ((rs->state != rs_init) &&
+				    rs->cm_id->context &&
+				    (rs->cm_id->verbs->device->transport_type == IBV_TRANSPORT_IB))))
+					ret = 0;
+			}
+			opt_on = *(int *) optval;
+			break;
+		case SO_RCVBUF:
+			if ((rs->type == SOCK_STREAM && !rs->rbuf) ||
+			    (rs->type == SOCK_DGRAM && !rs->qp_list))
+				rs->rbuf_size = (*(uint32_t *) optval) << 1;
+			ret = 0;
+			break;
+		case SO_SNDBUF:
+			if (!rs->sbuf)
+				rs->sbuf_size = (*(uint32_t *) optval) << 1;
+			if (rs->sbuf_size < RS_SNDLOWAT)
+				rs->sbuf_size = RS_SNDLOWAT << 1;
+			ret = 0;
+			break;
+		case SO_LINGER:
+			/* Invert value so default so_opt = 0 is on */
+			opt_on =  !((struct linger *) optval)->l_onoff;
+			ret = 0;
+			break;
+		case SO_KEEPALIVE:
+			opt_on = *(int *) optval;
+			ret = 0;
+			break;
+		case SO_OOBINLINE:
+			opt_on = *(int *) optval;
+			ret = 0;
+			break;
+		default:
+			break;
+		}
+		break;
+	case IPPROTO_TCP:
+		opts = &rs->tcp_opts;
+		switch (optname) {
+		case TCP_NODELAY:
+			opt_on = *(int *) optval;
+			ret = 0;
+			break;
+		case TCP_MAXSEG:
+			ret = 0;
+			break;
+		default:
+			break;
+		}
+		break;
+	case IPPROTO_IPV6:
+		opts = &rs->ipv6_opts;
+		switch (optname) {
+		case IPV6_V6ONLY:
+			if (rs->type == SOCK_STREAM) {
+				ret = rdma_set_option(rs->cm_id, RDMA_OPTION_ID,
+						      RDMA_OPTION_ID_AFONLY,
+						      (void *) optval, optlen);
+			}
+			opt_on = *(int *) optval;
+			break;
+		default:
+			break;
+		}
+		break;
+	case SOL_RDMA:
+		if (rs->state >= rs_opening) {
+			ret = ERR(EINVAL);
+			break;
+		}
+
+		switch (optname) {
+		case RDMA_SQSIZE:
+			rs->sq_size = min((*(uint32_t *) optval), RS_QP_MAX_SIZE);
+			ret = 0;
+			break;
+		case RDMA_RQSIZE:
+			rs->rq_size = min((*(uint32_t *) optval), RS_QP_MAX_SIZE);
+			ret = 0;
+			break;
+		case RDMA_INLINE:
+			rs->sq_inline = min(*(uint32_t *) optval, RS_QP_MAX_SIZE);
+			if (rs->sq_inline < RS_MIN_INLINE)
+				rs->sq_inline = RS_MIN_INLINE;
+			ret = 0;
+			break;
+		case RDMA_IOMAPSIZE:
+			rs->target_iomap_size = (uint16_t) rs_scale_to_value(
+				(uint8_t) rs_value_to_scale(*(int *) optval, 8), 8);
+			ret = 0;
+			break;
+		case RDMA_ROUTE:
+			if ((rs->optval = calloc(optlen, 1))) {
+				memcpy(rs->optval, optval, optlen);
+				rs->optlen = optlen;
+				ret = 0;
+			} else {
+				ret = ERR(ENOMEM);
+			}
+			break;
+		default:
+			break;
+		}
+		break;
+	default:
+		break;
+	}
+
+	if (!ret && opts) {
+		if (opt_on)
+			*opts |= (1 << optname);
+		else
+			*opts &= ~(1 << optname);
+	}
+
+	return ret;
+}
+
+int rgetsockopt(int socket, int level, int optname,
+		void *optval, socklen_t *optlen)
+{
+	struct rsocket *rs;
+	int ret = 0;
+
+	rs = idm_at(&idm, socket);
+	switch (level) {
+	case SOL_SOCKET:
+		switch (optname) {
+		case SO_REUSEADDR:
+		case SO_KEEPALIVE:
+		case SO_OOBINLINE:
+			*((int *) optval) = !!(rs->so_opts & (1 << optname));
+			*optlen = sizeof(int);
+			break;
+		case SO_RCVBUF:
+			*((int *) optval) = rs->rbuf_size;
+			*optlen = sizeof(int);
+			break;
+		case SO_SNDBUF:
+			*((int *) optval) = rs->sbuf_size;
+			*optlen = sizeof(int);
+			break;
+		case SO_LINGER:
+			/* Value is inverted so default so_opt = 0 is on */
+			((struct linger *) optval)->l_onoff =
+					!(rs->so_opts & (1 << optname));
+			((struct linger *) optval)->l_linger = 0;
+			*optlen = sizeof(struct linger);
+			break;
+		case SO_ERROR:
+			*((int *) optval) = rs->err;
+			*optlen = sizeof(int);
+			rs->err = 0;
+			break;
+		default:
+			ret = ENOTSUP;
+			break;
+		}
+		break;
+	case IPPROTO_TCP:
+		switch (optname) {
+		case TCP_NODELAY:
+			*((int *) optval) = !!(rs->tcp_opts & (1 << optname));
+			*optlen = sizeof(int);
+			break;
+		case TCP_MAXSEG:
+			*((int *) optval) = (rs->cm_id && rs->cm_id->route.num_paths) ?
+					    1 << (7 + rs->cm_id->route.path_rec->mtu) :
+					    2048;
+			*optlen = sizeof(int);
+			break;
+		default:
+			ret = ENOTSUP;
+			break;
+		}
+		break;
+	case IPPROTO_IPV6:
+		switch (optname) {
+		case IPV6_V6ONLY:
+			*((int *) optval) = !!(rs->ipv6_opts & (1 << optname));
+			*optlen = sizeof(int);
+			break;
+		default:
+			ret = ENOTSUP;
+			break;
+		}
+		break;
+	case SOL_RDMA:
+		switch (optname) {
+		case RDMA_SQSIZE:
+			*((int *) optval) = rs->sq_size;
+			*optlen = sizeof(int);
+			break;
+		case RDMA_RQSIZE:
+			*((int *) optval) = rs->rq_size;
+			*optlen = sizeof(int);
+			break;
+		case RDMA_INLINE:
+			*((int *) optval) = rs->sq_inline;
+			*optlen = sizeof(int);
+			break;
+		case RDMA_IOMAPSIZE:
+			*((int *) optval) = rs->target_iomap_size;
+			*optlen = sizeof(int);
+			break;
+		default:
+			ret = ENOTSUP;
+			break;
+		}
+		break;
+	default:
+		ret = ENOTSUP;
+		break;
+	}
+
+	return rdma_seterrno(ret);
+}
+
+int rfcntl(int socket, int cmd, ... /* arg */ )
+{
+	struct rsocket *rs;
+	va_list args;
+	long param;
+	int ret = 0;
+
+	rs = idm_at(&idm, socket);
+	va_start(args, cmd);
+	switch (cmd) {
+	case F_GETFL:
+		ret = (int) rs->fd_flags;
+		break;
+	case F_SETFL:
+		param = va_arg(args, long);
+		if (param & O_NONBLOCK)
+			ret = rs_set_nonblocking(rs, O_NONBLOCK);
+
+		if (!ret)
+			rs->fd_flags |= param;
+		break;
+	default:
+		ret = ERR(ENOTSUP);
+		break;
+	}
+	va_end(args);
+	return ret;
+}
+
+static struct rs_iomap_mr *rs_get_iomap_mr(struct rsocket *rs)
+{
+	int i;
+
+	if (!rs->remote_iomappings) {
+		rs->remote_iomappings = calloc(rs->remote_iomap.length,
+					       sizeof(*rs->remote_iomappings));
+		if (!rs->remote_iomappings)
+			return NULL;
+
+		for (i = 0; i < rs->remote_iomap.length; i++)
+			rs->remote_iomappings[i].index = i;
+	}
+
+	for (i = 0; i < rs->remote_iomap.length; i++) {
+		if (!rs->remote_iomappings[i].mr)
+			return &rs->remote_iomappings[i];
+	}
+	return NULL;
+}
+
+/*
+ * If an offset is given, we map to it.  If offset is -1, then we map the
+ * offset to the address of buf.  We do not check for conflicts, which must
+ * be fixed at some point.
+ */
+off_t riomap(int socket, void *buf, size_t len, int prot, int flags, off_t offset)
+{
+	struct rsocket *rs;
+	struct rs_iomap_mr *iomr;
+	int access = IBV_ACCESS_LOCAL_WRITE;
+
+	rs = idm_at(&idm, socket);
+	if (!rs->cm_id->pd || (prot & ~(PROT_WRITE | PROT_NONE)))
+		return ERR(EINVAL);
+
+	fastlock_acquire(&rs->map_lock);
+	if (prot & PROT_WRITE) {
+		iomr = rs_get_iomap_mr(rs);
+		access |= IBV_ACCESS_REMOTE_WRITE;
+	} else {
+		iomr = calloc(1, sizeof *iomr);
+		iomr->index = -1;
+	}
+	if (!iomr) {
+		offset = ERR(ENOMEM);
+		goto out;
+	}
+
+	iomr->mr = ibv_reg_mr(rs->cm_id->pd, buf, len, access);
+	if (!iomr->mr) {
+		if (iomr->index < 0)
+			free(iomr);
+		offset = -1;
+		goto out;
+	}
+
+	if (offset == -1)
+		offset = (uintptr_t) buf;
+	iomr->offset = offset;
+	atomic_init(&iomr->refcnt);
+	atomic_set(&iomr->refcnt, 1);
+
+	if (iomr->index >= 0) {
+		dlist_insert_tail(&iomr->entry, &rs->iomap_queue);
+		rs->iomap_pending = 1;
+	} else {
+		dlist_insert_tail(&iomr->entry, &rs->iomap_list);
+	}
+out:
+	fastlock_release(&rs->map_lock);
+	return offset;
+}
+
+int riounmap(int socket, void *buf, size_t len)
+{
+	struct rsocket *rs;
+	struct rs_iomap_mr *iomr;
+	dlist_entry *entry;
+	int ret = 0;
+
+	rs = idm_at(&idm, socket);
+	fastlock_acquire(&rs->map_lock);
+
+	for (entry = rs->iomap_list.next; entry != &rs->iomap_list;
+	     entry = entry->next) {
+		iomr = container_of(entry, struct rs_iomap_mr, entry);
+		if (iomr->mr->addr == buf && iomr->mr->length == len) {
+			rs_release_iomap_mr(iomr);
+			goto out;
+		}
+	}
+
+	for (entry = rs->iomap_queue.next; entry != &rs->iomap_queue;
+	     entry = entry->next) {
+		iomr = container_of(entry, struct rs_iomap_mr, entry);
+		if (iomr->mr->addr == buf && iomr->mr->length == len) {
+			rs_release_iomap_mr(iomr);
+			goto out;
+		}
+	}
+	ret = ERR(EINVAL);
+out:
+	fastlock_release(&rs->map_lock);
+	return ret;
+}
+
+static struct rs_iomap *rs_find_iomap(struct rsocket *rs, off_t offset)
+{
+	int i;
+
+	for (i = 0; i < rs->target_iomap_size; i++) {
+		if (offset >= rs->target_iomap[i].offset &&
+		    offset < rs->target_iomap[i].offset + rs->target_iomap[i].sge.length)
+			return &rs->target_iomap[i];
+	}
+	return NULL;
+}
+
+size_t riowrite(int socket, const void *buf, size_t count, off_t offset, int flags)
+{
+	struct rsocket *rs;
+	struct rs_iomap *iom = NULL;
+	struct ibv_sge sge;
+	size_t left = count;
+	uint32_t xfer_size, olen = RS_OLAP_START_SIZE;
+	int ret = 0;
+
+	rs = idm_at(&idm, socket);
+	fastlock_acquire(&rs->slock);
+	if (rs->iomap_pending) {
+		ret = rs_send_iomaps(rs, flags);
+		if (ret)
+			goto out;
+	}
+	for (; left; left -= xfer_size, buf += xfer_size, offset += xfer_size) {
+		if (!iom || offset > iom->offset + iom->sge.length) {
+			iom = rs_find_iomap(rs, offset);
+			if (!iom)
+				break;
+		}
+
+		if (!rs_can_send(rs)) {
+			ret = rs_get_comp(rs, rs_nonblocking(rs, flags),
+					  rs_conn_can_send);
+			if (ret)
+				break;
+			if (!(rs->state & rs_writable)) {
+				ret = ERR(ECONNRESET);
+				break;
+			}
+		}
+
+		if (olen < left) {
+			xfer_size = olen;
+			if (olen < RS_MAX_TRANSFER)
+				olen <<= 1;
+		} else {
+			xfer_size = left;
+		}
+
+		if (xfer_size > rs->sbuf_bytes_avail)
+			xfer_size = rs->sbuf_bytes_avail;
+		if (xfer_size > iom->offset + iom->sge.length - offset)
+			xfer_size = iom->offset + iom->sge.length - offset;
+
+		if (xfer_size <= rs->sq_inline) {
+			sge.addr = (uintptr_t) buf;
+			sge.length = xfer_size;
+			sge.lkey = 0;
+			ret = rs_write_direct(rs, iom, offset, &sge, 1,
+					      xfer_size, IBV_SEND_INLINE);
+		} else if (xfer_size <= rs_sbuf_left(rs)) {
+			memcpy((void *) (uintptr_t) rs->ssgl[0].addr, buf, xfer_size);
+			rs->ssgl[0].length = xfer_size;
+			ret = rs_write_direct(rs, iom, offset, rs->ssgl, 1, xfer_size, 0);
+			if (xfer_size < rs_sbuf_left(rs))
+				rs->ssgl[0].addr += xfer_size;
+			else
+				rs->ssgl[0].addr = (uintptr_t) rs->sbuf;
+		} else {
+			rs->ssgl[0].length = rs_sbuf_left(rs);
+			memcpy((void *) (uintptr_t) rs->ssgl[0].addr, buf,
+				rs->ssgl[0].length);
+			rs->ssgl[1].length = xfer_size - rs->ssgl[0].length;
+			memcpy(rs->sbuf, buf + rs->ssgl[0].length, rs->ssgl[1].length);
+			ret = rs_write_direct(rs, iom, offset, rs->ssgl, 2, xfer_size, 0);
+			rs->ssgl[0].addr = (uintptr_t) rs->sbuf + rs->ssgl[1].length;
+		}
+		if (ret)
+			break;
+	}
+out:
+	fastlock_release(&rs->slock);
+
+	return (ret && left == count) ? ret : count - left;
+}
+
+static int rs_svc_grow_sets(void)
+{
+	struct rsocket **rss;
+	struct pollfd *fds;
+	void *set;
+
+	set = calloc(svc_size + 2, sizeof(*rss) + sizeof(*fds));
+	if (!set)
+		return ENOMEM;
+
+	svc_size += 2;
+	rss = set;
+	fds = set + sizeof(*rss) * svc_size;
+	if (svc_cnt) {
+		memcpy(rss, svc_rss, sizeof(*rss) * svc_cnt);
+		memcpy(fds, svc_fds, sizeof(*fds) * svc_cnt);
+	}
+
+	free(svc_rss);
+	free(svc_fds);
+	svc_rss = rss;
+	svc_fds = fds;
+	return 0;
+}
+
+/*
+ * Index 0 is reserved for the service's communication socket.
+ */
+static int rs_svc_add_rs(struct rsocket *rs)
+{
+	int ret;
+
+	if (svc_cnt >= svc_size - 1) {
+		ret = rs_svc_grow_sets();
+		if (ret)
+			return ret;
+	}
+
+	svc_rss[++svc_cnt] = rs;
+	svc_fds[svc_cnt].fd = rs->udp_sock;
+	svc_fds[svc_cnt].events = POLLIN;
+	svc_fds[svc_cnt].revents = 0;
+	return 0;
+}
+
+static int rs_svc_rm_rs(struct rsocket *rs)
+{
+	int i;
+
+	for (i = 1; i <= svc_cnt; i++) {
+		if (svc_rss[i] == rs) {
+			svc_cnt--;
+			svc_rss[i] = svc_rss[svc_cnt];
+			svc_fds[i] = svc_fds[svc_cnt];
+			return 0;
+		}
+	}
+	return EBADF;
+}
+
+static void rs_svc_process_sock(void)
+{
+	struct rs_svc_msg msg;
+
+	read(svc_sock[1], &msg, sizeof msg);
+	if (msg.svcs & RS_SVC_DGRAM) {
+		msg.status = rs_svc_add_rs(msg.rs);
+	} else if (!msg.svcs) {
+		msg.status = rs_svc_rm_rs(msg.rs);
+	}
+
+	if (!msg.status)
+		msg.rs->svcs = msg.svcs;
+	write(svc_sock[1], &msg, sizeof msg);
+}
+
+static uint8_t rs_svc_sgid_index(struct ds_dest *dest, union ibv_gid *sgid)
+{
+	union ibv_gid gid;
+	int i;
+
+	for (i = 0; i < 16; i++) {
+		ibv_query_gid(dest->qp->cm_id->verbs, dest->qp->cm_id->port_num,
+			      i, &gid);
+		if (!memcmp(sgid, &gid, sizeof gid))
+			return i;
+	}
+	return 0;
+}
+
+static uint8_t rs_svc_path_bits(struct ds_dest *dest)
+{
+	struct ibv_port_attr attr;
+
+	if (!ibv_query_port(dest->qp->cm_id->verbs, dest->qp->cm_id->port_num, &attr))
+		return (uint8_t) ((1 << attr.lmc) - 1);
+	return 0x7f;
+}
+
+static void rs_svc_create_ah(struct rsocket *rs, struct ds_dest *dest, uint32_t qpn)
+{
+	union socket_addr saddr;
+	struct rdma_cm_id *id;
+	struct ibv_ah_attr attr;
+	int ret;
+
+	if (dest->ah) {
+		fastlock_acquire(&rs->slock);
+		ibv_destroy_ah(dest->ah);
+		dest->ah = NULL;
+		fastlock_release(&rs->slock);
+	}
+
+	ret = rdma_create_id(NULL, &id, NULL, dest->qp->cm_id->ps);
+	if  (ret)
+		return;
+
+	memcpy(&saddr, rdma_get_local_addr(dest->qp->cm_id),
+	       rdma_addrlen(rdma_get_local_addr(dest->qp->cm_id)));
+	if (saddr.sa.sa_family == AF_INET)
+		saddr.sin.sin_port = 0;
+	else
+		saddr.sin6.sin6_port = 0;
+	ret = rdma_resolve_addr(id, &saddr.sa, &dest->addr.sa, 2000);
+	if (ret)
+		goto out;
+
+	ret = rdma_resolve_route(id, 2000);
+	if (ret)
+		goto out;
+
+	memset(&attr, 0, sizeof attr);
+	if (id->route.path_rec->hop_limit > 1) {
+		attr.is_global = 1;
+		attr.grh.dgid = id->route.path_rec->dgid;
+		attr.grh.flow_label = ntohl(id->route.path_rec->flow_label);
+		attr.grh.sgid_index = rs_svc_sgid_index(dest, &id->route.path_rec->sgid);
+		attr.grh.hop_limit = id->route.path_rec->hop_limit;
+		attr.grh.traffic_class = id->route.path_rec->traffic_class;
+	}
+	attr.dlid = ntohs(id->route.path_rec->dlid);
+	attr.sl = id->route.path_rec->sl;
+	attr.src_path_bits = id->route.path_rec->slid & rs_svc_path_bits(dest);
+	attr.static_rate = id->route.path_rec->rate;
+	attr.port_num  = id->port_num;
+
+	fastlock_acquire(&rs->slock);
+	dest->qpn = qpn;
+	dest->ah = ibv_create_ah(dest->qp->cm_id->pd, &attr);
+	fastlock_release(&rs->slock);
+out:
+	rdma_destroy_id(id);
+}
+
+static int rs_svc_valid_udp_hdr(struct ds_udp_header *udp_hdr,
+				union socket_addr *addr)
+{
+	return (udp_hdr->tag == ntohl(DS_UDP_TAG)) &&
+		((udp_hdr->version == 4 && addr->sa.sa_family == AF_INET &&
+		  udp_hdr->length == DS_UDP_IPV4_HDR_LEN) ||
+		 (udp_hdr->version == 6 && addr->sa.sa_family == AF_INET6 &&
+		  udp_hdr->length == DS_UDP_IPV6_HDR_LEN));
+}
+
+static void rs_svc_forward(struct rsocket *rs, void *buf, size_t len,
+			   union socket_addr *src)
+{
+	struct ds_header hdr;
+	struct ds_smsg *msg;
+	struct ibv_sge sge;
+	uint64_t offset;
+
+	if (!ds_can_send(rs)) {
+		if (ds_get_comp(rs, 0, ds_can_send))
+			return;
+	}
+
+	msg = rs->smsg_free;
+	rs->smsg_free = msg->next;
+	rs->sqe_avail--;
+
+	ds_format_hdr(&hdr, src);
+	memcpy((void *) msg, &hdr, hdr.length);
+	memcpy((void *) msg + hdr.length, buf, len);
+	sge.addr = (uintptr_t) msg;
+	sge.length = hdr.length + len;
+	sge.lkey = rs->conn_dest->qp->smr->lkey;
+	offset = (uint8_t *) msg - rs->sbuf;
+
+	ds_post_send(rs, &sge, offset);
+}
+
+static void rs_svc_process_rs(struct rsocket *rs)
+{
+	struct ds_dest *dest, *cur_dest;
+	struct ds_udp_header *udp_hdr;
+	union socket_addr addr;
+	socklen_t addrlen = sizeof addr;
+	int len, ret;
+
+	ret = recvfrom(rs->udp_sock, svc_buf, sizeof svc_buf, 0, &addr.sa, &addrlen);
+	if (ret < DS_UDP_IPV4_HDR_LEN)
+		return;
+
+	udp_hdr = (struct ds_udp_header *) svc_buf;
+	if (!rs_svc_valid_udp_hdr(udp_hdr, &addr))
+		return;
+
+	len = ret - udp_hdr->length;
+	udp_hdr->tag = ntohl(udp_hdr->tag);
+	udp_hdr->qpn = ntohl(udp_hdr->qpn) & 0xFFFFFF;
+	ret = ds_get_dest(rs, &addr.sa, addrlen, &dest);
+	if (ret)
+		return;
+
+	if (udp_hdr->op == RS_OP_DATA) {
+		fastlock_acquire(&rs->slock);
+		cur_dest = rs->conn_dest;
+		rs->conn_dest = dest;
+		ds_send_udp(rs, NULL, 0, 0, RS_OP_CTRL);
+		rs->conn_dest = cur_dest;
+		fastlock_release(&rs->slock);
+	}
+
+	if (!dest->ah || (dest->qpn != udp_hdr->qpn))
+		rs_svc_create_ah(rs, dest, udp_hdr->qpn);
+
+	/* to do: handle when dest local ip address doesn't match udp ip */
+	if (udp_hdr->op == RS_OP_DATA) {
+		fastlock_acquire(&rs->slock);
+		cur_dest = rs->conn_dest;
+		rs->conn_dest = &dest->qp->dest;
+		rs_svc_forward(rs, svc_buf + udp_hdr->length, len, &addr);
+		rs->conn_dest = cur_dest;
+		fastlock_release(&rs->slock);
+	}
+}
+
+static void *rs_svc_run(void *arg)
+{
+	struct rs_svc_msg msg;
+	int i, ret;
+
+	ret = rs_svc_grow_sets();
+	if (ret) {
+		msg.status = ret;
+		write(svc_sock[1], &msg, sizeof msg);
+		return (void *) (uintptr_t) ret;
+	}
+
+	svc_fds[0].fd = svc_sock[1];
+	svc_fds[0].events = POLLIN;
+	do {
+		for (i = 0; i <= svc_cnt; i++)
+			svc_fds[i].revents = 0;
+
+		poll(svc_fds, svc_cnt + 1, -1);
+		if (svc_fds[0].revents)
+			rs_svc_process_sock();
+
+		for (i = 1; i <= svc_cnt; i++) {
+			if (svc_fds[i].revents)
+				rs_svc_process_rs(svc_rss[i]);
+		}
+	} while (svc_cnt >= 1);
+
+	return NULL;
+}
diff --git a/src/fabric.c b/src/fabric.c
new file mode 100644
index 00000000000..dcb0e299392
--- /dev/null
+++ b/src/fabric.c
@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2006 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2013 Intel Corp., Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if HAVE_CONFIG_H
+#  include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <rdma/fabric.h>
+#include <rdma/fi_arch.h>
+#include <rdma/fi_atomic.h>
+#include <rdma/fi_cm.h>
+#include <rdma/fi_domain.h>
+#include <rdma/fi_prov.h>
+#include <rdma/fi_rdma.h>
+#include <rdma/fi_socket.h>
+#include <rdma/fi_tagged.h>
+#include <rdma/fi_ucma.h>
+#include <rdma/fi_umad.h>
+#include <rdma/fi_uverbs.h>
+#include <rdma/fi_errno.h>
+#include "fi.h"
+
+static pthread_mutex_t mut = PTHREAD_MUTEX_INITIALIZER;
+static int init;
+static struct fi_prov *prov_head, *prov_tail;
+
+
+const char *fi_sysfs_path(void)
+{
+	static char *sysfs_path;
+	char *env = NULL;
+
+	if (sysfs_path)
+		return sysfs_path;
+
+	/*
+	 * Only follow path passed in through the calling user's
+	 * environment if we're not running SUID.
+	 */
+	if (getuid() == geteuid())
+		env = getenv("SYSFS_PATH");
+
+	if (env) {
+		int len;
+
+		sysfs_path = strndup(env, FI_PATH_MAX);
+		len = strlen(sysfs_path);
+		while (len > 0 && sysfs_path[len - 1] == '/') {
+			--len;
+			sysfs_path[len] = '\0';
+		}
+	} else {
+		sysfs_path = "/sys";
+	}
+
+	return sysfs_path;
+}
+
+int fi_read_file(const char *dir, const char *file, char *buf, size_t size)
+{
+	char *path;
+	int fd, len;
+
+	if (asprintf(&path, "%s/%s", dir, file) < 0)
+		return -1;
+
+	fd = open(path, O_RDONLY);
+	if (fd < 0) {
+		free(path);
+		return -1;
+	}
+
+	len = read(fd, buf, size);
+	close(fd);
+	free(path);
+
+	if (len > 0 && buf[len - 1] == '\n')
+		buf[--len] = '\0';
+
+	return len;
+}
+
+void fi_register(struct fi_ops_prov *ops)
+{
+	struct fi_prov *prov;
+
+	prov = calloc(sizeof *prov, 1);
+	if (!prov)
+		return;
+
+	prov->ops = ops;
+	if (prov_tail)
+		prov_tail->next = prov;
+	else
+		prov_head = prov;
+	prov_tail = prov;
+}
+
+int  ucma_init(void);
+int fi_init()
+{
+	int ret = 0;
+
+	pthread_mutex_lock(&mut);
+	if (init)
+		goto out;
+
+	ret = uv_init();
+	if (ret)
+		goto out;
+
+	ret = ucma_init();
+	if (ret)
+		goto out;
+
+	init = 1;
+out:
+	pthread_mutex_unlock(&mut);
+	return ret;
+}
+
+static void __attribute__((constructor)) fi_ini(void)
+{
+	uv_ini();
+	ibv_ini();
+	ucma_ini();
+	rdma_cm_ini();
+	psmx_ini();
+	mlx4_ini();
+}
+
+static void __attribute__((destructor)) fi_fini(void)
+{
+	mlx4_fini();
+	psmx_fini();
+	rdma_cm_fini();
+	ucma_fini();
+	ibv_fini();
+	uv_fini();
+}
+
+int fi_getinfo(char *node, char *service, struct fi_info *hints,
+	       struct fi_info **info)
+{
+	struct fi_prov *prov;
+	struct fi_info *tail, *cur;
+	int ret = -ENOSYS;
+
+	if (!init)
+		fi_init();
+
+	*info = tail = NULL;
+	for (prov = prov_head; prov; prov = prov->next) {
+		if (!prov->ops->getinfo)
+			continue;
+
+		ret = prov->ops->getinfo(node, service, hints, &cur);
+		if (ret)
+			continue;
+
+		if (!*info)
+			*info = cur;
+		else
+			tail->next = cur;
+		for (tail = cur; tail->next; tail = tail->next)
+			;
+	}
+
+	return *info ? 0 : ret;
+}
+
+void __fi_freeinfo(struct fi_info *info)
+{
+	if (info->src_addr)
+		free(info->src_addr);
+	if (info->dst_addr)
+		free(info->dst_addr);
+//	if (info->src_canonname)
+//		free(info->src_canonname);
+//	if (info->dst_canonname)
+//		free(info->dst_canonname);
+	if (info->domain_name)
+		free(info->domain_name);
+	if (info->data)
+		free(info->data);
+
+	free(info);
+}
+
+void fi_freeinfo(struct fi_info *info)
+{
+	struct fi_prov *prov;
+	struct fi_info *next;
+	int ret;
+
+	while (info) {
+		next = info->next;
+		for (prov = prov_head; prov && info; prov = prov->next) {
+			if (!prov->ops->freeinfo)
+				continue;
+
+			ret = prov->ops->freeinfo(info);
+			if (!ret)
+				goto cont;
+		}
+		__fi_freeinfo(info);
+cont:
+		info = next;
+	}
+}
+
+int fi_open(char *name, struct fi_info *info, uint64_t flags, fid_t *fid, void *context)
+{
+	struct fi_prov *prov;
+	int ret = -ENOSYS;
+
+	if (!init)
+		fi_init();
+
+	for (prov = prov_head; prov; prov = prov->next) {
+		if (!prov->ops->open)
+			continue;
+
+		ret = prov->ops->open(name, info, flags, fid, context);
+		if (!ret)
+			break;
+	}
+
+	return ret;
+}
+
+int fi_socket(struct fi_info *info, fid_t *fid, void *context)
+{
+	struct fi_prov *prov;
+	int ret = -ENOSYS;
+
+	if (!init)
+		fi_init();
+
+	for (prov = prov_head; prov; prov = prov->next) {
+		if (!prov->ops->socket)
+			continue;
+
+		ret = prov->ops->socket(info, fid, context);
+		if (!ret)
+			break;
+	}
+
+	return ret;
+}
+
+#define FI_ERRNO_OFFSET	256
+
+static const char *const errstr[] = {
+	[FI_EOTHER - FI_ERRNO_OFFSET] = "Unspecified error",
+	[FI_ETOOSMALL - FI_ERRNO_OFFSET] = "Provided buffer is too small"
+
+};
+
+const char *fi_strerror(int errnum)
+{
+	if (errnum < FI_ERRNO_OFFSET)
+		return strerror(errnum);
+	else
+		return errstr[errnum - FI_ERRNO_OFFSET];
+}
diff --git a/src/libfabric.map b/src/libfabric.map
new file mode 100644
index 00000000000..75c1a048cd2
--- /dev/null
+++ b/src/libfabric.map
@@ -0,0 +1,38 @@
+FABRIC_1.0 {
+	global:
+		fi_getinfo;
+		fi_freeinfo;
+		fi_open;
+		fi_socket;
+		fi_strerror;
+                rsocket;
+                rbind;
+                rlisten;
+                raccept;
+                rconnect;
+                rshutdown;
+                rclose;
+                rrecv;
+                rrecvfrom;
+                rrecvmsg;
+                rsend;
+                rsendto;
+                rsendmsg;
+                rread;
+                rreadv;
+                rwrite;
+                rwritev;
+                rpoll;
+                rselect;
+                rgetpeername;
+                rgetsockname;
+                rsetsockopt;
+                rgetsockopt;
+                rfcntl;
+                rpoll;
+                rselect;
+                riomap;
+                riounmap;
+                riowrite;
+	local: *;
+};
diff --git a/src/ucma.c b/src/ucma.c
new file mode 100644
index 00000000000..1fc368bd419
--- /dev/null
+++ b/src/ucma.c
@@ -0,0 +1,497 @@
+/*
+ * Copyright (c) 2005-2012 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if HAVE_CONFIG_H
+#  include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#include <stdlib.h>
+#include <string.h>
+#include <glob.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdint.h>
+#include <poll.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <endian.h>
+#include <byteswap.h>
+#include <stddef.h>
+#include <netdb.h>
+#include <syslog.h>
+
+#include <rdma/fabric.h>
+#include <rdma/fi_prov.h>
+#include <rdma/fi_ucma.h>
+#include "fi.h"
+
+
+static int ucma_abi_ver = RDMA_USER_CM_MAX_ABI_VERSION;
+
+#define UCMA_INIT_CMD(req, req_size, op)	\
+do {						\
+	(req)->cmd = UCMA_CMD_##op;		\
+	(req)->in  = (req_size) - sizeof(struct ucma_abi_cmd_hdr); \
+	(req)->out = 0;				\
+} while (0)
+
+#define UCMA_INIT_CMD_RESP(req, req_size, op, resp, resp_size) \
+do {						\
+	(req)->cmd = UCMA_CMD_##op;		\
+	(req)->in  = (req_size) - sizeof(struct ucma_abi_cmd_hdr); \
+	(req)->out = (resp_size);			\
+	(req)->response = (uintptr_t) (resp);	\
+} while (0)
+
+static int ucma_open(const char *name, struct fi_info *info, uint64_t flags,
+		     fid_t *fid, void *context);
+
+static struct fi_ops_prov ucma_prov_ops = {
+	.size = sizeof(struct fi_ops_prov),
+	.getinfo = NULL,
+	.freeinfo = NULL,
+	.socket = NULL,
+	.open = ucma_open
+};
+
+
+static int ucma_abi_version(void)
+{
+	char value[8];
+
+	if ((fi_read_file(fi_sysfs_path(), "class/misc/rdma_cm/abi_version",
+			 value, sizeof value) < 0) &&
+	    (fi_read_file(fi_sysfs_path(), "class/infiniband_ucma/abi_version",
+			 value, sizeof value) < 0)) {
+		return -ENOSYS;
+	}
+
+	ucma_abi_ver = strtol(value, NULL, 10);
+	if (ucma_abi_ver < RDMA_USER_CM_MIN_ABI_VERSION ||
+	    ucma_abi_ver > RDMA_USER_CM_MAX_ABI_VERSION) {
+		fprintf(stderr, PFX "ucma kernel ABI version %d not supported (%d).\n",
+			ucma_abi_ver, RDMA_USER_CM_MAX_ABI_VERSION);
+		return -ENOSYS;
+	}
+
+	return 0;
+}
+
+int ucma_init(void)
+{
+	return ucma_abi_version();
+}
+
+void ucma_ini(void)
+{
+	fi_register(&ucma_prov_ops);
+}
+
+void ucma_fini(void)
+{
+}
+
+static int __ucma_create_id(fid_t fid,
+			struct ucma_abi_create_id *cmd, size_t cmd_size,
+			struct ucma_abi_create_id_resp *resp, size_t resp_size)
+{
+	struct fid_ucma *ucma;
+
+	ucma = container_of(fid, struct fid_ucma, fid);
+	UCMA_INIT_CMD_RESP(cmd, cmd_size, CREATE_ID, resp, resp_size);
+	if (write(ucma->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+
+	VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
+	return 0;
+}
+
+static int __ucma_destroy_id(fid_t fid,
+			struct ucma_abi_destroy_id *cmd, size_t cmd_size,
+			struct ucma_abi_destroy_id_resp *resp, size_t resp_size)
+{
+	struct fid_ucma *ucma;
+
+	ucma = container_of(fid, struct fid_ucma, fid);
+	UCMA_INIT_CMD_RESP(cmd, cmd_size, DESTROY_ID, resp, resp_size);
+	if (write(ucma->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+
+	VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
+	return 0;
+}
+
+static int __ucma_bind_ip(fid_t fid,
+			struct ucma_abi_bind_ip *cmd, size_t cmd_size)
+{
+	struct fid_ucma *ucma;
+
+	ucma = container_of(fid, struct fid_ucma, fid);
+	UCMA_INIT_CMD(cmd, cmd_size, BIND_IP);
+	if (write(ucma->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+
+	return 0;
+}
+
+static int __ucma_bind(fid_t fid,
+		struct ucma_abi_bind *cmd, size_t cmd_size)
+{
+	struct fid_ucma *ucma;
+
+	ucma = container_of(fid, struct fid_ucma, fid);
+	UCMA_INIT_CMD(cmd, cmd_size, BIND);
+	if (write(ucma->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+
+	return 0;
+}
+static int __ucma_resolve_ip(fid_t fid,
+		struct ucma_abi_resolve_ip *cmd, size_t cmd_size)
+{
+	struct fid_ucma *ucma;
+
+	ucma = container_of(fid, struct fid_ucma, fid);
+	UCMA_INIT_CMD(cmd, cmd_size, RESOLVE_IP);
+	if (write(ucma->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+
+	return 0;
+}
+
+static int __ucma_resolve_addr(fid_t fid,
+		struct ucma_abi_resolve_addr *cmd, size_t cmd_size)
+{
+	struct fid_ucma *ucma;
+
+	ucma = container_of(fid, struct fid_ucma, fid);
+	UCMA_INIT_CMD(cmd, cmd_size, RESOLVE_ADDR);
+	if (write(ucma->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+
+	return 0;
+}
+
+static int __ucma_resolve_route(fid_t fid,
+		struct ucma_abi_resolve_route *cmd, size_t cmd_size)
+{
+	struct fid_ucma *ucma;
+
+	ucma = container_of(fid, struct fid_ucma, fid);
+	UCMA_INIT_CMD(cmd, cmd_size, RESOLVE_ROUTE);
+	if (write(ucma->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+
+	return 0;
+}
+
+static int __ucma_query_route(fid_t fid,
+			struct ucma_abi_query *cmd, size_t cmd_size,
+			struct ucma_abi_query_route_resp *resp, size_t resp_size)
+{
+	struct fid_ucma *ucma;
+
+	ucma = container_of(fid, struct fid_ucma, fid);
+	UCMA_INIT_CMD_RESP(cmd, cmd_size, QUERY_ROUTE, resp, resp_size);
+	if (write(ucma->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+
+	VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
+	return 0;
+}
+
+static int __ucma_query(fid_t fid,
+			struct ucma_abi_query *cmd, size_t cmd_size,
+			void *resp, size_t resp_size)
+{
+	struct fid_ucma *ucma;
+
+	ucma = container_of(fid, struct fid_ucma, fid);
+	UCMA_INIT_CMD_RESP(cmd, cmd_size, QUERY, resp, resp_size);
+	if (write(ucma->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+
+	VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
+	return 0;
+}
+
+static int __ucma_connect(fid_t fid,
+			struct ucma_abi_connect *cmd, size_t cmd_size)
+{
+	struct fid_ucma *ucma;
+
+	ucma = container_of(fid, struct fid_ucma, fid);
+	UCMA_INIT_CMD(cmd, cmd_size, CONNECT);
+	if (write(ucma->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+
+	return 0;
+}
+
+static int __ucma_listen(fid_t fid,
+			struct ucma_abi_listen *cmd, size_t cmd_size)
+{
+	struct fid_ucma *ucma;
+
+	ucma = container_of(fid, struct fid_ucma, fid);
+	UCMA_INIT_CMD(cmd, cmd_size, LISTEN);
+	if (write(ucma->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+
+	return 0;
+}
+
+static int __ucma_accept(fid_t fid,
+			struct ucma_abi_accept *cmd, size_t cmd_size)
+{
+	struct fid_ucma *ucma;
+
+	ucma = container_of(fid, struct fid_ucma, fid);
+	UCMA_INIT_CMD(cmd, cmd_size, ACCEPT);
+	if (write(ucma->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+
+	return 0;
+}
+
+static int __ucma_reject(fid_t fid,
+			struct ucma_abi_reject *cmd, size_t cmd_size)
+{
+	struct fid_ucma *ucma;
+
+	ucma = container_of(fid, struct fid_ucma, fid);
+	UCMA_INIT_CMD(cmd, cmd_size, REJECT);
+	if (write(ucma->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+
+	return 0;
+}
+
+static int __ucma_disconnect(fid_t fid,
+			struct ucma_abi_disconnect *cmd, size_t cmd_size)
+{
+	struct fid_ucma *ucma;
+
+	ucma = container_of(fid, struct fid_ucma, fid);
+	UCMA_INIT_CMD(cmd, cmd_size, DISCONNECT);
+	if (write(ucma->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+
+	return 0;
+}
+
+static int __ucma_init_qp_attr(fid_t fid,
+			struct ucma_abi_init_qp_attr *cmd, size_t cmd_size,
+			struct ibv_kern_qp_attr *resp, size_t resp_size)
+{
+	struct fid_ucma *ucma;
+
+	ucma = container_of(fid, struct fid_ucma, fid);
+	UCMA_INIT_CMD_RESP(cmd, cmd_size, INIT_QP_ATTR, resp, resp_size);
+	if (write(ucma->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+
+	VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
+	return 0;
+}
+
+static int __ucma_get_event(fid_t fid,
+			struct ucma_abi_get_event *cmd, size_t cmd_size,
+			struct ucma_abi_event_resp *resp, size_t resp_size)
+{
+	struct fid_ucma *ucma;
+
+	ucma = container_of(fid, struct fid_ucma, fid);
+	UCMA_INIT_CMD_RESP(cmd, cmd_size, GET_EVENT, resp, resp_size);
+	if (write(ucma->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+
+	VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
+	return 0;
+}
+
+static int __ucma_set_option(fid_t fid,
+			struct ucma_abi_set_option *cmd, size_t cmd_size)
+{
+	struct fid_ucma *ucma;
+
+	ucma = container_of(fid, struct fid_ucma, fid);
+	UCMA_INIT_CMD(cmd, cmd_size, SET_OPTION);
+	if (write(ucma->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+
+	return 0;
+}
+
+static int __ucma_notify(fid_t fid,
+			struct ucma_abi_notify *cmd, size_t cmd_size)
+{
+	struct fid_ucma *ucma;
+
+	ucma = container_of(fid, struct fid_ucma, fid);
+	UCMA_INIT_CMD(cmd, cmd_size, NOTIFY);
+	if (write(ucma->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+
+	return 0;
+}
+
+static int __ucma_join_ip_mcast(fid_t fid,
+			struct ucma_abi_join_ip_mcast *cmd, size_t cmd_size,
+			struct ucma_abi_create_id_resp *resp, size_t resp_size)
+{
+	struct fid_ucma *ucma;
+
+	ucma = container_of(fid, struct fid_ucma, fid);
+	UCMA_INIT_CMD_RESP(cmd, cmd_size, JOIN_IP_MCAST, resp, resp_size);
+	if (write(ucma->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+
+	VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
+	return 0;
+}
+
+static int __ucma_join_mcast(fid_t fid,
+			struct ucma_abi_join_mcast *cmd, size_t cmd_size,
+			struct ucma_abi_create_id_resp *resp, size_t resp_size)
+{
+	struct fid_ucma *ucma;
+
+	ucma = container_of(fid, struct fid_ucma, fid);
+	UCMA_INIT_CMD_RESP(cmd, cmd_size, JOIN_MCAST, resp, resp_size);
+	if (write(ucma->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+
+	VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
+	return 0;
+}
+
+static int __ucma_leave_mcast(fid_t fid,
+			struct ucma_abi_destroy_id *cmd, size_t cmd_size,
+			struct ucma_abi_destroy_id_resp *resp, size_t resp_size)
+{
+	struct fid_ucma *ucma;
+
+	ucma = container_of(fid, struct fid_ucma, fid);
+	UCMA_INIT_CMD_RESP(cmd, cmd_size, LEAVE_MCAST, resp, resp_size);
+	if (write(ucma->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+
+	VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
+	return 0;
+}
+
+static int __ucma_migrate_id(fid_t fid,
+			struct ucma_abi_migrate_id *cmd, size_t cmd_size,
+			struct ucma_abi_migrate_resp *resp, size_t resp_size)
+{
+	struct fid_ucma *ucma;
+
+	ucma = container_of(fid, struct fid_ucma, fid);
+	UCMA_INIT_CMD_RESP(cmd, cmd_size, MIGRATE_ID, resp, resp_size);
+	if (write(ucma->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+
+	VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
+	return 0;
+}
+
+
+static struct fi_ops_ucma ops_ucma = {
+	.size = sizeof(struct fi_ops_ucma),
+	.create_id = __ucma_create_id,
+	.destroy_id = __ucma_destroy_id,
+	.bind_ip = __ucma_bind_ip,
+	.bind = __ucma_bind,
+	.resolve_ip = __ucma_resolve_ip,
+	.resolve_addr = __ucma_resolve_addr,
+	.resolve_route = __ucma_resolve_route,
+	.query_route = __ucma_query_route,
+	.query = __ucma_query,
+	.connect = __ucma_connect,
+	.listen = __ucma_listen,
+	.accept = __ucma_accept,
+	.reject = __ucma_reject,
+	.disconnect = __ucma_disconnect,
+	.init_qp_attr = __ucma_init_qp_attr,
+	.get_event = __ucma_get_event,
+	.set_option = __ucma_set_option,
+	.notify = __ucma_notify,
+	.join_ip_mcast = __ucma_join_ip_mcast,
+	.join_mcast = __ucma_join_mcast,
+	.leave_mcast = __ucma_leave_mcast,
+	.migrate_id = __ucma_migrate_id
+};
+
+static int ucma_close(fid_t fid)
+{
+	struct fid_ucma *ucma;
+
+	ucma = container_of(fid, struct fid_ucma, fid);
+	close(ucma->fd);
+	free(ucma);
+	return 0;
+}
+
+static struct fi_ops ops_fi = {
+	.size = sizeof(struct fi_ops),
+	.close = ucma_close
+};
+
+static int ucma_open(const char *name, struct fi_info *info, uint64_t flags,
+		     fid_t *fid, void *context)
+{
+	struct fid_ucma *ucma;
+
+	if (!name || strcmp(FI_UCMA_INTERFACE, name))
+		return -ENOSYS;
+
+ 	ucma = calloc(1, sizeof(*ucma));
+ 	if (!ucma)
+ 		return -ENOMEM;
+
+	ucma->fd = open("/dev/infiniband/rdma_cm", O_RDWR | O_CLOEXEC);
+	if (ucma->fd < 0) {
+		free(ucma);
+		return -errno;
+	}
+
+	ucma->fid.fclass = FID_CLASS_INTERFACE;
+	ucma->fid.size = sizeof(*ucma);
+	ucma->fid.ops = &ops_fi;
+	ucma->fid.context = context;
+	ucma->ops = &ops_ucma;
+
+	*fid = &ucma->fid;
+	return 0;
+}
diff --git a/src/uverbs.c b/src/uverbs.c
new file mode 100644
index 00000000000..e3381d335b6
--- /dev/null
+++ b/src/uverbs.c
@@ -0,0 +1,710 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2006 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2013 Intel Corp., Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if HAVE_CONFIG_H
+#  include <config.h>
+#endif /* HAVE_CONFIG_H */
+
+#include <dirent.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <rdma/fabric.h>
+#include <rdma/fi_prov.h>
+#include <rdma/fi_uverbs.h>
+#include "fi.h"
+
+
+int uv_abi_ver;
+struct uv_dev *udev_head, *udev_tail;
+
+#define UV_INIT_CMD(cmd, size, opcode)				\
+	do {							\
+		(cmd)->command = UVERBS_CMD_##opcode;		\
+		(cmd)->in_words  = (size) / 4;			\
+		(cmd)->out_words = 0;				\
+	} while (0)
+
+#define UV_INIT_CMD_RESP(cmd, size, opcode, out, outsize)	\
+	do {							\
+		(cmd)->command = UVERBS_CMD_##opcode;		\
+		(cmd)->in_words  = (size) / 4;			\
+		(cmd)->out_words = (outsize) / 4;		\
+		(cmd)->response  = (uintptr_t) (out);		\
+	} while (0)
+
+static int uv_open(const char *name, struct fi_info *info, uint64_t flags,
+		   fid_t *fid, void *context);
+
+static struct fi_ops_prov uv_prov_ops = {
+	.size = sizeof(struct fi_ops_prov),
+	.getinfo = NULL,
+	.freeinfo = NULL,
+	.socket = NULL,
+	.open = uv_open
+};
+
+static int uv_abi_version(void)
+{
+	char value[8];
+
+	if (fi_read_file(fi_sysfs_path(), "class/infiniband_verbs/abi_version",
+			 value, sizeof value) < 0) {
+		return -ENOSYS;
+	}
+
+	uv_abi_ver = strtol(value, NULL, 10);
+	if (uv_abi_ver < UVERBS_MIN_ABI_VERSION ||
+	    uv_abi_ver > UVERBS_MAX_ABI_VERSION) {
+		fprintf(stderr, PFX "uverbs kernel ABI version %d not supported (%d).\n",
+			uv_abi_ver, UVERBS_MAX_ABI_VERSION);
+		return -ENOSYS;
+	}
+
+	return 0;
+}
+
+int uv_init(void)
+{
+	char class_path[FI_PATH_MAX];
+	DIR *class_dir;
+	struct dirent *dent;
+	struct uv_dev *udev = NULL;
+	struct stat buf;
+	int ret;
+
+	ret = uv_abi_version();
+	if (ret)
+		return ret;
+
+	snprintf(class_path, sizeof class_path, "%s/class/infiniband_verbs",
+		 fi_sysfs_path());
+
+	class_dir = opendir(class_path);
+	if (!class_dir)
+		return -ENOSYS;
+
+	while ((dent = readdir(class_dir))) {
+		if (dent->d_name[0] == '.')
+			continue;
+
+		if (!udev)
+			udev = calloc(sizeof *udev, 1);
+		if (!udev) {
+			ret = -ENOMEM;
+			break;
+		}
+
+		snprintf(udev->sysfs_path, sizeof udev->sysfs_path,
+			 "%s/%s", class_path, dent->d_name);
+
+		if (stat(udev->sysfs_path, &buf)) {
+			fprintf(stderr, PFX "Warning: couldn't stat '%s'.\n",
+				udev->sysfs_path);
+			continue;
+		}
+
+		if (!S_ISDIR(buf.st_mode))
+			continue;
+
+		snprintf(udev->sysfs_name, sizeof udev->sysfs_name, "%s", dent->d_name);
+
+		if (fi_read_file(udev->sysfs_path, "ibdev", udev->dev_name,
+				 sizeof udev->dev_name) < 0) {
+			fprintf(stderr, PFX "Warning: no dev class attr for '%s'.\n",
+				dent->d_name);
+			continue;
+		}
+
+		snprintf(udev->dev_path, sizeof udev->dev_path,
+			 "%s/class/infiniband/%s", fi_sysfs_path(), udev->dev_name);
+
+		if (udev_tail)
+			udev_tail->next = udev;
+		else
+			udev_head = udev;
+		udev_tail = udev;
+		udev = NULL;
+	}
+
+	if (udev)
+		free(udev);
+
+	closedir(class_dir);
+	return ret;
+}
+
+void uv_ini(void)
+{
+	fi_register(&uv_prov_ops);
+}
+
+void uv_fini(void)
+{
+}
+
+static int __uv_get_context(fid_t fid,
+			struct ibv_get_context *cmd, size_t cmd_size,
+			struct ibv_get_context_resp *resp, size_t resp_size)
+{
+	struct fid_uverbs *uv;
+
+	uv = container_of(fid, struct fid_uverbs, fid);
+	UV_INIT_CMD_RESP(cmd, cmd_size, GET_CONTEXT, resp, resp_size);
+	if (write(uv->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+
+	VALGRIND_MAKE_MEM_DEFINED(resp, resp_size);
+	return 0;
+}
+
+static int __uv_query_device(fid_t fid,
+			struct ibv_query_device *cmd, size_t cmd_size,
+			struct ibv_query_device_resp *resp, size_t resp_size)
+{
+	struct fid_uverbs *uv;
+
+	uv = container_of(fid, struct fid_uverbs, fid);
+	UV_INIT_CMD_RESP(cmd, cmd_size, QUERY_DEVICE, resp, resp_size);
+	if (write(uv->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+
+	VALGRIND_MAKE_MEM_DEFINED(resp, resp_size);
+	return 0;
+}
+
+static int __uv_query_port(fid_t fid,
+			struct ibv_query_port *cmd, size_t cmd_size,
+			struct ibv_query_port_resp *resp, size_t resp_size)
+{
+	struct fid_uverbs *uv;
+
+	uv = container_of(fid, struct fid_uverbs, fid);
+	UV_INIT_CMD_RESP(cmd, cmd_size, QUERY_PORT, resp, resp_size);
+	if (write(uv->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+
+	VALGRIND_MAKE_MEM_DEFINED(resp, resp_size);
+	return 0;
+}
+
+static int __uv_alloc_pd(fid_t fid,
+			struct ibv_alloc_pd *cmd, size_t cmd_size,
+			struct ibv_alloc_pd_resp *resp, size_t resp_size)
+{
+	struct fid_uverbs *uv;
+
+	uv = container_of(fid, struct fid_uverbs, fid);
+	UV_INIT_CMD_RESP(cmd, cmd_size, ALLOC_PD, resp, resp_size);
+	if (write(uv->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+
+	VALGRIND_MAKE_MEM_DEFINED(resp, resp_size);
+	return 0;
+}
+
+static int __uv_dealloc_pd(fid_t fid,
+			struct ibv_dealloc_pd *cmd, size_t cmd_size)
+{
+	struct fid_uverbs *uv;
+
+	uv = container_of(fid, struct fid_uverbs, fid);
+	UV_INIT_CMD(cmd, cmd_size, DEALLOC_PD);
+	if (write(uv->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+	return 0;
+}
+
+static int __uv_open_xrcd(fid_t fid,
+			struct ibv_open_xrcd *cmd, size_t cmd_size,
+			struct ibv_open_xrcd_resp *resp, size_t resp_size)
+{
+	struct fid_uverbs *uv;
+
+	uv = container_of(fid, struct fid_uverbs, fid);
+	UV_INIT_CMD_RESP(cmd, cmd_size, OPEN_XRCD, resp, resp_size);
+	if (write(uv->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+
+	VALGRIND_MAKE_MEM_DEFINED(resp, resp_size);
+	return 0;
+}
+
+static int __uv_close_xrcd(fid_t fid,
+			struct ibv_close_xrcd *cmd, size_t cmd_size)
+{
+	struct fid_uverbs *uv;
+
+	uv = container_of(fid, struct fid_uverbs, fid);
+	UV_INIT_CMD(cmd, cmd_size, CLOSE_XRCD);
+	if (write(uv->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+	return 0;
+}
+
+static int __uv_reg_mr(fid_t fid,
+			struct ibv_reg_mr *cmd, size_t cmd_size,
+			struct ibv_reg_mr_resp *resp, size_t resp_size)
+{
+	struct fid_uverbs *uv;
+
+	uv = container_of(fid, struct fid_uverbs, fid);
+	UV_INIT_CMD_RESP(cmd, cmd_size, REG_MR, resp, resp_size);
+	if (write(uv->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+
+	VALGRIND_MAKE_MEM_DEFINED(resp, resp_size);
+	return 0;
+}
+
+static int __uv_dereg_mr(fid_t fid,
+			struct ibv_dereg_mr *cmd, size_t cmd_size)
+{
+	struct fid_uverbs *uv;
+
+	uv = container_of(fid, struct fid_uverbs, fid);
+	UV_INIT_CMD(cmd, cmd_size, DEREG_MR);
+	if (write(uv->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+	return 0;
+}
+
+static int __uv_create_comp_channel(fid_t fid,
+			struct ibv_create_comp_channel *cmd, size_t cmd_size,
+			struct ibv_create_comp_channel_resp *resp, size_t resp_size)
+{
+	struct fid_uverbs *uv;
+
+	uv = container_of(fid, struct fid_uverbs, fid);
+	UV_INIT_CMD_RESP(cmd, cmd_size, CREATE_COMP_CHANNEL, resp, resp_size);
+	if (write(uv->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+
+	VALGRIND_MAKE_MEM_DEFINED(resp, resp_size);
+	return 0;
+}
+
+static int __uv_create_cq(fid_t fid,
+			struct ibv_create_cq *cmd, size_t cmd_size,
+			struct ibv_create_cq_resp *resp, size_t resp_size)
+{
+	struct fid_uverbs *uv;
+
+	uv = container_of(fid, struct fid_uverbs, fid);
+	UV_INIT_CMD_RESP(cmd, cmd_size, CREATE_CQ, resp, resp_size);
+	if (write(uv->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+
+	VALGRIND_MAKE_MEM_DEFINED(resp, resp_size);
+	return 0;
+}
+
+static int __uv_poll_cq(fid_t fid,
+			struct ibv_poll_cq *cmd, size_t cmd_size,
+			struct ibv_poll_cq_resp *resp, size_t resp_size)
+{
+	struct fid_uverbs *uv;
+
+	uv = container_of(fid, struct fid_uverbs, fid);
+	UV_INIT_CMD_RESP(cmd, cmd_size, POLL_CQ, resp, resp_size);
+	if (write(uv->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+
+	VALGRIND_MAKE_MEM_DEFINED(resp, resp_size);
+	return 0;
+}
+
+static int __uv_req_notify_cq(fid_t fid,
+			struct ibv_req_notify_cq *cmd, size_t cmd_size)
+{
+	struct fid_uverbs *uv;
+
+	uv = container_of(fid, struct fid_uverbs, fid);
+	UV_INIT_CMD(cmd, cmd_size, REQ_NOTIFY_CQ);
+	if (write(uv->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+	return 0;
+}
+
+static int __uv_resize_cq(fid_t fid,
+			struct ibv_resize_cq *cmd, size_t cmd_size,
+			struct ibv_resize_cq_resp *resp, size_t resp_size)
+{
+	struct fid_uverbs *uv;
+
+	uv = container_of(fid, struct fid_uverbs, fid);
+	UV_INIT_CMD_RESP(cmd, cmd_size, RESIZE_CQ, resp, resp_size);
+	if (write(uv->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+
+	VALGRIND_MAKE_MEM_DEFINED(resp, resp_size);
+	return 0;
+}
+
+static int __uv_destroy_cq(fid_t fid,
+			struct ibv_destroy_cq *cmd, size_t cmd_size,
+			struct ibv_destroy_cq_resp *resp, size_t resp_size)
+{
+	struct fid_uverbs *uv;
+
+	uv = container_of(fid, struct fid_uverbs, fid);
+	UV_INIT_CMD_RESP(cmd, cmd_size, DESTROY_CQ, resp, resp_size);
+	if (write(uv->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+
+	VALGRIND_MAKE_MEM_DEFINED(resp, resp_size);
+	return 0;
+}
+
+static int __uv_create_srq(fid_t fid,
+			struct ibv_create_srq *cmd, size_t cmd_size,
+			struct ibv_create_srq_resp *resp, size_t resp_size)
+{
+	struct fid_uverbs *uv;
+
+	uv = container_of(fid, struct fid_uverbs, fid);
+	UV_INIT_CMD_RESP(cmd, cmd_size, CREATE_SRQ, resp, resp_size);
+	if (write(uv->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+
+	VALGRIND_MAKE_MEM_DEFINED(resp, resp_size);
+	return 0;
+}
+
+static int __uv_modify_srq(fid_t fid,
+			struct ibv_modify_srq *cmd, size_t cmd_size)
+{
+	struct fid_uverbs *uv;
+
+	uv = container_of(fid, struct fid_uverbs, fid);
+	UV_INIT_CMD(cmd, cmd_size, MODIFY_SRQ);
+	if (write(uv->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+	return 0;
+}
+
+static int __uv_query_srq(fid_t fid,
+			struct ibv_query_srq *cmd, size_t cmd_size,
+			struct ibv_query_srq_resp *resp, size_t resp_size)
+{
+	struct fid_uverbs *uv;
+
+	uv = container_of(fid, struct fid_uverbs, fid);
+	UV_INIT_CMD_RESP(cmd, cmd_size, QUERY_SRQ, resp, resp_size);
+	if (write(uv->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+
+	VALGRIND_MAKE_MEM_DEFINED(resp, resp_size);
+	return 0;
+}
+
+static int __uv_destroy_srq(fid_t fid,
+			struct ibv_destroy_srq *cmd, size_t cmd_size,
+			struct ibv_destroy_srq_resp *resp, size_t resp_size)
+{
+	struct fid_uverbs *uv;
+
+	uv = container_of(fid, struct fid_uverbs, fid);
+	UV_INIT_CMD_RESP(cmd, cmd_size, DESTROY_SRQ, resp, resp_size);
+	if (write(uv->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+
+	VALGRIND_MAKE_MEM_DEFINED(resp, resp_size);
+	return 0;
+}
+
+static int __uv_create_qp(fid_t fid,
+			struct ibv_create_qp *cmd, size_t cmd_size,
+			struct ibv_create_qp_resp *resp, size_t resp_size)
+{
+	struct fid_uverbs *uv;
+
+	uv = container_of(fid, struct fid_uverbs, fid);
+	UV_INIT_CMD_RESP(cmd, cmd_size, CREATE_QP, resp, resp_size);
+	if (write(uv->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+
+	VALGRIND_MAKE_MEM_DEFINED(resp, resp_size);
+	return 0;
+}
+
+static int __uv_open_qp(fid_t fid,
+			struct ibv_open_qp *cmd, size_t cmd_size,
+			struct ibv_create_qp_resp *resp, size_t resp_size)
+{
+	struct fid_uverbs *uv;
+
+	uv = container_of(fid, struct fid_uverbs, fid);
+	UV_INIT_CMD_RESP(cmd, cmd_size, OPEN_QP, resp, resp_size);
+	if (write(uv->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+
+	VALGRIND_MAKE_MEM_DEFINED(resp, resp_size);
+	return 0;
+}
+
+static int __uv_query_qp(fid_t fid,
+			struct ibv_query_qp *cmd, size_t cmd_size,
+			struct ibv_query_qp_resp *resp, size_t resp_size)
+{
+	struct fid_uverbs *uv;
+
+	uv = container_of(fid, struct fid_uverbs, fid);
+	UV_INIT_CMD_RESP(cmd, cmd_size, QUERY_QP, resp, resp_size);
+	if (write(uv->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+
+	VALGRIND_MAKE_MEM_DEFINED(resp, resp_size);
+	return 0;
+}
+
+static int __uv_modify_qp(fid_t fid,
+			struct ibv_modify_qp *cmd, size_t cmd_size)
+{
+	struct fid_uverbs *uv;
+
+	uv = container_of(fid, struct fid_uverbs, fid);
+	UV_INIT_CMD(cmd, cmd_size, MODIFY_QP);
+	if (write(uv->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+	return 0;
+}
+
+static int __uv_destroy_qp(fid_t fid,
+			struct ibv_destroy_qp *cmd, size_t cmd_size,
+			struct ibv_destroy_qp_resp *resp, size_t resp_size)
+{
+	struct fid_uverbs *uv;
+
+	uv = container_of(fid, struct fid_uverbs, fid);
+	UV_INIT_CMD_RESP(cmd, cmd_size, DESTROY_QP, resp, resp_size);
+	if (write(uv->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+
+	VALGRIND_MAKE_MEM_DEFINED(resp, resp_size);
+	return 0;
+}
+
+static int __uv_post_send(fid_t fid,
+			struct ibv_post_send *cmd, size_t cmd_size,
+			struct ibv_post_send_resp *resp, size_t resp_size)
+{
+	struct fid_uverbs *uv;
+
+	uv = container_of(fid, struct fid_uverbs, fid);
+	UV_INIT_CMD_RESP(cmd, cmd_size, POST_SEND, resp, resp_size);
+	if (write(uv->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+
+	VALGRIND_MAKE_MEM_DEFINED(resp, resp_size);
+	return 0;
+}
+
+static int __uv_post_recv(fid_t fid,
+			struct ibv_post_recv *cmd, size_t cmd_size,
+			struct ibv_post_recv_resp *resp, size_t resp_size)
+{
+	struct fid_uverbs *uv;
+
+	uv = container_of(fid, struct fid_uverbs, fid);
+	UV_INIT_CMD_RESP(cmd, cmd_size, POST_RECV, resp, resp_size);
+	if (write(uv->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+
+	VALGRIND_MAKE_MEM_DEFINED(resp, resp_size);
+	return 0;
+}
+
+static int __uv_post_srq_recv(fid_t fid,
+			struct ibv_post_srq_recv *cmd, size_t cmd_size,
+			struct ibv_post_srq_recv_resp *resp, size_t resp_size)
+{
+	struct fid_uverbs *uv;
+
+	uv = container_of(fid, struct fid_uverbs, fid);
+	UV_INIT_CMD_RESP(cmd, cmd_size, POST_SRQ_RECV, resp, resp_size);
+	if (write(uv->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+
+	VALGRIND_MAKE_MEM_DEFINED(resp, resp_size);
+	return 0;
+}
+
+static int __uv_create_ah(fid_t fid,
+			struct ibv_create_ah *cmd, size_t cmd_size,
+			struct ibv_create_ah_resp *resp, size_t resp_size)
+{
+	struct fid_uverbs *uv;
+
+	uv = container_of(fid, struct fid_uverbs, fid);
+	UV_INIT_CMD_RESP(cmd, cmd_size, CREATE_AH, resp, resp_size);
+	if (write(uv->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+
+	VALGRIND_MAKE_MEM_DEFINED(resp, resp_size);
+	return 0;
+}
+
+static int __uv_destroy_ah(fid_t fid,
+			struct ibv_destroy_ah *cmd, size_t cmd_size)
+{
+	struct fid_uverbs *uv;
+
+	uv = container_of(fid, struct fid_uverbs, fid);
+	UV_INIT_CMD(cmd, cmd_size, DESTROY_AH);
+	if (write(uv->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+	return 0;
+}
+
+static int __uv_attach_mcast(fid_t fid,
+			struct ibv_attach_mcast *cmd, size_t cmd_size)
+{
+	struct fid_uverbs *uv;
+
+	uv = container_of(fid, struct fid_uverbs, fid);
+	UV_INIT_CMD(cmd, cmd_size, ATTACH_MCAST);
+	if (write(uv->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+	return 0;
+}
+
+static int __uv_detach_mcast(fid_t fid,
+			struct ibv_detach_mcast *cmd, size_t cmd_size)
+{
+	struct fid_uverbs *uv;
+
+	uv = container_of(fid, struct fid_uverbs, fid);
+	UV_INIT_CMD(cmd, cmd_size, DETACH_MCAST);
+	if (write(uv->fd, cmd, cmd_size) != cmd_size)
+		return -errno;
+	return 0;
+}
+
+static struct fi_ops_uverbs ops_uv = {
+	.size = sizeof(struct fi_ops_uverbs),
+	.get_context = __uv_get_context,
+	.query_device = __uv_query_device,
+	.query_port = __uv_query_port,
+	.alloc_pd = __uv_alloc_pd,
+	.dealloc_pd = __uv_dealloc_pd,
+	.open_xrcd = __uv_open_xrcd,
+	.close_xrcd = __uv_close_xrcd,
+	.reg_mr = __uv_reg_mr,
+	.dereg_mr = __uv_dereg_mr,
+	.create_comp_channel = __uv_create_comp_channel,
+	.create_cq = __uv_create_cq,
+	.poll_cq = __uv_poll_cq,
+	.req_notify_cq = __uv_req_notify_cq,
+	.resize_cq = __uv_resize_cq,
+	.destroy_cq = __uv_destroy_cq,
+	.create_srq = __uv_create_srq,
+	.modify_srq = __uv_modify_srq,
+	.query_srq = __uv_query_srq,
+	.destroy_srq = __uv_destroy_srq,
+	.create_qp = __uv_create_qp,
+	.open_qp = __uv_open_qp,
+	.query_qp = __uv_query_qp,
+	.modify_qp = __uv_modify_qp,
+	.destroy_qp = __uv_destroy_qp,
+	.post_send = __uv_post_send,
+	.post_recv = __uv_post_recv,
+	.post_srq_recv = __uv_post_srq_recv,
+	.create_ah = __uv_create_ah,
+	.destroy_ah = __uv_destroy_ah,
+	.attach_mcast = __uv_attach_mcast,
+	.detach_mcast = __uv_detach_mcast
+};
+
+static int uv_close(fid_t fid)
+{
+	struct fid_uverbs *uv;
+
+	uv = container_of(fid, struct fid_uverbs, fid);
+	close(uv->fd);
+	free(uv);
+	return 0;
+}
+
+static struct fi_ops ops_fi = {
+	.size = sizeof(struct fi_ops),
+	.close = uv_close
+};
+
+static int uv_open(const char *name, struct fi_info *info, uint64_t flags,
+		   fid_t *fid, void *context)
+{
+	struct fid_uverbs *uv;
+	char *dev_path;
+	int ret = 0;
+
+	if (!name || strncmp(FI_UVERBS_INTERFACE "/", name, 7))
+		return -ENOSYS;
+
+	if (asprintf(&dev_path, "/dev/infiniband%s", strstr(name, "/")) < 0)
+		return -ENOMEM;
+
+ 	uv = calloc(1, sizeof(*uv));
+ 	if (!uv) {
+ 		ret = -ENOMEM;
+ 		goto out;
+ 	}
+
+	uv->fd = open(dev_path, O_RDWR | O_CLOEXEC);
+	if (uv->fd < 1) {
+		ret = -errno;
+		free(uv);
+		goto out;
+	}
+
+	uv->fid.fclass = FID_CLASS_INTERFACE;
+	uv->fid.size = sizeof(*uv);
+	uv->fid.ops = &ops_fi;
+	uv->fid.context = context;
+	uv->ops = &ops_uv;
+
+	*fid = &uv->fid;
+out:
+	free(dev_path);
+	return ret;
+}