From 87e59e19a46384eeba727589f74c3fafe4121fa5 Mon Sep 17 00:00:00 2001 From: hongchunhua Date: Fri, 7 Aug 2020 15:18:56 +0800 Subject: [PATCH] =?UTF-8?q?=E6=96=B0=E5=BB=BAarpc=E4=BB=A3=E7=A0=81?= =?UTF-8?q?=E4=BB=93=EF=BC=8C=E5=88=86=E7=A6=BBaprc=E4=BB=A3=E7=A0=81?= =?UTF-8?q?=E4=B8=8A=E5=BA=93?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CMakeLists.txt | 59 + README.md | 26 + common.cmake | 67 + demo/README.md | 16 + demo/client_file_send/CMakeLists.txt | 32 + demo/client_file_send/main.c | 132 + demo/server_file_rev/CMakeLists.txt | 30 + demo/server_file_rev/main.c | 190 + inc/arpc_api.h | 475 ++ open_src/xio/CMakeLists.txt | 69 + open_src/xio/include/libxio.h | 43 + open_src/xio/include/xio_base.h | 1545 +++++ open_src/xio/include/xio_kernel.h | 321 + open_src/xio/include/xio_predefs.h | 52 + open_src/xio/include/xio_user.h | 600 ++ open_src/xio/src/common/sys/hashtable.h | 213 + open_src/xio/src/common/xio_common.h | 345 ++ open_src/xio/src/common/xio_connection.c | 3343 +++++++++++ open_src/xio/src/common/xio_connection.h | 285 + open_src/xio/src/common/xio_context.h | 316 + open_src/xio/src/common/xio_error.c | 146 + open_src/xio/src/common/xio_hash.h | 215 + open_src/xio/src/common/xio_idr.c | 182 + open_src/xio/src/common/xio_idr.h | 57 + open_src/xio/src/common/xio_mbuf.h | 498 ++ open_src/xio/src/common/xio_msg_list.h | 145 + open_src/xio/src/common/xio_nexus.c | 2747 +++++++++ open_src/xio/src/common/xio_nexus.h | 402 ++ open_src/xio/src/common/xio_nexus_cache.c | 188 + open_src/xio/src/common/xio_nexus_cache.h | 75 + open_src/xio/src/common/xio_objpool.c | 179 + open_src/xio/src/common/xio_objpool.h | 85 + open_src/xio/src/common/xio_observer.c | 273 + open_src/xio/src/common/xio_observer.h | 139 + open_src/xio/src/common/xio_options.c | 472 ++ open_src/xio/src/common/xio_protocol.h | 462 ++ open_src/xio/src/common/xio_server.c | 428 ++ open_src/xio/src/common/xio_server.h | 73 + open_src/xio/src/common/xio_session.c | 2169 +++++++ open_src/xio/src/common/xio_session.h | 249 + open_src/xio/src/common/xio_session_client.c | 1091 ++++ open_src/xio/src/common/xio_session_priv.h | 318 + open_src/xio/src/common/xio_session_server.c | 707 +++ open_src/xio/src/common/xio_sessions_cache.c | 145 + open_src/xio/src/common/xio_sessions_cache.h | 58 + open_src/xio/src/common/xio_sg_table.h | 147 + open_src/xio/src/common/xio_task.h | 360 ++ open_src/xio/src/common/xio_transport.c | 148 + open_src/xio/src/common/xio_transport.h | 383 ++ open_src/xio/src/common/xio_utils.c | 471 ++ open_src/xio/src/common/xio_workqueue.h | 100 + .../src/kernel/transport/compat/Makefile.in | 53 + .../src/kernel/transport/compat/autogen.sh | 3 + .../src/kernel/transport/compat/configure.ac | 138 + .../src/kernel/transport/compat/install-sh | 527 ++ .../xio/src/kernel/transport/compat/missing | 331 ++ .../xio/src/kernel/transport/rdma/Makefile.in | 66 + .../xio/src/kernel/transport/rdma/autogen.sh | 3 + .../src/kernel/transport/rdma/configure.ac | 216 + .../xio/src/kernel/transport/rdma/install-sh | 520 ++ .../kernel/transport/rdma/xio_rdma_datapath.c | 5246 +++++++++++++++++ .../transport/rdma/xio_rdma_management.c | 3266 ++++++++++ .../kernel/transport/rdma/xio_rdma_memory.c | 1067 ++++ .../transport/rdma/xio_rdma_transport.h | 634 ++ .../kernel/transport/rdma/xio_rdma_utils.c | 240 + .../kernel/transport/rdma/xio_rdma_utils.h | 50 + .../kernel/transport/rdma/xio_rdma_verbs.c | 140 + .../xio/src/kernel/transport/tcp/Makefile.in | 63 + .../xio/src/kernel/transport/tcp/autogen.sh | 3 + .../xio/src/kernel/transport/tcp/configure.ac | 212 + .../xio/src/kernel/transport/tcp/install-sh | 520 ++ .../kernel/transport/tcp/xio_tcp_datapath.c | 3490 +++++++++++ .../kernel/transport/tcp/xio_tcp_management.c | 2762 +++++++++ .../kernel/transport/tcp/xio_tcp_transport.h | 511 ++ .../xio/src/kernel/transport/xio_ktransport.c | 74 + .../xio/src/kernel/transport/xio_ktransport.h | 59 + open_src/xio/src/kernel/xio/Makefile.in | 128 + open_src/xio/src/kernel/xio/autogen.sh | 3 + open_src/xio/src/kernel/xio/configure.ac | 216 + open_src/xio/src/kernel/xio/install-sh | 520 ++ open_src/xio/src/kernel/xio/xio_context.c | 767 +++ .../xio/src/kernel/xio/xio_context_priv.h | 43 + open_src/xio/src/kernel/xio/xio_ev_data.h | 44 + open_src/xio/src/kernel/xio/xio_ev_loop.c | 574 ++ open_src/xio/src/kernel/xio/xio_ev_loop.h | 104 + open_src/xio/src/kernel/xio/xio_init.c | 97 + .../xio/src/kernel/xio/xio_kernel_utils.c | 310 + open_src/xio/src/kernel/xio/xio_mem.c | 49 + open_src/xio/src/kernel/xio/xio_mem.h | 65 + open_src/xio/src/kernel/xio/xio_mempool.c | 272 + open_src/xio/src/kernel/xio/xio_mempool.h | 106 + open_src/xio/src/kernel/xio/xio_os.h | 150 + open_src/xio/src/kernel/xio/xio_sg_iov.c | 202 + open_src/xio/src/kernel/xio/xio_sg_iovptr.c | 201 + open_src/xio/src/kernel/xio/xio_sg_scatter.c | 291 + open_src/xio/src/kernel/xio/xio_sg_table.c | 235 + open_src/xio/src/kernel/xio/xio_task.c | 364 ++ open_src/xio/src/kernel/xio/xio_workqueue.c | 452 ++ .../xio/src/kernel/xio/xio_workqueue_priv.h | 103 + open_src/xio/src/kernel/xio_log.h | 95 + open_src/xio/src/libxio_os/linuxapp/xio_env.h | 523 ++ .../xio/src/libxio_os/linuxapp/xio_env_adv.h | 3 + .../src/libxio_os/linuxapp/xio_env_basic.h | 3 + .../xio/src/libxio_os/linuxkernel/xio_env.h | 76 + .../src/libxio_os/linuxkernel/xio_env_adv.h | 3 + .../src/libxio_os/linuxkernel/xio_env_basic.h | 3 + open_src/xio/src/libxio_os/winapp/list.h | 312 + open_src/xio/src/libxio_os/winapp/spinlock.h | 127 + open_src/xio/src/libxio_os/winapp/xio_env.h | 758 +++ .../xio/src/libxio_os/winapp/xio_env_adv.h | 95 + .../xio/src/libxio_os/winapp/xio_env_basic.h | 124 + open_src/xio/src/libxio_os/winapp/xio_os.h | 88 + open_src/xio/src/tools/usr/Makefile.am | 28 + open_src/xio/src/tools/usr/xio_if_numa_cpus.c | 258 + open_src/xio/src/tools/usr/xio_mem_usage.c | 127 + open_src/xio/src/usr/Makefile.am | 163 + open_src/xio/src/usr/libxio.map | 63 + open_src/xio/src/usr/linux/atomic.h | 197 + open_src/xio/src/usr/linux/bitops.h | 13 + open_src/xio/src/usr/linux/debugfs.h | 6 + open_src/xio/src/usr/linux/jiffies.h | 63 + open_src/xio/src/usr/linux/kernel.h | 183 + open_src/xio/src/usr/linux/kref.h | 124 + open_src/xio/src/usr/linux/list.h | 775 +++ open_src/xio/src/usr/linux/printk.h | 19 + open_src/xio/src/usr/linux/slab.h | 57 + open_src/xio/src/usr/linux/usr.h | 45 + open_src/xio/src/usr/transport/rdma/ib_cm.h | 74 + .../usr/transport/rdma/xio_rdma_datapath.c | 5181 ++++++++++++++++ .../usr/transport/rdma/xio_rdma_management.c | 3730 ++++++++++++ .../usr/transport/rdma/xio_rdma_transport.h | 580 ++ .../src/usr/transport/rdma/xio_rdma_utils.c | 247 + .../src/usr/transport/rdma/xio_rdma_utils.h | 59 + .../src/usr/transport/rdma/xio_rdma_verbs.c | 687 +++ .../src/usr/transport/tcp/xio_tcp_datapath.c | 3787 ++++++++++++ .../usr/transport/tcp/xio_tcp_management.c | 2663 +++++++++ .../src/usr/transport/tcp/xio_tcp_transport.h | 411 ++ open_src/xio/src/usr/transport/xio_mempool.c | 782 +++ open_src/xio/src/usr/transport/xio_mempool.h | 53 + .../xio/src/usr/transport/xio_usr_transport.c | 194 + .../xio/src/usr/transport/xio_usr_transport.h | 136 + open_src/xio/src/usr/xio/get_clock.c | 254 + open_src/xio/src/usr/xio/get_clock.h | 95 + open_src/xio/src/usr/xio/xio_context.c | 849 +++ open_src/xio/src/usr/xio/xio_context_priv.h | 41 + open_src/xio/src/usr/xio/xio_ev_data.h | 63 + open_src/xio/src/usr/xio/xio_ev_loop.c | 621 ++ open_src/xio/src/usr/xio/xio_ev_loop.h | 208 + open_src/xio/src/usr/xio/xio_init.c | 166 + open_src/xio/src/usr/xio/xio_init.h | 47 + open_src/xio/src/usr/xio/xio_log.c | 111 + open_src/xio/src/usr/xio/xio_log.h | 113 + open_src/xio/src/usr/xio/xio_mem.c | 197 + open_src/xio/src/usr/xio/xio_mem.h | 166 + open_src/xio/src/usr/xio/xio_netlink.c | 237 + open_src/xio/src/usr/xio/xio_os.h | 55 + open_src/xio/src/usr/xio/xio_sg_iov.c | 220 + open_src/xio/src/usr/xio/xio_sg_iovptr.c | 218 + open_src/xio/src/usr/xio/xio_sg_table.c | 232 + open_src/xio/src/usr/xio/xio_task.c | 346 ++ open_src/xio/src/usr/xio/xio_timers_list.h | 311 + open_src/xio/src/usr/xio/xio_tls.c | 74 + open_src/xio/src/usr/xio/xio_tls.h | 48 + open_src/xio/src/usr/xio/xio_usr_utils.c | 360 ++ open_src/xio/src/usr/xio/xio_usr_utils.h | 41 + open_src/xio/src/usr/xio/xio_workqueue.c | 489 ++ open_src/xio/src/usr/xio/xio_workqueue_priv.h | 83 + open_src/xio/version.c | 1 + src/common/base_log.h | 95 + src/common/queue.h | 108 + src/common/threadpool.c | 372 ++ src/common/threadpool.h | 63 + src/session/arpc_client.c | 467 ++ src/session/arpc_com.c | 336 ++ src/session/arpc_com.h | 185 + src/session/arpc_make_request.c | 483 ++ src/session/arpc_message.c | 126 + src/session/arpc_process_oneway.c | 86 + src/session/arpc_process_request.c | 157 + src/session/arpc_process_rsp.c | 34 + src/session/arpc_server.c | 399 ++ 181 files changed, 78233 insertions(+) create mode 100644 CMakeLists.txt create mode 100644 README.md create mode 100644 common.cmake create mode 100644 demo/README.md create mode 100644 demo/client_file_send/CMakeLists.txt create mode 100644 demo/client_file_send/main.c create mode 100644 demo/server_file_rev/CMakeLists.txt create mode 100644 demo/server_file_rev/main.c create mode 100644 inc/arpc_api.h create mode 100644 open_src/xio/CMakeLists.txt create mode 100644 open_src/xio/include/libxio.h create mode 100644 open_src/xio/include/xio_base.h create mode 100644 open_src/xio/include/xio_kernel.h create mode 100644 open_src/xio/include/xio_predefs.h create mode 100644 open_src/xio/include/xio_user.h create mode 100644 open_src/xio/src/common/sys/hashtable.h create mode 100644 open_src/xio/src/common/xio_common.h create mode 100644 open_src/xio/src/common/xio_connection.c create mode 100644 open_src/xio/src/common/xio_connection.h create mode 100644 open_src/xio/src/common/xio_context.h create mode 100644 open_src/xio/src/common/xio_error.c create mode 100644 open_src/xio/src/common/xio_hash.h create mode 100644 open_src/xio/src/common/xio_idr.c create mode 100644 open_src/xio/src/common/xio_idr.h create mode 100644 open_src/xio/src/common/xio_mbuf.h create mode 100644 open_src/xio/src/common/xio_msg_list.h create mode 100644 open_src/xio/src/common/xio_nexus.c create mode 100644 open_src/xio/src/common/xio_nexus.h create mode 100644 open_src/xio/src/common/xio_nexus_cache.c create mode 100644 open_src/xio/src/common/xio_nexus_cache.h create mode 100644 open_src/xio/src/common/xio_objpool.c create mode 100644 open_src/xio/src/common/xio_objpool.h create mode 100644 open_src/xio/src/common/xio_observer.c create mode 100644 open_src/xio/src/common/xio_observer.h create mode 100644 open_src/xio/src/common/xio_options.c create mode 100644 open_src/xio/src/common/xio_protocol.h create mode 100644 open_src/xio/src/common/xio_server.c create mode 100644 open_src/xio/src/common/xio_server.h create mode 100644 open_src/xio/src/common/xio_session.c create mode 100644 open_src/xio/src/common/xio_session.h create mode 100644 open_src/xio/src/common/xio_session_client.c create mode 100644 open_src/xio/src/common/xio_session_priv.h create mode 100644 open_src/xio/src/common/xio_session_server.c create mode 100644 open_src/xio/src/common/xio_sessions_cache.c create mode 100644 open_src/xio/src/common/xio_sessions_cache.h create mode 100644 open_src/xio/src/common/xio_sg_table.h create mode 100644 open_src/xio/src/common/xio_task.h create mode 100644 open_src/xio/src/common/xio_transport.c create mode 100644 open_src/xio/src/common/xio_transport.h create mode 100644 open_src/xio/src/common/xio_utils.c create mode 100644 open_src/xio/src/common/xio_workqueue.h create mode 100644 open_src/xio/src/kernel/transport/compat/Makefile.in create mode 100644 open_src/xio/src/kernel/transport/compat/autogen.sh create mode 100644 open_src/xio/src/kernel/transport/compat/configure.ac create mode 100644 open_src/xio/src/kernel/transport/compat/install-sh create mode 100644 open_src/xio/src/kernel/transport/compat/missing create mode 100644 open_src/xio/src/kernel/transport/rdma/Makefile.in create mode 100644 open_src/xio/src/kernel/transport/rdma/autogen.sh create mode 100644 open_src/xio/src/kernel/transport/rdma/configure.ac create mode 100644 open_src/xio/src/kernel/transport/rdma/install-sh create mode 100644 open_src/xio/src/kernel/transport/rdma/xio_rdma_datapath.c create mode 100644 open_src/xio/src/kernel/transport/rdma/xio_rdma_management.c create mode 100644 open_src/xio/src/kernel/transport/rdma/xio_rdma_memory.c create mode 100644 open_src/xio/src/kernel/transport/rdma/xio_rdma_transport.h create mode 100644 open_src/xio/src/kernel/transport/rdma/xio_rdma_utils.c create mode 100644 open_src/xio/src/kernel/transport/rdma/xio_rdma_utils.h create mode 100644 open_src/xio/src/kernel/transport/rdma/xio_rdma_verbs.c create mode 100644 open_src/xio/src/kernel/transport/tcp/Makefile.in create mode 100644 open_src/xio/src/kernel/transport/tcp/autogen.sh create mode 100644 open_src/xio/src/kernel/transport/tcp/configure.ac create mode 100644 open_src/xio/src/kernel/transport/tcp/install-sh create mode 100644 open_src/xio/src/kernel/transport/tcp/xio_tcp_datapath.c create mode 100644 open_src/xio/src/kernel/transport/tcp/xio_tcp_management.c create mode 100644 open_src/xio/src/kernel/transport/tcp/xio_tcp_transport.h create mode 100644 open_src/xio/src/kernel/transport/xio_ktransport.c create mode 100644 open_src/xio/src/kernel/transport/xio_ktransport.h create mode 100644 open_src/xio/src/kernel/xio/Makefile.in create mode 100644 open_src/xio/src/kernel/xio/autogen.sh create mode 100644 open_src/xio/src/kernel/xio/configure.ac create mode 100644 open_src/xio/src/kernel/xio/install-sh create mode 100644 open_src/xio/src/kernel/xio/xio_context.c create mode 100644 open_src/xio/src/kernel/xio/xio_context_priv.h create mode 100644 open_src/xio/src/kernel/xio/xio_ev_data.h create mode 100644 open_src/xio/src/kernel/xio/xio_ev_loop.c create mode 100644 open_src/xio/src/kernel/xio/xio_ev_loop.h create mode 100644 open_src/xio/src/kernel/xio/xio_init.c create mode 100644 open_src/xio/src/kernel/xio/xio_kernel_utils.c create mode 100644 open_src/xio/src/kernel/xio/xio_mem.c create mode 100644 open_src/xio/src/kernel/xio/xio_mem.h create mode 100644 open_src/xio/src/kernel/xio/xio_mempool.c create mode 100644 open_src/xio/src/kernel/xio/xio_mempool.h create mode 100644 open_src/xio/src/kernel/xio/xio_os.h create mode 100644 open_src/xio/src/kernel/xio/xio_sg_iov.c create mode 100644 open_src/xio/src/kernel/xio/xio_sg_iovptr.c create mode 100644 open_src/xio/src/kernel/xio/xio_sg_scatter.c create mode 100644 open_src/xio/src/kernel/xio/xio_sg_table.c create mode 100644 open_src/xio/src/kernel/xio/xio_task.c create mode 100644 open_src/xio/src/kernel/xio/xio_workqueue.c create mode 100644 open_src/xio/src/kernel/xio/xio_workqueue_priv.h create mode 100644 open_src/xio/src/kernel/xio_log.h create mode 100644 open_src/xio/src/libxio_os/linuxapp/xio_env.h create mode 100644 open_src/xio/src/libxio_os/linuxapp/xio_env_adv.h create mode 100644 open_src/xio/src/libxio_os/linuxapp/xio_env_basic.h create mode 100644 open_src/xio/src/libxio_os/linuxkernel/xio_env.h create mode 100644 open_src/xio/src/libxio_os/linuxkernel/xio_env_adv.h create mode 100644 open_src/xio/src/libxio_os/linuxkernel/xio_env_basic.h create mode 100644 open_src/xio/src/libxio_os/winapp/list.h create mode 100644 open_src/xio/src/libxio_os/winapp/spinlock.h create mode 100644 open_src/xio/src/libxio_os/winapp/xio_env.h create mode 100644 open_src/xio/src/libxio_os/winapp/xio_env_adv.h create mode 100644 open_src/xio/src/libxio_os/winapp/xio_env_basic.h create mode 100644 open_src/xio/src/libxio_os/winapp/xio_os.h create mode 100644 open_src/xio/src/tools/usr/Makefile.am create mode 100644 open_src/xio/src/tools/usr/xio_if_numa_cpus.c create mode 100644 open_src/xio/src/tools/usr/xio_mem_usage.c create mode 100644 open_src/xio/src/usr/Makefile.am create mode 100644 open_src/xio/src/usr/libxio.map create mode 100644 open_src/xio/src/usr/linux/atomic.h create mode 100644 open_src/xio/src/usr/linux/bitops.h create mode 100644 open_src/xio/src/usr/linux/debugfs.h create mode 100644 open_src/xio/src/usr/linux/jiffies.h create mode 100644 open_src/xio/src/usr/linux/kernel.h create mode 100644 open_src/xio/src/usr/linux/kref.h create mode 100644 open_src/xio/src/usr/linux/list.h create mode 100644 open_src/xio/src/usr/linux/printk.h create mode 100644 open_src/xio/src/usr/linux/slab.h create mode 100644 open_src/xio/src/usr/linux/usr.h create mode 100644 open_src/xio/src/usr/transport/rdma/ib_cm.h create mode 100644 open_src/xio/src/usr/transport/rdma/xio_rdma_datapath.c create mode 100644 open_src/xio/src/usr/transport/rdma/xio_rdma_management.c create mode 100644 open_src/xio/src/usr/transport/rdma/xio_rdma_transport.h create mode 100644 open_src/xio/src/usr/transport/rdma/xio_rdma_utils.c create mode 100644 open_src/xio/src/usr/transport/rdma/xio_rdma_utils.h create mode 100644 open_src/xio/src/usr/transport/rdma/xio_rdma_verbs.c create mode 100644 open_src/xio/src/usr/transport/tcp/xio_tcp_datapath.c create mode 100644 open_src/xio/src/usr/transport/tcp/xio_tcp_management.c create mode 100644 open_src/xio/src/usr/transport/tcp/xio_tcp_transport.h create mode 100644 open_src/xio/src/usr/transport/xio_mempool.c create mode 100644 open_src/xio/src/usr/transport/xio_mempool.h create mode 100644 open_src/xio/src/usr/transport/xio_usr_transport.c create mode 100644 open_src/xio/src/usr/transport/xio_usr_transport.h create mode 100644 open_src/xio/src/usr/xio/get_clock.c create mode 100644 open_src/xio/src/usr/xio/get_clock.h create mode 100644 open_src/xio/src/usr/xio/xio_context.c create mode 100644 open_src/xio/src/usr/xio/xio_context_priv.h create mode 100644 open_src/xio/src/usr/xio/xio_ev_data.h create mode 100644 open_src/xio/src/usr/xio/xio_ev_loop.c create mode 100644 open_src/xio/src/usr/xio/xio_ev_loop.h create mode 100644 open_src/xio/src/usr/xio/xio_init.c create mode 100644 open_src/xio/src/usr/xio/xio_init.h create mode 100644 open_src/xio/src/usr/xio/xio_log.c create mode 100644 open_src/xio/src/usr/xio/xio_log.h create mode 100644 open_src/xio/src/usr/xio/xio_mem.c create mode 100644 open_src/xio/src/usr/xio/xio_mem.h create mode 100644 open_src/xio/src/usr/xio/xio_netlink.c create mode 100644 open_src/xio/src/usr/xio/xio_os.h create mode 100644 open_src/xio/src/usr/xio/xio_sg_iov.c create mode 100644 open_src/xio/src/usr/xio/xio_sg_iovptr.c create mode 100644 open_src/xio/src/usr/xio/xio_sg_table.c create mode 100644 open_src/xio/src/usr/xio/xio_task.c create mode 100644 open_src/xio/src/usr/xio/xio_timers_list.h create mode 100644 open_src/xio/src/usr/xio/xio_tls.c create mode 100644 open_src/xio/src/usr/xio/xio_tls.h create mode 100644 open_src/xio/src/usr/xio/xio_usr_utils.c create mode 100644 open_src/xio/src/usr/xio/xio_usr_utils.h create mode 100644 open_src/xio/src/usr/xio/xio_workqueue.c create mode 100644 open_src/xio/src/usr/xio/xio_workqueue_priv.h create mode 100644 open_src/xio/version.c create mode 100644 src/common/base_log.h create mode 100644 src/common/queue.h create mode 100644 src/common/threadpool.c create mode 100644 src/common/threadpool.h create mode 100644 src/session/arpc_client.c create mode 100644 src/session/arpc_com.c create mode 100644 src/session/arpc_com.h create mode 100644 src/session/arpc_make_request.c create mode 100644 src/session/arpc_message.c create mode 100644 src/session/arpc_process_oneway.c create mode 100644 src/session/arpc_process_request.c create mode 100644 src/session/arpc_process_rsp.c create mode 100644 src/session/arpc_server.c diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..c245a38 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,59 @@ +############################################################### +#*【项目】CA +#*【描述】 +#*【作者】hongchunhua +#*【时间】2020.07.22 +############################################################### + +cmake_minimum_required(VERSION 2.8) +project(arpc) + +set(COM_ROOT_PATH "${CMAKE_CURRENT_SOURCE_DIR}") + +#设置依赖的文件路径 +set(ARPC_ROOT_PATH "${CMAKE_CURRENT_SOURCE_DIR}") + + #开源 +set(ARPC_OPENSRC_PATH "${ARPC_ROOT_PATH}/open_src") + +set(ARPC_DEMO_PATH "${ARPC_ROOT_PATH}/demo") + +include("${COM_ROOT_PATH}/common.cmake") + +#设定源码 +set(XIO_INCLUDE ${OPENSRC_PATH}/xio/include) +set(SRC_COMMON ${COM_SRC_PATH}/common) +set(SRC_SESSION ${COM_SRC_PATH}/session) +set(ARPC_INCLUDE ${COM_ROOT_PATH}/inc) + +set(SOURCE_FILES "") +aux_source_directory(${SRC_COMMON} SOURCE_FILES) +aux_source_directory(${SRC_SESSION} SOURCE_FILES) + + +#设定头文件路径 +include_directories(${ARPC_INCLUDE} ${XIO_INCLUDE} ${SRC_COMMON} ${SRC_SESSION}) + + +#设定链接库的路径(一般使用第三方非系统目录下的库) +set(LINK_LIB_PATH ${DPENDENCY_LIB_PATH}) +LINK_DIRECTORIES(${LIBRARY_OUTPUT_PATH} ${LINK_LIB_PATH}) + + +#添加依赖项子目录 + +#动态库 +add_library(arpc SHARED ${SOURCE_FILES}) +#链接静态库 +target_link_libraries(arpc + "-Wl,--whole-archive" #之后的库使用--whole-archive选项 + xio +"-Wl,--no-whole-archive") #之后的库不使用--whole-archive选项 + +INSTALL(TARGETS arpc LIBRARY DESTINATION ${LINK_LIB_PATH}) +INSTALL(TARGETS arpc LIBRARY DESTINATION /usr/lib) + +#子项目 +add_subdirectory("${ARPC_OPENSRC_PATH}/xio/") +add_subdirectory("${ARPC_DEMO_PATH}/client_file_send") +add_subdirectory("${ARPC_DEMO_PATH}/server_file_rev") \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..c8814a4 --- /dev/null +++ b/README.md @@ -0,0 +1,26 @@ +--- +# build +*【说明】编译工程采用cmake编译 + +* for linux (ubuntu) +* apt-get install cmake + + 1. cd code root; + 2. mkdir build + 3. cd build + 4. cmake ../ (default:debug mode) + 5. make -j + 6. make install + +--- +# file tree +* demo(demo测试应用) + +* open_src (开源代码xio) + +* src (arpc 源码) + +* inc (arpc 函数头) + +#说明 +生成库文件libarpc.so diff --git a/common.cmake b/common.cmake new file mode 100644 index 0000000..f9e0fda --- /dev/null +++ b/common.cmake @@ -0,0 +1,67 @@ +############################################################### +#*【项目】CA +#*【描述】 +#*【作者】hongchunhua +#*【时间】2020.07.22 +############################################################### + + #依赖库 +set(DPENDENCY_LIB_PATH "${COM_ROOT_PATH}/lib/") + #开源 +set(OPENSRC_PATH "${COM_ROOT_PATH}/open_src") + + #APP 应用目录 +set(COM_APP_PATH "${COM_ROOT_PATH}/demo") + #源码目录 +set(COM_SRC_PATH "${COM_ROOT_PATH}/src") + +#设定编译参数 +if (DEFINED CLANG) + SET (CMAKE_C_COMPILER "/usr/bin/clang") + SET (CMAKE_C_FLAGS "-Wall -std=c99") + SET (CMAKE_C_FLAGS_DEBUG "-g") + SET (CMAKE_C_FLAGS_MINSIZEREL "-Os -DNDEBUG") + SET (CMAKE_C_FLAGS_RELEASE "-O4 -DNDEBUG") + SET (CMAKE_C_FLAGS_RELWITHDEBINFO "-O2 -g") + + SET (CMAKE_CXX_COMPILER "/usr/bin/clang++") + SET (CMAKE_CXX_FLAGS "-Wall") + SET (CMAKE_CXX_FLAGS_DEBUG "-g") + SET (CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG") + SET (CMAKE_CXX_FLAGS_RELEASE "-O4 -DNDEBUG") + SET (CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g") + + SET (CMAKE_AR "/usr/bin/llvm-ar") + SET (CMAKE_LINKER "/usr/bin/llvm-ld") + SET (CMAKE_NM "/usr/bin/llvm-nm") + SET (CMAKE_OBJDUMP "/usr/bin/llvm-objdump") + SET (CMAKE_RANLIB "/usr/bin/llvm-ranlib") +else() + set(CMAKE_CXX_FLAGS_DEBUG "$ENV{CXXFLAGS} -O0 -Wall -g -ggdb") + set(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -O3 -Wall") + set(CMAKE_C_FLAGS_DEBUG "$ENV{CFLAGS} -O0 -Wall -g -ggdb3 -Werror -Wdeclaration-after-statement") + set(CMAKE_C_FLAGS_RELEASE "$ENV{CFLAGS} -O3 -Wall") + + if (CMAKE_BUILD_TYPE STREQUAL Release) + message("NOTE: project to build on [Release] version.") + set(CMAKE_BUILD_TYPE "Release") + set(DEBUG_FLAG ${CMAKE_C_FLAGS_RELEASE}) + else() + message("WARNING: project to build on [Debug] version.") + set(CMAKE_BUILD_TYPE "Debug") + set(DEBUG_FLAG ${CMAKE_C_FLAGS_DEBUG}) + endif() + SET(CA_WARNINGS_SETTING "-Wno-missing-field-initializers -Wno-deprecated -fno-omit-frame-pointer -Wno-unused-parameter -Wno-deprecated-declarations -Wno-unused-function -Wno-unused-variable") + SET(C_CPP_FLAGS_ "${C_CPP_FLAGS_} -DPIC -fPIC ${DEBUG_FLAG} -D_GNU_SOURCE -DUSE_COMMON_LIB ${OS_FLAG} ${CA_WARNINGS_SETTING}") + + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${C_CPP_FLAGS_}") + SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${C_CPP_FLAGS_}") +endif() + +#设置输出路径 +SET(EXECUTABLE_OUTPUT_PATH ${COM_ROOT_PATH}/${CMAKE_BUILD_TYPE}_build_out/bin) #设置可执行文件的输出目录 +SET(LIBRARY_OUTPUT_PATH ${COM_ROOT_PATH}/${CMAKE_BUILD_TYPE}_build_out/lib) #设置库文件的输出目录 + +message("--cur path: ${CMAKE_CURRENT_SOURCE_DIR}") +message("--project : ${PROJECT_NAME}") +message("--out path: ${COM_ROOT_PATH}/${CMAKE_BUILD_TYPE}_build_out.") \ No newline at end of file diff --git a/demo/README.md b/demo/README.md new file mode 100644 index 0000000..1b00a72 --- /dev/null +++ b/demo/README.md @@ -0,0 +1,16 @@ +--- +# client demo +*【说明】客户端session实现用例,简单消息通信 + +# server demo +*【说明】服务端session实现用例,简单消息通信 +--- + +# client_file_send +*【说明】客户端session实现 文件发送 +--- + +# server_file_rev +*【说明】服务端session实现 文件接收 +--- + diff --git a/demo/client_file_send/CMakeLists.txt b/demo/client_file_send/CMakeLists.txt new file mode 100644 index 0000000..c72db33 --- /dev/null +++ b/demo/client_file_send/CMakeLists.txt @@ -0,0 +1,32 @@ +cmake_minimum_required(VERSION 2.8) +project(client_file) + +include("${COM_ROOT_PATH}/common.cmake") + +#设定源码 +set(ARPC_INCLUDE ${COM_ROOT_PATH}/inc) +set(SRC_COMMON ${COM_SRC_PATH}/common) +set(SRC_SESSION ${COM_SRC_PATH}/session) + +set(SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/main.c) +aux_source_directory(${SRC_COMMON} SOURCE_FILES) +aux_source_directory(${SRC_SESSION} SOURCE_FILES) + + +#设定头文件路径 +include_directories(${ARPC_INCLUDE} ${SRC_COMMON} ${SRC_SESSION}) + + +#设定链接库的路径(一般使用第三方非系统目录下的库) +set(LINK_LIB_PATH ${DPENDENCY_LIB_PATH}) +LINK_DIRECTORIES(${LIBRARY_OUTPUT_PATH} ${LINK_LIB_PATH}) + + +#添加依赖项子目录 + +#生成可执行文件 +add_executable(client_file ${SOURCE_FILES}) + +target_link_libraries(client_file -larpc -lnuma -ldl -lrt -lpthread) +add_dependencies(client_file aprc) + diff --git a/demo/client_file_send/main.c b/demo/client_file_send/main.c new file mode 100644 index 0000000..34930b2 --- /dev/null +++ b/demo/client_file_send/main.c @@ -0,0 +1,132 @@ +/* + * Copyright(C) 2020 Ruijie Network. All rights reserved. + */ + +/*! +* \file xxx.x +* \brief xxx +* +* 包含.. +* +* \copyright 2020 Ruijie Network. All rights reserved. +* \author hongchunhua@ruijie.com.cn +* \version v1.0.0 +* \date 2020.08.05 +* \note none +*/ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arpc_api.h" + +#define BUF_MAX_SIZE 1024 +/*---------------------------------------------------------------------------*/ +/* main */ +/*---------------------------------------------------------------------------*/ +int main(int argc, char *argv[]) +{ + uint32_t i = 0; + int ret = 0; + uint64_t offset = 0,send_len = 0, file_len =0; + + struct arpc_client_session_param param; + struct arpc_msg *requst =NULL; + arpc_session_handle_t session_fd; + struct arpc_msg_param p; + char *file_path = NULL; + FILE *fp = NULL; + + if (argc < 4) { + printf("Usage: %s \ + \n", argv[0]); + return 0; + } + printf("input:<%s> <%s> <%s> <%s>\n", argv[1], argv[2], argv[3], argv[4]); + file_path = argv[3]; + fp = fopen(file_path, "rb"); + if(!fp) { + printf("can not open this file[%s],or not exist!\n", file_path); + return 0; + } + fseek(fp, 0, SEEK_END); + file_len = ftell(fp); + rewind(fp); + printf("-----file_size:%lu\n", file_len); + arpc_init(); + // 创建session + param.con.type = ARPC_E_TRANS_TCP; + memcpy(param.con.ipv4.ip, argv[1], IPV4_MAX_LEN); + param.con.ipv4.port = atoi(argv[2]); + param.req_data = argv[4]; + param.req_data_len = strlen(argv[4]); + session_fd = arpc_client_create_session(¶m); + if (!session_fd){ + printf("arpc_client_create_session fail\n"); + goto end; + } + + // 新建消息 + requst = arpc_new_msg(NULL); + while(offset < file_len){ + send_len = ((file_len - offset) > DATA_DEFAULT_MAX_LEN)? DATA_DEFAULT_MAX_LEN: (file_len - offset); + printf("_____send_len:%lu, left_size:%lu____________\n", send_len, (file_len - offset)); + requst->send.head_len = strlen(file_path); + requst->send.head = file_path; + requst->send.total_data = send_len; + requst->send.vec_num = (requst->send.total_data / IOV_DEFAULT_MAX_LEN) + 1; + requst->proc_rsp_cb = NULL; + + // 读取文件 + requst->send.vec = malloc(requst->send.vec_num * sizeof(struct arpc_iov)); + for (i = 0; i < requst->send.vec_num -1; i++) { + fseek(fp, i*IOV_DEFAULT_MAX_LEN + offset, SEEK_SET); + requst->send.vec[i].data = malloc(IOV_DEFAULT_MAX_LEN); + requst->send.vec[i].len = fread(requst->send.vec[i].data, 1, IOV_DEFAULT_MAX_LEN, fp); + if (requst->send.vec[i].len < IOV_DEFAULT_MAX_LEN){ + if(feof(fp)){ + break; + } + } + } + fseek(fp, i*IOV_DEFAULT_MAX_LEN + offset, SEEK_SET); + offset += send_len; + send_len = send_len % IOV_DEFAULT_MAX_LEN; + requst->send.vec[i].data = malloc(send_len); + requst->send.vec[i].len = fread(requst->send.vec[i].data, 1, send_len, fp); + if (requst->send.vec[i].len < send_len){ + printf("fread len fail\n"); + } + //ret = arpc_do_request(session_fd, requst, -1); + ret = arpc_send_oneway_msg(session_fd, requst); + //usleep(500*1000); + if (ret != 0){ + printf("arpc_do_request fail\n"); + } + + // 释放资源 + for (i = 0; i < requst->send.vec_num; i++) { + if (requst->send.vec[i].data) + free(requst->send.vec[i].data); + } + free(requst->send.vec); + requst->send.vec = NULL; + arpc_msg_reset(requst); + } + arpc_delete_msg(&requst); + arpc_client_destroy_session(session_fd); + printf("file send complete:%s.\n", file_path); +end: + if (fp) + fclose(fp); + fp =NULL; + arpc_finish(); + return 0; +} + diff --git a/demo/server_file_rev/CMakeLists.txt b/demo/server_file_rev/CMakeLists.txt new file mode 100644 index 0000000..27e51e5 --- /dev/null +++ b/demo/server_file_rev/CMakeLists.txt @@ -0,0 +1,30 @@ +cmake_minimum_required(VERSION 2.8) +project(server_file) + +include("${COM_ROOT_PATH}/common.cmake") + +#设定源码 +set(ARPC_INCLUDE ${COM_ROOT_PATH}/inc) +set(SRC_COMMON ${COM_SRC_PATH}/common) +set(SRC_SESSION ${COM_SRC_PATH}/session) + +set(SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/main.c) +aux_source_directory(${SRC_COMMON} SOURCE_FILES) +aux_source_directory(${SRC_SESSION} SOURCE_FILES) + + +#设定头文件路径 +include_directories(${ARPC_INCLUDE} ${SRC_COMMON} ${SRC_SESSION}) + + +#设定链接库的路径(一般使用第三方非系统目录下的库) +set(LINK_LIB_PATH ${DPENDENCY_LIB_PATH}) +LINK_DIRECTORIES(${LIBRARY_OUTPUT_PATH} ${LINK_LIB_PATH}) + + +#添加依赖项子目录 + +#生成可执行文件 +add_executable(server_file ${SOURCE_FILES}) +target_link_libraries(server_file -larpc -lnuma -ldl -lrt -lpthread) +add_dependencies(server_file arpc) \ No newline at end of file diff --git a/demo/server_file_rev/main.c b/demo/server_file_rev/main.c new file mode 100644 index 0000000..7740d86 --- /dev/null +++ b/demo/server_file_rev/main.c @@ -0,0 +1,190 @@ +/* + * Copyright(C) 2020 Ruijie Network. All rights reserved. + */ + +/*! +* \file xxx.x +* \brief xxx +* +* 包含.. +* +* \copyright 2020 Ruijie Network. All rights reserved. +* \author hongchunhua@ruijie.com.cn +* \version v1.0.0 +* \date 2020.08.05 +* \note none +*/ + +#include +#include +#include +#include +#include +#include +#include + +#include "arpc_api.h" + +static char rsp_header[] = ""; + +static void *mem_alloc(uint32_t size, void *usr_context) +{ + void *mem = malloc(size); + return mem; +} +static int mem_free(void *buf_ptr, void *usr_context) +{ + if(buf_ptr) + free(buf_ptr); + return 0; +} + +static int process_rx_header(struct arpc_header_msg *header, void* usr_context, uint32_t *flag) +{ + SET_METHOD(*flag, METHOD_ALLOC_DATA_BUF); + SET_METHOD(*flag, METHOD_PROCESS_ASYNC); + return 0; +} + +static int process_rx_data(const struct arpc_vmsg *req_iov, struct arpc_vmsg *rsp_iov, void *usr_context) +{ + char file_path[512] = {0}; + FILE *fp = NULL; + uint32_t i; + + if (rsp_iov){ + rsp_iov->head = rsp_header; + rsp_iov->head_len = sizeof(rsp_header); + + rsp_iov->vec_num = 0; + rsp_iov->vec = NULL; + rsp_iov->total_data = 0; + } + + if (!req_iov || !usr_context){ + printf("null inputn"); + return 0; + } + sprintf(file_path, "./rev_%s", (char *)usr_context); + + printf("------file:%s, receive len:%lu.\n", file_path, req_iov->total_data); + + fp = fopen(file_path, "ab"); + if (!fp){ + printf("fopen path:%s fail.\n", file_path); + return 0; + } + for(i = 0; i < req_iov->vec_num; i++){ + fwrite(req_iov->vec[i].data, 1, req_iov->vec[i].len, fp); + fseek(fp, 0, SEEK_END); + } + fclose(fp); + return 0; +} +static int release_rsp(struct arpc_vmsg *rsp_iov, void *usr_context) +{ + return 0; +} + +int new_session_start(const struct arpc_new_session_req *client, struct arpc_new_session_rsp *param, void* usr_context) +{ + char file_path[512] = {0}; + FILE *fp = NULL; + if (client && client->client_data.data){ + memcpy(usr_context, client->client_data.data, client->client_data.len); + sprintf(file_path, "./rev_%s", (char *)usr_context); + fp = fopen(file_path, "w"); + if (!fp){ + printf("fopen path:%s fail.\n", file_path); + return 0; + } + fclose(fp); + } + return 0; +} + +int new_session_end(arpc_session_handle_t fd, struct arpc_new_session_rsp *param, void* usr_context) +{ + return 0; +} + + +static int process_async(const struct arpc_vmsg *req_iov, void* usr_context) +{ + FILE *fp = NULL; + char file_path[512] = {0}; + uint32_t i; + if (!req_iov || !usr_context){ + printf("null inputn"); + return 0; + } + sprintf(file_path, "./rev_%s", (char *)usr_context); + + fp = fopen(file_path, "ab"); + if (!fp){ + printf("fopen path:%s fail.\n", file_path); + return 0; + } + for(i = 0; i < req_iov->vec_num; i++){ + fwrite(req_iov->vec[i].data, 1, req_iov->vec[i].len, fp); + fseek(fp, 0, SEEK_END); + } + fclose(fp); + + return 0; +} +/*---------------------------------------------------------------------------*/ +/* main */ +/*---------------------------------------------------------------------------*/ +int main(int argc, char *argv[]) +{ + struct arpc_server_param param; + arpc_session_handle_t fd = NULL; + char file_name[256] = {0}; + struct arpc_session_ops ops ={ + .req_ops = { + .alloc_cb = &mem_alloc, + .free_cb = &mem_free, + .proc_head_cb = &process_rx_header, + .proc_data_cb = &process_rx_data, + .proc_async_cb = &process_async, + .release_rsp_cb = &release_rsp, + }, + .oneway_ops = { + .alloc_cb = &mem_alloc, + .free_cb = &mem_free, + .proc_head_cb = &process_rx_header, + .proc_data_cb = &process_rx_data, + .proc_async_cb = NULL, + } + }; + + if (argc < 2) { + printf("Usage: %s n", argv[0]); + return 0; + } + + arpc_init(); + memset(¶m, 0, sizeof(param)); + param.con.type = ARPC_E_TRANS_TCP; + memcpy(param.con.ipv4.ip, argv[1], IPV4_MAX_LEN); + param.con.ipv4.port = atoi(argv[2]); + + param.default_ops = ops; + param.new_session_start = &new_session_start; + param.new_session_end = &new_session_end; + param.default_ops_usr_ctx = file_name; + + fd = arpc_server_create(¶m); + if(!fd){ + goto end; + } + arpc_server_loop(fd, -1); + +end: + if(fd) + arpc_server_destroy(fd); + arpc_finish(); + return 0; +} + diff --git a/inc/arpc_api.h b/inc/arpc_api.h new file mode 100644 index 0000000..146b6bd --- /dev/null +++ b/inc/arpc_api.h @@ -0,0 +1,475 @@ +/* + * Copyright(C) 2020 Ruijie Network. All rights reserved. + */ + +/*! +* \file async_rpc_api.h +* \brief 异步session接口 +* +* 包含异步session外部所用到的接口和结构体 +* +* \copyright 2020 Ruijie Network. All rights reserved. +* \author hongchunhua@ruijie.com.cn +* \version v1.0.0 +* \date 2020.08.05 +* \note none +*/ + +#ifndef _ARPC_API_H_ +#define _ARPC_API_H_ + +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void* arpc_session_handle_t; /*! @brief session 句柄 */ +typedef void* arpc_server_t; /*! @brief server */ + +#define _DEF_SESSION_SERVER +#define _DEF_SESSION_CLIENT + +/*! + * @brief arpc消息框架全局初始化 + * + * 无论是client还是server,必须先全局初始化后,才能使用session功能。 + * 这里暂未实现配置入参,后续支持// todo + * + * @return int; (-1: fail ; ( 0: succeed + */ +int arpc_init(); + +/*! + * @brief arpc消息框架结束退出 + * + * 所有session都关闭和server资源都释放后,再调用这个,否则会有意想不到的错误 + * + * @return void; + */ +void arpc_finish(); + + +enum arpc_trans_type { + ARPC_E_TRANS_TCP = 0, +}; + +#define IPV4_MAX_LEN 16 +struct arpc_ipv4_addr { + char ip[IPV4_MAX_LEN]; + uint32_t port; +}; + +/** + * @brief 链路参数 + * + * @details + * 用于session的传输层的通信参数,如TCP/IP;这里暂时支持TCP模式, + * 后续可以拓展通信方式,如本地进程通信等 + */ +struct arpc_con_info{ + enum arpc_trans_type type; + union{ + struct arpc_ipv4_addr ipv4; + }; +}; + +/** + * @brief IOV结构 + * + * @details + * + */ +struct arpc_iov{ + void* data; + size_t len; +}; + +/** + * @brief arpc基础消息结构 + * + * @details + * arpc对外提供的数据结构 + */ +struct arpc_vmsg{ + uint32_t head_len; /*! @brief 头部长度 */ + void *head; /*! @brief 头部数据 */ + uint64_t total_data; /*! @brief 数据全部长度 */ + uint32_t vec_num; /*! @brief IO vector数量 */ + struct arpc_iov *vec; /*! @brief vector */ +}; + +/*! + * @brief 内存释放函数 + * + * @param[in] size ,消息头长度 + * @param[in] usr_context ,用户上下文,初始化时由调用者入参,由调用者使用 + * @return mem buf + */ +typedef int (*mem_free_cb_t)(void* buf_ptr, void* usr_context); + +#define METHOD_ALLOC_DATA_BUF 0 /*! @brief 申请自定义内存 */ +#define METHOD_PROCESS_ASYNC 1 /*! @brief 异步处理 */ + +/** + * @brief 标记位操作方法 + * + * @details + * 用于请求处理后,设置session主框架参数 + */ +#define SET_METHOD(flag, method) (flag=(flag|(1<NULL: fail ; ( 非NULL: succeed + */ +struct arpc_msg *arpc_new_msg(const struct arpc_msg_param *p); + +/*! + * @brief 释放一个消息体 + * + * 释放分配的内存; + * 注意:如果消息体处在发送待接收状态,即框架锁定,则会释放不成功,必须等待消息框架释放消息体。 + * 消息框架会确保消息体不会长期持有的,消息超时或者发送失败都是自动释放消息体 + * + * @param[inout] msg ,消息体指针的指针,如果释放成功,则句柄将会被置空 + * @return int; (-1: fail ; ( 0: succeed + */ +int arpc_delete_msg(struct arpc_msg **msg); + +/*! + * @brief 重置一个消息体 + * + * 重复利用一个消息体,避免发送数据不断申请资源。 + * 注意:处在发送状态的消息体无法重置,重置是用于释放接收回复消息的缓存buff + * + * @param[in] msg ,消息体指针 + * @return int; (-1: fail ; ( 0: succeed + */ +int arpc_msg_reset(struct arpc_msg *msg); + + +/*! + * @brief 发送请求 + * + * 发送一个请求等待回复(发送消息并阻塞等待接收方回复) + * + * @param[in] fd ,a session handle + * @param[in] msg ,a data that will send + * @param[in] timeout_ms , 超时时间, -1则一直等待,若设置回调,则该值不生效,直接返回 + * @return receive .0,表示发送成功,小于0则失败 + */ +int arpc_do_request(const arpc_session_handle_t fd, struct arpc_msg *msg, int32_t timeout_ms); + +/*! + * @brief 发送单向消息 + * + * 发送一个单向消息(接收方无需回复) + * + * @param[in] fd ,a session handle + * @param[in] msg ,a data that will send + * @return receive .0,表示发送成功,小于0则失败 + */ +int arpc_send_oneway_msg(const arpc_session_handle_t fd, struct arpc_msg *msg); + +/********************************************************************************************** + * @name client + * @brif clien 客户端 + **********************************************************************************************/ +#ifdef _DEF_SESSION_CLIENT + +#define MAX_SESSION_REQ_DATA_LEN 1024 /*! @brief 申请session的数据长度 */ + +/** + * @brief 客户端session实例化参数 + * + * @details + * 用于实例化客户端session的参数 + */ +struct arpc_client_session_param { + struct arpc_con_info con; /*! @brief 每个连接的传输类型和参数 */ + struct arpc_session_ops *ops; /*! @brief 请求回复的回调函数 */ + void *ops_usr_ctx; /*! @brief 调用者的上下文参数,用于操作函数入参 */ + uint32_t req_data_len; /*! @brief 申请session时的数据 */ + void *req_data; /*! @brief 调用者新建session请求时,发给服务端数据 */ +}; + +/*! + * @brief 创建session客户端实例 + * + * @param[in] param session服务端实例化的参数 + * @return arpc_session_handle_t; (NULL: fail ; ( 非NULL: succeed + * + */ +arpc_session_handle_t arpc_client_create_session(const struct arpc_client_session_param *param); + +/*! + * @brief 销毁session客户端实例 + * + * @param[in] fd session句柄 + * @return int; (-1: fail ; ( 0: succeed + * + */ +int arpc_client_destroy_session(arpc_session_handle_t *fd); + +/** + * @brief session状态枚举 + * + * @details + * 用于获取session状态 + */ +enum arpc_session_status { + ARPC_SESSION_STA_NOT_EXISTED = -1, /*! @brief session不存在或者已经释放 */ + ARPC_SESSION_STA_ACTIVE = 0, /*! @brief 正常活跃 */ + ARPC_SESSION_STA_RE_CON, /*! @brief 断路,尝试重连 */ + ARPC_SESSION_STA_WAIT, /*! @brief 断路,周期尝试重连 */ +}; + +/*! + * @brief 获取session状态 + * + * @param[in] fd session句柄 + * @return arpc_session_status; + * + */ +enum arpc_session_status arpc_get_session_status(const arpc_session_handle_t fd); + +#endif + + +#ifdef _DEF_SESSION_SERVER + +/*! + * @brief 新建session请求消息 + * + * @details + * 该状态是由client端发给server端,用于请求新建session + * + */ +struct arpc_new_session_req{ + uint64_t session_id; + struct arpc_con_info client_con_info; + struct arpc_iov client_data; +}; + +/*! + * @brief 新建session的状态 + * + * @details + * 该状态是由server端发给client端,用于应答新建session状态 + * + */ +enum arpc_new_session_status{ + ARPC_E_STATUS_OK = 0, + ARPC_E_STATUS_INVALID_USER, /*! @brief 非法用户 */ +}; + +/*! + * @brief 新建session回复消息 + * + * @details + * 该状态是由server端发给client端,用于回复新建session的结果 + * + */ +struct arpc_new_session_rsp{ + void *rsp_data; + uint32_t rsp_data_len; + enum arpc_new_session_status ret_status; + struct arpc_session_ops *ops; + void *ops_new_ctx; +}; + +/*! + * @brief session服务端实例化参数 + * + * @details + * 参数用于配置session功能 + * + */ +struct arpc_server_param { + /*! @brief 每个连接的传输类型和参数 */ + struct arpc_con_info con; + + /*! @brief 收到申请建立session请求时回调,用于处理调用者逻辑和输出回复消息*/ + int (*new_session_start)(const struct arpc_new_session_req *, struct arpc_new_session_rsp *, void*); + + /*! @brief 消息框架新建session后回调,用于释放调用者回复消息的资源*/ + int (*new_session_end)(arpc_session_handle_t, struct arpc_new_session_rsp *, void*); + + /*! @brief session服务端默认的消息操作函数 */ + struct arpc_session_ops default_ops; + + /*! @brief 用户上下文数据,作为消息处理函数的入参 */ + void *default_ops_usr_ctx; + + /*! @brief IOV数据深度 选填*/ + uint32_t iov_max_len; +}; + +/*! + * @brief 创建session服务端实例 + * + * @param[in] param session服务端实例化的参数 + * @return int; (NULL: fail ; ( 非NULL: succeed + * + */ +arpc_server_t arpc_server_create(const struct arpc_server_param *param); + +/*! + * @brief 监听session建立请求 + * + * @param[in] fd session服务端实例化的参数 + * @param[in] timeout_ms 服务端消息框架阻塞超时时间,-1表示一直阻塞,永不超时。 + * @return int; (NULL: fail ; ( 非NULL: succeed + * + */ +int arpc_server_loop(arpc_server_t fd, int32_t timeout_ms); + +/*! + * @brief 释放session服务端实例 + * + * @param[inout] fd session服务端实例化的参数, 释放成功,则句柄会被置空 + * @return int; (NULL: fail ; ( 非NULL: succeed + * + */ +int arpc_server_destroy(arpc_server_t *fd); + +#endif + +#ifdef __cplusplus +} +#endif + +#endif /*XIO_API_H */ diff --git a/open_src/xio/CMakeLists.txt b/open_src/xio/CMakeLists.txt new file mode 100644 index 0000000..c5ea26e --- /dev/null +++ b/open_src/xio/CMakeLists.txt @@ -0,0 +1,69 @@ +cmake_minimum_required(VERSION 2.8) +project(xio) + +include("${COM_ROOT_PATH}/common.cmake") + +set(XIO_ROOT_PATH ${CMAKE_CURRENT_SOURCE_DIR}) +set(XIO_INC ${XIO_ROOT_PATH}/include) +set(XIO_COM ${XIO_ROOT_PATH}/src/common) +set(XIO_XIO ${XIO_ROOT_PATH}/src/usr) +set(XIO_LINUX ${XIO_ROOT_PATH}/src/usr) +if(XIO_TYPE STREQUAL kernel) + set(TPYE_SRC ${XIO_ROOT_PATH}/src/kernel) + set(XIO_OS ${XIO_ROOT_PATH}/src/libxio_os/linuxkernel) +else() + set(XIO_TYPE "user") + set(TPYE_SRC ${XIO_ROOT_PATH}/src/usr) + set(XIO_OS ${XIO_ROOT_PATH}/src/libxio_os/linuxapp) +endif() + +message("## NOTE ##: project to build for [${XIO_TYPE}].") + +#version文件 +set(VERSION_FILE_PATH ${XIO_ROOT_PATH}) + +#版本 +set(VERSION_NUM "xio_1.7.0_release") + +set(VERSION_MSG "const char XIO_VERSION_STRING[] = \"${VERSION_NUM}\"\;") +file(WRITE ${VERSION_FILE_PATH}/version.c ${VERSION_MSG}) + +#设定源码 +set(SOURCE_FILES ${VERSION_FILE_PATH}/version.c) +aux_source_directory(${TPYE_SRC}/transport SOURCE_FILES) +aux_source_directory(${TPYE_SRC}/transport/tcp SOURCE_FILES) +aux_source_directory(${TPYE_SRC}/xio SOURCE_FILES) +aux_source_directory(${XIO_COM} SOURCE_FILES) + +#设定头文件路径 +include_directories(${XIO_INC} + ${XIO_COM} + ${XIO_COM}/sys + ${XIO_OS} + ${XIO_XIO} + ${TPYE_SRC}/xio + ${TPYE_SRC}/transport + ${TPYE_SRC}/transport/tcp) + + +#设定链接库的路径(一般使用第三方非系统目录下的库) +set(LINK_LIB_PATH ${DPENDENCY_LIB_PATH}) +LINK_DIRECTORIES(${LIBRARY_OUTPUT_PATH} ${LINK_LIB_PATH}) + + +#添加依赖项子目录 +#add_dependencies(xio) +set(BUILD_STATIC true) + +if(BUILD_STATIC) + message("build static for xio") + add_library(xio STATIC ${SOURCE_FILES}) + INSTALL(TARGETS xio ARCHIVE DESTINATION ${LINK_LIB_PATH}) +else() + #动态库 + add_library(xio SHARED ${SOURCE_FILES}) + INSTALL(TARGETS xio LIBRARY DESTINATION ${LINK_LIB_PATH}) + INSTALL(TARGETS xio LIBRARY DESTINATION /usr/lib) +endif() + + diff --git a/open_src/xio/include/libxio.h b/open_src/xio/include/libxio.h new file mode 100644 index 0000000..4d34a90 --- /dev/null +++ b/open_src/xio/include/libxio.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#if defined(__KERNEL__) +#include "xio_kernel.h" +#else +#include "xio_user.h" +#endif diff --git a/open_src/xio/include/xio_base.h b/open_src/xio/include/xio_base.h new file mode 100644 index 0000000..9044e77 --- /dev/null +++ b/open_src/xio/include/xio_base.h @@ -0,0 +1,1545 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_BASE_H +#define XIO_BASE_H + +#ifdef __cplusplus +extern "C" { +#endif +/*---------------------------------------------------------------------------*/ +/* preprocessor directives */ +/*---------------------------------------------------------------------------*/ +/** + * @def XIO_VERSION + * @brief accelio current api version number + */ +#define XIO_VERSION 0x0100 + +/** + * @def XIO_IOVLEN + * @brief array size of data IO vector in message + */ +#define XIO_IOVLEN 10 + +/** + * @def XIO_MAX_IOV + * @brief maximum size of data IO vector in message + */ +#define XIO_MAX_IOV 256 + +/*---------------------------------------------------------------------------*/ +/* opaque data structures */ +/*---------------------------------------------------------------------------*/ +struct xio_context; /* xio context */ +struct xio_server; /* server handle */ +struct xio_session; /* session handle */ +struct xio_connection; /* connection handle */ +struct xio_mr; /* registered memory handle */ + +/*---------------------------------------------------------------------------*/ +/* accelio extended errors */ +/*---------------------------------------------------------------------------*/ + +/** + * A number random enough not to collide with different errno ranges + * The assumption is that errno is at least 32-bit type + */ +#define XIO_BASE_STATUS 1247689300 + +/** + * @enum xio_status + * @brief accelio's extended error codes + */ +enum xio_status { + XIO_E_SUCCESS = 0, + XIO_E_NOT_SUPPORTED = XIO_BASE_STATUS, + XIO_E_NO_BUFS = (XIO_BASE_STATUS + 1), + XIO_E_CONNECT_ERROR = (XIO_BASE_STATUS + 2), + XIO_E_ROUTE_ERROR = (XIO_BASE_STATUS + 3), + XIO_E_ADDR_ERROR = (XIO_BASE_STATUS + 4), + XIO_E_UNREACHABLE = (XIO_BASE_STATUS + 5), + XIO_E_MSG_SIZE = (XIO_BASE_STATUS + 6), + XIO_E_PARTIAL_MSG = (XIO_BASE_STATUS + 7), + XIO_E_MSG_INVALID = (XIO_BASE_STATUS + 8), + XIO_E_MSG_UNKNOWN = (XIO_BASE_STATUS + 9), + XIO_E_SESSION_REFUSED = (XIO_BASE_STATUS + 10), + XIO_E_SESSION_ABORTED = (XIO_BASE_STATUS + 11), + XIO_E_SESSION_DISCONNECTED = (XIO_BASE_STATUS + 12), + XIO_E_SESSION_REJECTED = (XIO_BASE_STATUS + 13), + XIO_E_SESSION_REDIRECTED = (XIO_BASE_STATUS + 14), + XIO_E_SESSION_CLOSED = (XIO_BASE_STATUS + 15), + XIO_E_BIND_FAILED = (XIO_BASE_STATUS + 16), + XIO_E_TIMEOUT = (XIO_BASE_STATUS + 17), + XIO_E_IN_PORGRESS = (XIO_BASE_STATUS + 18), + XIO_E_INVALID_VERSION = (XIO_BASE_STATUS + 19), + XIO_E_NOT_SESSION = (XIO_BASE_STATUS + 20), + XIO_E_OPEN_FAILED = (XIO_BASE_STATUS + 21), + XIO_E_READ_FAILED = (XIO_BASE_STATUS + 22), + XIO_E_WRITE_FAILED = (XIO_BASE_STATUS + 23), + XIO_E_CLOSE_FAILED = (XIO_BASE_STATUS + 24), + XIO_E_UNSUCCESSFUL = (XIO_BASE_STATUS + 25), + XIO_E_MSG_CANCELED = (XIO_BASE_STATUS + 26), + XIO_E_MSG_CANCEL_FAILED = (XIO_BASE_STATUS + 27), + XIO_E_MSG_NOT_FOUND = (XIO_BASE_STATUS + 28), + XIO_E_MSG_FLUSHED = (XIO_BASE_STATUS + 29), + XIO_E_MSG_DISCARDED = (XIO_BASE_STATUS + 30), + XIO_E_STATE = (XIO_BASE_STATUS + 31), + XIO_E_NO_USER_BUFS = (XIO_BASE_STATUS + 32), + XIO_E_NO_USER_MR = (XIO_BASE_STATUS + 33), + XIO_E_USER_BUF_OVERFLOW = (XIO_BASE_STATUS + 34), + XIO_E_REM_USER_BUF_OVERFLOW = (XIO_BASE_STATUS + 35), + XIO_E_TX_QUEUE_OVERFLOW = (XIO_BASE_STATUS + 36), + XIO_E_USER_OBJ_NOT_FOUND = (XIO_BASE_STATUS + 37), + XIO_E_PEER_QUEUE_SIZE_MISMATCH = (XIO_BASE_STATUS + 38), + XIO_E_RSP_BUF_SIZE_MISMATCH = (XIO_BASE_STATUS + 39), + XIO_E_LAST_STATUS = (XIO_BASE_STATUS + 40) +}; + +/*---------------------------------------------------------------------------*/ +/* message data type */ +/*---------------------------------------------------------------------------*/ + +/** message request referred type */ +#define XIO_REQUEST (1 << 1) +/** message response referred type */ +#define XIO_RESPONSE (1 << 2) + +/** RDMA message family type */ +#define XIO_RDMA (1 << 3) +/** general message family type */ +#define XIO_MESSAGE (1 << 4) +/** one sided message family type */ +#define XIO_ONE_WAY (1 << 5) + +/** + * @enum xio_msg_type + * @brief supported message types + */ +enum xio_msg_type { + XIO_MSG_TYPE_REQ = (XIO_MESSAGE | XIO_REQUEST), + XIO_MSG_TYPE_RSP = (XIO_MESSAGE | XIO_RESPONSE), + XIO_MSG_TYPE_ONE_WAY = (XIO_ONE_WAY | XIO_REQUEST), + XIO_MSG_TYPE_RDMA = (XIO_RDMA) +}; + +/** + * @enum xio_msg_direction + * @brief message flow direction + */ +enum xio_msg_direction { + XIO_MSG_DIRECTION_OUT, + XIO_MSG_DIRECTION_IN +}; + +/** + * @enum xio_msg_flags + * @brief message level specific flags + */ +enum xio_msg_flags { + /** request read receipt. If the user wants to know that the msg was + * delivered to the recipient he can turn on this flag. Msg is + * considered delivered after callback on_msg (in case msg was + * delivered successfully)/on_msg_error (in case there was an error) + * was called for this msg. This flag can be set for one way msg or + * for request (but not for the response). In case of request: if the + * responder sends the response immediately (from within the on_msg + * callback), the receipt will be piggy backed on the response and the + * requester will receive on_msg_delivered callback, immediately + * followed by the on_msg callback. In case of one way msg or if the + * responder needs to do some asynchronous work before sending a + * response, a special inner msg will be sent to the requester + * triggering on_msg_delivered callback. + */ + XIO_MSG_FLAG_REQUEST_READ_RECEIPT = (1 << 0), + + /** force peer to rdma write. This flag should be enabled in case of + * request-response flow, when the application wants to enforce the + * response to be written using RDMA write (even if the response size + * is smaller than 8k) + */ + XIO_MSG_FLAG_PEER_WRITE_RSP = (1 << 1), + + /** force peer to rdma read. This flag should be enabled when the + * application wants to enforce the request/one way msg to be written + * using RDMA READ (even if the request size is smaller than 8k). + */ + XIO_MSG_FLAG_PEER_READ_REQ = (1 << 2), + + /** request an immediate send completion. Accelio batches cq signals + * to optimize performance. In order to decrease number of cq wake + * ups, on_msg_send_complete callback is called in batches of 16. + * Meaning that every 16th msg (or in case xio_connection was closed) + * will trigger on_msg_send_complete callback for itself and for the + * 15 msgs that preceded it. Meaning that if the user sent a number of + * msgs that is not divided by 16 (for example 13), he will not + * receive completion until 16 msgs will be sent or until the + * connection is closed (which ever happens first). In order to + * expedite signaling the user can enable this flag for the last msg + */ + XIO_MSG_FLAG_IMM_SEND_COMP = (1 << 3), + + /** last in batch. Typically, door bell to hardware indicating that + * there are msgs to be sent is rang for every msg. In case the user + * calls xio_send method several times in a row and wants to send the + * msgs in batch, this flag should be enabled for the last msg + */ + XIO_MSG_FLAG_LAST_IN_BATCH = (1 << 4), + + /* [1<<10 and above - reserved for library usage] */ +}; + +/** + * @enum xio_msg_hints + * @brief message level specific hints + */ +enum xio_msg_hints { + /**< message "in" assigned via assign_data_in_buf */ + XIO_MSG_HINT_ASSIGNED_DATA_IN_BUF = (1 << 0), + XIO_MSG_HINT_USER_ALLOC_DATA_BUF = (1 << 1) +}; + +/** + * @enum xio_receipt_result + * @brief message receipt result as sent by the message recipient + */ +enum xio_receipt_result { + XIO_READ_RECEIPT_ACCEPT, + XIO_READ_RECEIPT_REJECT, +}; + +/** + * @enum xio_sgl_type + * @brief message data scatter gather type + */ +enum xio_sgl_type { + XIO_SGL_TYPE_IOV = 0, + XIO_SGL_TYPE_IOV_PTR = 1, + XIO_SGL_TYPE_SCATTERLIST = 2, + XIO_SGL_TYPE_LAST +}; + +/** + * @struct xio_iovec + * @brief IO vector + */ +struct xio_iovec { + void *iov_base; /**< base address */ + size_t iov_len; /**< base length */ +}; + +/** + * @struct xio_msg_pdata + * @brief message private data structure used internally by the library + */ +struct xio_msg_pdata { + struct xio_msg *next; /**< internal library usage */ + struct xio_msg **prev; /**< internal library usage */ +}; + +/** + * @struct xio_sg_table + * @brief scatter gather table data structure + */ +struct xio_sg_table { + uint32_t nents; /**< number of entries */ + uint32_t max_nents; /**< maximum entries */ + /**< allowed */ + + void *sglist; /**< scatter list */ +}; + +struct xio_sge { + uint64_t addr; /* virtual address */ + uint32_t length; /* length */ + uint32_t stag; /* rkey */ +}; + +/** + * @struct xio_rdma_msg + * @brief Describes the source/target memory of an RDMA op + */ +struct xio_rdma_msg { + size_t length; + size_t nents; + struct xio_sge *rsg_list; + int is_read; + int pad; +}; + +/*---------------------------------------------------------------------------*/ +/* XIO context API */ +/*---------------------------------------------------------------------------*/ +/** + * @enum xio_context_attr_mask + * @brief supported context attributes to query/modify + */ +enum xio_context_attr_mask { + XIO_CONTEXT_ATTR_USER_CTX = 1 << 0 +}; + +/** + * @struct xio_context_attr + * @brief context attributes structure + */ +struct xio_context_attr { + void *user_context; /**< private user context to */ + /**< pass to connection */ + /**< oriented callbacks */ +}; + +/** + * closes the xio context and free its resources + * + * @param[in] ctx Pointer to the xio context handle + * + */ +void xio_context_destroy(struct xio_context *ctx); + +/** + * modify context parameters + * + * @param[in] ctx The xio context handle + * @param[in] attr The context attributes structure + * @param[in] attr_mask Attribute mask to modify + * + * @return 0 on success, or -1 on error. If an error occurs, call + * xio_errno function to get the failure reason. + */ +int xio_modify_context(struct xio_context *ctx, + struct xio_context_attr *attr, + int attr_mask); + +/** + * get context attributes + * + * @param[in] ctx The xio context handle + * @param[in] attr The context attributes structure + * @param[in] attr_mask Attribute mask to query + * + * @return 0 on success, or -1 on error. If an error occurs, call + * xio_errno function to get the failure reason. + * + */ +int xio_query_context(struct xio_context *ctx, + struct xio_context_attr *attr, + int attr_mask); + +/** + * poll for events using direct access to the event signaling resources + * (e.g. hw event queues) associated with the context; + * polling is performed continuously (by busy-waiting) during the specified + * timeout period + * + * all events which become pending during that time are handled and the user + * callbacks are called as appropriate for those events + * + * as there is no way to interrupt the loop, infinite polling is unsupported; + * the polling period may be limited internally by an unspecified value + * + * @param[in] ctx Pointer to the xio context handle + * @param[in] timeout_us number of microseconds to poll for events + * 0 : just poll instantly, don't busy-wait; + * + * @return 0 on success, or -1 on error. If an error occurs, call + * xio_errno function to get the failure reason. + * + * @note supported only for RDMA + */ +int xio_context_poll_completions(struct xio_context *ctx, int timeout_us); + +/*---------------------------------------------------------------------------*/ +/* XIO session API */ +/*---------------------------------------------------------------------------*/ +/** + * @enum xio_session_type + * @brief session's type definition + */ +enum xio_session_type { + XIO_SESSION_CLIENT, /**< represents the active side that initiate */ + /**< connection */ + XIO_SESSION_SERVER, /**< represents the passive side that listen to */ + /**< incoming connections */ +}; + +/** + * @enum xio_proto + * @brief session's transport protocol as received on the server side upon + * new session request + */ +enum xio_proto { + XIO_PROTO_RDMA, /**< Infiniband's RDMA protocol */ + XIO_PROTO_TCP /**< TCP protocol - userspace only */ +}; + +/** + * @enum xio_session_event + * @brief session events + */ +enum xio_session_event { + XIO_SESSION_REJECT_EVENT, /**< session reject event */ + XIO_SESSION_TEARDOWN_EVENT, /**< session teardown event */ + XIO_SESSION_NEW_CONNECTION_EVENT, /**< new connection event */ + XIO_SESSION_CONNECTION_ESTABLISHED_EVENT, /**< connection established */ + XIO_SESSION_CONNECTION_TEARDOWN_EVENT, /**< connection teardown event*/ + XIO_SESSION_CONNECTION_CLOSED_EVENT, /**< connection closed event*/ + XIO_SESSION_CONNECTION_DISCONNECTED_EVENT, /**< disconnection event */ + XIO_SESSION_CONNECTION_REFUSED_EVENT, /**< connection refused event*/ + XIO_SESSION_CONNECTION_ERROR_EVENT, /**< connection error event */ + XIO_SESSION_ERROR_EVENT, /**< session error event */ + XIO_SESSION_CONNECTION_RECONNECTING_EVENT, /**< connection reconnecting event */ + XIO_SESSION_CONNECTION_RECONNECTED_EVENT, /**< connection reconnected event */ +}; + +/** + * @enum xio_session_attr_mask + * @brief supported session attributes to query/modify + */ +enum xio_session_attr_mask { + XIO_SESSION_ATTR_USER_CTX = 1 << 0, + XIO_SESSION_ATTR_SES_OPS = 1 << 1, + XIO_SESSION_ATTR_URI = 1 << 2 +}; + +/** + * @struct xio_session_params + * @brief session creation params + */ +struct xio_session_params { + enum xio_session_type type; /**< The type of the session */ + + uint32_t initial_sn; /**< initial serial number */ + /**< to start with */ + + struct xio_session_ops *ses_ops; /**< session's ops callbacks */ + void *user_context; /**< session user context */ + void *private_data; /**< private user data snt to */ + /**< server upon new session */ + size_t private_data_len; /**< private data length */ + const char *uri; /**< the uri */ +}; + +/** + * @struct xio_session_attr + * @brief session attributes + */ +struct xio_session_attr { + struct xio_session_ops *ses_ops; /**< session's ops callbacks */ + void *user_context; /**< session user context */ + char *uri; /**< the uri */ +}; + +/** + * @struct xio_session_event_data + * @brief session event callback parameters + */ +struct xio_session_event_data { + struct xio_connection *conn; /**< connection object */ + void *conn_user_context; /**< user context */ + enum xio_session_event event; /**< the specific event */ + enum xio_status reason; /**< elaborated message */ + void *private_data; /**< user private data */ + /**< relevant to reject */ + size_t private_data_len; /**< private length */ +}; + +/** + * @struct xio_new_session_req + * @brief new session request message + */ +struct xio_new_session_req { + char *uri; /**< the uri */ + void *private_data; /**< client private data */ + uint16_t uri_len; /**< uri length */ + uint16_t private_data_len; /**< private data length */ + enum xio_proto proto; /**< source protocol type */ + struct sockaddr_storage src_addr; /**< source address of */ + /**< requester */ +}; + +/** + * @struct xio_new_session_rsp + * @brief new session response message + */ +struct xio_new_session_rsp { + void *private_data; /**< server private data */ + uint16_t private_data_len;/**< private data length */ + uint16_t reserved[3]; /**< structure alignment */ +}; + +/** + * @struct xio_session_ops + * @brief user provided callback functions that handles various session events + */ +struct xio_session_ops { + /** + * generic error event notification + * + * @param[in] session the session + * @param[in] data session event data information + * @param[in] cb_user_context user private data provided in session + * open + * @return 0 + */ + int (*on_session_event)(struct xio_session *session, + struct xio_session_event_data *data, + void *cb_user_context); + + /** + * new session notification - server side only + * + * @param[in] session the session + * @param[in] req new session request information + * @param[in] cb_user_context user private data provided in session + * open + * @return 0 + */ + int (*on_new_session)(struct xio_session *session, + struct xio_new_session_req *req, + void *cb_user_context); + + /** + * session established notification - client side only + * + * @param[in] session the session + * @param[in] rsp new session's response information + * @param[in] cb_user_context user private data provided in session + * open + * @return 0 + */ + int (*on_session_established)(struct xio_session *session, + struct xio_new_session_rsp *rsp, + void *cb_user_context); + + /** + * send completion notification - responder only + * + * @param[in] session the session + * @param[in] rsp the response that was sent from + * responder + * @param[in] cb_user_context user private data provided in + * xio_bind + * @return 0 + */ + int (*on_msg_send_complete)(struct xio_session *session, + struct xio_msg *rsp, + void *conn_user_context); + + /** + * message head buf alloc + * + * @param[in] session the session + * @param[in] head_len head len + * @param[in] head_buf buf + * @param[in] conn_user_context user private data provided in + * connection open on which + * the message send + * @return 0 + */ + int (*rev_msg_head_alloc_buf)(struct xio_session *session, + struct xio_iovec *header, + void *conn_user_context); + + /** + * message data buf alloc + * + * @param[in] session the session + * @param[in] head_len head len + * @param[in] head_buf buf + * @param[in] conn_user_context user private data provided in + * connection open on which + * the message send + * @return 0 + */ + int (*rev_msg_data_alloc_buf)(struct xio_session *session, + struct xio_msg *msg, + void *conn_user_context); + + /** + * message arrived notification + * + * @param[in] session the session + * @param[in] msg the incoming message + * @param[in] last_in_rxq hint that more incoming messages + * are expected + * @param[in] conn_user_context user private data provided in + * connection open on which + * the message send + * @return 0 + */ + int (*on_msg)(struct xio_session *session, + struct xio_msg *msg, + int last_in_rxq, + void *conn_user_context); + + /** + * one way message delivery receipt notification + * + * @param[in] session the session + * @param[in] msg the incoming message + * @param[in] last_in_rxq hint that more incoming messages + * are expected + * @param[in] conn_user_context user private data provided in + * connection open on which + * the message send + * @return 0 + */ + int (*on_msg_delivered)(struct xio_session *session, + struct xio_msg *msg, + int last_in_rxq, + void *conn_user_context); + + /** + * message error notification + * + * @param[in] session the session + * @param[in] error the error code + * @param[in] msg the incoming message + * @param[in] conn_user_context user private data provided in + * connection open on which + * the message send + * @return 0 + */ + int (*on_msg_error)(struct xio_session *session, + enum xio_status error, + enum xio_msg_direction, + struct xio_msg *msg, + void *conn_user_context); + + /** + * requester's message cancellation notification + * + * @param[in] session the session + * @param[in] result the result code + * @param[in] msg the message to cancel + * @param[in] conn_user_context user private data provided in + * connection open on which + * the message send + * @return 0 + */ + int (*on_cancel)(struct xio_session *session, + struct xio_msg *msg, + enum xio_status result, + void *conn_user_context); + + /** + * responder's message cancellation notification + * + * @param[in] session the session + * @param[in] req the request to cancel + * @param[in] conn_user_context user private data provided in + * connection open on which + * the message send + * @return 0 + */ + int (*on_cancel_request)(struct xio_session *session, + struct xio_msg *msg, + void *conn_user_context); + + /** + * notify the user to assign a data buffer for incoming read + * + * @param[in] msg the incoming message + * @param[in] conn_user_context user private data provided in + * connection open on which + * the message send + * @return 0 + */ + int (*assign_data_in_buf)(struct xio_msg *msg, + void *conn_user_context); + + /** + * sender's send completion notification - one way message only + * + * @param[in] session the session + * @param[in] msg the sent message + * @param[in] conn_user_context user private data provided on + * connection creation + * + * @return 0 + * @note called only if "read receipt" was not requested + */ + int (*on_ow_msg_send_complete)(struct xio_session *session, + struct xio_msg *msg, + void *conn_user_context); + + /** + * RDMA direct completion notification + * + * @param[in] session the session + * @param[in] msg the sent message + * @param[in] conn_user_context user private data provided on + * connection creation + * + * @returns 0 + */ + int (*on_rdma_direct_complete)(struct xio_session *session, + struct xio_msg *msg, + void *conn_user_context); +}; + +/** + * creates new requester session + * + * @param[in] params session creations parameters + * + * @return xio session context, or NULL upon error + */ +struct xio_session *xio_session_create(struct xio_session_params *params); + +/** + * teardown an opened session + * + * @param[in] session The xio session handle + * + * @return 0 on success, or -1 on error. If an error occurs, call + * xio_errno function to get the failure reason. + */ +int xio_session_destroy(struct xio_session *session); + +/** + * query session parameters + * + * @param[in] session The xio session handle + * @param[in] attr The session attributes structure + * @param[in] attr_mask attribute mask to query + * + * @return 0 on success, or -1 on error. If an error occurs, call + * xio_errno function to get the failure reason. + */ +int xio_query_session(struct xio_session *session, + struct xio_session_attr *attr, + int attr_mask); + +/** + * modify session parameters + * + * @param[in] session The xio session handle + * @param[in] attr The session attributes structure + * @param[in] attr_mask attribute mask to query + * + * @return 0 on success, or -1 on error. If an error occurs, call + * xio_errno function to get the failure reason. + */ +int xio_modify_session(struct xio_session *session, + struct xio_session_attr *attr, + int attr_mask); + +/** + * maps session event code to event string + * + * @param[in] event The session event + * + * @return a string that describes the event code + */ +const char *xio_session_event_str(enum xio_session_event event); + +/*---------------------------------------------------------------------------*/ +/* XIO connection API */ +/*---------------------------------------------------------------------------*/ +/** + * @enum xio_connection_attr_mask + * @brief supported connection attributes to query/modify + */ +enum xio_connection_attr_mask { + XIO_CONNECTION_ATTR_CTX = 1 << 0, + XIO_CONNECTION_ATTR_USER_CTX = 1 << 1, + XIO_CONNECTION_ATTR_PROTO = 1 << 2, + XIO_CONNECTION_ATTR_PEER_ADDR = 1 << 3, + XIO_CONNECTION_ATTR_LOCAL_ADDR = 1 << 4, + XIO_CONNECTION_ATTR_DISCONNECT_TIMEOUT = 1 << 5, +}; + +/** + * @struct xio_connection_attr + * @brief connection attributes structure + */ +struct xio_connection_attr { + void *user_context; /**< private user context to */ + /**< pass to connection */ + /**< oriented callbacks */ + struct xio_context *ctx; /**< context data type */ + uint8_t tos; /**< type of service RFC 2474 */ + uint8_t pad; /**< padding */ + uint16_t disconnect_timeout_secs; + enum xio_proto proto; /**< protocol type */ + struct sockaddr_storage peer_addr; /**< address of peer */ + struct sockaddr_storage local_addr; /**< address of local */ +}; + +/** + * @struct xio_connection_params + * @brief connection attributes structure + */ +struct xio_connection_params { + struct xio_session *session; /**< xio session handle */ + struct xio_context *ctx; /**< xio context handle */ + uint32_t conn_idx; /**< Connection index greater */ + /**< then 0 if 0 - auto count */ + uint8_t enable_tos; /**< explicitly enable tos */ + uint8_t tos; /**< type of service RFC 2474 */ + + /**< disconnect timeout in seconds */ + uint16_t disconnect_timeout_secs; + + /**< bounded outgoing interface address and/or port - NULL if not */ + /**< specified in form: */ + /**< host:port, host:, host, :port. */ + /**< [host]:port, [host]:, [host]. [ipv6addr]:port, [ipv6addr]:, */ + /**< [ipv6addr]. */ + const char *out_addr; + + /**< Private data pointer to pass to each connection callback */ + void *conn_user_context; +}; + +/** + * creates connection handle + * + * @param[in] cparams The xio connection parameters structure + * + * @return xio connection, or NULL upon error + */ +struct xio_connection *xio_connect(struct xio_connection_params *cparams); + +/** + * teardown an opened connection + * + * @param[in] conn The xio connection handle + * + * @return 0 on success, or -1 on error. If an error occurs, call + * xio_errno function to get the failure reason. + */ +int xio_disconnect(struct xio_connection *conn); + +/** + * free connection object + * + * @param[in] conn The xio connection handle + * + * @return 0 on success, or -1 on error. If an error occurs, call + * xio_errno function to get the failure reason. + */ +int xio_connection_destroy(struct xio_connection *conn); + +/** + * modify connection parameters + * + * @param[in] conn The xio connection handle + * @param[in] attr The connection attributes structure + * @param[in] attr_mask Attribute mask to modify + * + * @return 0 on success, or -1 on error. If an error occurs, call + * xio_errno function to get the failure reason. + */ +int xio_modify_connection(struct xio_connection *conn, + struct xio_connection_attr *attr, + int attr_mask); +/** + * query connection parameters + * + * @param[in] conn The xio connection handle + * @param[in] attr The connection attributes structure + * @param[in] attr_mask attribute mask to modify + * + * @return 0 on success, or -1 on error. If an error occurs, call + * xio_errno function to get the failure reason. + */ +int xio_query_connection(struct xio_connection *conn, + struct xio_connection_attr *attr, + int attr_mask); + +/** + * @enum xio_connection_optname + * @brief connection option name + */ +enum xio_connection_optname { + XIO_CONNECTION_FIONWRITE_BYTES, /**< uint64_t: the number of bytes */ + /**< in send queue */ + XIO_CONNECTION_FIONWRITE_MSGS, /**< int: the number of msgs in */ + /**< send queue */ + XIO_CONNECTION_LEADING_CONN /**< int: check if connection is leading: */ + /**<1 for leading conn, 0 otherwise */ +}; + +/** + * get xio_connections's info + * + * @param[in] connection Pointer to xio_connection + * @param[in] con_optname Get value of this option. + * (@ref xio_connection_optname) + * @param[in,out] optval A pointer to the buffer in which the value + * for the requested option is specified + * @param[in,out] optlen The size, in bytes, of the buffer pointed to by + * the optval parameter + * + * @return 0 on success, or -1 on error. If an error occurs, call + * xio_errno function to get the failure reason. + */ +int xio_connection_ioctl(struct xio_connection *connection, int con_optname, + void *optval, int *optlen); +/** + * send request to responder + * + * @param[in] conn The xio connection handle + * @param[in] req request message to send + * + * @return 0 on success, or -1 on error. If an error occurs, call + * xio_errno function to get the failure reason. + */ +int xio_send_request(struct xio_connection *conn, + struct xio_msg *req); + +/** + * send response back to requester + * + * @param[in] rsp Response to send + * + * @return 0 on success, or -1 on error. If an error occurs, call + * xio_errno function to get the failure reason. + */ +int xio_send_response(struct xio_msg *rsp); + +/** + * cancel an outstanding asynchronous I/O request + * + * @param[in] conn The xio connection handle on which the message was + * sent + * @param[in] req request message to cancel + * + * @return 0 on success, or -1 on error. If an error occurs, call + * xio_errno function to get the failure reason. + */ +int xio_cancel_request(struct xio_connection *conn, + struct xio_msg *req); +/** + * responder cancellation response + * + * @param[in] req the outstanding request to cancel + * @param[in] result responder cancellation code + * + * @return 0 on success, or -1 on error. If an error occurs, call + * xio_errno function to get the failure reason. + */ +int xio_cancel(struct xio_msg *req, enum xio_status result); + +/** + * release response resources back to xio + * + * @note the message itself is allocated by the application + * and is not freed by this function + * + * @param[in] rsp The released response + * + * @return 0 on success, or -1 on error. If an error occurs, call + * xio_errno function to get the failure reason. + */ +int xio_release_response(struct xio_msg *rsp); + +/** + * send one way message to remote peer + * + * @param[in] conn The xio connection handle + * @param[in] msg The message to send + * + * @return 0 on success, or -1 on error. If an error occurs, call + * xio_errno function to get the failure reason. + */ +int xio_send_msg(struct xio_connection *conn, + struct xio_msg *msg); + +/** + * send direct RDMA read/write command + * + * @param[in] conn The xio connection handle + * @param[in] msg The message describing the RDMA op + * + * @returns success (0), or a (negative) error value + */ +int xio_send_rdma(struct xio_connection *conn, + struct xio_msg *msg); + +/** + * release one way message resources back to xio when message is no longer + * needed + * + * @param[in] msg The released message + * + * @return 0 on success, or -1 on error. If an error occurs, call + * xio_errno function to get the failure reason. + */ +int xio_release_msg(struct xio_msg *msg); + +/*---------------------------------------------------------------------------*/ +/* XIO rkey management */ +/*---------------------------------------------------------------------------*/ + +struct xio_managed_rkey; + +/** + * Get raw rkey value from a managed rkey. + * + * @note Should only be used with the connection the managed key is + * registered with. + * + * @param[in] managed_rkey The managed rkey + * + * @return raw rkey + */ +uint32_t xio_managed_rkey_unwrap( + const struct xio_managed_rkey *managed_rkey); + +/** + * Register a remote rkey with a connection such that it will be automatically + * updated on reconnects, failovers, etc. + * + * @param[in] connection connection + * @param[in] raw_rkey A raw rkey received through connection from the + * other side. + * @return The managed rkey, or NULL if failed. + */ +struct xio_managed_rkey *xio_register_remote_rkey( + struct xio_connection *connection, uint32_t raw_rkey); + +/** + * Unregister a remote rkey from connection such that it will no longer + * be automatically updated on reconnects, failovers, etc. + * + * @param[in] managed_rkey The managed rkey + */ +void xio_unregister_remote_key(struct xio_managed_rkey *managed_rkey); + +/*---------------------------------------------------------------------------*/ +/* XIO server API */ +/*---------------------------------------------------------------------------*/ +/** + * open a server listener object + * + * @param[in] ctx The xio context handle + * @param[in] ops Structure of server's event handlers + * @param[in] uri Uri to connect or to bind + * @param[in] src_port Returned listen port in host order, can be NULL + * if not needed + * @param[in] flags Message related flags as defined in enum xio_msg_flags + * @param[in] cb_user_context Private data pointer to pass to each callback + * + * @return xio server context, or NULL upon error + */ +struct xio_server *xio_bind(struct xio_context *ctx, + struct xio_session_ops *ops, + const char *uri, + uint16_t *src_port, + uint32_t flags, + void *cb_user_context); + +/** + * teardown a server + * + * @param[in] server The xio server handle + * + * @return 0 on success, or -1 on error. If an error occurs, call + * xio_errno function to get the failure reason. + */ +int xio_unbind(struct xio_server *server); + +/** + * accept new session or "light redirect" it to anther thread + * + * @param[in] session The xio session handle + * @param[in] portals_array string array of alternative portals to the + * resource in form of "rdma://host:port" + * "rdma://127.0.0.1:1234" + * @param[in] portals_array_len The string array length + * @param[in] private_data References a user-controlled data buffer + * The contents of the buffer are copied and + * transparently passed to the remote side as + * part of the communication request. May be + * NULL if user_context is not required + * @param[in] private_data_len Specifies the size of the user-controlled + * data buffer + * + * @return 0 on success, or -1 on error. If an error occurs, call + * xio_errno function to get the failure reason. + */ +int xio_accept(struct xio_session *session, + const char **portals_array, + size_t portals_array_len, + void *private_data, + size_t private_data_len); + +/** + * redirect connecting session to connect to alternative resources + * + * @param[in] session The xio session handle + * @param[in] portals_array string array of alternative portals to the + * resource in form of "rdma://host:port" + * "rdma://127.0.0.1:1234" + * @param[in] portals_array_len The string array length + * + * @return 0 on success, or -1 on error. If an error occurs, call + * xio_errno function to get the failure reason. + */ +int xio_redirect(struct xio_session *session, + const char **portals_array, + size_t portals_array_len); + +/** + * reject a connecting session + * + * + * @param[in] session The xio session handle + * @param[in] reason Reason for rejection + * @param[in] private_data References a user-controlled data buffer + * The contents of the buffer are copied and + * transparently passed to the peer as part + * of the communication request. May be NULL + * if user_context is not required + * @param[in] private_data_len Specifies the size of the user-controlled + * data buffer + * + * @return 0 on success, or -1 on error. If an error occurs, call + * xio_errno function to get the failure reason. + */ +int xio_reject(struct xio_session *session, + enum xio_status reason, + void *private_data, + size_t private_data_len); + +/*---------------------------------------------------------------------------*/ +/* XIO configuration tuning API */ +/*---------------------------------------------------------------------------*/ +/** + * @enum xio_log_level + * @brief logging levels + */ +enum xio_log_level { + XIO_LOG_LEVEL_FATAL, /**< fatal logging level */ + XIO_LOG_LEVEL_ERROR, /**< error logging level */ + XIO_LOG_LEVEL_WARN, /**< warnings logging level */ + XIO_LOG_LEVEL_INFO, /**< informational logging level */ + XIO_LOG_LEVEL_DEBUG, /**< debugging logging level */ + XIO_LOG_LEVEL_TRACE, /**< tracing logging level */ + XIO_LOG_LEVEL_LAST +}; + +/** + * @enum xio_optlevel + * @brief configuration tuning option level + */ +enum xio_optlevel { + XIO_OPTLEVEL_ACCELIO, /**< General library option level */ + XIO_OPTLEVEL_RDMA, /**< RDMA transport level */ + XIO_OPTLEVEL_TCP, /**< TCP transport level */ +}; + +/** + * @enum xio_optname + * @brief configuration tuning option name + */ +enum xio_optname { + /* XIO_OPTLEVEL_ACCELIO */ + /** disable huge pages allocation. This flag is disabled by default. + * Typically, accelio allocates memory in huge pages (usually 2M) and + * not in regular pages (4k). This flag will cause accelio to allocate + * memory in regular pages. + */ + XIO_OPTNAME_DISABLE_HUGETBL = 0, + /** set user log function */ + XIO_OPTNAME_LOG_FN, + /** set/get logging level */ + XIO_OPTNAME_LOG_LEVEL, + /** set customed allocators hooks */ + XIO_OPTNAME_MEM_ALLOCATOR, + /**< enables/disables connection's keep alive. type: int */ + XIO_OPTNAME_ENABLE_KEEPALIVE, + /**< configure keep alive variables.type: struct xio_options_keepalive*/ + XIO_OPTNAME_CONFIG_KEEPALIVE, + + /* XIO_OPTLEVEL_ACCELIO/RDMA/TCP */ + /** message's max in iovec. This flag indicates what will be the max + * in iovec for xio_msg. In case the in iovec size is smaller than the + * default, it is best to configure it in order to save memory. + */ + XIO_OPTNAME_MAX_IN_IOVLEN = 100, + /** message's max out iovec. This flag indicates what will be the max + * out iovec for xio_msg. It is best to configure it to the out iovec + * size that the application uses in order to save memory. + */ + XIO_OPTNAME_MAX_OUT_IOVLEN, + /** enables the dma latency. Disables the CPU hybernation, triggering + * a higher power consumption. Hybernating can also be prevented via a + * system CPU policy. This is an override to it. This flag will be + * deprecated soon + */ + XIO_OPTNAME_ENABLE_DMA_LATENCY, + /** enables reconnection. This flag is disabled by default and should + * only be activated it when bonding is turned on. Otherwise, it will + * not handle the failover in a timely fashion. (Will take a very long + * or exponential time to disconnect). + */ + XIO_OPTNAME_ENABLE_RECONNECT, + /** enables byte based flow control. Application-driven flow control, + * based on app releasing each message. Especially suitable for + * one-way messages. Also works on the initiator side, if he doesn't + * release messages, he will stop getting responses. When client sends + * multiple msgs to server and it takes a lot of time for the server + * to process them, it can cause the server side to run out of memory. + * This case is more common in one way msgs. User can configure flow + * control and the msgs will stay in queues on client side. Both sides + * need to configure the queue depth to be the same. + */ + XIO_OPTNAME_ENABLE_FLOW_CONTROL, + /** maximum tx queued msgs. Default value is 1024 */ + XIO_OPTNAME_SND_QUEUE_DEPTH_MSGS, + /** maximum rx queued msgs. Default value is 1024 */ + XIO_OPTNAME_RCV_QUEUE_DEPTH_MSGS, + /** maximum tx queued bytes. Default value is 64M */ + XIO_OPTNAME_SND_QUEUE_DEPTH_BYTES, + /** maximum rx queued bytes. Default value is 64M */ + XIO_OPTNAME_RCV_QUEUE_DEPTH_BYTES, + /** configure internal memory pool. In case the user wants to + * configure accelio's memory slab, he needs to pass this flag. + */ + XIO_OPTNAME_CONFIG_MEMPOOL, + + /** set/get max inline XIO header size. If the application sends small + * header this flag can be configured in order to save memory. Default + * value is 256 + */ + XIO_OPTNAME_MAX_INLINE_XIO_HEADER, + + /** set/get max inline XIO data size. This flag is used to set/get the + * max inline xio data. If the application sends small data this flag + * can be configured in order to save memory. Default value is 8k. + */ + XIO_OPTNAME_MAX_INLINE_XIO_DATA, + /** set/get alignment of data buffer address. Used to configure buffer + * alignment inside accelio's internal pool. + */ + XIO_OPTNAME_XFER_BUF_ALIGN, + /** set/get alignment of inline xio data buffer address */ + XIO_OPTNAME_INLINE_XIO_DATA_ALIGN, + + /* XIO_OPTLEVEL_RDMA/TCP */ + /** enables the internal transport memory pool. This flag is enabled + * by default. Accelio provides its own memory pool. In case the user + * knows that when sending large data (via RDMA read/write) the memory + * is always registered this pool can be disabled in order to save + * memory. This requires the user to implement the "assign_in_buffer" + * and take full ownership on memory registration. In case the user + * will send msg without filling "mr" error is expected. + */ + XIO_OPTNAME_ENABLE_MEM_POOL = 200, + /** time in milliseconds after which the nexus delayed call is + * is triggered, the nexus is released and the transport (tcp + * or rdma) closes. Default value is 60000 milliseconds, which are + * 1 minute. + */ + XIO_OPTNAME_TRANSPORT_CLOSE_TIMEOUT, + + /* XIO_OPTLEVEL_RDMA */ + /** number of RDMA-capable HCAs on the machine. Read only */ + XIO_OPTNAME_RDMA_NUM_DEVICES = 300, + /** Call ibv_fork_init(). Forking with RDMA requires a special + * synchronization. This is a wrapper over a correspondent ib verb, as + * raw verbs are not accessible (calls ibv_fork_init() ) + */ + XIO_OPTNAME_ENABLE_FORK_INIT, + /** Max number of data (bytes) that can be posted inline to the SQ + * passed to ib(v)_create_qp + */ + XIO_OPTNAME_QP_CAP_MAX_INLINE_DATA, + + /* XIO_OPTLEVEL_TCP */ + /** check tcp mr validity. Disable sanity check for proper MRs in case + * of TCP transport. In case this flag is enabled, a check is being + * done whether the application provided MRs. This flag should be + * used only for the development stage: in case the user writes the + * application above tcp and he want to make sure that it would work + * on rdma as well he should enable this flag. For production, or in + * case the development is done when rdma enabled the flag should be + * disabled. This flag is disabled by default. + */ + XIO_OPTNAME_TCP_ENABLE_MR_CHECK = 400, + /** turn-off Nagle algorithm. In case this flag is enabled, tcp socket + * that is created by accelio will have TCP_NODELAY flag. This will + * turn off Nagle algorithm which collects small outgoing packets to + * be sent all at once, thereby improving latency + */ + XIO_OPTNAME_TCP_NO_DELAY, + /** tcp socket send buffer. Sets maximum socket send buffer to this + * value (SO_SNDBUF socket option). + */ + XIO_OPTNAME_TCP_SO_SNDBUF, + /** tcp socket receive buffer. Sets maximum socket receive buffer to + * this value (SO_RCVUF socket option) + */ + XIO_OPTNAME_TCP_SO_RCVBUF, + /** performance boost for the price of two fd resources. The flag is + * enabled by default. This flag allows to open 2 sockets for each + * xio_connection. One is for internal accelio headers and the other + * for data. This causes performance boost. The downside: for each + * xio_connection 2 file descriptors are used. + */ + XIO_OPTNAME_TCP_DUAL_STREAM, +}; + +/** + * Callback prototype for libxio log message handler. + * The library user may wish to register their own logging function. + * By default errors go to stderr. + * Use xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_LOG_FN, NULL, 0) + * to restore the default log fn. + * + *@param[in] file file name from which the callback is called + *@param[in] line the line number in the above file + *@param[in] function name of the function in which the callback is called + *@param[in] level message level (@ref xio_log_level) + *@param[in] fmt printf() format string + * + */ +typedef void (*xio_log_fn)(const char *file, unsigned line, + const char *function, unsigned level, + const char *fmt, ...); + +/** + * @struct xio_mem_allocator + * @brief user provided costumed allocator hook functions for library usage + */ +struct xio_mem_allocator { + void *user_context; /**< user specific context */ + + /** + * allocates block of memory + * + * @param[in] size size in bytes to allocate + * @param[in] user_context user specific context + * + * @return pointer to allocated memory or NULL if allocate fails + */ + void * (*allocate)(size_t size, void *user_context); + + /** + * allocates aligned block of memory and zero it content + * + * @param[in] boundary memory size will be a multiple + * of boundary, which must be a + * power of two and a multiple of + * sizeof(void *) + * @param[in] size size in bytes to allocate + * @param[in] user_context user specific context + * + * @return pointer to allocated memory or NULL if allocate fails + */ + void * (*memalign)(size_t boundary, size_t size, void *user_context); + + /** + * deallocates block of memory + * + * @param[in] ptr pointer to allocated block + * @param[in] user_context user specific context + * + */ + void (*free)(void *ptr, void *user_context); + + /** + * allocates block of memory using huge page + * + * @param[in] size block size to allocate + * @param[in] user_context user specific context + * + * @return pointer to allocated memory or NULL if allocate fails + */ + void * (*malloc_huge_pages)(size_t size, void *user_context); + + /** + * deallocates block of memory previously allocated by + * malloc_huge_pages + * + * @param[in] ptr pointer to allocated block + * @param[in] user_context user specific context + * + * @return pointer to block or NULL if allocate fails + */ + void (*free_huge_pages)(void *ptr, void *user_context); + + /** + * allocates block of memory on specific numa node + * + * @param[in] size block size to allocate + * @param[in] node the numa node + * @param[in] user_context user specific context + * + * @return pointer to allocated memory or NULL if allocate fails + */ + void * (*numa_alloc)(size_t size, int node, void *user_context); + + /** + * deallocates block of memory previously allocated by + * numa_alloc + * + * @param[in] ptr pointer to allocated block + * @param[in] user_context user specific context + * + * @return pointer to block or NULL if allocate fails + */ + void (*numa_free)(void *ptr, void *user_context); +}; + +/** + * @struct xio_options_keepalive + * @brief user provided values for connection's keepalive + * Use xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_CONFIG_KEEPALIVE, + * &ka, sizeof(ka))) + */ +struct xio_options_keepalive { + /**< the number of unacknowledged probes to send before considering */ + /**< the connection dead and notifying the application layer */ + int probes; + + /**< the heartbeat interval in seconds between two initial */ + /**< keepalive probes. */ + int time; + + /**< the interval in seconds between subsequential keepalive probes, */ + /**< regardless of what the connection has exchanged in the meantime */ + int intvl; +}; + +#define XIO_MAX_SLABS_NR 6 + +/** + * @struct xio_mempool_config + * @brief tuning parameters for internal Accelio's memory pool + * + * Use: xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, + * XIO_OPTNAME_CONFIG_MEMPOOL, &mempool_config, + * sizeof(mempool_config)); + * + */ +struct xio_mempool_config { + /**< number of slabs */ + size_t slabs_nr; + + /**< per slab configuration */ + struct xio_mempool_slab_config { + /**< slab's block memory size in bytes */ + size_t block_sz; + + /**< initial number of allocated blocks */ + size_t init_blocks_nr; + /**< growing quantum of block allocations */ + size_t grow_blocks_nr; + /**< maximum number of allocated blocks */ + size_t max_blocks_nr; + } slab_cfg[XIO_MAX_SLABS_NR]; +}; + +/** + * set xio's configuration tuning option + * + * @param[in] xio_obj Pointer to xio object or NULL + * @param[in] level The level at which the option is + * defined (@ref xio_optlevel) + * @param[in] optname The option for which the value is to be set. + * The optname parameter must be a socket option + * defined within the specified level, or behavior + * is undefined (@ref xio_optname) + * @param[in] optval A pointer to the buffer in which the value + * for the requested option is specified + * @param[in] optlen The size, in bytes, of the buffer pointed to by + * the optval parameter + * + * @return 0 on success, or -1 on error. If an error occurs, call + * xio_errno function to get the failure reason. + */ +int xio_set_opt(void *xio_obj, int level, int optname, + const void *optval, int optlen); + +/** + * set xio's configuration tuning option + * + * @param[in] xio_obj Pointer to xio object or NULL + * @param[in] level The level at which the option is + * defined (@ref xio_optlevel) + * @param[in] optname The option for which the value is to be set. + * The optname parameter must be a socket option + * defined within the specified level, or behavior + * is undefined (@ref xio_optname) + * @param[in,out] optval A pointer to the buffer in which the value + * for the requested option is specified + * @param[in,out] optlen The size, in bytes, of the buffer pointed to by + * the optval parameter + * + * @return 0 on success, or -1 on error. If an error occurs, call + * xio_errno function to get the failure reason. + */ +int xio_get_opt(void *xio_obj, int level, int optname, + void *optval, int *optlen); + +/*---------------------------------------------------------------------------*/ +/* XIO errors */ +/*---------------------------------------------------------------------------*/ +/** + * resolves system errors and XIO errors to human-readable + * string + * + * @param[in] errnum The xio error code + * + * @return a string that describes the error code + */ +const char *xio_strerror(int errnum); + +/** + * return last xio error + * + * @return last xio error code + */ +int xio_errno(void); + +/** + * Get library version string. + * + * @return Pointer to static buffer in library that holds the version string. + */ +const char *xio_version(void); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/open_src/xio/include/xio_kernel.h b/open_src/xio/include/xio_kernel.h new file mode 100644 index 0000000..6d61899 --- /dev/null +++ b/open_src/xio/include/xio_kernel.h @@ -0,0 +1,321 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_API_H +#define XIO_API_H + +#include +#include +#include +#include +#include +#include +#include "xio_base.h" + +#define DRV_VERSION "0.1" +#define DRV_RELDATE "2013-Oct-01" + +/** + * @struct xio_reg_mem + * @brief registered memory buffer descriptor + * (Compatibility with user mode) + */ +struct xio_reg_mem { + void *addr; /**< buffer's memory address */ + size_t length; /**< buffer's memory length */ +}; + +/*---------------------------------------------------------------------------*/ +/* message data type */ +/*---------------------------------------------------------------------------*/ +/** + * @struct xio_iovec_ex + * @brief extended IO vector + */ +struct xio_iovec_ex { + void *iov_base; /**< base address */ + size_t iov_len; /**< base length */ + void *user_context; /**< private user data */ +}; + +/** + * @struct xio_sg_iov + * @brief scatter gather iovec vector data structure + */ +struct xio_sg_iov { + uint32_t nents; /**< number of entries */ + uint32_t max_nents; /**< maximum entries */ + /**< allowed */ + + struct xio_iovec_ex sglist[XIO_IOVLEN]; /**< scatter vec */ + +}; + +/** + * @struct xio_sg_iovptr + * @brief scatter gather iovec pointer data structure + */ +struct xio_sg_iovptr { + uint32_t nents; /**< number of entries */ + uint32_t max_nents; /**< maximum entries */ + /**< allowed */ + + struct xio_iovec_ex *sglist; /**< scatter list */ +}; + +/** + * @struct xio_vmsg + * @brief message sub element type + */ +struct xio_vmsg { + struct xio_iovec header; /**< header's io vector */ + enum xio_sgl_type sgl_type; + int pad; + struct sg_table data_tbl; /**< data table */ + void *user_context; /**< private user data */ +}; + +/** + * @struct xio_msg + * @brief accelio's message definition + * + * An object representing a message received from or to be sent to another + * peer. + */ +struct xio_msg { + union { + uint64_t sn; /* unique message serial number + * returned by the library + */ + struct xio_msg *request; /* on server side - attached + * request + */ + }; + struct xio_vmsg in; /**< incoming side of message */ + struct xio_vmsg out; + struct xio_rdma_msg rdma; /**< RDMA source/target */ + void *user_context; /* for user usage - not sent */ + + enum xio_msg_type type; + enum xio_receipt_result receipt_res; + uint64_t flags; + uint64_t timestamp; /**< submission timestamp */ + uint64_t hints; /**< hints flags from library */ + /**< to application */ + + struct xio_msg_pdata pdata; /**< accelio private data */ + struct xio_msg *next; /* internal use */ +}; + +#define vmsg_sglist_nents(vmsg) \ + (vmsg)->data_tbl.nents + +#define vmsg_sglist_set_nents(vmsg, n) \ + (vmsg)->data_tbl.nents = (n) + +static inline void vmsg_sglist_set_by_reg_mem(struct xio_vmsg *vmsg, + const struct xio_reg_mem *reg_mem) +{ + BUG_ON(vmsg->sgl_type != XIO_SGL_TYPE_SCATTERLIST); + vmsg_sglist_set_nents(vmsg, 1); + sg_init_one(vmsg->data_tbl.sgl, reg_mem->addr, reg_mem->length); +} + +static inline void *vmsg_sglist_one_base(const struct xio_vmsg *vmsg) +{ + struct scatterlist *sg = vmsg->data_tbl.sgl; + + return sg_virt(sg); +} + +static inline size_t vmsg_sglist_one_len(const struct xio_vmsg *vmsg) +{ + const struct scatterlist *sg = vmsg->data_tbl.sgl; + + return sg->length; +} + +static inline void vmsg_sglist_set_user_context(struct xio_vmsg *vmsg, + void *user_context) +{ + vmsg->user_context = user_context; +} + +static inline void *vmsg_sglist_get_user_context(struct xio_vmsg *vmsg) +{ + return vmsg->user_context; +} + +static inline int xio_init_vmsg(struct xio_vmsg *vmsg, unsigned int nents) +{ + int ret; + + vmsg->sgl_type = XIO_SGL_TYPE_SCATTERLIST; + ret = sg_alloc_table(&vmsg->data_tbl, nents, GFP_KERNEL); + vmsg_sglist_set_nents(vmsg, 0); + + return ret; +} + +static inline void xio_fini_vmsg(struct xio_vmsg *vmsg) +{ + sg_free_table(&vmsg->data_tbl); +} + +static inline void xio_init_vmsg_from_sg_table(struct xio_vmsg *vmsg, + const struct sg_table *tbl) +{ + vmsg->sgl_type = XIO_SGL_TYPE_SCATTERLIST; + vmsg->data_tbl = *tbl; + vmsg_sglist_set_nents(vmsg, 0); +} + +static inline void xio_reinit_msg(struct xio_msg *msg) +{ + const struct sg_table in_tbl = msg->in.data_tbl; + const struct sg_table out_tbl = msg->out.data_tbl; + + memset(msg, 0, sizeof(*msg)); + xio_init_vmsg_from_sg_table(&msg->in, &in_tbl); + xio_init_vmsg_from_sg_table(&msg->out, &out_tbl); +} + +/*---------------------------------------------------------------------------*/ +/* XIO context API */ +/*---------------------------------------------------------------------------*/ +#define XIO_LOOP_USER_LOOP 0 +#define XIO_LOOP_GIVEN_THREAD 1 +#define XIO_LOOP_TASKLET 2 +#define XIO_LOOP_WORKQUEUE 3 + +/** + * @typedef xio_ev_handler_t + * @brief event loop callback function + * + * @param[in] data user private data + */ +typedef void (*xio_ev_handler_t)(void *data); + +struct xio_ev_data { + xio_ev_handler_t handler; + void *data; + union { + struct llist_node ev_llist; + struct work_struct work; + }; + volatile unsigned long int states; /* xio private data */ +}; + +/** + * user provided function for adding an event to the event loop + * to be processed on ctx worker context + */ +struct xio_loop_ops { + void *ev_loop; + int (*run)(void *loop); + void (*stop)(void *loop); + int (*add_event)(void *loop, struct xio_ev_data *data); +}; + +/** + * @struct xio_context_params + * @brief context creation parameters structure + */ +struct xio_context_params { + + unsigned int flags; /**< creation flags */ + + /* User's structure of callbacks operations for this context + * (case flag XIO_LOOP_USER_LOOP) + */ + struct xio_loop_ops *loop_ops; + + /* kthread if flags XIO_LOOP_GIVEN_THREAD can be current + */ + struct task_struct *worker; + + /**< private user data passed saved on context can be queried/modified */ + /**< via xio_query_context/xio_modify_context */ + void *user_context; + + /**< preallocate and registers rdma inline buffers for send/recv */ + int prealloc_xio_inline_bufs; + + /**< number of connections that this context will handle */ + int max_conns_per_ctx; + + /** depth of receive queue in RDMA. + * pass 0 if want the depth to remain default (XIO_MAX_IOV + constant) */ + int rq_depth; +}; + +/** + * xio_context - creates xio context - a context is mapped internally to + * a cpu core. + * + * @ctx_params: context creation creation flags + * @polling_timeout: polling timeout in microsecs - 0 ignore + * @cpu_hint: -1 (current) + * + * RETURNS: xio context handle, or NULL upon error. + */ +struct xio_context *xio_context_create( + struct xio_context_params *ctx_params, + int polling_timeout, + int cpu_hint); + +/*---------------------------------------------------------------------------*/ +/* XIO default event loop API */ +/* */ +/* Note: xio provides default muxer implementation around epoll. */ +/* users are encouraged to utilize their own implementations and provides */ +/* appropriate services to xio via the xio's context open interface. */ +/*---------------------------------------------------------------------------*/ + +int xio_context_run_loop(struct xio_context *ctx); + +void xio_context_stop_loop(struct xio_context *ctx); + +int xio_context_add_event(struct xio_context *ctx, struct xio_ev_data *data); + +void xio_destroy_context_continue(struct work_struct *work); +/*---------------------------------------------------------------------------*/ +/* XIO debugfs facility */ +/*---------------------------------------------------------------------------*/ +struct dentry *xio_debugfs_root(void); + +#endif /*XIO_API_H */ diff --git a/open_src/xio/include/xio_predefs.h b/open_src/xio/include/xio_predefs.h new file mode 100644 index 0000000..3d1b52b --- /dev/null +++ b/open_src/xio/include/xio_predefs.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_PREDEFS_H +#define XIO_PREDEFS_H + +#if defined(_WIN32) +#include +#define inline __inline + +#else /* !defined(_WIN32) */ + +#include + +#endif /* !defined(_WIN32) */ + +#endif + diff --git a/open_src/xio/include/xio_user.h b/open_src/xio/include/xio_user.h new file mode 100644 index 0000000..fe1a300 --- /dev/null +++ b/open_src/xio/include/xio_user.h @@ -0,0 +1,600 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * @file xio_user.h + * @brief interface file for accelio user space library + */ + +#ifndef XIO_API_H +#define XIO_API_H + +#include +#include +#include +#include "xio_predefs.h" +#include "xio_base.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @struct xio_reg_mem + * @brief registered memory buffer descriptor + * used by all allocation and registration methods + * it's the user responsibility to save allocation type and use an + * appropriate free method appropriately + */ +struct xio_reg_mem { + void *addr; /**< buffer's memory address */ + size_t length; /**< buffer's memory length */ + struct xio_mr *mr; /**< xio specific memory region */ + void *priv; /**< xio private data */ +}; + +/*---------------------------------------------------------------------------*/ +/* message data type */ +/*---------------------------------------------------------------------------*/ +/** + * @struct xio_iovec_ex + * @brief extended IO vector + */ +struct xio_iovec_ex { + void *iov_base; /**< base address */ + size_t iov_len; /**< base length */ + struct xio_mr *mr; /**< rdma specific memory */ + /**< region */ + void *user_context; /**< private user data */ +}; + +/** + * @struct xio_sg_iov + * @brief scatter gather iovec vector data structure + */ +struct xio_sg_iov { + uint32_t nents; /**< number of entries */ + uint32_t max_nents; /**< maximum entries */ + /**< allowed */ + + struct xio_iovec_ex sglist[XIO_IOVLEN]; /**< scatter vec */ + +}; + +/** + * @struct xio_sg_iovptr + * @brief scatter gather iovec pointer data structure + */ +struct xio_sg_iovptr { + uint32_t nents; /**< number of entries */ + uint32_t max_nents; /**< maximum entries */ + /**< allowed */ + + struct xio_iovec_ex *sglist; /**< scatter list */ +}; + +/** + * @struct xio_sg_iovptr + * @brief scatter gather iovec pointer data structure + */ +struct xio_base_iovptr { + uint32_t nents; /**< number of entries */ + uint32_t max_nents; /**< maximum entries */ + /**< allowed */ + + struct xio_iovec *sglist; /**< iov list */ +}; + +/** + * @struct xio_vmsg + * @brief message sub element type + */ +struct xio_vmsg { + struct xio_iovec header; /**< header's io vector */ + uint64_t total_data_len; /**< data's total lenght */ + enum xio_sgl_type sgl_type; /**< sg list type enum */ + int pad; /**< padding */ + /**< union for different scatter gather representations */ + union { + struct xio_sg_table data_tbl; /**< data table */ + struct xio_sg_iov data_iov; /**< iov vector */ + struct xio_sg_iovptr pdata_iov; /**< iov pointer */ + }; +}; + +/** + * @struct xio_msg + * @brief accelio's message definition + * + * An object representing a message received from or to be sent to another + * peer. + */ +struct xio_msg { + union { + uint64_t sn; /**< unique message serial */ + /**< number returned by the */ + /**< library */ + + struct xio_msg *request; /**< responder - attached */ + /**< the request */ + }; + struct xio_vmsg in; /**< incoming side of message */ + struct xio_vmsg out; /**< outgoing side of message */ + struct xio_rdma_msg rdma; /**< RDMA source/target */ + + void *user_context; /**< private user data */ + /**< not sent to the peer */ + enum xio_msg_type type; /**< message type */ + enum xio_receipt_result receipt_res; /**< the receipt result if */ + uint64_t flags; /**< message flags mask */ + uint64_t usr_flags; /**< message flags mask */ + uint64_t timestamp; /**< submission timestamp */ + uint64_t hints; /**< hints flags from library */ + /**< to application */ + + struct xio_msg_pdata pdata; /**< accelio private data */ + struct xio_msg *next; /**< send list of messages */ +}; + +/** + * helper macros to iterate over scatter lists + */ +#define vmsg_sglist(vmsg) \ + (((vmsg)->sgl_type == XIO_SGL_TYPE_IOV) ? \ + (vmsg)->data_iov.sglist : \ + (((vmsg)->sgl_type == XIO_SGL_TYPE_IOV_PTR) ? \ + (vmsg)->pdata_iov.sglist : NULL)) + +#define vmsg_base_sglist(vmsg) (vmsg)->data_tbl.sglist + + +#define vmsg_sglist_nents(vmsg) \ + (vmsg)->data_tbl.nents + +#define vmsg_sglist_set_nents(vmsg, n) \ + (vmsg)->data_tbl.nents = (n) + +static inline void vmsg_sglist_set_by_reg_mem(struct xio_vmsg *vmsg, + const struct xio_reg_mem *reg_mem) +{ + struct xio_iovec_ex *sgl = vmsg_sglist(vmsg); + + vmsg_sglist_set_nents(vmsg, 1); + sgl[0].iov_base = reg_mem->addr; + sgl[0].iov_len = reg_mem->length; + sgl[0].mr = reg_mem->mr; +} + +static inline void *vmsg_sglist_one_base(const struct xio_vmsg *vmsg) +{ + const struct xio_iovec_ex *sgl = vmsg_sglist(vmsg); + return sgl[0].iov_base; +} + +static inline size_t vmsg_sglist_one_len(const struct xio_vmsg *vmsg) +{ + const struct xio_iovec_ex *sgl = vmsg_sglist(vmsg); + + return sgl[0].iov_len; +} + +static inline void vmsg_sglist_set_user_context(struct xio_vmsg *vmsg, + void *user_context) +{ + struct xio_iovec_ex *sgl = vmsg_sglist(vmsg); + + sgl[0].user_context = user_context; +} + +static inline void *vmsg_sglist_get_user_context(struct xio_vmsg *vmsg) +{ + struct xio_iovec_ex *sgl = vmsg_sglist(vmsg); + + return sgl[0].user_context; +} + +static inline int xio_init_vmsg(struct xio_vmsg *vmsg, unsigned int nents) +{ + vmsg->sgl_type = XIO_SGL_TYPE_IOV; + return 0; +} + +static inline void xio_fini_vmsg(struct xio_vmsg *vmsg) +{ +} + +static inline void xio_reinit_msg(struct xio_msg *msg) +{ + memset(msg, 0, sizeof(*msg)); +} + +/*---------------------------------------------------------------------------*/ +/* XIO context API */ +/*---------------------------------------------------------------------------*/ +/** + * @def XIO_INFINITE + * @brief infinite time flag for event loop + */ +#define XIO_INFINITE -1 + +/** + * @struct xio_context_params + * @brief context creation parameters structure + */ +struct xio_context_params { + /**< private user data passed saved on context can be queried/modified */ + /**< via xio_query_context/xio_modify_context */ + void *user_context; + + /**< preallocate and registers rdma inline buffers for send/recv */ + int prealloc_xio_inline_bufs; + + /**< number of connections that this context will handle */ + int max_conns_per_ctx; + + /**< apply memory registration to internal accelio memory pool */ + int register_internal_mempool; + + /** depth of receive queue in RDMA. + * pass 0 if want the depth to remain default (XIO_MAX_IOV + constant) */ + int rq_depth; + +}; + + +/** + * creates xio context - a context object represent concurrency unit + * + * @param[in] ctx_params: context creation parameters (can be NULL) + * @param[in] polling_timeout_us: Polling timeout in microsecs - 0 ignore + * @param[in] cpu_hint: -1 - don't care, n - core on which the cpu is bounded + * + * @return xio context handle, or NULL upon error + */ +struct xio_context *xio_context_create(struct xio_context_params *ctx_params, + int polling_timeout_us, + int cpu_hint); + +/** + * get context poll fd, which can be later passed to an external dispatcher + * + * @param[in] ctx The xio context handle + * + * @return fd (non-negative) on success, or -1 on error. If an error occurs, + * call xio_errno function to get the failure reason. + */ +int xio_context_get_poll_fd(struct xio_context *ctx); + +/** + * @enum xio_ev_loop_events + * @brief accelio's event dispatcher event types + */ +enum xio_ev_loop_events { + XIO_POLLIN = (1 << 0), + XIO_POLLOUT = (1 << 1), + XIO_POLLET = (1 << 2), /**< edge-triggered poll */ + XIO_ONESHOT = (1 << 3), + XIO_POLLRDHUP = (1 << 4), + XIO_POLLHUP = (1 << 5), + XIO_POLLERR = (1 << 6), +}; + +/** + * @typedef xio_ev_handler_t + * @brief event loop callback function + * + * @param[in] fd the signaled file descriptor + * @param[in] events the event signaled as defined in enum xio_ev_loop_events + * @param[in] data user private data + */ +typedef void (*xio_ev_handler_t)(int fd, int events, void *data); + +/** + * add external fd to be used by internal dispatcher + * + * @param[in] ctx The xio context handle + * @param[in] fd the file descriptor + * @param[in] events the event signaled as defined in + * enum xio_ev_loop_events + * @param[in] handler event handler that handles the event + * @param[in] data user private data + * + * @return 0 on success, or -1 on error. If an error occurs, call + * xio_errno function to get the failure reason. + */ +int xio_context_add_ev_handler(struct xio_context *ctx, + int fd, int events, + xio_ev_handler_t handler, + void *data); +/** + * change the event event associated with the target file descriptor fd. + * + * @param[in] ctx The xio context handle + * @param[in] fd the file descriptor + * @param[in] events the event signaled as defined in + * enum xio_ev_loop_events + * + * @return 0 on success, or -1 on error. If an error occurs, call + * xio_errno function to get the failure reason. + */ +int xio_context_modify_ev_handler(struct xio_context *ctx, + int fd, int events); + +/** + * removes external fd from internal dispatcher + * + * @param[in] ctx The xio context handle + * @param[in] fd the file descriptor + * + * @return 0 on success, or -1 on error. If an error occurs, call + * xio_errno function to get the failure reason. + */ +int xio_context_del_ev_handler(struct xio_context *ctx, + int fd); + +/** + * run event loop for a specified (possibly infinite) amount of time; + * + * this function relies on polling and waiting mechanisms applied to all file + * descriptors and other event signaling resources (e.g. hw event queues) + * associated with the context; these mechanisms are continuously invoked + * until either the specified timeout period expires or the loop is stopped; + * + * all events which become pending during that time are handled and the user + * callbacks are called as appropriate for those events + * + * @param[in] ctx Pointer to the xio context handle + * @param[in] timeout_ms number of milliseconds to run the loop + * before exiting, if not stopped. + * 0 : just poll instantly, don't wait + * XIO_INFINITE: run continuously until stopped + * + * @return 0 on success, or -1 on error. If an error occurs, call + * xio_errno function to get the failure reason. + */ +int xio_context_run_loop(struct xio_context *ctx, int timeout_ms); + +/** + * stops context's running event loop + * + * @param[in] ctx Pointer to the xio context handle + */ +void xio_context_stop_loop(struct xio_context *ctx); + +/** + * poll for events for a specified (possibly infinite) amount of time; + * + * this function relies on polling and waiting mechanisms applied to all file + * descriptors and other event signaling resources (e.g. hw event queues) + * associated with the context; these mechanisms are invoked until the first + * successful polling attempt is made; + * + * all events which became pending till then are handled and the user callbacks + * are called as appropriate for those events; then the functions exits + * + * the number of actual events handled originated by any source of events is + * guaranteed to be limited + * + * @param[in] ctx Pointer to the xio context handle + * @param[in] timeout_ms number of milliseconds to wait before exiting, + * with or without events handled + * 0 : just poll instantly, don't wait + * XIO_INFINITE: wait for at least a single event + * + * @return 0 on success, or -1 on error. If an error occurs, call + * xio_errno function to get the failure reason. + */ +int xio_context_poll_wait(struct xio_context *ctx, int timeout_ms); + + +/*---------------------------------------------------------------------------*/ +/* library initialization routines */ +/*---------------------------------------------------------------------------*/ + +/** + * Initiates use of the libxio.so by a process. MUST BE CALLED in the "main" + * method before any accelio methods are called + * + * Idempotent routine to initialize the library. + * + */ +void xio_init(void); + +/** + * Terminates use of the libxio.so by a process. + * + * Idempotent routine to shutdown the library. + * + */ +void xio_shutdown(void); + +/*---------------------------------------------------------------------------*/ +/* Memory registration/allocation API */ +/*---------------------------------------------------------------------------*/ +/** + * register pre allocated memory for RDMA operations + * + * @param[in] addr buffer's memory address + * @param[in] length buffer's memory length + * @param[out] reg_mem registered memory data structure + * + * @return 0 on success, or -1 on error. If an error occurs, call + * xio_errno function to get the failure reason. + */ +int xio_mem_register(void *addr, size_t length, struct xio_reg_mem *reg_mem); + +/** + * unregister registered memory region, create by @ref xio_mem_register + * + * @param[in,out] reg_mem - previously registered memory data structure. + * + * @return 0 on success, or -1 on error. If an error occurs, call + * xio_errno function to get the failure reason. + */ +int xio_mem_dereg(struct xio_reg_mem *reg_mem); + +/** + * extract the rkey of the registered message assisting the + * arrived request + * + * @param[in] reg_mem registered memory data structure + * @param[in] req the incoming request. + * + * @return rkey of the registered memory + */ +uint32_t xio_lookup_rkey_by_request(const struct xio_reg_mem *reg_mem, + const struct xio_msg *req); + +/** + * extract the rkey of the registered message assisting the + * arrived response + * + * @param[in] reg_mem registered memory data structure + * @param[in] rsp the incoming response. + * + * @return rkey of the registered memory + */ +uint32_t xio_lookup_rkey_by_response(const struct xio_reg_mem *reg_mem, + const struct xio_msg *rsp); + +/** + * allocate and register memory for RDMA operations + * + * @param[in] length length of required buffer memory. + * @param[out] reg_mem registered memory data structure + * + * @return 0 on success, or -1 on error. If an error occurs, call + * xio_errno function to get the failure reason. + */ +int xio_mem_alloc(size_t length, struct xio_reg_mem *reg_mem); + +/** + * free registered memory region, create by @ref xio_mem_alloc + * + * @param[in,out] reg_mem - previously registered memory data structure. + * + * @return 0 on success, or -1 on error. If an error occurs, call + * xio_errno function to get the failure reason. + */ +int xio_mem_free(struct xio_reg_mem *reg_mem); + +/*---------------------------------------------------------------------------*/ +/* XIO memory pool API */ +/*---------------------------------------------------------------------------*/ + +/** + * @enum xio_mempool_flag + * @brief creation flags for mempool + */ +enum xio_mempool_flag { + XIO_MEMPOOL_FLAG_NONE = 0x0000, + XIO_MEMPOOL_FLAG_REG_MR = 0x0001, + XIO_MEMPOOL_FLAG_HUGE_PAGES_ALLOC = 0x0002, + XIO_MEMPOOL_FLAG_NUMA_ALLOC = 0x0004, + XIO_MEMPOOL_FLAG_REGULAR_PAGES_ALLOC = 0x0008, + /**< do not allocate buffers from larger slabs, + * if the smallest slab is empty + */ + XIO_MEMPOOL_FLAG_USE_SMALLEST_SLAB = 0x0016 +}; + +/** + * create mempool with NO (!) slabs + * + * @param[in] nodeid numa node id. -1 if don't care + * @param[in] flags mask of mempool creation flags + * defined (@ref xio_mempool_flag) + * + * @return pointer to xio_mempool object or NULL upon failure + */ +struct xio_mempool *xio_mempool_create(int nodeid, uint32_t flags); + +/** + * add a slab to current set (setup only). This method is not thread safe. + * + * @param[in] mpool the memory pool + * @param[in] size slab memory size + * @param[in] min initial buffers to allocate + * @param[in] max maximum buffers to allocate + * @param[in] alloc_quantum_nr growing quantum + * @param[in] alignment if not 0, the address of the allocated + * memory will be a multiple of alignment, which + * must be a power of two and a multiple + * of sizeof(void *) + * + * @return 0 on success, or -1 on error. If an error occurs, call + * xio_errno function to get the failure reason. + */ +int xio_mempool_add_slab(struct xio_mempool *mpool, + size_t size, size_t min, size_t max, + size_t alloc_quantum_nr, int alignment); + +/** + * destroy memory pool + * + * @param[in] mpool the memory pool + * + */ +void xio_mempool_destroy(struct xio_mempool *mpool); + +/** + * allocate memory buffer from memory pool. This method is thread safe + * + * @param[in] mpool the memory pool + * @param[in] length buffer size to allocate + * @param[in] reg_mem registered memory data structure + * + * @return 0 on success, or -1 on error. If an error occurs, call + * xio_errno function to get the failure reason. + */ +int xio_mempool_alloc(struct xio_mempool *mpool, + size_t length, struct xio_reg_mem *reg_mem); + +/** + * free memory buffer back to memory pool. This method is thread safe. + * + * @param[in] reg_mem registered memory data structure + * + */ +void xio_mempool_free(struct xio_reg_mem *reg_mem); + +#ifdef __cplusplus +} +#endif + +#endif /*XIO_API_H */ diff --git a/open_src/xio/src/common/sys/hashtable.h b/open_src/xio/src/common/sys/hashtable.h new file mode 100644 index 0000000..09ece60 --- /dev/null +++ b/open_src/xio/src/common/sys/hashtable.h @@ -0,0 +1,213 @@ +#ifndef SYS_HASHTABLE_H +#define SYS_HASHTABLE_H + +typedef unsigned int hash_func_t(const void *key); +typedef int key_cmp_func_t (const void *key1, const void *key2); +typedef void key_cp_func_t(void *keydst, const void *keysrc); + +/* + * Generic hashtable template + */ + +#define HASHTABLE_PRIME_TINY 49 +#define HASHTABLE_PRIME_SMALL 149 +#define HASHTABLE_PRIME_MEDIUM 977 +#define HASHTABLE_PRIME_LARGE 1277 +#define HASHTABLE_PRIME_HUGE 2459 + + +#define HASHTABLE_LOOKUP_FROM_LIST(head, list, key, var, field, _tfield) do {\ + int _hifound = 0; \ + list_for_each_entry(var, list, field._tfield) { \ + if (head->cmpfunc(key, &var->field.keycopy)) { \ + _hifound = 1; \ + break; \ + } \ + } \ + if (!_hifound) \ + var = NULL; \ +} while (0) + +#define HASHTABLE_INSERT_INTO_LIST(head, list, key, var, field, _tfield) do {\ + head->cpfunc(&(var->field.keycopy), key); \ + head->count++; \ + list_add(&(var->field._tfield), list); \ +} while (0) + + +#define HASHTABLE_LIST(head, i) (&((head)->list[i])) + +#define HASHTABLE_LENGTH(head) (sizeof((head)->list)/sizeof((head)->list[0])) + +#define HASHTABLE_INDEX(head, key) (((head)->hfunc(key))%HASHTABLE_LENGTH(head)) + +#define HASHTABLE_HEAD(name, type, prime) \ +struct name { \ + int count; \ + unsigned int tmp_i; \ + struct type *tmp_v; \ + hash_func_t *hfunc; \ + key_cmp_func_t *cmpfunc; \ + key_cp_func_t *cpfunc; \ + struct list_head list[prime]; \ +} + + +#define HASHTABLE_ENTRY(type, keytype, _tfield) \ +struct { \ + struct list_head _tfield; \ + struct keytype keycopy; \ +} + + +#define HASHTABLE_INIT(head, _hfunc, _cmpfunc, _cpfunc) do { \ + unsigned int _hil; \ + (head)->hfunc = (hash_func_t *)_hfunc; \ + (head)->cmpfunc = (key_cmp_func_t *)_cmpfunc; \ + (head)->cpfunc = (key_cp_func_t *)_cpfunc; \ + (head)->count = 0; \ + for (_hil = 0; _hil < HASHTABLE_LENGTH(head); _hil++) { \ + INIT_LIST_HEAD(HASHTABLE_LIST(head, _hil)); \ + } \ +} while (0) + +#define HASHTABLE_EMPTY(head) ((head)->count == 0) + +#define HASHTABLE_KEY(var, field) (&(var)->field.keycopy) + + +#define HASHTABLE_INSERT(head, key, var, field, _tfield) do { \ + unsigned int _hil = HASHTABLE_INDEX(head, key); \ + HASHTABLE_INSERT_INTO_LIST((head), HASHTABLE_LIST(head, _hil), \ + key, var, field, _tfield); \ +} while (0) + + +#define HASHTABLE_FOREACH(var, head, field, _tfield) \ +for ((head)->tmp_i = 0; \ + (head)->tmp_i < HASHTABLE_LENGTH(head); \ + (head)->tmp_i++) \ + list_for_each_entry(var, HASHTABLE_LIST(head, (head)->tmp_i), \ + field._tfield) + + +#define HASHTABLE_FOREACH_SAFE(var, head, field, _tfield) \ +for ((head)->tmp_i = 0; \ + (head)->tmp_i < HASHTABLE_LENGTH(head); \ + (head)->tmp_i++) \ + list_for_each_entry_safe(var, (head)->tmp_v, \ + HASHTABLE_LIST(head, (head)->tmp_i), \ + field._tfield) + +#define HASHTABLE_REMOVE(head, var, type, field, _tfield) do { \ + list_del_init(&(var)->field._tfield); \ + (head)->count--; \ +} while (0) + + + +#define HASHTABLE_LOOKUP(head, key, var, field, _tfield) do { \ + unsigned int _hil = HASHTABLE_INDEX(head, key); \ + HASHTABLE_LOOKUP_FROM_LIST((head), HASHTABLE_LIST(head, _hil), \ + key, var, field, _tfield); \ +} while (0) + + +#define HASHTABLE_LOOKUP_FOREACH(h, key, var, field, _tfield) \ + list_for_each(var, HASHTABLE_LIST(h, \ + HASHTABLE_INDEX((h), (key))), field._tfield) \ + if ((h)->cmpfunc(key, &(var)->field.keycopy)) + +#define HASHTABLE_LOOKUP_FOREACH_SAFE(h, k, d, f, _tfield) \ + list_for_each_safe(d, (h)->tmp_v, \ + HASHTABLE_LIST(h, HASHTABLE_INDEX(h, k)), \ + f._tfield) \ + if ((h)->cmpfunc(k, &d->f.keycopy)) + +/* + * Hashtable definitions + */ + + +#define _HT_LFIELD mpfld + +#define HT_HEAD(name, type, prime) \ + HASHTABLE_HEAD(name, type, prime) + +#define HT_ENTRY(type, keytype) \ + HASHTABLE_ENTRY(type, keytype, _HT_LFIELD) + +#define HT_KEY(var, field) \ + HASHTABLE_KEY(var, field) + +#define HT_INIT(head, _hfunc, _cmpfunc, _cpfunc) \ + HASHTABLE_INIT(head, _hfunc, _cmpfunc, _cpfunc) + +#define HT_EMPTY(head) \ + HASHTABLE_EMPTY(head) + +#define HT_FOREACH(var, head, field) \ + HASHTABLE_FOREACH(var, head, field, _HT_LFIELD) + +#define HT_FOREACH_SAFE(var, head, field) \ + HASHTABLE_FOREACH_SAFE(var, head, field, _HT_LFIELD) + +#define HT_REMOVE(head, var, type, field) \ + HASHTABLE_REMOVE(head, var, type, field, _HT_LFIELD) + +#define HT_REMOVE_BY_KEY(head, key, type, field) \ + HASHTABLE_REMOVE_BY_KEY(head, key, type, field, _HT_LFIELD) + +#define HT_LOOKUP(head, key, var, field) \ + HASHTABLE_LOOKUP(head, key, var, field, _HT_LFIELD) + +#define HT_INSERT(head, key, var, field) \ + HASHTABLE_INSERT(head, key, var, field, _HT_LFIELD) + +/* + * Multi hashtable definitions + */ + +#define _MULTI_HT_LFIELD mmpfld + +#define MULTI_HT_HEAD(name, type, prime) \ + HASHTABLE_HEAD(name, type, prime) + +#define MULTI_HT_ENTRY(type, keytype) \ + HASHTABLE_ENTRY(type, keytype, _MULTI_HT_LFIELD) + +#define MULTI_HT_KEY(var, field) \ + HASHTABLE_KEY(var, field) + +#define MULTI_HT_INIT(head, _hfunc, _cmpfunc, _cpfunc) \ + HASHTABLE_INIT(head, _hfunc, _cmpfunc, _cpfunc) + +#define MULTI_HT_EMPTY(head) \ + HASHTABLE_EMPTY(head) + +#define MULTI_HT_FOREACH(var, head, field) \ + HASHTABLE_FOREACH(var, head, field, _MULTI_HT_LFIELD) + +#define MULTI_HT_FOREACH_SAFE(var, head, field) \ + HASHTABLE_FOREACH_SAFE(var, head, field, _MULTI_HT_LFIELD) + +#define MULTI_HT_REMOVE(head, var, type, field) \ + HASHTABLE_REMOVE(head, var, type, field, _MULTI_HT_LFIELD) + +#define MULTI_HT_REMOVE_BY_KEY(head, key, type, field) \ + HASHTABLE_REMOVE_BY_KEY(head, key, type, field, _MULTI_HT_LFIELD) + +#define MULTI_HT_LOOKUP(head, key, var, field) \ + HASHTABLE_LOOKUP(head, key, var, field, _MULTI_HT_LFIELD) + +#define MULTI_HT_LOOKUP_FOREACH(head, key, var, field) \ + HASHTABLE_LOOKUP_FOREACH(head, key, var, field, _MULTI_HT_LFIELD) + +#define MULTI_HT_LOOKUP_FOREACH_SAFE(head, key, var, field) \ + HASHTABLE_LOOKUP_FOREACH_SAFE(head, key, var, field, _MULTI_HT_LFIELD) + +#define MULTI_HT_INSERT(head, key, var, field) \ + HASHTABLE_INSERT(head, key, var, field, _MULTI_HT_LFIELD) + + +#endif /* SYS_HASHTABLE_H */ diff --git a/open_src/xio/src/common/xio_common.h b/open_src/xio/src/common/xio_common.h new file mode 100644 index 0000000..cdc04f9 --- /dev/null +++ b/open_src/xio/src/common/xio_common.h @@ -0,0 +1,345 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_COMMON_H +#define XIO_COMMON_H + +#ifdef __cplusplus +extern "C" { +#endif + +/*---------------------------------------------------------------------------*/ +/* externals */ +/*---------------------------------------------------------------------------*/ +extern struct xio_options g_options; +extern double g_mhz; +extern struct xio_idr *usr_idr; +extern struct xio_mempool_config g_mempool_config; + +/*---------------------------------------------------------------------------*/ +/* defines */ +/*---------------------------------------------------------------------------*/ +/*#define XIO_SESSION_DEBUG*/ + +/* Macro for 64 bit variables to switch to from net */ +#define ntohll(x) (((uint64_t)(ntohl((int)((x << 32) >> 32))) << 32) | \ + (unsigned int)ntohl(((int)(x >> 32)))) +#define htonll(x) ntohll(x) + +#define uint64_from_ptr(p) (uint64_t)(uintptr_t)(p) +#define ptr_from_int64(p) (void *)(unsigned long)(p) + +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) +#endif + +/*---------------------------------------------------------------------------*/ +/* debuging facilities */ +/*---------------------------------------------------------------------------*/ +void xio_set_error(int errnum); + +#define XIO_TLV_LEN sizeof(struct xio_tlv) +#define XIO_SESSION_HDR_LEN sizeof(struct xio_session_hdr) +#define XIO_TRANSPORT_OFFSET (XIO_TLV_LEN + XIO_SESSION_HDR_LEN) +#define MAX_PRIVATE_DATA_LEN 1024 + +/** + * extended message flags + */ +enum xio_msg_flags_ex { + /* [below 1<<10 - reserved for application usage] */ + /* [above 1<<10 - reserved for library usage] */ + XIO_MSG_FLAG_EX_IMM_READ_RECEIPT = BIT(10), /**< immediate receipt */ + XIO_MSG_FLAG_EX_RECEIPT_FIRST = BIT(11), /**< read receipt first */ + XIO_MSG_FLAG_EX_RECEIPT_LAST = BIT(12), /**< read receipt last */ +}; + +#define xio_clear_ex_flags(flag) \ + ((*(flag)) &= ~(XIO_MSG_FLAG_EX_RECEIPT_FIRST | \ + XIO_MSG_FLAG_EX_RECEIPT_LAST | \ + XIO_MSG_FLAG_EX_IMM_READ_RECEIPT)) + +#define xio_app_receipt_request(rq) \ + ((rq)->flags & (XIO_MSG_FLAG_EX_RECEIPT_FIRST | \ + XIO_MSG_FLAG_EX_RECEIPT_LAST)) + +#define xio_app_receipt_first_request(rq) \ + (((rq)->flags & XIO_MSG_FLAG_EX_RECEIPT_FIRST) == \ + XIO_MSG_FLAG_EX_RECEIPT_FIRST) + +#define xio_app_receipt_last_request(rq) \ + (((rq)->flags & XIO_MSG_FLAG_EX_RECEIPT_LAST) == \ + XIO_MSG_FLAG_EX_RECEIPT_LAST) + +/** + * TLV types + */ +#define XIO_NOP 1 + +#define XIO_CREDIT BIT(6) /* 0x40 */ +#define XIO_NEXUS_SETUP BIT(7) /* 0x80 */ +#define XIO_SESSION_SETUP BIT(8) /* 0x100 */ +#define XIO_CONNECTION_HELLO BIT(9) /* 0x200 */ +#define XIO_FIN BIT(10) /* 0x400 */ +#define XIO_CANCEL BIT(11) /* 0x800 */ +#define XIO_ACK BIT(12) /* 0x1000 */ +#define XIO_RDMA_READ BIT(13) /* 0x2000 */ +#define XIO_CONNECTION_KA BIT(14) /* 0x4000 */ + +#define XIO_MSG_REQ XIO_MSG_TYPE_REQ +#define XIO_MSG_RSP XIO_MSG_TYPE_RSP +#define XIO_CREDIT_NOP (XIO_CREDIT | XIO_NOP) +#define XIO_NEXUS_SETUP_REQ (XIO_NEXUS_SETUP | XIO_REQUEST) +#define XIO_NEXUS_SETUP_RSP (XIO_NEXUS_SETUP | XIO_RESPONSE) +#define XIO_SESSION_SETUP_REQ (XIO_SESSION_SETUP | XIO_REQUEST) +#define XIO_SESSION_SETUP_RSP (XIO_SESSION_SETUP | XIO_RESPONSE) +#define XIO_ONE_WAY_REQ XIO_MSG_TYPE_ONE_WAY +#define XIO_ONE_WAY_RSP (XIO_ONE_WAY | XIO_RESPONSE) +#define XIO_FIN_REQ (XIO_FIN | XIO_REQUEST) +#define XIO_FIN_RSP (XIO_FIN | XIO_RESPONSE) +#define XIO_CANCEL_REQ (XIO_CANCEL | XIO_REQUEST) +#define XIO_CANCEL_RSP (XIO_CANCEL | XIO_RESPONSE) +#define XIO_CONNECTION_HELLO_REQ (XIO_CONNECTION_HELLO | XIO_REQUEST) +#define XIO_CONNECTION_HELLO_RSP (XIO_CONNECTION_HELLO | XIO_RESPONSE) +#define XIO_CONNECTION_KA_REQ (XIO_CONNECTION_KA | XIO_REQUEST) +#define XIO_CONNECTION_KA_RSP (XIO_CONNECTION_KA | XIO_RESPONSE) +#define XIO_ACK_REQ (XIO_ACK | XIO_REQUEST) +#define XIO_RDMA_READ_ACK (XIO_RDMA_READ | XIO_RESPONSE) + +#define IS_REQUEST(type) ((type) & XIO_REQUEST) +#define IS_RESPONSE(type) ((type) & XIO_RESPONSE) +#define IS_NOP(type) ((type) & XIO_NOP) +#define IS_RDMA_RD_ACK(type) ((type) & XIO_RDMA_READ) +#define IS_MESSAGE(type) ((type) & XIO_MESSAGE) +#define IS_SESSION_SETUP(type) ((type) & XIO_SESSION_SETUP) +#define IS_NEXUS_SETUP(type) ((type) & XIO_NEXUS_SETUP) +#define IS_ONE_WAY(type) ((type) & XIO_ONE_WAY) +#define IS_FIN(type) ((type) & XIO_FIN) +#define IS_CANCEL(type) ((type) & XIO_CANCEL) +#define IS_CONNECTION_HELLO(type) ((type) & XIO_CONNECTION_HELLO) +#define IS_DIRECT_RDMA(type) ((type) & XIO_RDMA) +#define IS_APPLICATION_MSG(type) \ + (IS_MESSAGE(type) || IS_ONE_WAY(type) || IS_DIRECT_RDMA(type)) + +/** + * TLV magic + */ +#define XIO_MAGIC 0x58494F50 /* ascii of 'XIOP' */ + +/** + * TLV macros + */ +#define PACK_SVAL(src, trgt, attr) ((trgt)->attr = htons((src)->attr)) +#define PACK_LVAL(src, trgt, attr) ((trgt)->attr = htonl((src)->attr)) +#define PACK_LLVAL(src, trgt, attr) ((trgt)->attr = htonll((src)->attr)) + +#define UNPACK_SVAL(src, trgt, attr) ((trgt)->attr = ntohs((src)->attr)) +#define UNPACK_LVAL(src, trgt, attr) ((trgt)->attr = ntohl((src)->attr)) +#define UNPACK_LLVAL(src, trgt, attr) ((trgt)->attr = ntohll((src)->attr)) + +#define test_bits(mask, addr) (((*addr) & (mask)) != 0) +#define clr_bits(mask, addr) ((*addr) &= ~(mask)) +#define set_bits(mask, addr) ((*addr) |= (mask)) + +#define test_flag(flag, addr) (((*addr) & (flag)) == (flag)) + +/* header flags */ +#define XIO_HEADER_FLAG_NONE (0) +#define XIO_HEADER_FLAG_PEER_WRITE_RSP BIT(0) + +/*---------------------------------------------------------------------------*/ +/* structures */ +/*---------------------------------------------------------------------------*/ +struct xio_options { + int max_in_iovsz; + int max_out_iovsz; + int reconnect; + /* transport options needed globally */ + int max_inline_xio_hdr; + int max_inline_xio_data; + int enable_flow_control; + int snd_queue_depth_msgs; + int rcv_queue_depth_msgs; + uint64_t snd_queue_depth_bytes; + uint64_t rcv_queue_depth_bytes; + int xfer_buf_align; + int inline_xio_data_align; + int enable_keepalive; + int transport_close_timeout; + int pad; + + struct xio_options_keepalive ka; +}; + +/*---------------------------------------------------------------------------*/ +/* message headers */ +/*---------------------------------------------------------------------------*/ +PACKED_MEMORY(struct xio_tlv { + uint32_t magic; + uint32_t type; + uint64_t len; +}); + +#ifdef XIO_SESSION_DEBUG +PACKED_MEMORY(struct xio_session_hdr { + uint32_t dest_session_id; + uint32_t flags; + uint64_t serial_num; + uint16_t sn; /* serial number */ + uint16_t ack_sn; /* ack serial number */ + uint16_t credits_msgs; + uint16_t pad[3]; + uint32_t receipt_result; + uint64_t credits_bytes; + uint64_t connection; + uint64_t session; +}); +#else +PACKED_MEMORY(struct xio_session_hdr { + uint32_t dest_session_id; + uint32_t flags; + uint64_t serial_num; + uint16_t sn; /* serial number */ + uint16_t ack_sn; /* ack serial number */ + uint16_t credits_msgs; + uint16_t pad[3]; + uint32_t receipt_result; + uint64_t credits_bytes; +}); +#endif + +/* setup flags */ +#define XIO_CID 1 + +#define XIO_RECONNECT (XIO_CID) + +PACKED_MEMORY(struct xio_nexus_setup_req { + uint16_t version; + uint16_t flags; + uint32_t cid; +}); + +PACKED_MEMORY(struct xio_nexus_setup_rsp { + uint32_t cid; + uint32_t status; + uint16_t version; + uint16_t flags; +}); + +PACKED_MEMORY(struct xio_session_cancel_hdr { + uint32_t requester_session_id; + uint32_t responder_session_id; + uint64_t sn; +}); + +struct xio_msg; +struct xio_vmsg; +struct xio_iovec; +struct xio_iovec_ex; + +/*---------------------------------------------------------------------------*/ +/* enum */ +/*---------------------------------------------------------------------------*/ + +enum xio_wc_op { + XIO_WC_OP_UNKNOWN, + XIO_WC_OP_RECV, + XIO_WC_OP_SEND, + XIO_WC_OP_RDMA_READ, + XIO_WC_OP_RDMA_WRITE, +}; + +/*---------------------------------------------------------------------------*/ +/* structs */ +/*---------------------------------------------------------------------------*/ +union xio_sockaddr { + struct sockaddr sa; + struct sockaddr_in sa_in; + struct sockaddr_in6 sa_in6; + struct sockaddr_storage sa_stor; +}; + +/*---------------------------------------------------------------------------*/ +/* xio_utils.c */ +/*---------------------------------------------------------------------------*/ + +int xio_uri_get_proto(const char *uri, char *proto, + int proto_len); + +int xio_uri_get_portal(const char *uri, char *portal, + int portal_len); + +int xio_uri_get_resource(const char *uri, char *resource, + int resource_len); + +const char *xio_uri_get_resource_ptr(const char *uri); + +int xio_uri_to_ss(const char *uri, struct sockaddr_storage *ss); + +int xio_host_port_to_ss(const char *buf, + struct sockaddr_storage *ss); + +size_t xio_write_tlv(uint32_t type, uint64_t len, uint8_t *buffer); + +size_t xio_read_tlv(uint32_t *type, uint64_t *len, void **value, + uint8_t *buffer); + +size_t memcpyv(struct xio_iovec *dst, int dsize, + struct xio_iovec *src, int ssize); + +size_t memclonev(struct xio_iovec *dst, int dsize, + struct xio_iovec *src, int ssize); + +size_t xio_iov_length(const struct xio_iovec *iov, + unsigned long nr_segs); + +unsigned int xio_get_nodeid(unsigned int cpu_id); + +void xio_msg_dump(struct xio_msg *xio_msg); + +const char *xio_proto_str(enum xio_proto proto); + +/*---------------------------------------------------------------------------*/ +/* xio_options.c */ +/*---------------------------------------------------------------------------*/ +struct xio_options *xio_get_options(void); + +#ifdef __cplusplus +} +#endif + +#endif /*XIO_COMMON_H */ diff --git a/open_src/xio/src/common/xio_connection.c b/open_src/xio/src/common/xio_connection.c new file mode 100644 index 0000000..12d309b --- /dev/null +++ b/open_src/xio/src/common/xio_connection.c @@ -0,0 +1,3343 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include "libxio.h" +#include "xio_log.h" +#include "xio_common.h" +#include "xio_hash.h" +#include "xio_protocol.h" +#include "xio_mbuf.h" +#include "xio_task.h" +#include "xio_observer.h" +#include "xio_transport.h" +#include "xio_msg_list.h" +#include "xio_ev_data.h" +#include "xio_objpool.h" +#include "xio_workqueue.h" +#include "xio_idr.h" +#include "xio_sg_table.h" +#include "xio_context.h" +#include "xio_nexus.h" +#include "xio_session.h" +#include "xio_connection.h" +#include + +#define MSG_POOL_SZ 1024 +#define XIO_IOV_THRESHOLD 20 + +static struct xio_transition xio_transition_table[][2] = { +/* INIT */ { + {/*valid*/ 0, /*next_state*/ XIO_CONNECTION_STATE_INVALID, /*send_flags*/ 0 }, + {/*valid*/ 0, /*next_state*/ XIO_CONNECTION_STATE_INVALID, /*send_flags*/ 0 }, + }, + +/* ESTABLISHED */ { + {/*valid*/ 0, /*next_state*/ XIO_CONNECTION_STATE_INVALID, /*send_flags*/ 0 }, + {/*valid*/ 0, /*next_state*/ XIO_CONNECTION_STATE_INVALID, /*send_flags*/ 0 }, + }, + +/* ONLINE */ { + {/*valid*/ 1, /*next_state*/ XIO_CONNECTION_STATE_CLOSE_WAIT, /*send_flags*/ SEND_ACK }, + {/*valid*/ 0, /*next_state*/ XIO_CONNECTION_STATE_INVALID, /*send_flags*/ 0 }, + }, + +/* FIN_WAIT_1 */ { + {/*valid*/ 1, /*next_state*/ XIO_CONNECTION_STATE_CLOSING, /*send_flags*/ SEND_ACK }, + {/*valid*/ 1, /*next_state*/ XIO_CONNECTION_STATE_FIN_WAIT_2, /*send_flags*/ 0 }, + }, +/* FIN_WAIT_2 */ { + {/*valid*/ 1, /*next_state*/ XIO_CONNECTION_STATE_TIME_WAIT, /*send_flags*/ SEND_ACK }, + {/*valid*/ 0, /*next_state*/ XIO_CONNECTION_STATE_INVALID, /*send_flags*/ 0 }, + }, +/* CLOSING */ { + {/*valid*/ 0, /*next_state*/ XIO_CONNECTION_STATE_INVALID, /*send_flags*/ 0 }, + {/*valid*/ 1, /*next_state*/ XIO_CONNECTION_STATE_TIME_WAIT, /*send_flags*/ 0 }, + }, +/* TIME_WAIT */ { + {/*valid*/ 0, /*next_state*/ XIO_CONNECTION_STATE_INVALID, /*send_flags*/ 0 }, + {/*valid*/ 0, /*next_state*/ XIO_CONNECTION_STATE_INVALID, /*send_flags*/ 0 }, + }, +/* CLOSE_WAIT */ { + {/*valid*/ 0, /*next_state*/ XIO_CONNECTION_STATE_INVALID, /*send_flags*/ 0 }, + {/*valid*/ 0, /*next_state*/ XIO_CONNECTION_STATE_INVALID, /*send_flags*/ 0 }, + }, +/* LAST_ACK */ { + {/*valid*/ 0, /*next_state*/ XIO_CONNECTION_STATE_INVALID, /*send_flags*/ 0 }, + {/*valid*/ 1, /*next_state*/ XIO_CONNECTION_STATE_CLOSED, /*send_flags*/ 0 }, + }, +/* CLOSED */ { + {/*valid*/ 0, /*next_state*/ XIO_CONNECTION_STATE_INVALID, /*send_flags*/ 0 }, + {/*valid*/ 0, /*next_state*/ XIO_CONNECTION_STATE_INVALID, /*send_flags*/ 0 }, + }, +/* DISCONNECTED */{ + {/*valid*/ 0, /*next_state*/ XIO_CONNECTION_STATE_INVALID, /*send_flags*/ 0 }, + {/*valid*/ 0, /*next_state*/ XIO_CONNECTION_STATE_INVALID, /*send_flags*/ 0 }, + }, +/* ERROR */ { + {/*valid*/ 0, /*next_state*/ XIO_CONNECTION_STATE_INVALID, /*send_flags*/ 0 }, + {/*valid*/ 0, /*next_state*/ XIO_CONNECTION_STATE_INVALID, /*send_flags*/ 0 }, + }, +/* INVALID */ { + {/*valid*/ 0, /*next_state*/ XIO_CONNECTION_STATE_INVALID, /*send_flags*/ 0,}, + {/*valid*/ 0, /*next_state*/ XIO_CONNECTION_STATE_INVALID, /*send_flags*/ 0 }, + }, +}; + +static void xio_connection_post_destroy(struct kref *kref); +static void xio_connection_teardown_handler(void *connection_); +static void xio_connection_keepalive_time(void *_connection); +static void xio_close_time_wait(void *data); + +struct xio_managed_rkey { + struct list_head list_entry; + uint32_t rkey; + uint32_t pad; +}; + +/*---------------------------------------------------------------------------*/ +/* xio_connection_next_transit */ +/*---------------------------------------------------------------------------*/ +struct xio_transition *xio_connection_next_transit( + enum xio_connection_state state, int fin_ack) +{ + return &xio_transition_table[state][fin_ack]; +} + +char *xio_connection_state_str(enum xio_connection_state state) +{ + switch (state) { + case XIO_CONNECTION_STATE_INIT: + return "INIT"; + case XIO_CONNECTION_STATE_ESTABLISHED: + return "ESTABLISHED"; + case XIO_CONNECTION_STATE_ONLINE: + return "ONLINE"; + case XIO_CONNECTION_STATE_FIN_WAIT_1: + return "FIN_WAIT_1"; + case XIO_CONNECTION_STATE_FIN_WAIT_2: + return "FIN_WAIT_2"; + case XIO_CONNECTION_STATE_CLOSING: + return "CLOSING"; + case XIO_CONNECTION_STATE_TIME_WAIT: + return "TIME_WAIT"; + case XIO_CONNECTION_STATE_CLOSE_WAIT: + return "CLOSE_WAIT"; + case XIO_CONNECTION_STATE_LAST_ACK: + return "LAST_ACK"; + case XIO_CONNECTION_STATE_CLOSED: + return "CLOSED"; + case XIO_CONNECTION_STATE_DISCONNECTED: + return "DISCONNECTED"; + case XIO_CONNECTION_STATE_ERROR: + return "ERROR"; + case XIO_CONNECTION_STATE_INVALID: + return "INVALID"; + } + + return NULL; +}; + +/*---------------------------------------------------------------------------*/ +/* xio_is_connection_online */ +/*---------------------------------------------------------------------------*/ +static int xio_is_connection_online(struct xio_connection *connection) +{ + return connection->session->state == XIO_SESSION_STATE_ONLINE && + connection->state == XIO_CONNECTION_STATE_ONLINE; +} + +/*---------------------------------------------------------------------------*/ +/* xio_connection_create */ +/*---------------------------------------------------------------------------*/ +struct xio_connection *xio_connection_create(struct xio_session *session, + struct xio_context *ctx, + int conn_idx, + void *cb_user_context) +{ + struct xio_connection *connection; + + if (!ctx || !session) { + xio_set_error(EINVAL); + return NULL; + } + + connection = (struct xio_connection *) + kcalloc(1, sizeof(*connection), GFP_KERNEL); + if (!connection) { + xio_set_error(ENOMEM); + return NULL; + } + + connection->session = session; + connection->nexus = NULL; + connection->ctx = ctx; + connection->req_ack_sn = ~0; + connection->rsp_ack_sn = ~0; + connection->rx_queue_watermark_msgs = + session->rcv_queue_depth_msgs / 2; + connection->rx_queue_watermark_bytes = + session->rcv_queue_depth_bytes / 2; + connection->enable_flow_control = g_options.enable_flow_control; + + connection->conn_idx = conn_idx; + connection->cb_user_context = cb_user_context; + + connection->disconnect_timeout = XIO_DEF_CONNECTION_TIMEOUT; + + memcpy(&connection->ses_ops, &session->ses_ops, + sizeof(session->ses_ops)); + + INIT_LIST_HEAD(&connection->managed_rkey_list); + INIT_LIST_HEAD(&connection->io_tasks_list); + INIT_LIST_HEAD(&connection->post_io_tasks_list); + INIT_LIST_HEAD(&connection->pre_send_list); + + xio_msg_list_init(&connection->reqs_msgq); + xio_msg_list_init(&connection->rsps_msgq); + + xio_msg_list_init(&connection->in_flight_reqs_msgq); + xio_msg_list_init(&connection->in_flight_rsps_msgq); + + kref_init(&connection->kref); + spin_lock(&ctx->ctx_list_lock); + list_add_tail(&connection->ctx_list_entry, &ctx->ctx_list); + spin_unlock(&ctx->ctx_list_lock); + + return connection; +} + +/*---------------------------------------------------------------------------*/ +/* xio_connection_set_ow_send_comp_params */ +/*---------------------------------------------------------------------------*/ +static void xio_connection_set_ow_send_comp_params(struct xio_msg *msg) +{ + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + ssize_t data_len; + + /* messages that are planed to send via "SEND" operation can + * discard the read receipt for better performance + */ + + if ((msg->flags & XIO_MSG_FLAG_REQUEST_READ_RECEIPT) || + (msg->type != XIO_ONE_WAY_REQ)) + return; + + sgtbl = xio_sg_table_get(&msg->out); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(msg->out.sgl_type); + data_len = tbl_length(sgtbl_ops, sgtbl); + + /* heuristics to guess in which cases the lower layer will not + * do "rdma read" but will use send/receive + */ + if (tbl_nents(sgtbl_ops, sgtbl) > XIO_IOV_THRESHOLD) { + clr_bits(XIO_MSG_FLAG_IMM_SEND_COMP, &msg->flags); + set_bits(XIO_MSG_FLAG_EX_IMM_READ_RECEIPT, &msg->flags); + return; + } + if (data_len > (ssize_t)g_options.max_inline_xio_data && data_len > 0) { + clr_bits(XIO_MSG_FLAG_IMM_SEND_COMP, &msg->flags); + set_bits(XIO_MSG_FLAG_EX_IMM_READ_RECEIPT, &msg->flags); + return; + } + set_bits(XIO_MSG_FLAG_IMM_SEND_COMP, &msg->flags); +} + +/*---------------------------------------------------------------------------*/ +/* xio_connection_send */ +/*---------------------------------------------------------------------------*/ +int xio_connection_send(struct xio_connection *connection, + struct xio_msg *msg) +{ + struct xio_task *task = NULL; + struct xio_task *req_task = NULL; + struct xio_session_hdr hdr = {0}; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + size_t tx_bytes = 0; + int retval = 0; + int is_req = 0; + int rc = EFAULT; + int standalone_receipt = 0; + int is_control; + + /* is control message */ + is_control = !IS_APPLICATION_MSG(msg->type); + + /* flow control test */ + if (!is_control && connection->enable_flow_control) { + if (connection->peer_credits_msgs == 0) + return -EAGAIN; + + sgtbl = xio_sg_table_get(&msg->out); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(msg->out.sgl_type); + + tx_bytes = msg->out.header.iov_len + + tbl_length(sgtbl_ops, sgtbl); + + /* message does not fit into remote queue */ + if (connection->session->peer_rcv_queue_depth_bytes < + tx_bytes) { + ERROR_LOG( + "message length %zd is bigger than peer " \ + "receive queue size %llu\n", tx_bytes, + connection->session->peer_rcv_queue_depth_bytes); + return -XIO_E_PEER_QUEUE_SIZE_MISMATCH; + } + + if (connection->peer_credits_bytes < tx_bytes) + return -EAGAIN; + } + + if (IS_RESPONSE(msg->type) && + (xio_app_receipt_request(msg) == XIO_MSG_FLAG_EX_RECEIPT_FIRST)) { + /* this is a receipt message */ + task = xio_nexus_get_primary_task(connection->nexus); + if (!task) { + ERROR_LOG("tasks pool is empty\n"); + return -ENOMEM; + } + req_task = container_of(msg->request, struct xio_task, imsg); + list_move_tail(&task->tasks_list_entry, + &connection->pre_send_list); + + task->sender_task = req_task; + task->omsg = msg; + task->rtid = req_task->rtid; + + hdr.serial_num = msg->request->sn; + hdr.receipt_result = msg->receipt_res; + is_req = 1; + standalone_receipt = 1; + } else { + if (IS_REQUEST(msg->type) || msg->type == XIO_MSG_TYPE_RDMA) { + task = xio_nexus_get_primary_task(connection->nexus); + if (!task) { + ERROR_LOG("tasks pool is empty\n"); + return -ENOMEM; + } + task->omsg = msg; + hdr.serial_num = task->omsg->sn; + is_req = 1; + /* save the message "in" side */ + if (msg->flags & XIO_MSG_FLAG_REQUEST_READ_RECEIPT) + memcpy(&task->in_receipt, + &msg->in, sizeof(task->in_receipt)); + + list_move_tail(&task->tasks_list_entry, + &connection->pre_send_list); + } else if (IS_RESPONSE(msg->type)) { + task = container_of(msg->request, + struct xio_task, imsg); + + list_move_tail(&task->tasks_list_entry, + &connection->pre_send_list); + + hdr.serial_num = msg->request->sn; + } else { + ERROR_LOG("Unknown message type %u\n", msg->type); + return -EINVAL; + } + } + /* reset the task mbuf */ + xio_mbuf_reset(&task->mbuf); + + /* set the mbuf to beginning of tlv */ + if (xio_mbuf_tlv_start(&task->mbuf) != 0) + goto cleanup; + + task->tlv_type = msg->type; + task->session = connection->session; + task->stag = uint64_from_ptr(task->session); + task->nexus = connection->nexus; + task->connection = connection; + task->omsg = msg; + task->omsg_flags = (uint16_t)msg->flags; + task->omsg->next = NULL; + task->last_in_rxq = 0; + + /* mark as a control message */ + task->is_control = is_control; + + /* optimize for send complete */ + if (msg->type == XIO_ONE_WAY_REQ && + connection->session->ses_ops.on_ow_msg_send_complete) + xio_connection_set_ow_send_comp_params(msg); + + if (msg->type != XIO_MSG_TYPE_RDMA) { + hdr.flags = (uint32_t)msg->flags; + hdr.dest_session_id = connection->session->peer_session_id; + if (!task->is_control || task->tlv_type == XIO_ACK_REQ) { + if (IS_REQUEST(msg->type)) { + hdr.sn = connection->req_sn++; + hdr.ack_sn = connection->req_ack_sn; + } else if (IS_RESPONSE(msg->type)) { + hdr.sn = connection->rsp_sn++; + hdr.ack_sn = connection->rsp_ack_sn; + } else { + ERROR_LOG("unknown message type %u\n", + msg->type); + return -EINVAL; + } + if (connection->enable_flow_control) { + hdr.credits_msgs = + connection->credits_msgs; + connection->credits_msgs = 0; + hdr.credits_bytes = + connection->credits_bytes; + connection->credits_bytes = 0; + if (!standalone_receipt) { + connection->peer_credits_msgs--; + connection->peer_credits_bytes -= + tx_bytes; + } + } + } +#ifdef XIO_SESSION_DEBUG + hdr.connection = uint64_from_ptr(connection); + hdr.session = uint64_from_ptr(connection->session); +#endif + xio_session_write_header(task, &hdr); + } + /* send it */ + retval = xio_nexus_send(connection->nexus, task); + if (retval != 0) { + ERROR_LOG("xio_nexus_send failed with %d\n", retval); + rc = (retval == -EAGAIN) ? EAGAIN : xio_errno(); + if (!task->is_control || task->tlv_type == XIO_ACK_REQ) { + if (connection->enable_flow_control) { + connection->credits_msgs = hdr.credits_msgs; + connection->credits_bytes = hdr.credits_bytes; + if (!standalone_receipt && + connection->enable_flow_control) { + connection->peer_credits_msgs++; + connection->peer_credits_bytes += + tx_bytes; + } + } + } + goto cleanup; + } + return 0; + +cleanup: + if (is_req) + xio_tasks_pool_put(task); + else + list_move(&task->tasks_list_entry, &connection->io_tasks_list); + + return -rc; +} + +/*---------------------------------------------------------------------------*/ +/* xio_connection_flush_msgs */ +/*---------------------------------------------------------------------------*/ +static int xio_connection_flush_msgs(struct xio_connection *connection) +{ + struct xio_msg *pmsg, *tmp_pmsg, *omsg = NULL; + + if (!xio_msg_list_empty(&connection->reqs_msgq)) + omsg = xio_msg_list_first(&connection->reqs_msgq); + xio_msg_list_foreach_safe(pmsg, &connection->in_flight_reqs_msgq, + tmp_pmsg, pdata) { + xio_msg_list_remove(&connection->in_flight_reqs_msgq, + pmsg, pdata); + if (omsg) + xio_msg_list_insert_before(omsg, pmsg, pdata); + else + xio_msg_list_insert_tail(&connection->reqs_msgq, + pmsg, pdata); + + if (connection->enable_flow_control && + (pmsg->type == XIO_MSG_TYPE_REQ || + pmsg->type == XIO_ONE_WAY_REQ)) { + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + size_t tx_bytes; + + sgtbl = xio_sg_table_get(&pmsg->out); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(pmsg->out.sgl_type); + tx_bytes = pmsg->out.header.iov_len + + tbl_length(sgtbl_ops, sgtbl); + + connection->tx_queued_msgs--; + connection->tx_bytes -= tx_bytes; + + if (connection->tx_queued_msgs < 0) + ERROR_LOG("tx_queued_msgs:%d\n", + connection->tx_queued_msgs); + } + } + + if (!xio_msg_list_empty(&connection->rsps_msgq)) + omsg = xio_msg_list_first(&connection->rsps_msgq); + else + omsg = NULL; + + xio_msg_list_foreach_safe(pmsg, &connection->in_flight_rsps_msgq, + tmp_pmsg, pdata) { + xio_msg_list_remove(&connection->in_flight_rsps_msgq, + pmsg, pdata); + if (omsg) + xio_msg_list_insert_before(omsg, pmsg, pdata); + else + xio_msg_list_insert_tail(&connection->rsps_msgq, + pmsg, pdata); + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_connection_notify_req_msgs_flush */ +/*---------------------------------------------------------------------------*/ +static void xio_connection_notify_req_msgs_flush(struct xio_connection + *connection, + enum xio_status status) +{ + struct xio_msg *pmsg, *tmp_pmsg; + + xio_msg_list_foreach_safe(pmsg, &connection->reqs_msgq, + tmp_pmsg, pdata) { + xio_msg_list_remove(&connection->reqs_msgq, pmsg, pdata); + if (!IS_APPLICATION_MSG(pmsg->type)) { + if (pmsg->type == XIO_FIN_REQ && + connection->state != XIO_CONNECTION_STATE_DISCONNECTED) { + connection->fin_request_flushed = 1; + /* since fin req was not really sent, need to + * "undo" the kref updates done in + * xio_send_fin_req() */ + kref_put(&connection->kref, xio_connection_post_destroy); + kref_put(&connection->kref, xio_connection_post_destroy); + } + continue; + } + xio_session_notify_msg_error(connection, pmsg, + status, + XIO_MSG_DIRECTION_OUT); + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_connection_notify_rsp_msgs_flush */ +/*---------------------------------------------------------------------------*/ +static void xio_connection_notify_rsp_msgs_flush(struct xio_connection + *connection, + enum xio_status status) +{ + struct xio_msg *pmsg, *tmp_pmsg; + + xio_msg_list_foreach_safe(pmsg, &connection->rsps_msgq, + tmp_pmsg, pdata) { + xio_msg_list_remove(&connection->rsps_msgq, pmsg, pdata); + if (pmsg->type == XIO_ONE_WAY_RSP) { + xio_context_msg_pool_put(pmsg); + continue; + } + + /* this is read receipt */ + if (IS_RESPONSE(pmsg->type) && + (xio_app_receipt_request(pmsg) == + XIO_MSG_FLAG_EX_RECEIPT_FIRST)) { + continue; + } + if (!IS_APPLICATION_MSG(pmsg->type)) + continue; + xio_session_notify_msg_error(connection, pmsg, + status, + XIO_MSG_DIRECTION_OUT); + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_connection_notify_msgs_flush */ +/*---------------------------------------------------------------------------*/ +int xio_connection_notify_msgs_flush(struct xio_connection *connection) +{ + xio_connection_notify_req_msgs_flush(connection, XIO_E_MSG_FLUSHED); + + xio_connection_notify_rsp_msgs_flush(connection, XIO_E_MSG_FLUSHED); + + connection->is_flushed = 1; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_connection_flush_tasks */ +/*---------------------------------------------------------------------------*/ +int xio_connection_flush_tasks(struct xio_connection *connection) +{ + struct xio_task *ptask, *pnext_task; + + if (!(connection->nexus)) + return 0; + + if (!list_empty(&connection->post_io_tasks_list)) { + TRACE_LOG("post_io_list not empty!\n"); + list_for_each_entry_safe(ptask, pnext_task, + &connection->post_io_tasks_list, + tasks_list_entry) { + TRACE_LOG("post_io_list: task %p" \ + "type 0x%x ltid:%d\n", + ptask, + ptask->tlv_type, ptask->ltid); + xio_tasks_pool_put(ptask); + } + } + + if (!list_empty(&connection->pre_send_list)) { + TRACE_LOG("pre_send_list not empty!\n"); + list_for_each_entry_safe(ptask, pnext_task, + &connection->pre_send_list, + tasks_list_entry) { + TRACE_LOG("pre_send_list: task %p, " \ + "type 0x%x ltid:%d\n", + ptask, + ptask->tlv_type, ptask->ltid); + if (ptask->sender_task) { + /* the tx task is returend back to pool */ + xio_tasks_pool_put(ptask->sender_task); + ptask->sender_task = NULL; + } + xio_tasks_pool_put(ptask); + } + } + + if (!list_empty(&connection->io_tasks_list)) { + TRACE_LOG("io_tasks_list not empty!\n"); + list_for_each_entry_safe(ptask, pnext_task, + &connection->io_tasks_list, + tasks_list_entry) { + TRACE_LOG("io_tasks_list: task %p, " \ + "type 0x%x ltid:%d\n", + ptask, + ptask->tlv_type, ptask->ltid); + } + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_connection_update_rkeys */ +/*---------------------------------------------------------------------------*/ +static int xio_connection_update_rkeys(struct xio_connection *connection) +{ + struct xio_managed_rkey *managed_rkey; + + if (!connection->nexus) + return 0; + + if (list_empty(&connection->managed_rkey_list)) + return 0; + + list_for_each_entry(managed_rkey, + &connection->managed_rkey_list, + list_entry) { + if (xio_nexus_update_rkey(connection->nexus, + &managed_rkey->rkey)) { + ERROR_LOG("update_rkey failed: rkey %u\n", + managed_rkey->rkey); + return -1; + } + } + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_connection_restart_tasks */ +/*---------------------------------------------------------------------------*/ +static int xio_connection_restart_tasks(struct xio_connection *connection) +{ + struct xio_task *ptask, *pnext_task; + int is_req; + + if (!connection->nexus) + return 0; + + /* tasks in io_tasks_lists belongs to the application and should not be + * touched, the application is assumed to retransmit + */ + + /* task in post_io_tasks_list are responses freed by the application + * but there TX complete was yet arrived, in reconnect use case the + * TX complete will never happen, so free them + */ + if (!list_empty(&connection->post_io_tasks_list)) { + TRACE_LOG("post_io_list not empty!\n"); + list_for_each_entry_safe(ptask, pnext_task, + &connection->post_io_tasks_list, + tasks_list_entry) { + TRACE_LOG("post_io_list: task %p" \ + "type 0x%x ltid:%d\n", + ptask, + ptask->tlv_type, ptask->ltid); + xio_tasks_pool_put(ptask); + } + } + + /* task in pre_send_list are either response or requests, or receipt + * repeat the logic of xio_connection_send w.r.t release logic + */ + + if (!list_empty(&connection->pre_send_list)) { + TRACE_LOG("pre_send_list not empty!\n"); + list_for_each_entry_safe(ptask, pnext_task, + &connection->pre_send_list, + tasks_list_entry) { + TRACE_LOG("pre_send_list: task %p, " \ + "type 0x%x ltid:%d\n", + ptask, + ptask->tlv_type, ptask->ltid); + if (IS_RESPONSE(ptask->tlv_type) && + ((ptask->omsg_flags & + (XIO_MSG_FLAG_EX_RECEIPT_FIRST | + XIO_MSG_FLAG_EX_RECEIPT_LAST)) == + XIO_MSG_FLAG_EX_RECEIPT_FIRST)) + /* this is a receipt message */ + is_req = 1; + else + is_req = IS_REQUEST(ptask->tlv_type) || + (ptask->tlv_type == XIO_MSG_TYPE_RDMA); + + if (is_req) + xio_tasks_pool_put(ptask); + else + list_move(&ptask->tasks_list_entry, + &connection->io_tasks_list); + } + } + + if (list_empty(&connection->io_tasks_list)) + return 0; + + /* Tasks may need to be updated by the transport layer, e.g. + * if tasks in io_tasks_lists need to perform RDMA write then + * the r_keys may be changed if the underling device was changed + * in case of bonding for example + */ + list_for_each_entry(ptask, + &connection->io_tasks_list, + tasks_list_entry) { + if (xio_nexus_update_task(connection->nexus, ptask)) { + ERROR_LOG("update_task failed: task %p\n", ptask); + return -1; + } + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_connection_xmit_inl */ +/*---------------------------------------------------------------------------*/ +static inline int xio_connection_xmit_inl( + struct xio_connection *connection, + struct xio_msg_list *msgq, + struct xio_msg_list *in_flight_msgq, + void (*flush_msgq)(struct xio_connection *, enum xio_status), + int *retry_cnt) +{ + int retval = 0, rc = 0; + struct xio_task *t; + struct xio_tasks_pool *q; + struct xio_msg *msg; + + preempt_disable(); + + msg = xio_msg_list_first(msgq); + if (!msg) { + (*retry_cnt)++; + return rc; + } + + retval = xio_connection_send(connection, msg); + if (retval) { + if (retval == -EAGAIN) { + (*retry_cnt)++; + preempt_enable(); + return 1; + } else if (retval == -ENOMSG) { + /* message error was notified */ + DEBUG_LOG("xio_connection_send failed.\n"); + /* while error drain the messages */ + *retry_cnt = 0; + rc = 0; + } else if (retval == -XIO_E_PEER_QUEUE_SIZE_MISMATCH) { + /* message larger then remote receive + * queue - flush all messages */ + (*flush_msgq)(connection, + (enum xio_status)-retval); + (*retry_cnt)++; + rc = 1; + } else { + xio_msg_list_remove(msgq, msg, pdata); + rc = -1; + } + } else { + *retry_cnt = 0; + xio_msg_list_remove(msgq, msg, pdata); + if (IS_APPLICATION_MSG(msg->type)) { + xio_msg_list_insert_tail( + in_flight_msgq, msg, + pdata); + } + } + preempt_enable(); + + q = connection->nexus->primary_tasks_pool; + t = list_first_entry_or_null(&q->stack, struct xio_task, + tasks_list_entry); + if (unlikely(!t || list_is_last(&t->tasks_list_entry, &q->stack))) { + if (q->curr_used != q->params.max_nr - 1) + xio_tasks_pool_alloc_slab(q, connection->nexus->transport_hndl); + } + + return rc; +} + +/*---------------------------------------------------------------------------*/ +/* xio_connection_xmit */ +/*---------------------------------------------------------------------------*/ +static int xio_connection_xmit(struct xio_connection *connection) +{ + int retval = 0; + int retry_cnt = 0; + + struct xio_msg_list *msgq1, *in_flight_msgq1; + struct xio_msg_list *msgq2, *in_flight_msgq2; + void (*flush_msgq1)(struct xio_connection *, enum xio_status); + void (*flush_msgq2)(struct xio_connection *, enum xio_status); + + if (connection->send_req_toggle == 0) { + msgq1 = &connection->reqs_msgq; + in_flight_msgq1 = &connection->in_flight_reqs_msgq; + flush_msgq1 = &xio_connection_notify_req_msgs_flush; + msgq2 = &connection->rsps_msgq; + in_flight_msgq2 = &connection->in_flight_rsps_msgq; + flush_msgq2 = &xio_connection_notify_rsp_msgs_flush; + } else { + msgq1 = &connection->rsps_msgq; + in_flight_msgq1 = &connection->in_flight_rsps_msgq; + flush_msgq1 = &xio_connection_notify_rsp_msgs_flush; + msgq2 = &connection->reqs_msgq; + in_flight_msgq2 = &connection->in_flight_reqs_msgq; + flush_msgq2 = &xio_connection_notify_req_msgs_flush; + } + + while (retry_cnt < 2) { + retval = xio_connection_xmit_inl(connection, + msgq1, in_flight_msgq1, + flush_msgq1, + &retry_cnt); + if (retval < 0) { + connection->send_req_toggle = + 1 - connection->send_req_toggle; + break; + } + retval = xio_connection_xmit_inl(connection, + msgq2, in_flight_msgq2, + flush_msgq2, + &retry_cnt); + if (retval < 0) + break; + } + + if (retval < 0) { + xio_set_error(-retval); + ERROR_LOG("failed to send message - %s\n", + xio_strerror(-retval)); + return -1; + } else { + return 0; + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_connection_remove_in_flight */ +/*---------------------------------------------------------------------------*/ +int xio_connection_remove_in_flight(struct xio_connection *connection, + struct xio_msg *msg) +{ + if (!IS_APPLICATION_MSG(msg->type)) + return 0; + + if (IS_REQUEST(msg->type) || msg->type == XIO_MSG_TYPE_RDMA) + xio_msg_list_remove( + &connection->in_flight_reqs_msgq, msg, pdata); + else if (IS_RESPONSE(msg->type)) + xio_msg_list_remove( + &connection->in_flight_rsps_msgq, msg, pdata); + else { + ERROR_LOG("unexpected message type %u\n", msg->type); + return -EINVAL; + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_connection_remove_msg_from_queue */ +/*---------------------------------------------------------------------------*/ +int xio_connection_remove_msg_from_queue(struct xio_connection *connection, + struct xio_msg *msg) +{ + if (!IS_APPLICATION_MSG(msg->type)) + return 0; + + if (IS_REQUEST(msg->type) || msg->type == XIO_MSG_TYPE_RDMA) + xio_msg_list_remove( + &connection->reqs_msgq, msg, pdata); + else if (IS_RESPONSE(msg->type)) + xio_msg_list_remove( + &connection->rsps_msgq, msg, pdata); + else { + ERROR_LOG("unexpected message type %u\n", msg->type); + return -EINVAL; + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_connection_reconnect */ +/*---------------------------------------------------------------------------*/ +int xio_connection_reconnect(struct xio_connection *connection) +{ + + connection->close_reason = XIO_E_SESSION_DISCONNECTED; + + /* Notify user on reconnection start */ + xio_session_notify_reconnecting(connection->session, + connection); + + return 0; + +} + +/*---------------------------------------------------------------------------*/ +/* xio_connection_restart */ +/*---------------------------------------------------------------------------*/ +int xio_connection_restart(struct xio_connection *connection) +{ + int retval; + + retval = xio_connection_flush_msgs(connection); + if (retval) + return retval; + + retval = xio_connection_update_rkeys(connection); + if (retval) + return retval; + + retval = xio_connection_restart_tasks(connection); + if (retval) + return retval; + + /* raise restart flag */ + connection->restarted = 1; + + /* Notify user on responses */ + xio_connection_notify_rsp_msgs_flush(connection, XIO_E_MSG_FLUSHED); + + /* Notify user on reconnection end */ + xio_session_notify_reconnected(connection->session, + connection); + + /* restart transmission */ + retval = xio_connection_xmit(connection); + if (retval) + return retval; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_send_request */ +/*---------------------------------------------------------------------------*/ +int xio_send_request(struct xio_connection *connection, + struct xio_msg *msg) +{ + struct xio_msg_list reqs_msgq; + struct xio_msg *pmsg; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + size_t tx_bytes; + int nr = -1; + int retval = 0; +#ifdef XIO_CFLAG_STAT_COUNTERS + struct xio_statistics *stats; +#endif +#ifdef XIO_CFLAG_EXTRA_CHECKS + int valid; +#endif + + if (!connection || !msg) { + xio_set_error(EINVAL); + return -1; + } + + if (unlikely(connection->disconnecting || + (connection->state != XIO_CONNECTION_STATE_ONLINE && + connection->state != XIO_CONNECTION_STATE_ESTABLISHED && + connection->state != XIO_CONNECTION_STATE_INIT))) { + xio_set_error(XIO_ESHUTDOWN); + return -1; + } +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_lock(connection->ctx); +#endif + + if (msg->next) { + xio_msg_list_init(&reqs_msgq); + nr = 0; + } + + pmsg = msg; +#ifdef XIO_CFLAG_STAT_COUNTERS + stats = &connection->ctx->stats; +#endif + while (pmsg) { + if (unlikely(connection->tx_queued_msgs > + connection->session->snd_queue_depth_msgs)) { + xio_set_error(XIO_E_TX_QUEUE_OVERFLOW); + DEBUG_LOG("send queue overflow %d\n", + connection->tx_queued_msgs); + retval = -1; + goto send; + } +#ifdef XIO_CFLAG_EXTRA_CHECKS + valid = xio_session_is_valid_in_req(connection->session, pmsg); + if (unlikely(!valid)) { + xio_set_error(EINVAL); + ERROR_LOG("invalid in message\n"); + retval = -1; + goto send; + } + valid = xio_session_is_valid_out_msg(connection->session, pmsg); + if (unlikely(!valid)) { + xio_set_error(EINVAL); + ERROR_LOG("invalid out message\n"); + retval = -1; + goto send; + } +#endif + + sgtbl = xio_sg_table_get(&pmsg->out); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(pmsg->out.sgl_type); + tx_bytes = pmsg->out.header.iov_len + tbl_length( + sgtbl_ops, + sgtbl); + + if (unlikely(connection->tx_bytes + tx_bytes > + connection->session->snd_queue_depth_bytes)) { + xio_set_error(XIO_E_TX_QUEUE_OVERFLOW); + ERROR_LOG("send queue overflow. queued:%lu bytes\n", + connection->tx_bytes); + retval = -1; + goto send; + } +#ifdef XIO_CFLAG_STAT_COUNTERS + pmsg->timestamp = get_cycles(); + xio_stat_inc(stats, XIO_STAT_TX_MSG); + xio_stat_add(stats, XIO_STAT_TX_BYTES, tx_bytes); +#endif + + pmsg->sn = xio_session_get_sn(connection->session); + pmsg->type = XIO_MSG_TYPE_REQ; + + if (connection->enable_flow_control) { + connection->tx_queued_msgs++; + connection->tx_bytes += tx_bytes; + } + if (nr == -1) + xio_msg_list_insert_tail(&connection->reqs_msgq, pmsg, + pdata); + else { + nr++; + xio_msg_list_insert_tail(&reqs_msgq, pmsg, pdata); + } + pmsg = pmsg->next; + } + if (nr > 0) + xio_msg_list_concat(&connection->reqs_msgq, &reqs_msgq, pdata); + +send: + /* do not xmit until connection is assigned */ + if (xio_is_connection_online(connection)) + if (xio_connection_xmit(connection)) { +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(connection->ctx); +#endif + return -1; + } +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(connection->ctx); +#endif + + return retval; +} +EXPORT_SYMBOL(xio_send_request); + +/*---------------------------------------------------------------------------*/ +/* xio_send_response */ +/*---------------------------------------------------------------------------*/ +int xio_send_response(struct xio_msg *msg) +{ + struct xio_task *task; + struct xio_connection *connection = NULL; + struct xio_vmsg *vmsg; + struct xio_msg *pmsg = msg; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + size_t bytes; + int retval = 0; + +#ifdef XIO_CFLAG_STAT_COUNTERS + struct xio_statistics *stats; +#endif +#ifdef XIO_CFLAG_EXTRA_CHECKS + int valid; +#endif + +#ifdef XIO_THREAD_SAFE_DEBUG + task = container_of(pmsg->request, struct xio_task, imsg); + xio_ctx_debug_thread_lock(task->connection->ctx); +#endif + + while (pmsg) { + task = container_of(pmsg->request, struct xio_task, imsg); + connection = task->connection; +#ifdef XIO_CFLAG_STAT_COUNTERS + stats = &connection->ctx->stats; +#endif + vmsg = &pmsg->out; + + if (task->imsg.sn != pmsg->request->sn) { + ERROR_LOG("match not found: request sn:%llu, " \ + "response sn:%llu\n", + task->imsg.sn, pmsg->request->sn); + xio_set_error(EINVAL); + retval = -1; + + goto send; + } + /* set type for notification */ + pmsg->type = XIO_MSG_TYPE_RSP; + if (unlikely( + connection->disconnecting || + (connection->state != XIO_CONNECTION_STATE_ONLINE && + connection->state != XIO_CONNECTION_STATE_ESTABLISHED && + connection->state != XIO_CONNECTION_STATE_INIT))) { + /* we discard the response as connection is not active + * anymore + */ + xio_set_error(XIO_ESHUTDOWN); + xio_tasks_pool_put(task); + + xio_session_notify_msg_error(connection, pmsg, + XIO_E_MSG_DISCARDED, + XIO_MSG_DIRECTION_OUT); + + pmsg = pmsg->next; + continue; + } + if (task->state != XIO_TASK_STATE_DELIVERED && + task->state != XIO_TASK_STATE_READ) { + ERROR_LOG("duplicate response send. request sn:%llu\n", + task->imsg.sn); + + xio_session_notify_msg_error(connection, pmsg, + XIO_E_MSG_INVALID, + XIO_MSG_DIRECTION_OUT); + pmsg = pmsg->next; + continue; + } +#ifdef XIO_CFLAG_STAT_COUNTERS + /* Server latency */ + xio_stat_add(stats, XIO_STAT_APPDELAY, + get_cycles() - task->imsg.timestamp); +#endif +#ifdef XIO_CFLAG_EXTRA_CHECKS + valid = xio_session_is_valid_out_msg(connection->session, pmsg); + if (!valid) { + xio_set_error(EINVAL); + ERROR_LOG("invalid out message\n"); + retval = -1; + goto send; + } +#endif + +#ifdef XIO_CFLAG_STAT_COUNTERS + sgtbl = xio_sg_table_get(vmsg); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(vmsg->sgl_type); + bytes = vmsg->header.iov_len + + tbl_length(sgtbl_ops, sgtbl); + + xio_stat_inc(stats, XIO_STAT_TX_MSG); + xio_stat_add(stats, XIO_STAT_TX_BYTES, bytes); +#endif + pmsg->flags |= XIO_MSG_FLAG_EX_RECEIPT_LAST; + if ((pmsg->request->flags & + XIO_MSG_FLAG_REQUEST_READ_RECEIPT) && + (task->state == XIO_TASK_STATE_DELIVERED)) + pmsg->flags |= XIO_MSG_FLAG_EX_RECEIPT_FIRST; + task->state = XIO_TASK_STATE_READ; + if (connection->enable_flow_control) { + vmsg = &pmsg->request->in; + sgtbl = xio_sg_table_get(vmsg); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(vmsg->sgl_type); + bytes = vmsg->header.iov_len + + tbl_length(sgtbl_ops, sgtbl); + + connection->credits_msgs++; + connection->credits_bytes += bytes; + } + xio_msg_list_insert_tail(&connection->rsps_msgq, pmsg, pdata); + + pmsg = pmsg->next; + } + +send: + + /* do not xmit until connection is assigned */ + if (connection && xio_is_connection_online(connection)) { + if (xio_connection_xmit(connection)) { +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(connection->ctx); +#endif + return -1; + } + } +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(connection->ctx); +#endif + + return retval; +} +EXPORT_SYMBOL(xio_send_response); + +/*---------------------------------------------------------------------------*/ +/* xio_connection_send_read_receipt */ +/*---------------------------------------------------------------------------*/ +int xio_connection_send_read_receipt(struct xio_connection *connection, + struct xio_msg *msg) +{ + struct xio_msg *rsp; + struct xio_task *task; + + task = container_of(msg, struct xio_task, imsg); + + rsp = (struct xio_msg *)xio_context_msg_pool_get(connection->ctx); + + rsp->type = (enum xio_msg_type) + (((unsigned)msg->type & ~XIO_REQUEST) | XIO_RESPONSE); + rsp->request = msg; + + rsp->flags = XIO_MSG_FLAG_EX_RECEIPT_FIRST; + task->state = XIO_TASK_STATE_READ; + + rsp->out.header.iov_len = 0; + rsp->out.data_tbl.nents = 0; + rsp->in.header.iov_len = 0; + rsp->in.data_tbl.nents = 0; + + xio_msg_list_insert_tail(&connection->rsps_msgq, rsp, pdata); + + /* do not xmit until connection is assigned */ + if (xio_is_connection_online(connection)) + return xio_connection_xmit(connection); + + return 0; +} + +int xio_connection_release_read_receipt(struct xio_connection *connection, + struct xio_msg *msg) +{ + xio_context_msg_pool_put(msg); + return 0; +} + +static int xio_send_typed_msg(struct xio_connection *connection, + struct xio_msg *msg, + enum xio_msg_type msg_type) +{ + struct xio_msg_list reqs_msgq; + struct xio_msg *pmsg = msg; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + size_t tx_bytes; + int nr = -1; + int retval = 0; +#ifdef XIO_CFLAG_STAT_COUNTERS + struct xio_statistics *stats = &connection->ctx->stats; +#endif +#ifdef XIO_CFLAG_EXTRA_CHECKS + int valid; +#endif + + if (unlikely(connection->disconnecting || + (connection->state != XIO_CONNECTION_STATE_ONLINE && + connection->state != XIO_CONNECTION_STATE_ESTABLISHED && + connection->state != XIO_CONNECTION_STATE_INIT))) { + xio_set_error(XIO_ESHUTDOWN); + return -1; + } + + if (msg->next) { + xio_msg_list_init(&reqs_msgq); + nr = 0; + } + + while (pmsg) { + if (unlikely(connection->tx_queued_msgs > + connection->session->snd_queue_depth_msgs)) { + xio_set_error(XIO_E_TX_QUEUE_OVERFLOW); + WARN_LOG("send queue overflow %d\n", + connection->tx_queued_msgs); + retval = -1; + goto send; + } + +#ifdef XIO_CFLAG_EXTRA_CHECKS + valid = xio_session_is_valid_out_msg(connection->session, pmsg); + if (unlikely(!valid)) { + xio_set_error(EINVAL); + ERROR_LOG("invalid out message\n"); + retval = -1; + goto send; + } +#endif + sgtbl = xio_sg_table_get(&pmsg->out); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(pmsg->out.sgl_type); + tx_bytes = pmsg->out.header.iov_len + tbl_length( + sgtbl_ops, + sgtbl); + + if (unlikely(connection->tx_bytes + tx_bytes > + connection->session->snd_queue_depth_bytes)) { + xio_set_error(XIO_E_TX_QUEUE_OVERFLOW); + ERROR_LOG("send queue overflow. queued:%lu bytes\n", + connection->tx_bytes); + retval = -1; + goto send; + } +#ifdef XIO_CFLAG_STAT_COUNTERS + pmsg->timestamp = get_cycles(); + xio_stat_inc(stats, XIO_STAT_TX_MSG); + xio_stat_add(stats, XIO_STAT_TX_BYTES, tx_bytes); +#endif + pmsg->sn = xio_session_get_sn(connection->session); + pmsg->type = msg_type; + + if (connection->enable_flow_control) { + connection->tx_queued_msgs++; + connection->tx_bytes += tx_bytes; + TRACE_LOG( + "connection->tx_queued_msgs=%d, connection->tx_bytes=%zu\n", + connection->tx_queued_msgs, + connection->tx_bytes); + } + if (nr == -1) + xio_msg_list_insert_tail(&connection->reqs_msgq, pmsg, + pdata); + else { + nr++; + xio_msg_list_insert_tail(&reqs_msgq, pmsg, pdata); + } + + pmsg = pmsg->next; + } + if (nr > 0) + xio_msg_list_concat(&connection->reqs_msgq, &reqs_msgq, pdata); + +send: + /* do not xmit until connection is assigned */ + if (xio_is_connection_online(connection)) { + if (xio_connection_xmit(connection)) { + return -1; + } + } + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_send_msg */ +/*---------------------------------------------------------------------------*/ +int xio_send_msg(struct xio_connection *connection, + struct xio_msg *msg) +{ + int retval; +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_lock(connection->ctx); +#endif + retval = xio_send_typed_msg(connection, msg, XIO_ONE_WAY_REQ); +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(connection->ctx); +#endif + return retval; +} +EXPORT_SYMBOL(xio_send_msg); + +/*---------------------------------------------------------------------------*/ +/* xio_send_rdma */ +/*---------------------------------------------------------------------------*/ +int xio_send_rdma(struct xio_connection *connection, + struct xio_msg *msg) +{ + int retval; +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_lock(connection->ctx); +#endif + if (unlikely(connection->nexus->transport_hndl->proto != XIO_PROTO_RDMA)) { + xio_set_error(XIO_E_NOT_SUPPORTED); + ERROR_LOG("using xio_send_rdma over TCP transport"); +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(connection->ctx); +#endif + return -1; + } + retval = xio_send_typed_msg(connection, msg, XIO_MSG_TYPE_RDMA); +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(connection->ctx); +#endif + return retval; +} +EXPORT_SYMBOL(xio_send_rdma); + +/*---------------------------------------------------------------------------*/ +/* xio_connection_xmit_msgs */ +/*---------------------------------------------------------------------------*/ +int xio_connection_xmit_msgs(struct xio_connection *connection) +{ + if (connection->state == XIO_CONNECTION_STATE_ONLINE /*|| + connection->state == XIO_CONNECTION_STATE_FIN_WAIT_1*/) { + return xio_connection_xmit(connection); + } + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_connection_post_close */ +/*---------------------------------------------------------------------------*/ +static void xio_connection_post_close(void *_connection) +{ + struct xio_connection *connection = (struct xio_connection *)_connection; + + xio_ctx_del_work(connection->ctx, &connection->hello_work); + + xio_ctx_del_delayed_work(connection->ctx, + &connection->fin_delayed_work); + + xio_ctx_del_delayed_work(connection->ctx, + &connection->fin_timeout_work); + + xio_ctx_del_delayed_work(connection->ctx, + &connection->ka.timer); + + xio_ctx_del_work(connection->ctx, &connection->fin_work); + + xio_ctx_del_work(connection->ctx, &connection->teardown_work); + spin_lock(&connection->ctx->ctx_list_lock); + list_del(&connection->ctx_list_entry); + spin_unlock(&connection->ctx->ctx_list_lock); + + kfree(connection); + +} + +/*---------------------------------------------------------------------------*/ +/* xio_connection_close */ +/*---------------------------------------------------------------------------*/ +int xio_connection_close(struct xio_connection *connection) +{ + if (xio_ctx_is_work_in_handler(connection->ctx, + &connection->teardown_work)) { + xio_ctx_set_work_destructor( + connection->ctx, connection, + xio_connection_post_close, + &connection->teardown_work); + } else { + xio_connection_post_close(connection); + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_connection_queue_io_task */ +/*---------------------------------------------------------------------------*/ +void xio_connection_queue_io_task(struct xio_connection *connection, + struct xio_task *task) +{ + list_move_tail(&task->tasks_list_entry, &connection->io_tasks_list); +} + +/*---------------------------------------------------------------------------*/ +/* xio_release_response_task */ +/*---------------------------------------------------------------------------*/ +void xio_release_response_task(struct xio_task *task) +{ + /* the tx task is returned back to pool */ + if (task->sender_task) { + xio_tasks_pool_put(task->sender_task); + task->sender_task = NULL; + } + + /* the rx task is returned back to pool */ + xio_tasks_pool_put(task); +} + +/*---------------------------------------------------------------------------*/ +/* xio_release_response */ +/*---------------------------------------------------------------------------*/ +int xio_release_response(struct xio_msg *msg) +{ + struct xio_task *task; + struct xio_connection *connection = NULL; + struct xio_msg *pmsg = msg; + + while (pmsg) { + if (unlikely(!IS_RESPONSE(pmsg->type))) { + ERROR_LOG("xio_release_rsp failed. invalid type:0x%x\n", + pmsg->type); + xio_set_error(EINVAL); + return -1; + } + task = container_of(pmsg->request, struct xio_task, imsg); + if (unlikely(!task->sender_task || + task->tlv_type != XIO_MSG_TYPE_RSP)) { + /* do not release response in responder */ + ERROR_LOG("xio_release_rsp failed. invalid type:0x%x\n", + task->tlv_type); + xio_set_error(EINVAL); + return -1; + } + connection = task->connection; +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_lock(connection->ctx); +#endif + + if (connection->enable_flow_control) { + struct xio_vmsg *vmsg; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + size_t bytes; + + vmsg = &task->sender_task->omsg->out; + sgtbl = xio_sg_table_get(vmsg); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(vmsg->sgl_type); + bytes = vmsg->header.iov_len + + tbl_length(sgtbl_ops, sgtbl); + + connection->tx_queued_msgs--; + connection->tx_bytes -= bytes; + + vmsg = &task->imsg.in; + sgtbl = xio_sg_table_get(vmsg); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(vmsg->sgl_type); + bytes = vmsg->header.iov_len + + tbl_length(sgtbl_ops, sgtbl); + + connection->credits_msgs++; + connection->credits_bytes += bytes; + + if (connection->state == XIO_CONNECTION_STATE_ONLINE && + ((connection->credits_msgs >= + connection->rx_queue_watermark_msgs) || + (connection->credits_bytes >= + connection->rx_queue_watermark_bytes))) + xio_send_credits_ack(connection); + } + + list_move_tail(&task->tasks_list_entry, + &connection->post_io_tasks_list); + + xio_release_response_task(task); + + pmsg = pmsg->next; + } + if (connection && xio_is_connection_online(connection)) { +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(connection->ctx); +#endif + return xio_connection_xmit(connection); + } + +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(connection->ctx); +#endif + return 0; +} +EXPORT_SYMBOL(xio_release_response); + +/*---------------------------------------------------------------------------*/ +/* xio_release_msg */ +/*---------------------------------------------------------------------------*/ +int xio_release_msg(struct xio_msg *msg) +{ + struct xio_task *task; + struct xio_connection *connection = NULL; + struct xio_msg *pmsg = msg; + int retval; + +#ifdef XIO_THREAD_SAFE_DEBUG + task = container_of(pmsg, struct xio_task, imsg); + xio_ctx_debug_thread_lock(task->connection->ctx); +#endif + + while (pmsg) { + if (unlikely(pmsg->type != XIO_ONE_WAY_REQ)) { + ERROR_LOG("xio_release_msg failed. invalid type:0x%x\n", + pmsg->type); + xio_set_error(EINVAL); +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(task->connection->ctx); +#endif + return -1; + } + task = container_of(pmsg, struct xio_task, imsg); + if (unlikely(task->tlv_type != XIO_ONE_WAY_REQ)) { + ERROR_LOG("xio_release_msg failed. invalid type:0x%x\n", + task->tlv_type); + xio_set_error(EINVAL); +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(task->connection->ctx); +#endif + return -1; + } + connection = task->connection; + if (connection->enable_flow_control) { + struct xio_vmsg *vmsg; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + size_t bytes; + + vmsg = &task->imsg.in; + sgtbl = xio_sg_table_get(vmsg); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(vmsg->sgl_type); + bytes = vmsg->header.iov_len + + tbl_length(sgtbl_ops, sgtbl); + + connection->credits_msgs++; + connection->credits_bytes += bytes; + if (connection->state == XIO_CONNECTION_STATE_ONLINE && + ((connection->credits_msgs >= + connection->rx_queue_watermark_msgs) || + (connection->credits_bytes >= + connection->rx_queue_watermark_bytes))) + xio_send_credits_ack(connection); + } + + list_move_tail(&task->tasks_list_entry, + &connection->post_io_tasks_list); + + /* the rx task is returned back to pool */ + xio_tasks_pool_put(task); + + pmsg = pmsg->next; + } + + if (connection && xio_is_connection_online(connection)) { + retval = xio_connection_xmit(connection); +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(connection->ctx); +#endif + return retval; + } + +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(connection->ctx); +#endif + + return 0; +} +EXPORT_SYMBOL(xio_release_msg); + +/*---------------------------------------------------------------------------*/ +/* xio_poll_completions */ +/*---------------------------------------------------------------------------*/ +int xio_poll_completions(struct xio_connection *connection, + long min_nr, long nr, + struct timespec *timeout) +{ + if (connection->nexus) + return xio_nexus_poll(connection->nexus, min_nr, nr, timeout); + else + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_fin_req_timeout */ +/*---------------------------------------------------------------------------*/ +static void xio_fin_req_timeout(void *data) +{ + struct xio_connection *connection = (struct xio_connection *)data; + + if (connection->fin_req_timeout) + return; + + connection->fin_req_timeout++; + ERROR_LOG("connection close timeout. session:%p, connection:%p\n", + connection->session, connection); + + DEBUG_LOG("connection %p state change: current_state:%s, " \ + "next_state:%s\n", + connection, + xio_connection_state_str((enum xio_connection_state) + connection->state), + xio_connection_state_str(XIO_CONNECTION_STATE_CLOSED)); + + /* connection got disconnection during LAST ACK state - \ + * ignore request timeout */ + if (connection->state == XIO_CONNECTION_STATE_LAST_ACK) { + connection->state = XIO_CONNECTION_STATE_CLOSED; + goto exit; + } else if (connection->state == XIO_CONNECTION_STATE_FIN_WAIT_1) { + kref_put(&connection->kref, xio_connection_post_destroy); + } + + /* flush all messages from in flight message queue to in queue */ + xio_connection_flush_msgs(connection); + + /* flush all messages back to user */ + xio_connection_notify_msgs_flush(connection); + + connection->state = XIO_CONNECTION_STATE_CLOSED; + + if (!connection->disable_notify) + xio_ctx_add_work( + connection->ctx, + connection, + xio_connection_teardown_handler, + &connection->teardown_work); + else + xio_connection_destroy(connection); +exit: + kref_put(&connection->kref, xio_connection_post_destroy); +} + +/*---------------------------------------------------------------------------*/ +/* xio_send_fin_req */ +/*---------------------------------------------------------------------------*/ +int xio_send_fin_req(struct xio_connection *connection) +{ + struct xio_msg *msg; + int retval; + + msg = (struct xio_msg *)xio_context_msg_pool_get(connection->ctx); + + msg->type = (enum xio_msg_type)XIO_FIN_REQ; + msg->in.header.iov_len = 0; + msg->out.header.iov_len = 0; + msg->in.data_tbl.nents = 0; + msg->out.data_tbl.nents = 0; + + /* insert to the tail of the queue */ + xio_msg_list_insert_tail(&connection->reqs_msgq, msg, pdata); + + DEBUG_LOG("send fin request. session:%p, connection:%p\n", + connection->session, connection); + + /* trigger the timer */ + connection->fin_req_timeout = 0; + retval = xio_ctx_add_delayed_work( + connection->ctx, + connection->disconnect_timeout, connection, + xio_fin_req_timeout, + &connection->fin_timeout_work); + if (retval != 0) { + ERROR_LOG("xio_ctx_timer_add failed.\n"); + return retval; + } + + /* avoid race for recv and send completion and xio_connection_destroy */ + kref_get(&connection->kref); + kref_get(&connection->kref); + + /* do not xmit until connection is assigned */ + return xio_connection_xmit(connection); +} + +/*---------------------------------------------------------------------------*/ +/* xio_send_fin_ack */ +/*---------------------------------------------------------------------------*/ +int xio_send_fin_ack(struct xio_connection *connection, struct xio_task *task) +{ + struct xio_msg *msg; + + msg = (struct xio_msg *)xio_context_msg_pool_get(connection->ctx); + + msg->type = (enum xio_msg_type)XIO_FIN_RSP; + msg->request = &task->imsg; + msg->in.header.iov_len = 0; + msg->out.header.iov_len = 0; + msg->in.data_tbl.nents = 0; + msg->out.data_tbl.nents = 0; + + /* insert to the tail of the queue */ + xio_msg_list_insert_tail(&connection->rsps_msgq, msg, pdata); + + DEBUG_LOG("send fin response. session:%p, connection:%p\n", + connection->session, connection); + + /* add reference to avoid race */ + kref_get(&connection->kref); + + /* status is not important - just send */ + return xio_connection_xmit(connection); +} + +/*---------------------------------------------------------------------------*/ +/* xio_connection_release_fin */ +/*---------------------------------------------------------------------------*/ +int xio_connection_release_fin(struct xio_connection *connection, + struct xio_msg *msg) +{ + xio_context_msg_pool_put(msg); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_disconnect_initial_connection */ +/*---------------------------------------------------------------------------*/ +int xio_disconnect_initial_connection(struct xio_connection *connection) +{ + struct xio_msg *msg; + int retval; + + msg = (struct xio_msg *)xio_context_msg_pool_get(connection->ctx); + + msg->type = (enum xio_msg_type)XIO_FIN_REQ; + msg->in.header.iov_len = 0; + msg->out.header.iov_len = 0; + msg->in.data_tbl.nents = 0; + msg->out.data_tbl.nents = 0; + + DEBUG_LOG("send fin request. session:%p, connection:%p\n", + connection->session, connection); + + TRACE_LOG("connection %p state change: current_state:%s, " \ + "next_state:%s\n", + connection, + xio_connection_state_str((enum xio_connection_state) + connection->state), + xio_connection_state_str(XIO_CONNECTION_STATE_FIN_WAIT_1)); + + connection->state = XIO_CONNECTION_STATE_FIN_WAIT_1; + + /* avoid race for recv and send completion and xio_connection_destroy */ + kref_get(&connection->kref); /* for recv */ + kref_get(&connection->kref); /* for send comp */ + kref_get(&connection->kref); /* for time wait */ + + /* we don't want to send all queued messages yet - send directly */ + retval = xio_connection_send(connection, msg); + if (retval == -EAGAIN) + retval = 0; + + if (!connection->disable_notify) + xio_session_notify_connection_closed(connection->session, + connection); + return retval; +} + +static void xio_pre_disconnect(void *conn) +{ + struct xio_connection *connection = (struct xio_connection *)conn; + + /* now we are on the right context, reaffirm that in the mean time, + * state was not changed + */ + if (connection->state != XIO_CONNECTION_STATE_ONLINE) + return; + + kref_get(&connection->kref); /* for time wait */ + + /* on keep alive timeout, assume fin is also timeout and bypass */ + if (!connection->ka.timedout) { + connection->state = XIO_CONNECTION_STATE_FIN_WAIT_1; + + xio_send_fin_req(connection); + + if (!connection->disable_notify) { + connection->close_reason = XIO_E_SESSION_CLOSED; + xio_session_notify_connection_closed( + connection->session, connection); + } + } else { + if (!connection->disable_notify) { + connection->close_reason = XIO_E_TIMEOUT; + xio_session_notify_connection_closed( + connection->session, connection); + } + connection->state = XIO_CONNECTION_STATE_TIME_WAIT; + xio_close_time_wait(connection); + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_disconnect */ +/*---------------------------------------------------------------------------*/ +int xio_disconnect(struct xio_connection *connection) +{ + int retval; + + /* active close state machine */ + + if (!connection || !connection->session) { + xio_set_error(EINVAL); + ERROR_LOG("xio_disconnect failed 'Invalid argument'\n"); + return -1; + } + + DEBUG_LOG("xio_disconnect. session:%p connection:%p state:%s\n", + connection->session, connection, + xio_connection_state_str((enum xio_connection_state) + connection->state)); + + if ((connection->state != XIO_CONNECTION_STATE_ONLINE && + connection->state != XIO_CONNECTION_STATE_ESTABLISHED) || + connection->disconnecting) { + /* delay the disconnection to when connection become online */ + connection->disconnecting = 1; + + return 0; + } + connection->disconnecting = 1; + retval = xio_ctx_add_work( + connection->ctx, + connection, + xio_pre_disconnect, + &connection->fin_work); + if (retval != 0) { + ERROR_LOG("xio_ctx_timer_add failed.\n"); + + return retval; + } + + return 0; +} +EXPORT_SYMBOL(xio_disconnect); + +/*---------------------------------------------------------------------------*/ +/* xio_cancel_request */ +/*---------------------------------------------------------------------------*/ +int xio_cancel_request(struct xio_connection *connection, + struct xio_msg *req) +{ + struct xio_msg *pmsg, *tmp_pmsg; + uint64_t stag; + struct xio_session_cancel_hdr hdr; + +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_lock(connection->ctx); +#endif + + /* search the tx */ + xio_msg_list_foreach_safe(pmsg, &connection->reqs_msgq, + tmp_pmsg, pdata) { + if (pmsg->sn == req->sn) { + ERROR_LOG("[%llu] - message found on reqs_msgq\n", + req->sn); + xio_msg_list_remove(&connection->reqs_msgq, + pmsg, pdata); + xio_session_notify_cancel( + connection, pmsg, XIO_E_MSG_CANCELED); +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(connection->ctx); +#endif + return 0; + } + } + hdr.sn = htonll(req->sn); + hdr.requester_session_id = + htonl(connection->session->session_id); + hdr.responder_session_id = + htonl(connection->session->peer_session_id); + stag = + uint64_from_ptr(connection->session); + + /* cancel request on tx */ + xio_nexus_cancel_req(connection->nexus, req, stag, &hdr, sizeof(hdr)); +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(connection->ctx); +#endif + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_connection_send_cancel_response */ +/*---------------------------------------------------------------------------*/ +int xio_connection_send_cancel_response(struct xio_connection *connection, + struct xio_msg *msg, + struct xio_task *task, + enum xio_status result) +{ + struct xio_session_cancel_hdr hdr; + + hdr.sn = htonll(msg->sn); + hdr.responder_session_id = htonl(connection->session->session_id); + hdr.requester_session_id = htonl(connection->session->peer_session_id); + + xio_nexus_cancel_rsp(connection->nexus, task, result, + &hdr, sizeof(hdr)); + + return 0; +} + +struct xio_task *xio_connection_find_io_task(struct xio_connection *connection, + uint64_t msg_sn) +{ + struct xio_task *ptask; + + /* look in the tx_comp */ + list_for_each_entry(ptask, &connection->io_tasks_list, + tasks_list_entry) { + if (ptask->imsg.sn == msg_sn) + return ptask; + } + + return NULL; +} + +/*---------------------------------------------------------------------------*/ +/* xio_cancel */ +/*---------------------------------------------------------------------------*/ +int xio_cancel(struct xio_msg *req, enum xio_status result) +{ + struct xio_task *task; + + if (result != XIO_E_MSG_CANCELED && result != XIO_E_MSG_CANCEL_FAILED) { + xio_set_error(EINVAL); + ERROR_LOG("invalid status\n"); + return -1; + } + + task = container_of(req, struct xio_task, imsg); +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_lock(task->connection->ctx); +#endif + xio_connection_send_cancel_response(task->connection, &task->imsg, + task, result); + /* release the message */ + if (result == XIO_E_MSG_CANCELED) { + /* the rx task is returned back to pool */ + xio_tasks_pool_put(task); + } + +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(task->connection->ctx); +#endif + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_modify_connection */ +/*---------------------------------------------------------------------------*/ +int xio_modify_connection(struct xio_connection *connection, + struct xio_connection_attr *attr, + int attr_mask) +{ + /* + int retval = 0; + int nexus_modify = 0; + struct xio_nexus_attr nattr; + int nattr_mask = 0; + */ + + if (!connection || !attr) { + xio_set_error(EINVAL); + ERROR_LOG("invalid parameters\n"); + return -1; + } + if (test_bits(XIO_CONNECTION_ATTR_USER_CTX, &attr_mask)) + connection->cb_user_context = attr->user_context; + if (test_bits(XIO_CONNECTION_ATTR_DISCONNECT_TIMEOUT, &attr_mask)) { + if (attr->disconnect_timeout_secs) { + if (attr->disconnect_timeout_secs < XIO_MIN_CONNECTION_TIMEOUT) + connection->disconnect_timeout = XIO_MIN_CONNECTION_TIMEOUT; + else + connection->disconnect_timeout = attr->disconnect_timeout_secs * 1000; + } else { + connection->disconnect_timeout = XIO_DEF_CONNECTION_TIMEOUT; + } + } + /* + memset(&nattr, 0, sizeof(nattr)); + if (test_bits(XIO_CONNECTION_ATTR_TOS, &attr_mask)) { + nattr.tos = attr->tos; + set_bits(XIO_NEXUS_ATTR_TOS, &nattr_mask); + nexus_modify = 1; + } + + if (!nexus_modify) + goto exit; + + if (nexus_modify && !connection->nexus) { + xio_set_error(EINVAL); + return -1; + } + + retval = xio_nexus_modify(connection->nexus, + &nattr, nattr_mask); + +exit: + return retval; + */ + return 0; +} +EXPORT_SYMBOL(xio_modify_connection); + +/*---------------------------------------------------------------------------*/ +/* xio_query_connection */ +/*---------------------------------------------------------------------------*/ +int xio_query_connection(struct xio_connection *connection, + struct xio_connection_attr *attr, + int attr_mask) +{ + /* + int retval = 0; + int nexus_query = 0; + struct xio_nexus_attr nattr; + int nattr_mask = 0; + */ + + if (!connection || !attr) { + xio_set_error(EINVAL); + ERROR_LOG("invalid parameters\n"); + return -1; + } + if (attr_mask & XIO_CONNECTION_ATTR_USER_CTX) + attr->user_context = connection->cb_user_context; + + if (attr_mask & XIO_CONNECTION_ATTR_CTX) + attr->ctx = connection->ctx; + + if (test_bits(XIO_CONNECTION_ATTR_DISCONNECT_TIMEOUT, &attr_mask)) + attr->disconnect_timeout_secs = connection->disconnect_timeout/1000; + + if (attr_mask & XIO_CONNECTION_ATTR_PROTO) + attr->proto = (enum xio_proto) + xio_nexus_get_proto(connection->nexus); + + if (attr_mask & XIO_CONNECTION_ATTR_PEER_ADDR) + xio_nexus_get_peer_addr(connection->nexus, + &attr->peer_addr, + sizeof(attr->peer_addr)); + + if (attr_mask & XIO_CONNECTION_ATTR_LOCAL_ADDR) + xio_nexus_get_local_addr(connection->nexus, + &attr->local_addr, + sizeof(attr->local_addr)); + + /* + memset(&nattr, 0, sizeof(nattr)); + if (test_bits(XIO_CONNECTION_ATTR_TOS, &attr_mask)) { + set_bits(XIO_NEXUS_ATTR_TOS, &nattr_mask); + nexus_query = 1; + } + + if (!nexus_query) + goto exit; + + if (nexus_query && !connection->nexus) { + xio_set_error(EINVAL); + return -1; + } + + retval = xio_nexus_query(connection->nexus, + &nattr, nattr_mask); + if (retval != 0) + return -1; + + if (test_bits(XIO_CONNECTION_ATTR_TOS, &attr_mask)) + attr->tos = nattr.tos; +exit: + */ + + return 0; +} +EXPORT_SYMBOL(xio_query_connection); + +/*---------------------------------------------------------------------------*/ +/* xio_connection_send_hello_req */ +/*---------------------------------------------------------------------------*/ +int xio_connection_send_hello_req(struct xio_connection *connection) +{ + struct xio_msg *msg; + int retval; + + DEBUG_LOG("send hello request. session:%p, connection:%p\n", + connection->session, connection); + + msg = (struct xio_msg *)xio_context_msg_pool_get(connection->ctx); + + msg->type = (enum xio_msg_type)XIO_CONNECTION_HELLO_REQ; + msg->in.header.iov_len = 0; + msg->out.header.iov_len = 0; + msg->in.data_tbl.nents = 0; + msg->out.data_tbl.nents = 0; + + /* we don't want to send all queued messages yet - send directly */ + retval = xio_connection_send(connection, msg); + if (retval == -EAGAIN) + retval = 0; + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_connection_send_hello_rsp */ +/*---------------------------------------------------------------------------*/ +int xio_connection_send_hello_rsp(struct xio_connection *connection, + struct xio_task *task) +{ + struct xio_msg *msg; + int retval; + + TRACE_LOG("send hello response. session:%p, connection:%p\n", + connection->session, connection); + + msg = (struct xio_msg *)xio_context_msg_pool_get(connection->ctx); + + msg->type = (enum xio_msg_type)XIO_CONNECTION_HELLO_RSP; + msg->request = &task->imsg; + msg->in.header.iov_len = 0; + msg->out.header.iov_len = 0; + msg->in.data_tbl.nents = 0; + msg->out.data_tbl.nents = 0; + + /* we don't want to send all queued messages yet - send directly */ + retval = xio_connection_send(connection, msg); + if (retval == -EAGAIN) + retval = 0; + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_connection_release_hello */ +/*---------------------------------------------------------------------------*/ +static inline void xio_connection_release_hello( + struct xio_connection *connection, struct xio_msg *msg) +{ + xio_context_msg_pool_put(msg); +} + +/*---------------------------------------------------------------------------*/ +/* xio_connection_post_destroy */ +/*---------------------------------------------------------------------------*/ +static void xio_connection_post_destroy(struct kref *kref) +{ + int retval; + struct xio_session *session; + struct xio_context *ctx; + int destroy_session = 0; + int close_reason; + struct xio_connection *tmp_connection = NULL; + + struct xio_connection *connection = container_of(kref, + struct xio_connection, + kref); + session = connection->session; + ctx = connection->ctx; + close_reason = connection->close_reason; + + DEBUG_LOG("xio_connection_post_destroy. session:%p, connection:%p " \ + "conn:%p nr:%d\n", + session, connection, connection->nexus, + session->connections_nr); + + /* remove the connection from the session's connections list */ + if (connection->nexus) { + xio_connection_flush_tasks(connection); + /* for race condition between connection teardown and transport closed */ + if (connection->state != XIO_CONNECTION_STATE_DISCONNECTED) + xio_nexus_close(connection->nexus, &session->observer); + } + + /* leading connection */ + spin_lock(&session->connections_list_lock); + if (session->lead_connection && + session->lead_connection->nexus == connection->nexus) { + if ((connection->state == XIO_CONNECTION_STATE_INIT || + connection->state == XIO_CONNECTION_STATE_DISCONNECTED || + connection->state == XIO_CONNECTION_STATE_ERROR) && + session->connections_nr) { + session->connections_nr--; + list_del(&connection->connections_list_entry); + } + tmp_connection = session->lead_connection; + session->lead_connection = NULL; + TRACE_LOG("lead connection is closed\n"); + } else if (session->redir_connection && + session->redir_connection->nexus == connection->nexus) { + tmp_connection = session->redir_connection; + session->redir_connection = NULL; + TRACE_LOG("redirected connection is closed\n"); + } else { + session->connections_nr--; + list_del(&connection->connections_list_entry); + tmp_connection = connection; + } + destroy_session = ((session->connections_nr == 0) && + !session->lead_connection && + !session->redir_connection); + spin_unlock(&session->connections_list_lock); + retval = xio_connection_close(tmp_connection); + + if (retval != 0) { + ERROR_LOG("failed to close connection"); + return; + } + if (session->disable_teardown) + return; + + if (destroy_session) + xio_session_init_teardown(session, ctx, close_reason); +} + +/*---------------------------------------------------------------------------*/ +/* xio_connection_destroy */ +/*---------------------------------------------------------------------------*/ +int xio_connection_destroy(struct xio_connection *connection) +{ + int retval = 0; + int found; + struct xio_session *session; + + if (!connection) { + xio_set_error(EINVAL); + return -1; + } +#ifdef XIO_THREAD_SAFE_DEBUG + if (connection != connection->session->lead_connection) + /*not locking for inner accelio lead connection */ + xio_ctx_debug_thread_lock(connection->ctx); +#endif + found = xio_idr_lookup_uobj(usr_idr, connection); + if (found) { + if (!list_empty(&connection->io_tasks_list)) + WARN_LOG("tasks still pending. connection:%p\n", + connection); + xio_idr_remove_uobj(usr_idr, connection); + } else { + ERROR_LOG("connection not found:%p\n", connection); + xio_set_error(XIO_E_USER_OBJ_NOT_FOUND); +#ifdef XIO_THREAD_SAFE_DEBUG + if (connection != connection->session->lead_connection) + xio_ctx_debug_thread_unlock(connection->ctx); +#endif + return -1; + } + + session = connection->session; + + DEBUG_LOG("xio_connection_destroy. session:%p, connection:%p " \ + "nexus:%p nr:%d, state:%s\n", + session, connection, connection->nexus, + session->connections_nr, + xio_connection_state_str((enum xio_connection_state) + connection->state)); + + switch (connection->state) { + case XIO_CONNECTION_STATE_INIT: + case XIO_CONNECTION_STATE_CLOSED: + case XIO_CONNECTION_STATE_DISCONNECTED: + case XIO_CONNECTION_STATE_ERROR: + break; + default: + ERROR_LOG("connection %p : current_state:%s, " \ + "invalid destroy state\n", + connection, + xio_connection_state_str((enum xio_connection_state) + connection->state)); + xio_set_error(EPERM); +#ifdef XIO_THREAD_SAFE_DEBUG + if (connection != connection->session->lead_connection) + xio_ctx_debug_thread_unlock(connection->ctx); +#endif + return -1; + } + /* if there is any delayed timeout - stop it. + * users may call this function at any stage + **/ + xio_ctx_del_work(connection->ctx, &connection->hello_work); + xio_ctx_del_work(connection->ctx, &connection->fin_work); + xio_ctx_del_work(connection->ctx, &connection->teardown_work); + + xio_ctx_del_delayed_work(connection->ctx, + &connection->fin_delayed_work); + xio_ctx_del_delayed_work(connection->ctx, + &connection->fin_timeout_work); + xio_ctx_del_delayed_work(connection->ctx, + &connection->ka.timer); + + kref_put(&connection->kref, xio_connection_post_destroy); +#ifdef XIO_THREAD_SAFE_DEBUG + if (connection != connection->session->lead_connection) + xio_ctx_debug_thread_unlock(connection->ctx); +#endif + + return retval; +} +EXPORT_SYMBOL(xio_connection_destroy); + +/*---------------------------------------------------------------------------*/ +/* xio_connection_teardown_handler */ +/*---------------------------------------------------------------------------*/ +static void xio_connection_teardown_handler(void *connection_) +{ + struct xio_connection *connection = + (struct xio_connection *)connection_; + + xio_session_notify_connection_teardown(connection->session, + connection); +} + +/*---------------------------------------------------------------------------*/ +/* xio_connection_disconnected */ +/*---------------------------------------------------------------------------*/ +int xio_connection_disconnected(struct xio_connection *connection) +{ + int close = 0; + + DEBUG_LOG("connection disconnected: connection:%p\n", connection); + + /* stop all pending timers */ + xio_ctx_del_work(connection->ctx, &connection->hello_work); + + xio_ctx_del_delayed_work(connection->ctx, + &connection->fin_delayed_work); + + xio_ctx_del_delayed_work(connection->ctx, + &connection->fin_timeout_work); + + xio_ctx_del_work(connection->ctx, &connection->fin_work); + + if (!connection->disable_notify && !connection->disconnecting) { + xio_session_notify_connection_disconnected( + connection->session, connection, + (enum xio_status)connection->close_reason); + } else if (connection->state == XIO_CONNECTION_STATE_INIT && + connection->disconnecting) { + connection->disable_notify = 0; + xio_session_notify_connection_disconnected( + connection->session, connection, + (enum xio_status)connection->close_reason); + } + connection->state = XIO_CONNECTION_STATE_DISCONNECTED; + + /* flush all messages from in flight message queue to in queue */ + xio_connection_flush_msgs(connection); + + /* flush all messages back to user */ + xio_connection_notify_msgs_flush(connection); + + if (connection->nexus) { + if (connection->session->lead_connection && + connection->session->lead_connection->nexus == + connection->nexus) { + connection->session->lead_connection = NULL; + close = 1; + } + if (connection->session->redir_connection && + connection->session->redir_connection->nexus == + connection->nexus) { + connection->session->redir_connection = NULL; + close = 1; + } + /* free nexus and tasks pools */ + if (close) { + xio_connection_flush_tasks(connection); + xio_nexus_close(connection->nexus, + &connection->session->observer); + } + } + + if (!connection->disable_notify) + xio_ctx_add_work( + connection->ctx, + connection, + xio_connection_teardown_handler, + &connection->teardown_work); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_connection_refused */ +/*---------------------------------------------------------------------------*/ +int xio_connection_refused(struct xio_connection *connection) +{ + connection->close_reason = XIO_E_CONNECT_ERROR; + + xio_session_notify_connection_refused( + connection->session, connection, + XIO_E_CONNECT_ERROR); + + /* flush all messages from in flight message queue to in queue */ + xio_connection_flush_msgs(connection); + + /* flush all messages back to user */ + xio_connection_notify_msgs_flush(connection); + + connection->state = XIO_CONNECTION_STATE_ERROR; + + xio_ctx_add_work( + connection->ctx, + connection, + xio_connection_teardown_handler, + &connection->teardown_work); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_connection_error_event */ +/*---------------------------------------------------------------------------*/ +int xio_connection_error_event(struct xio_connection *connection, + enum xio_status reason) +{ + connection->close_reason = reason; + + xio_session_notify_connection_error(connection->session, connection, + reason); + + /* flush all messages from in flight message queue to in queue */ + xio_connection_flush_msgs(connection); + + /* flush all messages back to user */ + xio_connection_notify_msgs_flush(connection); + + connection->state = XIO_CONNECTION_STATE_ERROR; + + xio_ctx_add_work( + connection->ctx, + connection, + xio_connection_teardown_handler, + &connection->teardown_work); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_on_fin_req_send_comp */ +/*---------------------------------------------------------------------------*/ +int xio_on_fin_req_send_comp(struct xio_connection *connection, + struct xio_task *task) +{ + DEBUG_LOG("got fin request send completion. session:%p, " \ + "connection:%p\n", + connection->session, connection); + + kref_put(&connection->kref, xio_connection_post_destroy); + + return 0; +} + +static void xio_close_time_wait(void *data) +{ + struct xio_connection *connection = (struct xio_connection *)data; + + DEBUG_LOG("connection %p state change: current_state:%s, " \ + "next_state:%s\n", + connection, + xio_connection_state_str((enum xio_connection_state) + connection->state), + xio_connection_state_str(XIO_CONNECTION_STATE_CLOSED)); + + if (connection->session->state == XIO_SESSION_STATE_REJECTED) + connection->close_reason = XIO_E_SESSION_REJECTED; + else if (connection->ka.timedout) + connection->close_reason = XIO_E_TIMEOUT; + else + connection->close_reason = XIO_E_SESSION_CLOSED; + + /* flush all messages from in flight message queue to in queue */ + xio_connection_flush_msgs(connection); + + /* flush all messages back to user */ + xio_connection_notify_msgs_flush(connection); + + connection->state = XIO_CONNECTION_STATE_CLOSED; + + if (!connection->disable_notify) + xio_ctx_add_work( + connection->ctx, + connection, + xio_connection_teardown_handler, + &connection->teardown_work); + else + xio_connection_destroy(connection); + + kref_put(&connection->kref, xio_connection_post_destroy); +} + +static void xio_handle_last_ack(void *data) +{ + struct xio_connection *connection = (struct xio_connection *)data; + + DEBUG_LOG("connection %p state change: current_state:%s, " \ + "next_state:%s\n", + connection, + xio_connection_state_str((enum xio_connection_state) + connection->state), + xio_connection_state_str(XIO_CONNECTION_STATE_CLOSED)); + + connection->close_reason = XIO_E_SESSION_DISCONNECTED; + + /* flush all messages from in flight message + * queue to in queue */ + xio_connection_flush_msgs(connection); + + /* flush all messages back to user */ + xio_connection_notify_msgs_flush(connection); + + connection->state = XIO_CONNECTION_STATE_CLOSED; + + if (!connection->disable_notify) + xio_ctx_add_work( + connection->ctx, + connection, + xio_connection_teardown_handler, + &connection->teardown_work); + else + xio_connection_destroy(connection); + + /* xio_connection_destroy(connection); */ +} + +/*---------------------------------------------------------------------------*/ +/* xio_on_fin_ack_recv */ +/*---------------------------------------------------------------------------*/ +int xio_on_fin_ack_recv(struct xio_connection *connection, + struct xio_task *task) +{ + struct xio_transition *transition; + int retval = 0; + + DEBUG_LOG("got fin ack. session:%p, connection:%p\n", + connection->session, connection); + + if (connection->fin_req_timeout) + return 0; + + connection->fin_req_timeout++; + + /* cancel the timer */ + xio_ctx_del_delayed_work(connection->ctx, + &connection->fin_timeout_work); + + xio_connection_release_fin(connection, task->sender_task->omsg); + + /* recycle the task */ + xio_tasks_pool_put(task->sender_task); + task->sender_task = NULL; + xio_tasks_pool_put(task); + + transition = xio_connection_next_transit((enum xio_connection_state) + connection->state, + 1 /*ack*/); + + if (!transition->valid) { + ERROR_LOG("invalid transition. session:%p, connection:%p, " \ + "state:%s\n", + connection->session, connection, + xio_connection_state_str((enum xio_connection_state) + connection->state)); + retval = -1; + goto cleanup; + } + if (connection->state == XIO_CONNECTION_STATE_LAST_ACK) { + xio_handle_last_ack(connection); + goto cleanup; + } + + DEBUG_LOG("connection %p state change: current_state:%s, " \ + "next_state:%s\n", + connection, + xio_connection_state_str((enum xio_connection_state) + connection->state), + xio_connection_state_str(transition->next_state)); + + connection->state = transition->next_state; + + if (connection->state == XIO_CONNECTION_STATE_TIME_WAIT) { + int retval = xio_ctx_add_delayed_work( + connection->ctx, + 2, connection, + xio_close_time_wait, + &connection->fin_delayed_work); + if (retval != 0) { + ERROR_LOG("xio_ctx_timer_add failed.\n"); + goto cleanup; + } + } + +cleanup: + kref_put(&connection->kref, xio_connection_post_destroy); + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_on_fin_req_recv */ +/*---------------------------------------------------------------------------*/ +int xio_on_fin_req_recv(struct xio_connection *connection, + struct xio_task *task) +{ + struct xio_transition *transition; + + DEBUG_LOG("fin request received. session:%p, connection:%p\n", + connection->session, connection); + + transition = xio_connection_next_transit((enum xio_connection_state) + connection->state, + 0 /*fin*/); + + if (!transition->valid) { + ERROR_LOG("invalid transition. session:%p, connection:%p, " \ + "state:%s\n", + connection->session, connection, + xio_connection_state_str((enum xio_connection_state) + connection->state)); + return -1; + } + /* flush all pending requests */ + xio_connection_notify_req_msgs_flush(connection, XIO_E_MSG_FLUSHED); + /*fin req was flushed. need to send it again */ + if (connection->fin_request_flushed) + xio_send_fin_req(connection); + + if (transition->send_flags & SEND_ACK) + xio_send_fin_ack(connection, task); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_on_fin_ack_send_comp */ +/*---------------------------------------------------------------------------*/ +int xio_on_fin_ack_send_comp(struct xio_connection *connection, + struct xio_task *task) +{ + struct xio_transition *transition; + int retval = 0; + + DEBUG_LOG("fin ack send completion received. " \ + "session:%p, connection:%p\n", + connection->session, connection); + + xio_connection_release_fin(connection, task->omsg); + task->sender_task = NULL; + xio_tasks_pool_put(task); + + transition = xio_connection_next_transit((enum xio_connection_state) + connection->state, + 0 /*fin*/); + if (!transition->valid) { + ERROR_LOG("invalid transition. session:%p, connection:%p, " \ + "state:%s\n", + connection->session, connection, + xio_connection_state_str((enum xio_connection_state) + connection->state)); + return -1; + } + + DEBUG_LOG("connection %p state change: current_state:%s, " \ + "next_state:%s\n", + connection, + xio_connection_state_str((enum xio_connection_state) + connection->state), + xio_connection_state_str(transition->next_state)); + + connection->state = transition->next_state; + + /* transition from online to close_wait - notify the application */ + if (connection->state == XIO_CONNECTION_STATE_CLOSE_WAIT) { + connection->disconnecting = 1; + xio_send_fin_req(connection); + + connection->close_reason = XIO_E_SESSION_DISCONNECTED; + if (!connection->disable_notify) + xio_session_notify_connection_closed( + connection->session, + connection); + + DEBUG_LOG("connection %p state change: current_state:%s, " \ + "next_state:%s\n", + connection, + xio_connection_state_str((enum xio_connection_state) + connection->state), + xio_connection_state_str( + XIO_CONNECTION_STATE_LAST_ACK)); + connection->state = XIO_CONNECTION_STATE_LAST_ACK; + } + + if (connection->state == XIO_CONNECTION_STATE_TIME_WAIT) { + retval = xio_ctx_add_delayed_work( + connection->ctx, + 2, connection, + xio_close_time_wait, + &connection->fin_delayed_work); + if (retval != 0) { + ERROR_LOG("xio_ctx_timer_add failed.\n"); + goto cleanup; + } + } + +cleanup: + kref_put(&connection->kref, xio_connection_post_destroy); + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_xmit_messages */ +/*---------------------------------------------------------------------------*/ +static inline void xio_xmit_messages(void *connection) +{ + xio_connection_xmit_msgs((struct xio_connection *)connection); +} + +/*---------------------------------------------------------------------------*/ +/* xio_on_connection_hello_rsp_recv */ +/*---------------------------------------------------------------------------*/ +int xio_on_connection_hello_rsp_recv(struct xio_connection *connection, + struct xio_task *task) +{ + struct xio_session *session = connection->session; + + DEBUG_LOG("recv hello response. session:%p, connection:%p\n", + session, connection); + + if (task) { + DEBUG_LOG("got hello response. session:%p, connection:%p\n", + session, connection); + + xio_connection_release_hello(connection, + task->sender_task->omsg); + /* recycle the task */ + xio_tasks_pool_put(task->sender_task); + task->sender_task = NULL; + xio_tasks_pool_put(task); + } + connection->peer_credits_msgs = session->peer_rcv_queue_depth_msgs; + connection->credits_msgs = 0; + connection->peer_credits_bytes = session->peer_rcv_queue_depth_bytes; + connection->credits_bytes = 0; + + /* from now - no need to disable */ + connection->disable_notify = 0; + + /* delayed disconnect request should be done now */ + if (connection->state == XIO_CONNECTION_STATE_INIT && + connection->disconnecting) { + connection->disconnecting = 0; + xio_connection_set_state(connection, + XIO_CONNECTION_STATE_ONLINE); + xio_disconnect(connection); + return 0; + } + + /* set the new connection to ESTABLISHED */ + xio_connection_set_state(connection, + XIO_CONNECTION_STATE_ESTABLISHED); + xio_session_notify_connection_established(session, connection); + + if (connection->state == XIO_CONNECTION_STATE_ESTABLISHED) { + /* set the new connection to online */ + xio_connection_set_state( + connection, + XIO_CONNECTION_STATE_ONLINE); + + xio_connection_keepalive_start(connection); + + xio_ctx_add_work( + connection->ctx, + connection, + xio_xmit_messages, + &connection->hello_work); + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_on_connection_hello_req_recv */ +/*---------------------------------------------------------------------------*/ +int xio_on_connection_hello_req_recv(struct xio_connection *connection, + struct xio_task *task) +{ + /* delayed disconnect request should be done now */ + DEBUG_LOG("recv hello request. session:%p, connection:%p\n", + connection->session, connection); + + /* from now - no need to disable */ + connection->disable_notify = 0; + + /* temporarily set the state to init to delay disconnection */ + connection->state = XIO_CONNECTION_STATE_INIT; + xio_session_notify_new_connection(task->session, connection); + + if (connection->disconnecting == 0) { + connection->session->state = XIO_SESSION_STATE_ONLINE; + connection->session->disable_teardown = 0; + connection->peer_credits_msgs = + connection->session->peer_rcv_queue_depth_msgs; + connection->credits_msgs = 0; + connection->peer_credits_bytes = + connection->session->peer_rcv_queue_depth_bytes; + connection->credits_bytes = 0; + + TRACE_LOG("session state is now ONLINE. session:%p\n", + connection->session); + + xio_connection_set_state(connection, + XIO_CONNECTION_STATE_ONLINE); + + xio_connection_keepalive_start(connection); + } + xio_connection_send_hello_rsp(connection, task); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_on_connection_hello_rsp_send_comp */ +/*---------------------------------------------------------------------------*/ +int xio_on_connection_hello_rsp_send_comp(struct xio_connection *connection, + struct xio_task *task) +{ + xio_connection_release_hello(connection, task->omsg); + xio_tasks_pool_put(task); + + /* deferred disconnect should take place now */ + if (connection->disconnecting) { + connection->disconnecting = 0; + xio_connection_set_state(connection, + XIO_CONNECTION_STATE_ONLINE); + xio_disconnect(connection); + return 0; + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_send_credits_ack */ +/*---------------------------------------------------------------------------*/ +int xio_send_credits_ack(struct xio_connection *connection) +{ + struct xio_msg *msg; + + msg = (struct xio_msg *)xio_context_msg_pool_get(connection->ctx); + + msg->type = (enum xio_msg_type)XIO_ACK_REQ; + msg->in.header.iov_len = 0; + msg->out.header.iov_len = 0; + msg->in.data_tbl.nents = 0; + msg->out.data_tbl.nents = 0; + + /* insert to the head of the queue */ + xio_msg_list_insert_tail(&connection->reqs_msgq, msg, pdata); + + DEBUG_LOG("send credits_msgs ack. session:%p, connection:%p\n", + connection->session, connection); + + /* do not xmit until connection is assigned */ + return xio_connection_xmit(connection); +} + +/*---------------------------------------------------------------------------*/ +/* xio_on_credits_ack_send_comp */ +/*---------------------------------------------------------------------------*/ +int xio_on_credits_ack_send_comp(struct xio_connection *connection, + struct xio_task *task) +{ + xio_connection_release_hello(connection, task->omsg); + xio_tasks_pool_put(task); + + return xio_connection_xmit(connection); +} + +/*---------------------------------------------------------------------------*/ +/* xio_managed_rkey_unwrap */ +/*---------------------------------------------------------------------------*/ +uint32_t xio_managed_rkey_unwrap( + const struct xio_managed_rkey *managed_rkey) +{ + return managed_rkey->rkey; +} +EXPORT_SYMBOL(xio_managed_rkey_unwrap); + +/*---------------------------------------------------------------------------*/ +/* xio_register_remote_rkey */ +/*---------------------------------------------------------------------------*/ +struct xio_managed_rkey *xio_register_remote_rkey( + struct xio_connection *connection, uint32_t raw_rkey) +{ + struct xio_managed_rkey *managed_rkey = (struct xio_managed_rkey *) + kcalloc(1, sizeof(*managed_rkey), GFP_KERNEL); + if (!managed_rkey) + return NULL; + + managed_rkey->rkey = raw_rkey; + list_add(&managed_rkey->list_entry, &connection->managed_rkey_list); + return managed_rkey; +} +EXPORT_SYMBOL(xio_register_remote_rkey); + +/*---------------------------------------------------------------------------*/ +/* xio_unregister_remote_key */ +/*---------------------------------------------------------------------------*/ +void xio_unregister_remote_key(struct xio_managed_rkey *managed_rkey) +{ + list_del(&managed_rkey->list_entry); + kfree(managed_rkey); +} +EXPORT_SYMBOL(xio_unregister_remote_key); + +/*---------------------------------------------------------------------------*/ +/* xio_req_to_transport_base */ +/*---------------------------------------------------------------------------*/ +const struct xio_transport_base *xio_req_to_transport_base( + const struct xio_msg *req) +{ + struct xio_task *task = container_of(req, struct xio_task, imsg); + + return task->connection->nexus->transport_hndl; +} + +/*---------------------------------------------------------------------------*/ +/* xio_connection_ioctl */ +/*---------------------------------------------------------------------------*/ +int xio_connection_ioctl(struct xio_connection *connection, int con_optname, + void *optval, int *optlen) +{ + if (!connection) { + xio_set_error(EINVAL); + return -1; + } + switch (con_optname) { + case XIO_CONNECTION_FIONWRITE_BYTES: + *optlen = sizeof(uint64_t); + *((uint64_t *)optval) = + connection->session->snd_queue_depth_bytes - + connection->tx_bytes; + return 0; + case XIO_CONNECTION_FIONWRITE_MSGS: + *optlen = sizeof(int); + *((int *)optval) = + connection->session->snd_queue_depth_msgs - + connection->tx_queued_msgs; + return 0; + case XIO_CONNECTION_LEADING_CONN: + *optlen = sizeof(int); + if (connection->session->connection_srv_first == connection) + *((int *)optval) = 1; + else + *((int *)optval) = 0; + return 0; + default: + break; + } + xio_set_error(XIO_E_NOT_SUPPORTED); + return -1; +} +EXPORT_SYMBOL(xio_connection_ioctl); + +/*---------------------------------------------------------------------------*/ +/* xio_connection_send_ka_req */ +/*---------------------------------------------------------------------------*/ +int xio_connection_send_ka_req(struct xio_connection *connection) +{ + struct xio_msg *msg; + + if (connection->state != XIO_CONNECTION_STATE_ONLINE || + connection->ka.req_sent) + return 0; + + TRACE_LOG("send keepalive request. session:%p, connection:%p\n", + connection->session, connection); + + msg = (struct xio_msg *)xio_context_msg_pool_get(connection->ctx); + + msg->type = (enum xio_msg_type)XIO_CONNECTION_KA_REQ; + msg->in.header.iov_len = 0; + msg->out.header.iov_len = 0; + msg->in.data_tbl.nents = 0; + msg->out.data_tbl.nents = 0; + + connection->ka.req_sent = 1; + + /* insert to the tail of the queue */ + xio_msg_list_insert_head(&connection->reqs_msgq, msg, pdata); + + /* do not xmit until connection is assigned */ + return xio_connection_xmit(connection); +} + +/*---------------------------------------------------------------------------*/ +/* xio_connection_send_ka_rsp */ +/*---------------------------------------------------------------------------*/ +int xio_connection_send_ka_rsp(struct xio_connection *connection, + struct xio_task *task) +{ + struct xio_msg *msg; + + TRACE_LOG("send keepalive response. session:%p, connection:%p\n", + connection->session, connection); + + msg = (struct xio_msg *)xio_context_msg_pool_get(connection->ctx); + + msg->type = (enum xio_msg_type)XIO_CONNECTION_KA_RSP; + msg->request = &task->imsg; + msg->in.header.iov_len = 0; + msg->out.header.iov_len = 0; + msg->in.data_tbl.nents = 0; + msg->out.data_tbl.nents = 0; + + /* insert to the tail of the queue */ + xio_msg_list_insert_head(&connection->rsps_msgq, msg, pdata); + + /* do not xmit until connection is assigned */ + return xio_connection_xmit(connection); +} + +/*---------------------------------------------------------------------------*/ +/* xio_on_connection_ka_rsp_recv */ +/*---------------------------------------------------------------------------*/ +int xio_on_connection_ka_rsp_recv(struct xio_connection *connection, + struct xio_task *task) +{ + int retval; + + TRACE_LOG("recv keepalive response. session:%p, connection:%p\n", + connection->session, connection); + + xio_ctx_del_delayed_work(connection->ctx, + &connection->ka.timer); + + connection->ka.probes = 0; + connection->ka.req_sent = 0; + connection->ka.timedout = 0; + + retval = xio_ctx_add_delayed_work( + connection->ctx, + 1000 * g_options.ka.time, connection, + xio_connection_keepalive_time, + &connection->ka.timer); + if (retval != 0) { + ERROR_LOG("periodic keepalive failed - abort\n"); + return -1; + } + + xio_context_msg_pool_put(task->sender_task->omsg); + /* recycle the task */ + xio_tasks_pool_put(task->sender_task); + task->sender_task = NULL; + xio_tasks_pool_put(task); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_on_connection_ka_req_recv */ +/*---------------------------------------------------------------------------*/ +int xio_on_connection_ka_req_recv(struct xio_connection *connection, + struct xio_task *task) +{ + /* delayed disconnect request should be done now */ + TRACE_LOG("recv keepalive request. session:%p, connection:%p\n", + connection->session, connection); + + connection->ka.timedout = 0; + + /* optimization: reschedule local timer if request received */ + if (g_options.enable_keepalive && !connection->ka.probes && + !connection->ka.req_sent) { + int retval; + + xio_ctx_del_delayed_work(connection->ctx, + &connection->ka.timer); + retval = xio_ctx_add_delayed_work( + connection->ctx, + 1000 * g_options.ka.time, connection, + xio_connection_keepalive_start, + &connection->ka.timer); + if (retval != 0) { + ERROR_LOG("periodic keepalive failed - abort\n"); + return -1; + } + } + + xio_connection_send_ka_rsp(connection, task); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_on_connection_ka_rsp_send_comp */ +/*---------------------------------------------------------------------------*/ +int xio_on_connection_ka_rsp_send_comp(struct xio_connection *connection, + struct xio_task *task) +{ + xio_context_msg_pool_put(task->omsg); + xio_tasks_pool_put(task); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_connection_keepalive_intvl */ +/*---------------------------------------------------------------------------*/ +void xio_connection_keepalive_intvl(void *_connection) +{ + struct xio_connection *connection = + (struct xio_connection *)_connection; + int retval; + + xio_ctx_del_delayed_work(connection->ctx, + &connection->ka.timer); + + connection->ka.timedout = 1; + + if (++connection->ka.probes == g_options.ka.probes) { + ERROR_LOG("connection keepalive timeout. connection:%p probes:[%d]\n", + connection, connection->ka.probes); + connection->ka.probes = 0; + connection->ka.req_sent = 0; + + retval = xio_ctx_add_delayed_work( + connection->ctx, + 1000 * g_options.ka.time, connection, + xio_connection_keepalive_start, + &connection->ka.timer); + if (retval != 0) { + ERROR_LOG("periodic keepalive failed - abort\n"); + return; + } + /* notify the application of connection error */ + xio_session_notify_connection_error( + connection->session, connection, XIO_E_TIMEOUT); + if ((!connection->disconnecting) && (!g_options.reconnect)) + xio_disconnect(connection); + return; + } + WARN_LOG("connection keepalive timeout. connection:%p probes:[%d]\n", + connection, connection->ka.probes); + retval = xio_ctx_add_delayed_work( + connection->ctx, + 1000 * g_options.ka.intvl, connection, + xio_connection_keepalive_intvl, + &connection->ka.timer); + if (retval != 0) { + ERROR_LOG("keepalive timeout failed - abort\n"); + return; + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_connection_keepalive_time */ +/*---------------------------------------------------------------------------*/ +static void xio_connection_keepalive_time(void *_connection) +{ + struct xio_connection *connection = + (struct xio_connection *)_connection; + int retval; + + xio_ctx_del_delayed_work(connection->ctx, + &connection->ka.timer); + + retval = xio_ctx_add_delayed_work( + connection->ctx, + 1000 * g_options.ka.intvl, connection, + xio_connection_keepalive_intvl, + &connection->ka.timer); + if (retval != 0) { + ERROR_LOG("keepalive timeout failed - abort\n"); + return; + } + xio_connection_send_ka_req(connection); +} + +/*---------------------------------------------------------------------------*/ +/* xio_connection_keepalive_start */ +/*---------------------------------------------------------------------------*/ +void xio_connection_keepalive_start(void *_connection) +{ + struct xio_connection *connection = + (struct xio_connection *)_connection; + int retval; + + xio_ctx_del_delayed_work(connection->ctx, + &connection->ka.timer); + + if (!g_options.enable_keepalive) + return; + + retval = xio_ctx_add_delayed_work( + connection->ctx, + 1000 * g_options.ka.time, connection, + xio_connection_keepalive_time, + &connection->ka.timer); + if (retval != 0) { + ERROR_LOG("keepalive timeout failed - abort\n"); + return; + } +} + diff --git a/open_src/xio/src/common/xio_connection.h b/open_src/xio/src/common/xio_connection.h new file mode 100644 index 0000000..58cfda4 --- /dev/null +++ b/open_src/xio/src/common/xio_connection.h @@ -0,0 +1,285 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_CONNECTION_H +#define XIO_CONNECTION_H + +enum xio_connection_state { + XIO_CONNECTION_STATE_INIT, + XIO_CONNECTION_STATE_ESTABLISHED, + XIO_CONNECTION_STATE_ONLINE, + XIO_CONNECTION_STATE_FIN_WAIT_1, /* tcp state machine */ + XIO_CONNECTION_STATE_FIN_WAIT_2, /* tcp state machine */ + XIO_CONNECTION_STATE_CLOSING, /* tcp state machine */ + XIO_CONNECTION_STATE_TIME_WAIT, /* tcp state machine */ + XIO_CONNECTION_STATE_CLOSE_WAIT, /* tcp state machine */ + XIO_CONNECTION_STATE_LAST_ACK, /* tcp state machine */ + XIO_CONNECTION_STATE_CLOSED, /* user close */ + XIO_CONNECTION_STATE_DISCONNECTED, /* disconnect */ + XIO_CONNECTION_STATE_ERROR, /* error */ + XIO_CONNECTION_STATE_INVALID +}; + +#define SEND_ACK 0x0001 +#define SEND_FIN 0x0002 + +#define XIO_MIN_CONNECTION_TIMEOUT 1000 +#define XIO_DEF_CONNECTION_TIMEOUT 300000 + +struct xio_transition { + int valid; + enum xio_connection_state next_state; + int send_flags; +}; + +struct xio_ka { + xio_delayed_work_handle_t timer; + int probes; + int req_sent; + int timedout; + int pad; +}; + +struct xio_connection { + struct xio_ka ka; + struct xio_nexus *nexus; + struct xio_session *session; + struct xio_context *ctx; /* connection context */ + struct xio_session_ops ses_ops; + + /* server's session may have multiple connections each has + * private data assigned by bind + */ + uint16_t enable_flow_control; + uint16_t req_sn; + uint16_t req_exp_sn; + uint16_t req_ack_sn; + + uint16_t rsp_sn; + uint16_t rsp_exp_sn; + uint16_t rsp_ack_sn; + uint16_t credits_msgs; + + uint16_t peer_credits_msgs; + uint16_t rx_queue_watermark_msgs; + uint16_t conn_idx; + uint16_t state; + + uint16_t fin_req_timeout; + uint16_t disable_notify; + uint16_t disconnecting; + uint16_t restarted; + + uint64_t latest_delivered; + + uint16_t is_flushed; + uint16_t send_req_toggle; + uint16_t cd_bit; /*close disconnect bit */ + uint16_t fin_request_flushed; + + uint32_t close_reason; + int32_t tx_queued_msgs; + struct kref kref; + uint32_t disconnect_timeout; + + struct xio_msg_list reqs_msgq; + struct xio_msg_list rsps_msgq; + struct xio_msg_list in_flight_reqs_msgq; + struct xio_msg_list in_flight_rsps_msgq; + + xio_work_handle_t hello_work; + xio_work_handle_t fin_work; + xio_delayed_work_handle_t fin_delayed_work; + xio_delayed_work_handle_t fin_timeout_work; + + struct list_head managed_rkey_list; + struct list_head io_tasks_list; + struct list_head post_io_tasks_list; + struct list_head pre_send_list; + struct list_head connections_list_entry; + struct list_head ctx_list_entry; + void *cb_user_context; + + size_t tx_bytes; + uint64_t credits_bytes; + uint64_t peer_credits_bytes; + uint64_t rx_queue_watermark_bytes; + + uint32_t nexus_attr_mask; + struct xio_nexus_init_attr nexus_attr; + + xio_work_handle_t teardown_work; + +#ifdef XIO_SESSION_DEBUG + uint64_t peer_connection; + uint64_t peer_session; +#endif +}; + +struct xio_connection *xio_connection_create( + struct xio_session *session, + struct xio_context *ctx, int conn_idx, + void *cb_user_context); + +int xio_connection_close(struct xio_connection *connection); + +static inline void xio_connection_set( + struct xio_connection *connection, + struct xio_nexus *nexus) +{ + connection->nexus = nexus; +} + +static inline void xio_connection_set_ops( + struct xio_connection *connection, + struct xio_session_ops *ses_ops) +{ + memcpy(&connection->ses_ops, ses_ops, sizeof(*ses_ops)); +} + +int xio_connection_send(struct xio_connection *connection, + struct xio_msg *msg); + +int xio_connection_xmit_msgs(struct xio_connection *connection); + +void xio_connection_queue_io_task(struct xio_connection *connection, + struct xio_task *task); + +struct xio_task *xio_connection_find_io_task(struct xio_connection *connection, + uint64_t msg_sn); + +static inline void xio_connection_set_state( + struct xio_connection *connection, + enum xio_connection_state state) +{ + connection->state = state; +} + +struct xio_transition *xio_connection_next_transit( + enum xio_connection_state state, + int fin_ack); + +int xio_connection_send_read_receipt(struct xio_connection *connection, + struct xio_msg *msg); + +int xio_connection_release_read_receipt(struct xio_connection *connection, + struct xio_msg *msg); + +void xio_release_response_task(struct xio_task *task); + +int xio_send_fin_ack(struct xio_connection *connection, + struct xio_task *task); + +int xio_disconnect_initial_connection( + struct xio_connection *connection); + +int xio_connection_disconnected(struct xio_connection *connection); + +int xio_connection_refused(struct xio_connection *connection); + +int xio_connection_error_event(struct xio_connection *connection, + enum xio_status reason); + +int xio_connection_remove_in_flight(struct xio_connection *connection, + struct xio_msg *msg); + +int xio_connection_remove_msg_from_queue(struct xio_connection *connection, + struct xio_msg *msg); + +int xio_connection_send_cancel_response( + struct xio_connection *connection, + struct xio_msg *msg, + struct xio_task *task, + enum xio_status result); + +int xio_connection_send_hello_req(struct xio_connection *connection); + +int xio_connection_send_hello_rsp(struct xio_connection *connection, + struct xio_task *task); + +char *xio_connection_state_str(enum xio_connection_state state); + +int xio_connection_reconnect(struct xio_connection *connection); + +int xio_connection_restart(struct xio_connection *connection); + +int xio_on_fin_req_send_comp(struct xio_connection *connection, + struct xio_task *task); + +int xio_on_fin_ack_send_comp(struct xio_connection *connection, + struct xio_task *task); + +int xio_on_fin_req_recv(struct xio_connection *connection, + struct xio_task *task); + +int xio_on_fin_ack_recv(struct xio_connection *connection, + struct xio_task *task); + +int xio_on_connection_hello_req_recv(struct xio_connection *connection, + struct xio_task *task); + +int xio_on_connection_hello_rsp_send_comp(struct xio_connection *connection, + struct xio_task *task); +int xio_on_connection_hello_rsp_recv(struct xio_connection *connection, + struct xio_task *task); + +int xio_send_credits_ack(struct xio_connection *connection); + +int xio_on_credits_ack_send_comp(struct xio_connection *connection, + struct xio_task *task); + +int xio_on_credits_ack_recv(struct xio_connection *connection, + struct xio_task *task); + +const struct xio_transport_base *xio_req_to_transport_base( + const struct xio_msg *req); + +int xio_connection_ioctl(struct xio_connection *connection, int con_optname, + void *optval, int *optlen); + +int xio_on_connection_ka_req_recv(struct xio_connection *connection, + struct xio_task *task); + +int xio_on_connection_ka_rsp_send_comp(struct xio_connection *connection, + struct xio_task *task); + +int xio_on_connection_ka_rsp_recv(struct xio_connection *connection, + struct xio_task *task); + +void xio_connection_keepalive_start(void *_connection); + +#endif /*XIO_CONNECTION_H */ diff --git a/open_src/xio/src/common/xio_context.h b/open_src/xio/src/common/xio_context.h new file mode 100644 index 0000000..4a42668 --- /dev/null +++ b/open_src/xio/src/common/xio_context.h @@ -0,0 +1,316 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_CONTEXT_H +#define XIO_CONTEXT_H + +#define xio_ctx_work_t xio_work_handle_t +#define xio_ctx_delayed_work_t xio_delayed_work_handle_t + +#define XIO_PROTO_LAST 2 /* from enum xio_proto */ + +#ifdef XIO_THREAD_SAFE_DEBUG +#define BACKTRACE_BUFFER_SIZE 2048 +#endif + +/*---------------------------------------------------------------------------*/ +/* enum */ +/*---------------------------------------------------------------------------*/ +enum xio_context_event { + XIO_CONTEXT_EVENT_CLOSE, + XIO_CONTEXT_EVENT_POST_CLOSE +}; + +enum xio_context_pool_class { + XIO_CONTEXT_POOL_CLASS_INITIAL, + XIO_CONTEXT_POOL_CLASS_PRIMARY +}; + +enum xio_counters { + XIO_STAT_TX_MSG, + XIO_STAT_RX_MSG, + XIO_STAT_TX_BYTES, + XIO_STAT_RX_BYTES, + XIO_STAT_DELAY, + XIO_STAT_APPDELAY, + /* user can register 10 more messages */ + XIO_STAT_USER_FIRST, + XIO_STAT_LAST = 16 +}; + +typedef int (*poll_completions_fn_t)(void *, int); + +/*---------------------------------------------------------------------------*/ +/* structs */ +/*---------------------------------------------------------------------------*/ +struct xio_statistics { + uint64_t hertz; + uint64_t counter[XIO_STAT_LAST]; + char *name[XIO_STAT_LAST]; +}; + +struct xio_context { + void *ev_loop; + void *mempool; + /* pools per transport */ + struct xio_tasks_pool *primary_tasks_pool[XIO_PROTO_LAST]; + struct xio_tasks_pool_ops *primary_pool_ops[XIO_PROTO_LAST]; + + struct xio_tasks_pool *initial_tasks_pool[XIO_PROTO_LAST]; + struct xio_tasks_pool_ops *initial_pool_ops[XIO_PROTO_LAST]; + + /* pool per connection */ + struct xio_objpool *msg_pool; + + void *poll_completions_ctx; + poll_completions_fn_t poll_completions_fn; + + int cpuid; + int nodeid; + int polling_timeout; + unsigned int flags; + uint64_t worker; + + int32_t run_private; + + uint32_t is_running:1; + uint32_t defered_destroy:1; + uint32_t prealloc_xio_inline_bufs:1; + uint32_t register_internal_mempool:1; + uint32_t resereved:28; + + struct xio_statistics stats; + void *user_context; + struct xio_workqueue *workqueue; + struct list_head ctx_list; /* per context storage */ + + /* list of sessions using this connection */ + struct xio_observable observable; + void *netlink_sock; + xio_work_handle_t destroy_ctx_work; + spinlock_t ctx_list_lock; + + int max_conns_per_ctx; + int rq_depth; + int pad; +#ifdef XIO_THREAD_SAFE_DEBUG + int nptrs; + int pad1; + pthread_mutex_t dbg_thread_mutex; + void *buffer[BACKTRACE_BUFFER_SIZE]; +#endif + +}; + +/*---------------------------------------------------------------------------*/ +/* xio_context_reg_observer */ +/*---------------------------------------------------------------------------*/ +int xio_context_reg_observer(struct xio_context *context, + struct xio_observer *observer); + +/*---------------------------------------------------------------------------*/ +/* xio_context_unreg_observer */ +/*---------------------------------------------------------------------------*/ +void xio_context_unreg_observer(struct xio_context *conn, + struct xio_observer *observer); + +/*---------------------------------------------------------------------------*/ +/* xio_add_counter */ +/*---------------------------------------------------------------------------*/ +int xio_add_counter(struct xio_context *ctx, char *name); + +/*---------------------------------------------------------------------------*/ +/* xio_del_counter */ +/*---------------------------------------------------------------------------*/ +int xio_del_counter(struct xio_context *ctx, int counter); + +/*---------------------------------------------------------------------------*/ +/* xio_ctx_stat_add */ +/*---------------------------------------------------------------------------*/ +static inline void xio_ctx_stat_add(struct xio_context *ctx, + int counter, uint64_t val) +{ + ctx->stats.counter[counter] += val; +} + +/*---------------------------------------------------------------------------*/ +/* xio_ctx_stat_inc */ +/*---------------------------------------------------------------------------*/ +static inline void xio_ctx_stat_inc(struct xio_context *ctx, int counter) +{ + ctx->stats.counter[counter]++; +} + +/*---------------------------------------------------------------------------*/ +/* xio_stat_add */ +/*---------------------------------------------------------------------------*/ +static inline void xio_stat_add(struct xio_statistics *stats, + int counter, uint64_t val) +{ + stats->counter[counter] += val; +} + +/*---------------------------------------------------------------------------*/ +/* xio_stat_inc */ +/*---------------------------------------------------------------------------*/ +static inline void xio_stat_inc(struct xio_statistics *stats, int counter) +{ + stats->counter[counter]++; +} + +/*---------------------------------------------------------------------------*/ +/* xio_ctx_add_delayed_work */ +/*---------------------------------------------------------------------------*/ +int xio_ctx_add_delayed_work(struct xio_context *ctx, + int msec_duration, void *data, + void (*timer_fn)(void *data), + xio_ctx_delayed_work_t *work); + +/*---------------------------------------------------------------------------*/ +/* xio_ctx_del_delayed_work */ +/*---------------------------------------------------------------------------*/ +int xio_ctx_del_delayed_work(struct xio_context *ctx, + xio_ctx_delayed_work_t *work); + +/*---------------------------------------------------------------------------*/ +/* xio_ctx_add_work */ +/*---------------------------------------------------------------------------*/ +int xio_ctx_add_work(struct xio_context *ctx, void *data, + void (*function)(void *data), + xio_ctx_work_t *work); + +/*---------------------------------------------------------------------------*/ +/* xio_ctx_set_work_destructor */ +/*---------------------------------------------------------------------------*/ +int xio_ctx_set_work_destructor( + struct xio_context *ctx, void *data, + void (*destructor)(void *data), + xio_ctx_work_t *work); + +/*---------------------------------------------------------------------------*/ +/* xio_ctx_is_work_in_handler */ +/*---------------------------------------------------------------------------*/ +int xio_ctx_is_work_in_handler(struct xio_context *ctx, xio_ctx_work_t *work); + +/*---------------------------------------------------------------------------*/ +/* xio_ctx_del_work */ +/*---------------------------------------------------------------------------*/ +int xio_ctx_del_work(struct xio_context *ctx, + xio_ctx_work_t *work); + +/*---------------------------------------------------------------------------*/ +/* xio_ctx_add_event */ +/*---------------------------------------------------------------------------*/ +int xio_context_add_event(struct xio_context *ctx, struct xio_ev_data *evt); + +/*---------------------------------------------------------------------------*/ +/* xio_context_disable_event */ +/*---------------------------------------------------------------------------*/ +void xio_context_disable_event(struct xio_ev_data *evt); + +/*---------------------------------------------------------------------------*/ +/* xio_context_is_pending_event */ +/*---------------------------------------------------------------------------*/ +int xio_context_is_pending_event(struct xio_ev_data *evt); + +/*---------------------------------------------------------------------------*/ +/* xio_context_is_loop_stopping */ +/*---------------------------------------------------------------------------*/ +int xio_context_is_loop_stopping(struct xio_context *ctx); + +/*---------------------------------------------------------------------------*/ +/* xio_context_set_poll_completions_fn */ +/*---------------------------------------------------------------------------*/ +void xio_context_set_poll_completions_fn( + struct xio_context *ctx, + poll_completions_fn_t poll_completions_fn, + void *poll_completions_ctx); + +/*---------------------------------------------------------------------------*/ +/* xio_context_modify_ev_handler */ +/*---------------------------------------------------------------------------*/ +int xio_context_modify_ev_handler(struct xio_context *ctx, + int fd, int events); + +/* + * should be called only from context_shutdown event context + */ +/*---------------------------------------------------------------------------*/ +/* xio_context_destroy_wait */ +/*---------------------------------------------------------------------------*/ +static inline void xio_context_destroy_wait(struct xio_context *ctx) +{ + ctx->run_private++; +} + +/* + * should be called only from loop context + */ +/*---------------------------------------------------------------------------*/ +/* xio_context_destroy_resume */ +/*---------------------------------------------------------------------------*/ +void xio_context_destroy_resume(struct xio_context *ctx); + +/*---------------------------------------------------------------------------*/ +/* xio_context_msg_pool_get */ +/*---------------------------------------------------------------------------*/ +static inline void *xio_context_msg_pool_get(struct xio_context *ctx) +{ + return xio_objpool_alloc(ctx->msg_pool); +} + +/*---------------------------------------------------------------------------*/ +/* xio_context_msg_pool_put */ +/*---------------------------------------------------------------------------*/ +static inline void xio_context_msg_pool_put(void *obj) +{ + xio_objpool_free(obj); +} +/*---------------------------------------------------------------------------*/ +/* xio_ctx_pool_create */ +/*---------------------------------------------------------------------------*/ +int xio_ctx_pool_create(struct xio_context *ctx, enum xio_proto proto, + enum xio_context_pool_class pool_cls); + + +#ifdef XIO_THREAD_SAFE_DEBUG +int xio_ctx_debug_thread_lock(struct xio_context *ctx); +int xio_ctx_debug_thread_unlock(struct xio_context *ctx); +#endif + +#endif /*XIO_CONTEXT_H */ + diff --git a/open_src/xio/src/common/xio_error.c b/open_src/xio/src/common/xio_error.c new file mode 100644 index 0000000..49c3fa1 --- /dev/null +++ b/open_src/xio/src/common/xio_error.c @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include "libxio.h" +#include "xio_common.h" + +/*---------------------------------------------------------------------------*/ +/* xio_gen_status_str */ +/*---------------------------------------------------------------------------*/ +static const char *xio_gen_status_str(enum xio_status ev) +{ + switch (ev) { + case XIO_E_NOT_SUPPORTED: + return "Not supported"; + case XIO_E_NO_BUFS: + return "No buffer space available"; + case XIO_E_CONNECT_ERROR: + return "Connect error"; + case XIO_E_ROUTE_ERROR: + return "Route error"; + case XIO_E_ADDR_ERROR: + return "Address error"; + case XIO_E_UNREACHABLE: + return "No route to host"; + case XIO_E_PARTIAL_MSG: + return "Partial message"; + case XIO_E_MSG_SIZE: + return "Message too long"; + case XIO_E_MSG_INVALID: + return "Message is invalid"; + case XIO_E_MSG_UNKNOWN: + return "Message unknown"; + case XIO_E_SESSION_REFUSED: + return "Session refused"; + case XIO_E_SESSION_ABORTED: + return "Session aborted"; + case XIO_E_SESSION_DISCONNECTED: + return "Session disconnected"; + case XIO_E_SESSION_REJECTED: + return "Session rejected"; + case XIO_E_SESSION_REDIRECTED: + return "Session redirected"; + case XIO_E_SESSION_CLOSED: + return "Session closed"; + case XIO_E_BIND_FAILED: + return "Bind failed"; + case XIO_E_TIMEOUT: + return "Timeout"; + case XIO_E_IN_PORGRESS: + return "Operation now in progress"; + case XIO_E_INVALID_VERSION: + return "Invalid version"; + case XIO_E_NOT_SESSION: + return "Not a session"; + case XIO_E_OPEN_FAILED: + return "Open failed"; + case XIO_E_READ_FAILED: + return "Read failed"; + case XIO_E_WRITE_FAILED: + return "Write failed"; + case XIO_E_CLOSE_FAILED: + return "Close failed"; + case XIO_E_UNSUCCESSFUL: + return "Operation unsuccessful"; + case XIO_E_MSG_CANCELED: + return "Message canceled"; + case XIO_E_MSG_CANCEL_FAILED: + return "Message cancel failed"; + case XIO_E_MSG_NOT_FOUND: + return "Message not found"; + case XIO_E_MSG_FLUSHED: + return "Message flushed"; + case XIO_E_MSG_DISCARDED: + return "Message discarded"; + case XIO_E_STATE: + return "Operation not permitted in current state"; + case XIO_E_NO_USER_BUFS: + return "User buffers not available"; + case XIO_E_NO_USER_MR: + return "User mr not available"; + case XIO_E_USER_BUF_OVERFLOW: + return "Local user buffers overflow"; + case XIO_E_REM_USER_BUF_OVERFLOW: + return "Remote user buffers overflow"; + case XIO_E_TX_QUEUE_OVERFLOW: + return "Send queue overflow"; + case XIO_E_USER_OBJ_NOT_FOUND: + return "User object not found"; + case XIO_E_PEER_QUEUE_SIZE_MISMATCH: + return "Peer receive queue is smaller then message size"; + case XIO_E_RSP_BUF_SIZE_MISMATCH: + return "Response buffer is smaller then actual response"; + default: + return "Unknown error"; + }; +} + +/*---------------------------------------------------------------------------*/ +/* xio_strerror */ +/*---------------------------------------------------------------------------*/ +const char *xio_strerror(int errnum) +{ + if (errnum < XIO_BASE_STATUS) + return strerror(errnum); + + if (errnum >= XIO_E_NOT_SUPPORTED && errnum < XIO_E_LAST_STATUS) + return xio_gen_status_str((enum xio_status)errnum); + + return "Unknown error"; +} +EXPORT_SYMBOL(xio_strerror); diff --git a/open_src/xio/src/common/xio_hash.h b/open_src/xio/src/common/xio_hash.h new file mode 100644 index 0000000..ea00f77 --- /dev/null +++ b/open_src/xio/src/common/xio_hash.h @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_HASH_H +#define XIO_HASH_H + +struct xio_key_ptr { + void *id; +}; + +struct xio_key_int8 { + uint8_t id; + uint8_t pad[7]; +}; + +struct xio_key_int16 { + uint16_t id; + uint8_t pad[6]; +}; + +struct xio_key_int32 { + uint32_t id; + uint8_t pad[4]; +}; + +struct xio_key_int64 { + uint64_t id; +}; + +struct xio_key_str { + char *id; +}; + +static inline unsigned int int8_hash(uint8_t key8) +{ + unsigned int key = key8; + + key += ~(key << 15); + key ^= (key >> 10); + key += (key << 3); + key ^= (key >> 6); + key += ~(key << 11); + key ^= (key >> 16); + return key; +} + +/* Thomas Wang's 32 Bit Mix Function: + * http://www.cris.com/~Ttwang/tech/inthash.htm + */ +static inline unsigned int int16_hash(uint16_t key16) +{ + unsigned int key = key16; + + key += ~(key << 15); + key ^= (key >> 10); + key += (key << 3); + key ^= (key >> 6); + key += ~(key << 11); + key ^= (key >> 16); + return key; +} + +/* Thomas Wang's 32 Bit Mix Function: + * http://www.cris.com/~Ttwang/tech/inthash.htm + */ +static inline unsigned int int32_hash(uint32_t key) +{ + key += ~(key << 15); + key ^= (key >> 10); + key += (key << 3); + key ^= (key >> 6); + key += ~(key << 11); + key ^= (key >> 16); + return key; +} + +/* Thomas Wang's 32 Bit Mix Function: + * http://www.cris.com/~Ttwang/tech/inthash.htm + */ +static inline unsigned int int64_hash(uint64_t key) +{ + key += ~(key << 32); + key ^= (key >> 22); + key += ~(key << 13); + key ^= (key >> 8); + key += (key << 3); + key ^= (key >> 15); + key += ~(key << 27); + key ^= (key >> 31); + return (unsigned int)key; +} + +static inline unsigned int str_hash(const char *s) +{ + unsigned int key = 0; + + while (*s) + key = key*37 + *s++; + + return key; +} + +static inline unsigned int xio_int8_hash( + const struct xio_key_int8 *k) +{ + return int8_hash(k->id); +} + +static inline unsigned int xio_int16_hash( + const struct xio_key_int16 *k) +{ + return int16_hash(k->id); +} + +static inline unsigned int xio_int32_hash( + const struct xio_key_int32 *k) +{ + return int32_hash(k->id); +} + +static inline unsigned int xio_int64_hash( + const struct xio_key_int64 *k) +{ + return int64_hash(k->id); +} + +static inline unsigned int xio_str_hash( + const struct xio_key_str *k) +{ + return str_hash(k->id); +} + +static inline unsigned int xio_ptr_hash( + const struct xio_key_ptr *k) +{ + return int64_hash((uint64_t)(uintptr_t)(k->id)); +} + +static inline int xio_int32_cmp( + const struct xio_key_int32 *k1, + const struct xio_key_int32 *k2) +{ + return (k1->id == k2->id); +} + +static inline void xio_int32_cp( + struct xio_key_int32 *dst, + const struct xio_key_int32 *src) +{ + dst->id = src->id; +} + +static inline int xio_int64_cmp( + const struct xio_key_int64 *k1, + const struct xio_key_int64 *k2) +{ + return (k1->id == k2->id); +} + +static inline void xio_int64_cp( + struct xio_key_int64 *dst, + const struct xio_key_int64 *src) +{ + dst->id = src->id; +} + +static inline int xio_ptr_cmp( + const struct xio_key_ptr *k1, + const struct xio_key_ptr *k2) +{ + return (k1->id == k2->id); +} + +static inline void xio_ptr_cp( + struct xio_key_ptr *dst, + const struct xio_key_ptr *src) +{ + dst->id = src->id; +} + +#endif diff --git a/open_src/xio/src/common/xio_idr.c b/open_src/xio/src/common/xio_idr.c new file mode 100644 index 0000000..ced8dd5 --- /dev/null +++ b/open_src/xio/src/common/xio_idr.c @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include "xio_log.h" +#include "xio_common.h" +#include "xio_hash.h" +#include +#include "xio_idr.h" +#include + +struct xio_idr_entry { + void *key; + char *name; + + HT_ENTRY(xio_idr_entry, xio_key_int64) idr_ht_entry; +}; + +struct xio_idr { + HT_HEAD(, xio_idr_entry, HASHTABLE_PRIME_MEDIUM) cache; + spinlock_t lock; /* idr lock */ + int pad; +}; + +/*---------------------------------------------------------------------------*/ +/* xio_idr_remove_uobj */ +/*---------------------------------------------------------------------------*/ +int xio_idr_remove_uobj(struct xio_idr *idr, void *uobj) +{ + struct xio_idr_entry *idr_entry = NULL; + struct xio_key_int64 key; + + if (!idr || !uobj) + return -1; + + spin_lock(&idr->lock); + key.id = uint64_from_ptr(uobj); + HT_LOOKUP(&idr->cache, &key, idr_entry, idr_ht_entry); + if (!idr_entry) { + spin_unlock(&idr->lock); + return -1; + } + + HT_REMOVE(&idr->cache, idr_entry, xio_idr_entry, idr_ht_entry); + spin_unlock(&idr->lock); + + kfree(idr_entry->name); + kfree(idr_entry); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_idr_lookup_uobj */ +/*---------------------------------------------------------------------------*/ +int xio_idr_lookup_uobj(struct xio_idr *idr, void *uobj) +{ + struct xio_idr_entry *idr_entry = NULL; + struct xio_key_int64 key; + + if (!idr || !uobj) + return 0; + + spin_lock(&idr->lock); + key.id = uint64_from_ptr(uobj); + HT_LOOKUP(&idr->cache, &key, idr_entry, idr_ht_entry); + spin_unlock(&idr->lock); + + return idr_entry ? 1 : 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sessions_cache_add */ +/*---------------------------------------------------------------------------*/ +int xio_idr_add_uobj(struct xio_idr *idr, void *uobj, const char *obj_name) +{ + struct xio_idr_entry *idr1_entry = NULL, *idr_entry; + struct xio_key_int64 key; + int retval = -1; + char *pname = NULL; + + if (!idr || !uobj) + return -1; + + idr_entry = (struct xio_idr_entry *) + kcalloc(1, sizeof(*idr_entry), GFP_KERNEL); + if (!idr_entry) + return -1; + + pname = kstrdup(obj_name, GFP_KERNEL); + + spin_lock(&idr->lock); + key.id = uint64_from_ptr(uobj); + HT_LOOKUP(&idr->cache, &key, idr1_entry, idr_ht_entry); + if (idr1_entry) + goto exit; + + idr_entry->key = uobj; + idr_entry->name = pname; + HT_INSERT(&idr->cache, &key, idr_entry, idr_ht_entry); + retval = 0; +exit: + spin_unlock(&idr->lock); + if (retval) { + kfree(pname); + kfree(idr_entry); + } + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_idr_create */ +/*---------------------------------------------------------------------------*/ +struct xio_idr *xio_idr_create(void) +{ + struct xio_idr *idr; + + idr = (struct xio_idr *)kcalloc(1, sizeof(*idr), GFP_KERNEL); + if (!idr) + return NULL; + + HT_INIT(&idr->cache, xio_int64_hash, xio_int64_cmp, xio_int64_cp); + spin_lock_init(&idr->lock); + + return idr; +} + +/*---------------------------------------------------------------------------*/ +/* xio_idr_destroy */ +/*---------------------------------------------------------------------------*/ +void xio_idr_destroy(struct xio_idr *idr) +{ + struct xio_idr_entry *idr_entry = NULL; + + if (!idr) + return; + + HT_FOREACH_SAFE(idr_entry, &idr->cache, idr_ht_entry) { + HT_REMOVE(&idr->cache, idr_entry, xio_idr_entry, idr_ht_entry); + ERROR_LOG("user object leaked: %p, type:struct %s\n", + idr_entry->key, idr_entry->name); + kfree(idr_entry->name); + kfree(idr_entry); + } + kfree(idr); +} + diff --git a/open_src/xio/src/common/xio_idr.h b/open_src/xio/src/common/xio_idr.h new file mode 100644 index 0000000..c544a85 --- /dev/null +++ b/open_src/xio/src/common/xio_idr.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_IDR_H +#define XIO_IDR_H + +struct xio_idr; + +/*---------------------------------------------------------------------------*/ +/* user object cache */ +/*---------------------------------------------------------------------------*/ +struct xio_idr *xio_idr_create(void); + +int xio_idr_add_uobj(struct xio_idr *cache, void *uobj, const char *obj_name); + +int xio_idr_remove_uobj(struct xio_idr *cache, void *uobj); + +int xio_idr_lookup_uobj(struct xio_idr *cache, void *uobj); + +void xio_idr_destroy(struct xio_idr *cache); + +#endif /*XIO_IDR_H */ + diff --git a/open_src/xio/src/common/xio_mbuf.h b/open_src/xio/src/common/xio_mbuf.h new file mode 100644 index 0000000..e568583 --- /dev/null +++ b/open_src/xio/src/common/xio_mbuf.h @@ -0,0 +1,498 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_MBUF_H +#define XIO_MBUF_H + + struct xio_mbuf_buf { + void *head; + void *tail; + uint32_t buflen; + uint32_t datalen; + }; + + struct xio_mbuf_tlv { + void *head; + void *tail; + uint64_t len; + uint32_t type; + uint32_t pad; + void *val; + }; + +struct xio_mbuf { + void *curr; + struct xio_mbuf_buf buf; + struct xio_mbuf_tlv tlv; + void *marker; +}; + +#define xio_mbuf_set_tlv_hdr(mbuf) \ + ((mbuf)->curr = ((mbuf)->tlv.head)) + +#define xio_mbuf_set_val_start(mbuf) \ + ((mbuf)->curr = ((char *)(mbuf)->tlv.head + XIO_TLV_LEN)) + +#define xio_mbuf_set_session_hdr(mbuf) \ + ((mbuf)->curr = sum_to_ptr((mbuf)->tlv.head, XIO_TLV_LEN)) + +#define xio_mbuf_set_trans_hdr(mbuf) \ + ((mbuf)->curr = sum_to_ptr((mbuf)->tlv.head, \ + XIO_TLV_LEN + XIO_SESSION_HDR_LEN)) + +#define xio_mbuf_tlv_head(mbuf) ((mbuf)->tlv.head) + +#define xio_mbuf_tlv_val_ptr(mbuf) ((mbuf)->tlv.val) + +#define xio_mbuf_tlv_type(mbuf) ((mbuf)->tlv.type) + +#define xio_mbuf_data_length(mbuf) ((mbuf)->buf.datalen) + +#define xio_mbuf_tlv_len(mbuf) \ + ((char *)(mbuf)->curr - (char *)(mbuf)->tlv.head) + +#define xio_mbuf_tlv_payload_len(mbuf) \ + ((char *)(mbuf)->curr - (char *)(mbuf)->tlv.val) + +#define xio_mbuf_reset(mbuf) \ + ((mbuf)->curr = (mbuf)->buf.head) + +#define xio_mbuf_tlv_space_left(mbuf) \ + ((mbuf)->buf.tail - (mbuf)->curr) + +#define xio_mbuf_get_curr_ptr(mbuf) ((mbuf)->curr) + +#define xio_mbuf_get_curr_offset(mbuf) \ + ((char *)(mbuf)->curr - (char *)(mbuf)->buf.head) + +#define xio_mbuf_inc(mbuf, len) \ + ((mbuf)->curr = ((char *)(mbuf)->curr + (len))) + +#define xio_mbuf_dec(mbuf, len) \ + ((mbuf)->curr = ((mbuf)->curr - (len))) + +#define xio_mbuf_push(mbuf) ((mbuf)->marker = (mbuf)->curr) + +#define xio_mbuf_pop(mbuf) ((mbuf)->curr = (mbuf)->marker) + +/*---------------------------------------------------------------------------*/ +/* xio_mbuf_dump */ +/*---------------------------------------------------------------------------*/ +static inline void xio_mbuf_dump(struct xio_mbuf *mbuf) +{ + DEBUG_LOG("########################################################" \ + "#############\n"); + DEBUG_LOG("buf: mbuf:%p head:%p, tail:%p, buflen:%u, datalen:%u\n", + mbuf, mbuf->buf.head, mbuf->buf.tail, mbuf->buf.buflen, + mbuf->buf.datalen); + DEBUG_LOG("tlv: mbuf:%p head:%p, tail:%p, type:%d, len:%llu, val:%p\n", + mbuf, mbuf->tlv.head, mbuf->tlv.tail, mbuf->tlv.type, + mbuf->tlv.len, mbuf->tlv.val); + DEBUG_LOG("curr: mbuf:%p curr:%p\n", mbuf, mbuf->curr); + DEBUG_LOG("#########################################################" \ + "############\n"); +} + +/*---------------------------------------------------------------------------*/ +/* xio_mbuf_init */ +/*---------------------------------------------------------------------------*/ +static inline void xio_mbuf_init(struct xio_mbuf *mbuf, void *buf, + uint32_t buflen, uint32_t datalen) +{ + struct xio_mbuf_buf *pbuf = &mbuf->buf; + struct xio_mbuf_tlv *tlv = &mbuf->tlv; + + mbuf->curr = buf; + pbuf->head = buf; + pbuf->tail = sum_to_ptr(buf, buflen); + pbuf->buflen = buflen; + pbuf->datalen = datalen; + + memset(tlv, 0, sizeof(*tlv)); +} + +/*---------------------------------------------------------------------------*/ +/* xio_mbuf_tlv_start */ +/*---------------------------------------------------------------------------*/ +static inline int xio_mbuf_tlv_start(struct xio_mbuf *mbuf) +{ + struct xio_mbuf_buf *buf = &mbuf->buf; + struct xio_mbuf_tlv *tlv = &mbuf->tlv; + + if (((uint64_t)((char *)buf->tail - (char *)mbuf->curr)) <= + XIO_TLV_LEN) { + ERROR_LOG("xio_mbuf_tlv start failed. buf.tail:%p, " \ + "len:%zd, curr:%p\n", + buf->tail, XIO_TLV_LEN, mbuf->curr); + return -1; + } + + tlv->head = mbuf->curr; + tlv->tail = buf->tail; + tlv->val = sum_to_ptr(buf->head, XIO_TLV_LEN); + mbuf->curr = tlv->val; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_mbuf_read_first_tlv */ +/*---------------------------------------------------------------------------*/ +static inline int xio_mbuf_read_first_tlv(struct xio_mbuf *mbuf) +{ + int len; + struct xio_mbuf_tlv *tlv = &mbuf->tlv; + + tlv->head = mbuf->buf.head; + + len = xio_read_tlv(&tlv->type, &tlv->len, + &tlv->val, (uint8_t *)tlv->head); + if (len == -1 || (sum_to_ptr(tlv->head, len) > mbuf->buf.tail)) { + ERROR_LOG("xio_mbuf_first_read_tlv failed. tlv.head:%p, " \ + "len:%d, buf.tail:%p\n", + tlv->head, len, mbuf->buf.tail); + return -1; + } + tlv->tail = sum_to_ptr(tlv->head, len); + mbuf->curr = tlv->val; + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_mbuf_read_next_tlv */ +/*---------------------------------------------------------------------------*/ +static inline int xio_mbuf_read_next_tlv(struct xio_mbuf *mbuf) +{ + int len; + + mbuf->tlv.head = mbuf->tlv.tail; + + len = xio_read_tlv(&mbuf->tlv.type, &mbuf->tlv.len, + &mbuf->tlv.val, (uint8_t *)mbuf->tlv.head); + if (len == -1 || (sum_to_ptr(mbuf->tlv.head, len) > mbuf->buf.tail)) { + ERROR_LOG("xio_mbuf_next_read_tlv failed. tlv.head:%p, " \ + "len:%d, buf.tail:%p\n", + mbuf->tlv.head, len, mbuf->buf.tail); + return -1; + } + mbuf->tlv.tail = sum_to_ptr(mbuf->tlv.head, len); + mbuf->curr = mbuf->tlv.val; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_mbuf_write_tlv */ +/*---------------------------------------------------------------------------*/ +static inline int xio_mbuf_write_tlv(struct xio_mbuf *mbuf, uint32_t type, + uint16_t len) +{ + int retval; + + mbuf->tlv.type = type; + mbuf->tlv.len = len; + + retval = xio_write_tlv(mbuf->tlv.type, mbuf->tlv.len, + (uint8_t *)mbuf->tlv.head); + if (retval == -1 || (sum_to_ptr(mbuf->tlv.head, retval) > + mbuf->buf.tail)) { + ERROR_LOG("xio_mbuf_write_tlv failed. tlv.head:%p, " \ + "len:%d, buf.tail:%p\n", + mbuf->tlv.head, retval, mbuf->buf.tail); + return -1; + } + mbuf->tlv.tail = sum_to_ptr(mbuf->tlv.head, retval); + mbuf->buf.datalen = (char *)mbuf->curr - (char *)mbuf->tlv.head; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_read_tlv */ +/*---------------------------------------------------------------------------*/ +static inline uint32_t xio_read_tlv_type(struct xio_mbuf *mbuf) +{ + struct xio_tlv *tlv; + static uint32_t magic; + + if (magic == 0) + magic = ntohl(XIO_MAGIC); + + tlv = (struct xio_tlv *)mbuf->tlv.head; + if (tlv->magic != magic) + return -1; + + return ntohl(tlv->type); +} + +/*---------------------------------------------------------------------------*/ +/* xio_mbuf_write_u8 */ +/*---------------------------------------------------------------------------*/ +static inline int xio_mbuf_write_u8(struct xio_mbuf *mbuf, uint8_t val) +{ + if (sum_to_ptr(mbuf->curr, sizeof(uint8_t)) <= mbuf->buf.tail) { + inc_ptr(mbuf->curr, + xio_write_uint8(val, 0, (uint8_t *)mbuf->curr)); + return 0; + } + ERROR_LOG("xio_mbuf_write_u8 failed. curr:%p, " \ + "len:%zd, buf.tail:%p\n", + mbuf->curr, sizeof(uint8_t), mbuf->buf.tail); + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_mbuf_read_u8 */ +/*---------------------------------------------------------------------------*/ +static inline int xio_mbuf_read_u8(struct xio_mbuf *mbuf, uint8_t *val) +{ + if (sum_to_ptr(mbuf->curr, sizeof(uint8_t)) <= mbuf->tlv.tail) { + inc_ptr(mbuf->curr, + xio_read_uint8(val, 0, (const uint8_t *)mbuf->curr)); + return 0; + } + ERROR_LOG("xio_mbuf_read_u8 failed. curr:%p, " \ + "len:%zd, tlv.tail:%p\n", + mbuf->curr, sizeof(uint8_t), mbuf->tlv.tail); + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_mbuf_write_u16 */ +/*---------------------------------------------------------------------------*/ +static inline int xio_mbuf_write_u16(struct xio_mbuf *mbuf, uint16_t val) +{ + if (sum_to_ptr(mbuf->curr, sizeof(uint16_t)) <= mbuf->buf.tail) { + inc_ptr(mbuf->curr, + xio_write_uint16(val, 0, (uint8_t *)mbuf->curr)); + return 0; + } + ERROR_LOG("xio_mbuf_write_u16 failed. curr:%p, " \ + "len:%zd, buf.tail:%p\n", + mbuf->curr, sizeof(uint16_t), mbuf->buf.tail); + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_mbuf_read_u16 */ +/*---------------------------------------------------------------------------*/ +static inline int xio_mbuf_read_u16(struct xio_mbuf *mbuf, uint16_t *val) +{ + if (sum_to_ptr(mbuf->curr, sizeof(uint16_t)) <= mbuf->tlv.tail) { + inc_ptr(mbuf->curr, + xio_read_uint16(val, 0, (const uint8_t *)mbuf->curr)); + return 0; + } + ERROR_LOG("xio_mbuf_read_u16 failed. curr:%p, " \ + "len:%zd, tlv.tail:%p\n", + mbuf->curr, sizeof(uint16_t), mbuf->tlv.tail); + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_mbuf_write_u32 */ +/*---------------------------------------------------------------------------*/ +static inline int xio_mbuf_write_u32(struct xio_mbuf *mbuf, uint32_t val) +{ + if (sum_to_ptr(mbuf->curr, sizeof(uint32_t)) <= mbuf->buf.tail) { + inc_ptr(mbuf->curr, + xio_write_uint32(val, 0, (uint8_t *)mbuf->curr)); + return 0; + } + ERROR_LOG("xio_mbuf_write_u32 failed. curr:%p, " \ + "len:%zd, buf.tail:%p\n", + mbuf->curr, sizeof(uint32_t), mbuf->buf.tail); + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_mbuf_read_u32 */ +/*---------------------------------------------------------------------------*/ +static inline int xio_mbuf_read_u32(struct xio_mbuf *mbuf, uint32_t *val) +{ + if (sum_to_ptr(mbuf->curr, sizeof(uint32_t)) <= mbuf->tlv.tail) { + inc_ptr(mbuf->curr, + xio_read_uint32(val, 0, (const uint8_t *)mbuf->curr)); + return 0; + } + ERROR_LOG("xio_mbuf_read_u32 failed. curr:%p, " \ + "len:%zd, tlv.tail:%p\n", + mbuf->curr, sizeof(uint32_t), mbuf->tlv.tail); + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_mbuf_write_u64 */ +/*---------------------------------------------------------------------------*/ +static inline int xio_mbuf_write_u64(struct xio_mbuf *mbuf, uint64_t val) +{ + if (sum_to_ptr(mbuf->curr, sizeof(uint64_t)) <= mbuf->buf.tail) { + inc_ptr(mbuf->curr, + xio_write_uint64(val, 0, (uint8_t *)mbuf->curr)); + return 0; + } + ERROR_LOG("xio_mbuf_write_u64 failed. curr:%p, len:%zd, " \ + "buf.tail:%p\n", mbuf->curr, sizeof(uint64_t), + mbuf->buf.tail); + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_mbuf_read_u64 */ +/*---------------------------------------------------------------------------*/ +static inline int xio_mbuf_read_u64(struct xio_mbuf *mbuf, uint64_t *val) +{ + if ((uint64_t)((char *)mbuf->tlv.tail - (char *)mbuf->curr) > + sizeof(uint64_t)) { + inc_ptr(mbuf->curr, + xio_read_uint64(val, 0, (const uint8_t *)mbuf->curr)); + return 0; + } + ERROR_LOG("xio_mbuf_read_u64 failed. curr:%p, " \ + "len:%zd, tlv.tail:%p\n", + mbuf->curr, sizeof(uint64_t), mbuf->tlv.tail); + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_mbuf_write_array */ +/*---------------------------------------------------------------------------*/ +static inline int xio_mbuf_write_array(struct xio_mbuf *mbuf, void *array, + size_t len) +{ + if (sum_to_ptr(mbuf->curr, len) <= mbuf->buf.tail) { + inc_ptr(mbuf->curr, + xio_write_array((const uint8_t *)array, len, + 0, (uint8_t *)mbuf->curr)); + return 0; + } + ERROR_LOG("xio_mbuf_write_array failed. curr:%p, " \ + "len:%zd, buf.tail:%p\n", + mbuf->curr, len, mbuf->buf.tail); + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_mbuf_read_array */ +/*---------------------------------------------------------------------------*/ +static inline int xio_mbuf_read_array(struct xio_mbuf *mbuf, void *array, + size_t len) +{ + if (sum_to_ptr(mbuf->curr, len) <= mbuf->tlv.tail) { + inc_ptr(mbuf->curr, + xio_read_array((uint8_t *)array, len, 0, + (const uint8_t *)mbuf->curr)); + return 0; + } + ERROR_LOG("xio_mbuf_read_array failed. curr:%p, len:%zd, " \ + "tlv.tail:%p\n", + mbuf->curr, len, mbuf->tlv.tail); + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_mbuf_write_string */ +/*---------------------------------------------------------------------------*/ +static inline int xio_mbuf_write_string(struct xio_mbuf *mbuf, + const char *str, size_t maxlen) +{ + size_t len = strnlen(str, maxlen); + + if (sum_to_ptr(mbuf->curr, len) <= mbuf->buf.tail) { + inc_ptr(mbuf->curr, + xio_write_string(str, maxlen, 0, + (uint8_t *)mbuf->curr)); + return 0; + } + + ERROR_LOG("xio_mbuf_write_string failed. curr:%p, " \ + "len:%zd, buf.tail:%p\n", + mbuf->curr, len, mbuf->buf.tail); + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_mbuf_read_string */ +/*---------------------------------------------------------------------------*/ +static inline int xio_mbuf_read_string(struct xio_mbuf *mbuf, char *str, + uint16_t maxlen, size_t *len) +{ + *len = xio_read_string(str, maxlen, 0, (const uint8_t *)mbuf->curr); + inc_ptr(mbuf->curr, *len); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_mbuf_set_data_length */ +/*---------------------------------------------------------------------------*/ +static inline int xio_mbuf_set_data_length(struct xio_mbuf *mbuf, + size_t datalen) +{ + if (likely(datalen <= mbuf->buf.buflen)) { + mbuf->buf.datalen = datalen; + return 0; + } + ERROR_LOG("xio_mbuf_set_data_length failed. datalen:%zd, " \ + "buf.buflen:%u\n", datalen, mbuf->buf.buflen); + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_mbuf_read_first_tlv */ +/*---------------------------------------------------------------------------*/ +static inline int xio_mbuf_read_type(struct xio_mbuf *mbuf) +{ + struct xio_tlv *tlv = (struct xio_tlv *)mbuf->buf.head; + + return ntohl(tlv->type); +} + +#endif + diff --git a/open_src/xio/src/common/xio_msg_list.h b/open_src/xio/src/common/xio_msg_list.h new file mode 100644 index 0000000..fb9eddb --- /dev/null +++ b/open_src/xio/src/common/xio_msg_list.h @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_MSG_LIST_H +#define XIO_MSG_LIST_H + +struct xio_msg_list { + struct xio_msg *first; /* first element */ + struct xio_msg **last; /* addr of last next element */ +}; + +#define XIO_MSG_LIST_HEAD_INITIALIZER(head) \ + { NULL, &(head).first } + +/* + * msg list functions. + */ +#define xio_msg_list_init(head) do { \ + (head)->first = NULL; \ + (head)->last = &(head)->first; \ +} while (/*CONSTCOND*/0) + +#define xio_msg_list_insert_head(head, elm, field) do { \ + if (((elm)->field.next = (head)->first) != NULL) \ + (head)->first->field.prev = \ + &(elm)->field.next; \ + else \ + (head)->last = &(elm)->field.next; \ + (head)->first = (elm); \ + (elm)->field.prev = &(head)->first; \ +} while (/*CONSTCOND*/0) + +#define xio_msg_list_insert_tail(head, elm, field) do { \ + (elm)->field.next = NULL; \ + (elm)->field.prev = (head)->last; \ + *(head)->last = (elm); \ + (head)->last = &(elm)->field.next; \ +} while (/*CONSTCOND*/0) + +#define xio_msg_list_insert_after(head, listelm, elm, field) do { \ + if (((elm)->field.next = (listelm)->field.next) != NULL) \ + (elm)->field.next->field.prev = \ + &(elm)->field.next; \ + else \ + (head)->last = &(elm)->field.next; \ + (listelm)->field.next = (elm); \ + (elm)->field.prev = &(listelm)->field.next; \ +} while (/*CONSTCOND*/0) + +#define xio_msg_list_insert_before(listelm, elm, field) do { \ + (elm)->field.prev = (listelm)->field.prev; \ + (elm)->field.next = (listelm); \ + *(listelm)->field.prev = (elm); \ + (listelm)->field.prev = &(elm)->field.next; \ +} while (/*CONSTCOND*/0) + +#define xio_msg_list_remove(head, elm, field) do { \ + if (((elm)->field.next) != NULL) \ + (elm)->field.next->field.prev = \ + (elm)->field.prev; \ + else \ + (head)->last = (elm)->field.prev; \ + *(elm)->field.prev = (elm)->field.next; \ +} while (/*CONSTCOND*/0) + +#define xio_msg_list_foreach(var, head, field) \ + for ((var) = ((head)->first); \ + (var); \ + (var) = ((var)->field.next)) + +#define xio_msg_list_foreach_reverse(var, head, headname, field) \ + for ((var) = (*(((struct headname *)((head)->last))->last)); \ + (var); \ + (var) = (*(((struct headname *)((var)->field.prev))->last))) + +#define xio_msg_list_foreach_safe(var, head, tvar, field) \ + for ((var) = xio_msg_list_first((head)); \ + (var) && ((tvar) = xio_msg_list_next((var), field), 1); \ + (var) = (tvar)) + +#define xio_msg_list_concat(head1, head2, field) do { \ + if (!xio_msg_list_empty(head2)) { \ + *(head1)->last = (head2)->first; \ + (head2)->first->field.prev = (head1)->last; \ + (head1)->last = (head2)->last; \ + xio_msg_list_init((head2)); \ + } \ +} while (/*CONSTCOND*/0) + +#define xio_msg_list_splice(head, elm, field) do { \ + struct xio_msg *curelm = (elm), *nextelm; \ + do { \ + nextelm = (curelm)->field.next; \ + xio_msg_list_insert_tail((head), (curelm)); \ + curelm = nextelm; \ + } while (curelm); \ +} while (/*CONSTCOND*/0) + +/* + * message list access methods. + */ +#define xio_msg_list_empty(head) (!(head)->first) +#define xio_msg_list_first(head) ((head)->first) +#define xio_msg_list_next(elm, field) ((elm)->field.next) + +#define xio_msg_list_last(head, headname) \ + (*(((struct headname *)((head)->last))->last)) +#define xio_msg_list_prev(elm, headname, field) \ + (*(((struct headname *)((elm)->field.prev))->last)) + +#endif /* XIO_MSG_LIST_H */ diff --git a/open_src/xio/src/common/xio_nexus.c b/open_src/xio/src/common/xio_nexus.c new file mode 100644 index 0000000..0212503 --- /dev/null +++ b/open_src/xio/src/common/xio_nexus.c @@ -0,0 +1,2747 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include "libxio.h" +#include "xio_log.h" +#include "xio_common.h" +#include "xio_protocol.h" +#include "xio_hash.h" +#include "xio_observer.h" +#include "xio_ev_data.h" +#include "xio_objpool.h" +#include "xio_workqueue.h" +#include "xio_protocol.h" +#include "xio_mbuf.h" +#include "xio_task.h" +#include "xio_transport.h" +#include "xio_context.h" +#include "xio_nexus_cache.h" +#include "xio_server.h" +#include "xio_session.h" +#include "xio_nexus.h" +#include + +/*---------------------------------------------------------------------------*/ +/* private structures */ +/*---------------------------------------------------------------------------*/ +struct xio_observers_htbl_node { + struct xio_observer *observer; + uint32_t id; + uint32_t pad; + struct list_head observers_htbl_node; + +}; + +struct xio_event_params { + struct xio_nexus *nexus; + union xio_transport_event_data event_data; +}; + +struct xio_nexus_observer_work { + struct xio_observer_event observer_event; + xio_work_handle_t observer_work; + struct xio_context *ctx; +}; + +static int xio_msecs[] = {60000, 30000, 15000, 0}; + +#define XIO_SERVER_GRACE_PERIOD 1000 +#define XIO_SERVER_TIMEOUT (60000 + 30000 + 15000 + XIO_SERVER_GRACE_PERIOD) + +/*---------------------------------------------------------------------------*/ +/* forward declarations */ +/*---------------------------------------------------------------------------*/ +static int xio_nexus_primary_pool_create(struct xio_nexus *nexus); +static int xio_nexus_primary_pool_recreate(struct xio_nexus *nexus); +static int xio_nexus_on_transport_event(void *observer, void *sender, + int event, void *event_data); +static void xio_nexus_on_transport_closed(struct xio_nexus *nexus, + union xio_transport_event_data + *event_data); +static int xio_nexus_flush_tx_queue(struct xio_nexus *nexus); +static int xio_nexus_destroy(struct xio_nexus *nexus); +static int xio_nexus_xmit(struct xio_nexus *nexus); +static void xio_nexus_destroy_handler(void *nexus_); +static void xio_nexus_disconnected(void *nexus_); +static void xio_nexus_trans_error_handler(void *ev_params_); + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_server_reconnect */ +/*---------------------------------------------------------------------------*/ +static int xio_nexus_server_reconnect(struct xio_nexus *nexus); + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_client_reconnect */ +/*---------------------------------------------------------------------------*/ +static int xio_nexus_client_reconnect(struct xio_nexus *nexus); + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_client_reconnect_timeout */ +/*---------------------------------------------------------------------------*/ +static void xio_nexus_client_reconnect_failed(void *data); + +static void xio_nexus_cancel_dwork(struct xio_nexus *nexus) +{ + xio_ctx_del_delayed_work(nexus->transport_hndl->ctx, + &nexus->close_time_hndl); +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_init_observers_htbl */ +/*---------------------------------------------------------------------------*/ +static inline void xio_nexus_init_observers_htbl(struct xio_nexus *nexus) +{ + INIT_LIST_HEAD(&nexus->observers_htbl); +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_free_observers_htbl */ +/*---------------------------------------------------------------------------*/ +static void xio_nexus_free_observers_htbl(struct xio_nexus *nexus) +{ + struct xio_observers_htbl_node *node, *next_node; + + list_for_each_entry_safe(node, next_node, + &nexus->observers_htbl, + observers_htbl_node) { + list_del(&node->observers_htbl_node); + kfree(node); + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_hash_observer */ +/*---------------------------------------------------------------------------*/ +static int xio_nexus_hash_observer(struct xio_nexus *nexus, + struct xio_observer *observer, + uint32_t id) +{ + struct xio_observers_htbl_node *node; + + node = (struct xio_observers_htbl_node *) + kcalloc(1, sizeof(*node), GFP_KERNEL); + if (!node) { + xio_set_error(ENOMEM); + ERROR_LOG("kcalloc failed. %m\n"); + return -1; + } + node->observer = observer; + node->id = id; + + list_add_tail(&node->observers_htbl_node, + &nexus->observers_htbl); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_delete_observer */ +/*---------------------------------------------------------------------------*/ +static int xio_nexus_delete_observer(struct xio_nexus *nexus, + struct xio_observer *observer) +{ + struct xio_observers_htbl_node *node, *next_node; + + list_for_each_entry_safe(node, next_node, + &nexus->observers_htbl, + observers_htbl_node) { + if (node->observer == observer) { + list_del(&node->observers_htbl_node); + kfree(node); + return 0; + } + } + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_observer_lookup */ +/*---------------------------------------------------------------------------*/ +struct xio_observer *xio_nexus_observer_lookup(struct xio_nexus *nexus, + uint32_t id) +{ + struct xio_observers_htbl_node *node, *next_node; + + list_for_each_entry_safe(node, next_node, + &nexus->observers_htbl, + observers_htbl_node) { + if (node->id == id) + return node->observer; + } + + return NULL; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_reg_observer */ +/*---------------------------------------------------------------------------*/ +void xio_nexus_reg_observer(struct xio_nexus *nexus, + struct xio_observer *observer, + uint32_t oid) +{ + spin_lock(&nexus->nexus_obs_lock); + xio_observable_reg_observer(&nexus->observable, observer); + xio_nexus_hash_observer(nexus, observer, oid); + spin_unlock(&nexus->nexus_obs_lock); +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_unreg_observer */ +/*---------------------------------------------------------------------------*/ +void xio_nexus_unreg_observer(struct xio_nexus *nexus, + struct xio_observer *observer) +{ + spin_lock(&nexus->nexus_obs_lock); + xio_nexus_delete_observer(nexus, observer); + xio_observable_unreg_observer(&nexus->observable, observer); + spin_unlock(&nexus->nexus_obs_lock); +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_get_primary_task */ +/*---------------------------------------------------------------------------*/ +struct xio_task *xio_nexus_get_primary_task(struct xio_nexus *nexus) +{ + struct xio_task *task = xio_tasks_pool_get( + nexus->primary_tasks_pool, nexus->transport_hndl); + + if (!task) + return NULL; + task->nexus = nexus; + + return task; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_task_lookup */ +/*---------------------------------------------------------------------------*/ +static inline struct xio_task *xio_nexus_task_lookup(void *nexus, int id) +{ + return xio_tasks_pool_lookup( + ((struct xio_nexus *)nexus)->primary_tasks_pool, id); +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_notify_server */ +/*---------------------------------------------------------------------------*/ +static void xio_nexus_notify_server(struct xio_nexus *nexus, int event, + void *event_data) +{ + if (nexus->server) + xio_observable_notify_observer(&nexus->observable, + &nexus->server->observer, + event, event_data); +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_write_setup_req */ +/*---------------------------------------------------------------------------*/ +static int xio_nexus_write_setup_req(struct xio_task *task, + struct xio_nexus_setup_req *req) +{ + struct xio_nexus_setup_req *tmp_req; + + /* reset the whole mbuf before building a message */ + xio_mbuf_reset(&task->mbuf); + + /* set start of the tlv */ + if (xio_mbuf_tlv_start(&task->mbuf) != 0) + return -1; + + /* set the mbuf after tlv header */ + xio_mbuf_set_val_start(&task->mbuf); + + tmp_req = (struct xio_nexus_setup_req *) + xio_mbuf_get_curr_ptr(&task->mbuf); + + /* fill request */ + PACK_SVAL(req, tmp_req, version); + PACK_SVAL(req, tmp_req, flags); + PACK_LVAL(req, tmp_req, cid); + + xio_mbuf_inc(&task->mbuf, sizeof(struct xio_nexus_setup_req)); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_read_setup_req */ +/*---------------------------------------------------------------------------*/ +static int xio_nexus_read_setup_req(struct xio_task *task, + struct xio_nexus_setup_req *req) +{ + struct xio_nexus_setup_req *tmp_req; + + /* reset the whole mbuf before building a message */ + xio_mbuf_reset(&task->mbuf); + + /* set start of the tlv */ + if (xio_mbuf_tlv_start(&task->mbuf) != 0) + return -1; + + tmp_req = (struct xio_nexus_setup_req *) + xio_mbuf_get_curr_ptr(&task->mbuf); + + /* fill request */ + UNPACK_SVAL(tmp_req, req, version); + UNPACK_SVAL(tmp_req, req, flags); + UNPACK_LVAL(tmp_req, req, cid); + + xio_mbuf_inc(&task->mbuf, sizeof(struct xio_nexus_setup_req)); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_write_setup_rsp */ +/*---------------------------------------------------------------------------*/ +static int xio_nexus_write_setup_rsp(struct xio_task *task, + struct xio_nexus_setup_rsp *rsp) +{ + struct xio_nexus_setup_rsp *tmp_rsp; + + /* set start of the tlv */ + if (xio_mbuf_tlv_start(&task->mbuf) != 0) + return -1; + + tmp_rsp = (struct xio_nexus_setup_rsp *) + xio_mbuf_get_curr_ptr(&task->mbuf); + + /* fill request */ + PACK_LVAL(rsp, tmp_rsp, cid); + PACK_LVAL(rsp, tmp_rsp, status); + PACK_SVAL(rsp, tmp_rsp, version); + PACK_SVAL(rsp, tmp_rsp, flags); + + xio_mbuf_inc(&task->mbuf, sizeof(struct xio_nexus_setup_rsp)); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_read_setup_rsp */ +/*---------------------------------------------------------------------------*/ +static int xio_nexus_read_setup_rsp(struct xio_task *task, + struct xio_nexus_setup_rsp *rsp) +{ + struct xio_nexus_setup_rsp *tmp_rsp; + + /* set start of the tlv */ + if (xio_mbuf_tlv_start(&task->mbuf) != 0) + return -1; + + tmp_rsp = (struct xio_nexus_setup_rsp *) + xio_mbuf_get_curr_ptr(&task->mbuf); + + /* fill request */ + UNPACK_LVAL(tmp_rsp, rsp, cid); + UNPACK_LVAL(tmp_rsp, rsp, status); + UNPACK_SVAL(tmp_rsp, rsp, version); + UNPACK_SVAL(tmp_rsp, rsp, flags); + + xio_mbuf_inc(&task->mbuf, sizeof(struct xio_nexus_setup_rsp)); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_send_setup_req */ +/*---------------------------------------------------------------------------*/ +static int xio_nexus_send_setup_req(struct xio_nexus *nexus) +{ + struct xio_task *task; + struct xio_nexus_setup_req req = {0}; + struct xio_transport_base *trans_hndl; + int retval = 0; + struct xio_tasks_pool *pool; + + TRACE_LOG("send setup request\n"); + + if (!nexus->transport->send) { + ERROR_LOG("transport does not implement \"send\"\n"); + xio_set_error(ENOSYS); + return -1; + } + /* when reconnecting before the dup2 send is done via new handle */ + if (nexus->state == XIO_NEXUS_STATE_RECONNECT) { + req.flags = XIO_RECONNECT; + req.cid = nexus->server_cid; + trans_hndl = nexus->new_transport_hndl; + } else { + req.flags = 0; + req.cid = 0; + trans_hndl = nexus->transport_hndl; + } + + if (nexus->srq_enabled) + pool = nexus->primary_tasks_pool; + else + pool = nexus->initial_tasks_pool; + task = xio_tasks_pool_get(pool, trans_hndl); + if (!task) { + ERROR_LOG("%s task pool is empty\n", pool->params.pool_name); + return -1; + } + task->nexus = nexus; + task->tlv_type = XIO_NEXUS_SETUP_REQ; + task->omsg = NULL; + + req.version = XIO_VERSION; + + retval = xio_nexus_write_setup_req(task, &req); + if (retval) + goto cleanup; + + /* always add it to the top */ + list_add(&task->tasks_list_entry, &nexus->tx_queue); + + if (!trans_hndl) { + ERROR_LOG("null transport handle state=%d\n", nexus->state); + xio_tasks_pool_put(task); + return -1; + } + TRACE_LOG("%s: nexus:%p, rdma_hndl:%p\n", __func__, + nexus, trans_hndl); + retval = nexus->transport->send(trans_hndl, task); + if (retval != 0) { + ERROR_LOG("send setup request failed\n"); + xio_tasks_pool_put(task); + return -1; + } + + return 0; + +cleanup: + xio_tasks_pool_put(task); + xio_set_error(XIO_E_MSG_INVALID); + ERROR_LOG("receiving setup request failed\n"); + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_swap */ +/*---------------------------------------------------------------------------*/ +static int xio_nexus_swap(struct xio_nexus *old, struct xio_nexus *_new) +{ + struct xio_transport *transport; + struct xio_tasks_pool *initial_tasks_pool; + + if (old->transport != _new->transport) { + ERROR_LOG("can't swap not the same transport\n"); + return -1; + } + + transport = old->transport; + + if (!transport->dup2) { + ERROR_LOG("transport doesn't support dup2\n"); + return -ENOSYS; + } + + /* SWAP observers */ + /* disconnect observers */ + xio_observable_unreg_observer( + &_new->transport_hndl->observable, + &_new->trans_observer); + + xio_observable_unreg_observer( + &old->transport_hndl->observable, + &old->trans_observer); + + /* reconnect observers (swapped) */ + xio_observable_reg_observer( + &_new->transport_hndl->observable, + &old->trans_observer); + + xio_observable_reg_observer( + &old->transport_hndl->observable, + &_new->trans_observer); + + /* Swap the initial pool as the setup request arrived on the a task + * from the initial pool and should be answered using the same task + */ + initial_tasks_pool = old->initial_tasks_pool; + old->initial_tasks_pool = _new->initial_tasks_pool; + _new->initial_tasks_pool = initial_tasks_pool; + + xio_tasks_pool_remap(old->primary_tasks_pool, _new->transport_hndl); + /* make old_nexus->transport_hndl copy of new_nexus->transport_hndl + * old_nexus->trasport_hndl will be closed, note that observers were + * swapped + */ + if (transport->dup2(_new->transport_hndl, &old->transport_hndl)) { + ERROR_LOG("dup2 transport failed\n"); + return -1; + } + + /* + * Unregister the new_nexus (it was temporary) from the context. + */ + xio_context_unreg_observer(_new->transport_hndl->ctx, &_new->ctx_observer); + + /* silently destroy new_nexus (it was temporary) but do not close + * its transport handler since it was copied from _new to old, + * _new->transport_hndl is now used as old_nexus->transport_hndl. + * + * if the failure is on the client side, destroy the temporary new_nexus. + * if the failure is on the server side, the temporary new_nexus will be + * destroyed after the transport closes (by calling xio_nexus_on_transport_closed + * after a XIO_TRANSPORT_EVENT_CLOSED occurs on the server side. + */ + _new->transport_hndl = NULL; + if (old->transport_hndl->is_client) xio_nexus_destroy(_new); + + /* TODO what about messages held by the application */ + + /* be ready to receive messages */ + if (xio_nexus_primary_pool_recreate(old)) { + ERROR_LOG("recreate primary pool failed\n"); + return -1; + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_on_recv_setup_req */ +/*---------------------------------------------------------------------------*/ +static int xio_nexus_on_recv_setup_req(struct xio_nexus *new_nexus, + struct xio_task *task) +{ + struct xio_nexus_setup_req req; + struct xio_nexus_setup_rsp rsp; + struct xio_nexus *nexus; + uint32_t status = 0; + uint32_t cid; + int retval = 0; + uint16_t flags = 0; + + if (new_nexus->state == XIO_NEXUS_STATE_CLOSED) { + ERROR_LOG("got a request for a closing nexus %p\n", new_nexus); + } + + TRACE_LOG("receiving setup request\n"); + retval = xio_nexus_read_setup_req(task, &req); + if (retval != 0) + goto cleanup; + + /* verify version */ + if (req.version != XIO_VERSION) { + ERROR_LOG("client invalid version.cver:0x%x, sver::0x%x\n", + req.version, XIO_VERSION); + xio_set_error(XIO_E_INVALID_VERSION); + return -1; + } + + /* by default nexus is the new nexus */ + nexus = new_nexus; + if (req.flags & XIO_RECONNECT) { + struct xio_nexus *dis_nexus; + /* Server side reconnect strategy, use new transport with the + * old nexus + */ + cid = req.cid; + flags = XIO_RECONNECT; + dis_nexus = xio_nexus_cache_lookup(cid); + if (dis_nexus && dis_nexus != new_nexus) { + /* stop timer */ + xio_nexus_cancel_dwork(dis_nexus); + + retval = xio_nexus_swap(dis_nexus, new_nexus); + if (retval != 0) { + ERROR_LOG("swap nexus failed\n"); + return -1; + } + /* retransmission will start after setup response is + * transmitted - xio_nexus_on_send_setup_rsp_comp + */ + nexus = dis_nexus; + } else { + flags = XIO_CID; + status = XIO_E_UNSUCCESSFUL; + } + goto send_response; + } + + cid = nexus->cid; + /* time to prepare the primary pool if srq is disabled. In case + * srq was enabled, it was created in order to send the nexus setup */ + if (!nexus->srq_enabled) { + retval = xio_nexus_primary_pool_create(nexus); + if (retval != 0) { + ERROR_LOG("create primary pool failed\n"); + status = ENOMEM; + goto send_response; + } + } + +send_response: + /* reset mbuf */ + xio_mbuf_reset(&task->mbuf); + + /* write response */ + task->tlv_type = XIO_NEXUS_SETUP_RSP; + task->omsg = NULL; + task->nexus = nexus; + + rsp.cid = cid; + rsp.status = status; + rsp.version = XIO_VERSION; + rsp.flags = flags; + + retval = xio_nexus_write_setup_rsp(task, &rsp); + if (retval != 0) + goto cleanup; + + /* send it */ + TRACE_LOG("%s: nexus:%p, trans_hndl:%p\n", __func__, + nexus, nexus->transport_hndl); + list_move(&task->tasks_list_entry, &nexus->tx_queue); + retval = nexus->transport->send(nexus->transport_hndl, task); + if (retval != 0) { + ERROR_LOG("send setup response failed\n"); + return -1; + } + + return 0; +cleanup: + xio_set_error(XIO_E_MSG_INVALID); + ERROR_LOG("receiving setup request failed\n"); + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_prep_new_transport */ +/*---------------------------------------------------------------------------*/ +static int xio_nexus_prep_new_transport(struct xio_nexus *nexus) +{ + int retval; + + /* ignore close event on transport_hndl (part of dup2) */ + xio_observable_unreg_observer( + &nexus->transport_hndl->observable, + &nexus->trans_observer); + + /* nexus is an observer of the new transport (see open API) + * no need to register + */ + xio_tasks_pool_remap(nexus->primary_tasks_pool, + nexus->new_transport_hndl); + /* make nexus->transport_hndl copy of nexus->new_transport_hndl + * old nexus->trasport_hndl will be closed + */ + if (nexus->transport->dup2(nexus->new_transport_hndl, + &nexus->transport_hndl)) { + ERROR_LOG("dup2 transport failed\n"); + return -1; + } + + /* TODO: what about messages held by the application */ + /* be ready to receive messages */ + retval = xio_nexus_primary_pool_recreate(nexus); + if (retval != 0) { + ERROR_LOG("recreate primary pool failed\n"); + return -1; + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_on_recv_setup_rsp */ +/*---------------------------------------------------------------------------*/ +static int xio_nexus_on_recv_setup_rsp(struct xio_nexus *nexus, + struct xio_task *task) +{ + struct xio_nexus_setup_rsp rsp; + int retval; + + TRACE_LOG("receiving setup response. nexus:%p\n", nexus); + retval = xio_nexus_read_setup_rsp(task, &rsp); + if (retval != 0) + goto cleanup; + + if (rsp.status) { + xio_set_error(rsp.status); + ERROR_LOG("remote peer reported status %d - [%s]\n", + rsp.status, xio_strerror(rsp.status)); + if (rsp.flags & XIO_CID) { + /* reconnection is impossible since remote + * CID was not found on server side + */ + /* Stop timer */ + xio_nexus_cancel_dwork(nexus); + if (nexus->state == XIO_NEXUS_STATE_RECONNECT) { + retval = xio_nexus_prep_new_transport(nexus); + if (retval != 0) { + ERROR_LOG( + "prep new transport failed\n"); + return -1; + } + } + + /* Kill nexus */ + nexus->state = XIO_NEXUS_STATE_DISCONNECTED; + TRACE_LOG("nexus state changed to disconnected\n"); + xio_observable_notify_all_observers( + &nexus->observable, + XIO_NEXUS_EVENT_DISCONNECTED, + NULL); + } else { + union xio_nexus_event_data nexus_event_data; + + nexus_event_data.error.reason = XIO_E_CONNECT_ERROR; + xio_observable_notify_all_observers( + &nexus->observable, + XIO_NEXUS_EVENT_ERROR, + &nexus_event_data); + } + xio_tasks_pool_put(task->sender_task); + task->sender_task = NULL; + xio_tasks_pool_put(task); + + return 0; + } + if (rsp.version != XIO_VERSION) { + xio_set_error(XIO_E_INVALID_VERSION); + ERROR_LOG("client invalid version.cver:0x%x, sver::0x%x\n", + XIO_VERSION, rsp.version); + return -1; + } + TRACE_LOG("%s: nexus:%p, trans_hndl:%p\n", __func__, + nexus, nexus->transport_hndl); + /* recycle the tasks */ + xio_tasks_pool_put(task->sender_task); + task->sender_task = NULL; + xio_tasks_pool_put(task); + + if (nexus->state != XIO_NEXUS_STATE_RECONNECT) { + if (!nexus->srq_enabled) { + /* create the primary */ + retval = xio_nexus_primary_pool_create(nexus); + if (retval != 0) { + ERROR_LOG("create primary pool failed\n"); + return -1; + } + } + nexus->state = XIO_NEXUS_STATE_CONNECTED; + + xio_observable_notify_all_observers(&nexus->observable, + XIO_NEXUS_EVENT_ESTABLISHED, + NULL); + /* remember server cid for reconnect */ + nexus->server_cid = rsp.cid; + } else { + /* Stop reconnect timer */ + xio_nexus_cancel_dwork(nexus); + + retval = xio_nexus_prep_new_transport(nexus); + if (retval != 0) { + ERROR_LOG("prep new transport failed\n"); + return -1; + } + nexus->state = XIO_NEXUS_STATE_CONNECTED; + + /* Tell session to re-initiate transmission */ + xio_observable_notify_all_observers(&nexus->observable, + XIO_NEXUS_EVENT_RECONNECTED, + NULL); + } + + return 0; +cleanup: + xio_set_error(XIO_E_MSG_INVALID); + ERROR_LOG("receiving setup request failed\n"); + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_on_send_setup_rsp_comp */ +/*---------------------------------------------------------------------------*/ +static int xio_nexus_on_send_setup_rsp_comp(struct xio_nexus *nexus, + struct xio_task *task) +{ + enum xio_nexus_event nexus_event; + + if (nexus->state == XIO_NEXUS_STATE_RECONNECT) + /* Tell session to re-initiate transmission */ + nexus_event = XIO_NEXUS_EVENT_RECONNECTED; + else + nexus_event = XIO_NEXUS_EVENT_ESTABLISHED; + + /* Set new state */ + nexus->state = XIO_NEXUS_STATE_CONNECTED; + xio_observable_notify_all_observers(&nexus->observable, + nexus_event, + NULL); + + /* recycle the task */ + xio_tasks_pool_put(task); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_on_recv_session_setup_req */ +/*---------------------------------------------------------------------------*/ +static int xio_nexus_on_recv_session_setup_req(struct xio_nexus *nexus, + struct xio_task *task) +{ + union xio_nexus_event_data nexus_event_data; + + task->nexus = nexus; + nexus_event_data.msg.task = task; + nexus_event_data.msg.op = XIO_WC_OP_RECV; + + /* add reference count to opened nexus that new + * session is join in */ + if (!nexus->is_first_req) + xio_nexus_addref(nexus); + else + nexus->is_first_req = 0; + + /* always route "hello" to server */ + xio_nexus_notify_server( + nexus, + XIO_NEXUS_EVENT_NEW_MESSAGE, + &nexus_event_data); + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_on_recv_req */ +/*---------------------------------------------------------------------------*/ +static int xio_nexus_on_recv_req(struct xio_nexus *nexus, + struct xio_task *task) +{ + union xio_nexus_event_data nexus_event_data; + + task->nexus = nexus; + nexus_event_data.msg.task = task; + nexus_event_data.msg.op = XIO_WC_OP_RECV; + + /* route the message to any of observer */ + xio_observable_notify_any_observer( + &nexus->observable, + XIO_NEXUS_EVENT_NEW_MESSAGE, + &nexus_event_data); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_on_recv_rsp */ +/*---------------------------------------------------------------------------*/ +static int xio_nexus_on_recv_rsp(struct xio_nexus *nexus, + struct xio_task *task) +{ + union xio_nexus_event_data nexus_event_data; + + task->nexus = nexus; + + nexus_event_data.msg.task = task; + nexus_event_data.msg.op = XIO_WC_OP_RECV; + if (likely(task->sender_task)) { + if (unlikely(task->sender_task->nexus != nexus)) { + DEBUG_LOG("spurious event\n"); + return 0; + } + /* route the response to the sender session */ + xio_observable_notify_observer( + &nexus->observable, + &task->sender_task->session->observer, + XIO_NEXUS_EVENT_NEW_MESSAGE, + &nexus_event_data); + } else { + /* route the message to any of observer */ + xio_observable_notify_any_observer( + &nexus->observable, + XIO_NEXUS_EVENT_NEW_MESSAGE, + &nexus_event_data); + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_on_send_msg_comp */ +/*---------------------------------------------------------------------------*/ +static int xio_nexus_on_send_msg_comp(struct xio_nexus *nexus, + struct xio_task *task) +{ + union xio_nexus_event_data nexus_event_data; + + nexus_event_data.msg.task = task; + nexus_event_data.msg.op = XIO_WC_OP_SEND; + + xio_observable_notify_observer( + &nexus->observable, + &task->session->observer, + XIO_NEXUS_EVENT_SEND_COMPLETION, + &nexus_event_data); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_initial_pool_create */ +/*---------------------------------------------------------------------------*/ +static int xio_nexus_initial_pool_create(struct xio_nexus *nexus) +{ + struct xio_tasks_pool_ops *pool_ops; + struct xio_transport_base *transport_hndl; + struct xio_tasks_pool_cls pool_cls; + struct xio_context *ctx; + enum xio_proto proto; + int retval; + + if (nexus->state == XIO_NEXUS_STATE_RECONNECT) + transport_hndl = nexus->new_transport_hndl; + else + transport_hndl = nexus->transport_hndl; + + proto = transport_hndl->proto; + ctx = transport_hndl->ctx; + + retval = xio_ctx_pool_create(ctx, proto, + XIO_CONTEXT_POOL_CLASS_INITIAL); + if (retval) { + ERROR_LOG("Failed to create initial pool. nexus:%p\n", nexus); + return -1; + } + + /* set pool helpers to the transport */ + if (nexus->transport->set_pools_cls) { + pool_cls.pool = NULL; + pool_cls.task_get = (struct xio_task *(*)(void *, void *)) + xio_tasks_pool_get; + pool_cls.task_lookup = (struct xio_task * (*)(void *, int)) + xio_tasks_pool_lookup; + pool_cls.task_put = (void (*)(struct xio_task *)) + xio_tasks_pool_put; + + nexus->transport->set_pools_cls(transport_hndl, + &pool_cls, NULL); + } + pool_ops = ctx->initial_pool_ops[proto]; + + if (pool_ops->pool_post_create) + pool_ops->pool_post_create( + transport_hndl, + ctx->initial_tasks_pool[proto], + ctx->initial_tasks_pool[proto]->dd_data); + + nexus->initial_tasks_pool = ctx->initial_tasks_pool[proto]; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_initial_pool_create */ +/*---------------------------------------------------------------------------*/ +static int xio_nexus_primary_pool_create(struct xio_nexus *nexus) +{ + struct xio_tasks_pool_ops *pool_ops; + struct xio_transport_base *transport_hndl; + struct xio_tasks_pool_cls pool_cls; + struct xio_context *ctx; + enum xio_proto proto; + int retval; + struct xio_task *task; + + transport_hndl = nexus->transport_hndl; + proto = transport_hndl->proto; + ctx = transport_hndl->ctx; + + retval = xio_ctx_pool_create(ctx, proto, + XIO_CONTEXT_POOL_CLASS_PRIMARY); + if (retval) { + ERROR_LOG("Failed to create primary pool. nexus:%p\n", nexus); + return -1; + } + + /* set pool helpers to the transport */ + if (nexus->transport->set_pools_cls) { + pool_cls.pool = NULL; + pool_cls.task_get = (struct xio_task *(*)(void *, void *)) + xio_tasks_pool_get; + pool_cls.task_lookup = (struct xio_task * (*)(void *, int)) + xio_tasks_pool_lookup; + pool_cls.task_put = (void (*)(struct xio_task *)) + xio_tasks_pool_put; + nexus->transport->set_pools_cls(transport_hndl, + NULL, &pool_cls); + } + pool_ops = ctx->primary_pool_ops[proto]; + + if (pool_ops->pool_post_create) + pool_ops->pool_post_create( + transport_hndl, + ctx->primary_tasks_pool[proto], + ctx->primary_tasks_pool[proto]->dd_data); + + nexus->primary_tasks_pool = ctx->primary_tasks_pool[proto]; + + /* set pool context as the nexus's transport handler */ + nexus->primary_tasks_pool->params.pool_hooks.context = nexus->transport_hndl; + + list_for_each_entry(task, &nexus->primary_tasks_pool->stack, tasks_list_entry) { + xio_task_reinit(nexus->transport_hndl, task); + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_primary_pool_recreate */ +/*---------------------------------------------------------------------------*/ +static int xio_nexus_primary_pool_recreate(struct xio_nexus *nexus) +{ + struct xio_tasks_pool_cls pool_cls; + struct xio_tasks_pool_ops *pool_ops; + struct xio_context *ctx; + enum xio_proto proto; + + proto = nexus->transport_hndl->proto; + ctx = nexus->transport_hndl->ctx; + pool_ops = ctx->primary_pool_ops[proto]; + + if (!pool_ops || !nexus->primary_tasks_pool) + return -1; + + /* set pool helpers to the transport */ + if (nexus->transport->set_pools_cls) { + pool_cls.pool = NULL; + pool_cls.task_get = (struct xio_task *(*)(void *, void *)) + xio_tasks_pool_get; + pool_cls.task_lookup = (struct xio_task * (*)(void *, int)) + xio_tasks_pool_lookup; + pool_cls.task_put = xio_tasks_pool_put; + + nexus->transport->set_pools_cls(nexus->transport_hndl, + NULL, + &pool_cls); + } + /* Equivalent to old xio_rdma_primary_pool_run, + * will call xio_rdma_rearm_rq + */ + if (pool_ops->pool_post_create) + pool_ops->pool_post_create( + nexus->transport_hndl, + nexus->primary_tasks_pool, + nexus->primary_tasks_pool->dd_data); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_release_cb */ +/*---------------------------------------------------------------------------*/ +static void xio_nexus_release_cb(void *data) +{ + struct xio_nexus *nexus = (struct xio_nexus *)data; + + TRACE_LOG("physical nexus close. nexus:%p rdma_hndl:%p\n", + nexus, nexus->transport_hndl); + + if (!nexus->is_listener) + xio_nexus_cache_remove(nexus->cid); + + if (nexus->state != XIO_NEXUS_STATE_DISCONNECTED) { + nexus->state = XIO_NEXUS_STATE_CLOSED; + TRACE_LOG("nexus state changed to closed\n"); + } + + /* now it is zero */ + if (nexus->transport && nexus->transport->close) + nexus->transport->close(nexus->transport_hndl); +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_release */ +/*---------------------------------------------------------------------------*/ +static void xio_nexus_release(void *data) +{ + struct xio_nexus *nexus = (struct xio_nexus *)data; + + TRACE_LOG("physical nexus close. nexus:%p rdma_hndl:%p\n", + nexus, nexus->transport_hndl); + + xio_ctx_del_delayed_work(nexus->transport_hndl->ctx, + &nexus->close_time_hndl); + + xio_nexus_release_cb(data); +} + +/*---------------------------------------------------------------------------*/ +/* xio_on_context_close */ +/*---------------------------------------------------------------------------*/ +static void xio_on_context_close(struct xio_nexus *nexus, + struct xio_context *ctx) +{ + TRACE_LOG("xio_on_context_close. nexus:%p, ctx:%p\n", nexus, ctx); + + /* remove the nexus from table */ + xio_nexus_cache_remove(nexus->cid); + + xio_ctx_del_delayed_work(ctx, &nexus->close_time_hndl); + + /* shut down the context and its dependent without waiting */ + if (nexus->transport->context_shutdown) + nexus->transport->context_shutdown(nexus->transport_hndl, ctx); + + /* at that stage the nexus may no longer exist */ +} + +/*---------------------------------------------------------------------------*/ +/* xio_on_context_event */ +/*---------------------------------------------------------------------------*/ +static int xio_on_context_event(void *observer, void *sender, int event, + void *event_data) +{ + TRACE_LOG("xio_on_context_event\n"); + if (event == XIO_CONTEXT_EVENT_CLOSE) { + TRACE_LOG("context: [close] ctx:%p\n", sender); + xio_on_context_close((struct xio_nexus *)observer, + (struct xio_context *)sender); + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_on_server_close */ +/*---------------------------------------------------------------------------*/ +static void xio_on_server_close(struct xio_nexus *nexus, + struct xio_server *server) +{ + TRACE_LOG("xio_on_server_close. nexus:%p, server:%p\n", nexus, server); + if (nexus->server) { + xio_server_unreg_observer(nexus->server, + &nexus->srv_observer); + nexus->server = NULL; + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_on_server_event */ +/*---------------------------------------------------------------------------*/ +static int xio_on_server_event(void *observer, void *sender, int event, + void *event_data) +{ + TRACE_LOG("xio_on_server_event\n"); + if (event == XIO_SERVER_EVENT_CLOSE) { + TRACE_LOG("server: [close] server:%p\n", sender); + xio_on_server_close((struct xio_nexus *)observer, + (struct xio_server *)sender); + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_create */ +/*---------------------------------------------------------------------------*/ +struct xio_nexus *xio_nexus_create(struct xio_nexus *parent_nexus, + struct xio_transport_base *transport_hndl) +{ + struct xio_nexus *nexus; + int retval; + + if (parent_nexus->transport_hndl->is_client) + return NULL; + + /* allocate nexus */ + nexus = (struct xio_nexus *) + kcalloc(1, sizeof(struct xio_nexus), GFP_KERNEL); + if (!nexus) { + xio_set_error(ENOMEM); + ERROR_LOG("kcalloc failed. %m\n"); + return NULL; + } + + XIO_OBSERVER_INIT(&nexus->trans_observer, nexus, + xio_nexus_on_transport_event); + + /* start listen to server events */ + XIO_OBSERVER_INIT(&nexus->srv_observer, nexus, + xio_on_server_event); + + spin_lock_init(&nexus->nexus_obs_lock); + + XIO_OBSERVABLE_INIT(&nexus->observable, nexus); + + xio_nexus_init_observers_htbl(nexus); + + /* start listen to context events */ + XIO_OBSERVER_INIT(&nexus->ctx_observer, nexus, + xio_on_context_event); + + INIT_LIST_HEAD(&nexus->tx_queue); + + xio_context_reg_observer(transport_hndl->ctx, &nexus->ctx_observer); + + /* add the nexus to temporary list */ + nexus->transport_hndl = transport_hndl; + nexus->transport = parent_nexus->transport; + nexus->server = parent_nexus->server; + nexus->srq_enabled = parent_nexus->srq_enabled; + kref_init(&nexus->kref); + nexus->state = XIO_NEXUS_STATE_OPEN; + nexus->is_first_req = 1; + mutex_init(&nexus->lock_connect); + + xio_nexus_cache_add(nexus, &nexus->cid); + + /* add the new nexus as observer to server */ + if (nexus->server) + xio_server_reg_observer(nexus->server, + &nexus->srv_observer); + + /* add the new nexus as observer to transport */ + xio_transport_reg_observer(nexus->transport_hndl, + &nexus->trans_observer); + + if (nexus->transport->get_pools_setup_ops) { + struct xio_context *ctx = nexus->transport_hndl->ctx; + enum xio_proto proto = nexus->transport_hndl->proto; + + if (!ctx->primary_pool_ops[proto] || + !ctx->initial_pool_ops[proto]) + nexus->transport->get_pools_setup_ops( + nexus->transport_hndl, + &ctx->initial_pool_ops[proto], + &ctx->primary_pool_ops[proto]); + } else { + ERROR_LOG("transport does not implement \"add_observer\"\n"); + goto cleanup; + } + if (nexus->srq_enabled) + retval = xio_nexus_primary_pool_create(nexus); + else + retval = xio_nexus_initial_pool_create(nexus); + + if (retval != 0) { + ERROR_LOG("failed to setup pool\n"); + goto cleanup; + } + nexus->destroy_event.handler = xio_nexus_destroy_handler; + nexus->destroy_event.data = nexus; + + nexus->trans_error_event.handler = xio_nexus_trans_error_handler; + nexus->trans_error_event.data = NULL; + + TRACE_LOG("nexus: [new] ptr:%p, transport_hndl:%p\n", nexus, + nexus->transport_hndl); + + return nexus; + +cleanup: + xio_nexus_destroy(nexus); + return NULL; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_on_message_error */ +/*---------------------------------------------------------------------------*/ +static void xio_nexus_on_message_error(struct xio_nexus *nexus, + union xio_transport_event_data + *event_data) +{ + union xio_nexus_event_data nexus_event_data; + + nexus_event_data.msg_error.reason = event_data->msg_error.reason; + nexus_event_data.msg_error.direction = event_data->msg_error.direction; + nexus_event_data.msg_error.task = event_data->msg_error.task; + + xio_observable_notify_any_observer(&nexus->observable, + XIO_NEXUS_EVENT_MESSAGE_ERROR, + &nexus_event_data); +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_on_new_transport */ +/*---------------------------------------------------------------------------*/ +static void xio_nexus_on_new_transport(struct xio_nexus *nexus, + union xio_transport_event_data + *event_data) +{ + union xio_nexus_event_data nexus_event_data; + struct xio_nexus *child_nexus; + + child_nexus = xio_nexus_create( + nexus, + event_data->new_connection.child_trans_hndl); + + TRACE_LOG("%s: nexus:%p, trans_hndl:%p\n", __func__, + child_nexus, event_data->new_connection.child_trans_hndl); + nexus_event_data.new_nexus.child_nexus = child_nexus; + if (!child_nexus) { + ERROR_LOG("failed to create child nexus\n"); + goto exit; + } + + /* notify of new child to server */ + xio_nexus_notify_server( + nexus, + XIO_NEXUS_EVENT_NEW_CONNECTION, + &nexus_event_data); + + return; +exit: + xio_nexus_notify_server( + nexus, + XIO_NEXUS_EVENT_ERROR, + &nexus_event_data); +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_on_transport_closed */ +/*---------------------------------------------------------------------------*/ +static void xio_nexus_on_transport_closed(struct xio_nexus *nexus, + union xio_transport_event_data + *event_data) +{ + xio_nexus_destroy(nexus); +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_on_transport_error */ +/*---------------------------------------------------------------------------*/ +static void xio_nexus_on_transport_error(struct xio_nexus *nexus, + union xio_transport_event_data + *event_data) +{ + union xio_nexus_event_data nexus_event_data; + + nexus_event_data.error.reason = event_data->error.reason; + + xio_nexus_state_set(nexus, XIO_NEXUS_STATE_ERROR); + xio_observable_notify_all_observers(&nexus->observable, + XIO_NEXUS_EVENT_ERROR, + &nexus_event_data); +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_on_transport_established */ +/*---------------------------------------------------------------------------*/ +static void xio_nexus_on_transport_established(struct xio_nexus *nexus, + union xio_transport_event_data + *event_data) +{ + int retval; + + if (!nexus->transport_hndl->is_client) + return; + + if (nexus->srq_enabled) + retval = xio_nexus_primary_pool_create(nexus); + else + retval = xio_nexus_initial_pool_create(nexus); + + if (retval) + ERROR_LOG("creation of task pool failed\n"); + + xio_nexus_send_setup_req(nexus); +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_destroy_handler */ +/*---------------------------------------------------------------------------*/ +static void xio_nexus_destroy_handler(void *nexus_) +{ + struct xio_nexus *nexus = (struct xio_nexus *)nexus_; + + xio_nexus_release(nexus); +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_disconnected */ +/*---------------------------------------------------------------------------*/ +static void xio_nexus_disconnected(void *nexus_) +{ + struct xio_nexus *nexus = (struct xio_nexus *)nexus_; + int ret; + + /* Try to reconnect */ + if (g_options.reconnect) { + if (nexus->transport_hndl->is_client) + ret = xio_nexus_client_reconnect(nexus); + else + ret = xio_nexus_server_reconnect(nexus); + + if (!ret) { + TRACE_LOG("reconnect attempt nexus:%p\n", nexus); + return; + } + ERROR_LOG("can't reconnect nexus:%p\n", nexus); + } + + /* Can't reconnect */ + + nexus->state = XIO_NEXUS_STATE_DISCONNECTED; + TRACE_LOG("nexus state changed to disconnected nexus:%p\n", nexus); + + xio_nexus_flush_tx_queue(nexus); + if (!xio_observable_is_empty(&nexus->observable)) { + xio_observable_notify_all_observers( + &nexus->observable, + XIO_NEXUS_EVENT_DISCONNECTED, + NULL); + } else { + xio_context_add_event(nexus->transport_hndl->ctx, + &nexus->destroy_event); + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_trans_error_handler */ +/*---------------------------------------------------------------------------*/ +static void xio_nexus_trans_error_handler(void *ev_params_) +{ + struct xio_event_params *ev_params = + (struct xio_event_params *)ev_params_; + + ev_params->nexus->trans_error_event.data = NULL; + + xio_context_disable_event(&ev_params->nexus->trans_error_event); + + if (ev_params->nexus->state == XIO_NEXUS_STATE_RECONNECT) + xio_nexus_client_reconnect_failed(ev_params->nexus); + else + xio_nexus_on_transport_error(ev_params->nexus, + &ev_params->event_data); + + kfree(ev_params); +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_on_transport_disconnected */ +/*---------------------------------------------------------------------------*/ +static void xio_nexus_on_transport_disconnected(struct xio_nexus *nexus, + union xio_transport_event_data + *event_data) +{ + /* cancel old timers */ + xio_ctx_del_delayed_work(nexus->transport_hndl->ctx, + &nexus->close_time_hndl); + + xio_nexus_disconnected(nexus); +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_on_new_message */ +/*---------------------------------------------------------------------------*/ +static int xio_nexus_on_new_message(struct xio_nexus *nexus, + union xio_transport_event_data *event_data) +{ + int retval = -1; + struct xio_task *task = event_data->msg.task; + + task->nexus = nexus; + switch (task->tlv_type) { + case XIO_NEXUS_SETUP_RSP: + retval = xio_nexus_on_recv_setup_rsp(nexus, task); + break; + case XIO_NEXUS_SETUP_REQ: + retval = xio_nexus_on_recv_setup_req(nexus, task); + break; + case XIO_CONNECTION_HELLO_REQ: + case XIO_SESSION_SETUP_REQ: + retval = xio_nexus_on_recv_session_setup_req(nexus, task); + break; + default: + if (IS_REQUEST(task->tlv_type)) + retval = xio_nexus_on_recv_req(nexus, task); + else if (IS_RESPONSE(task->tlv_type)) + retval = xio_nexus_on_recv_rsp(nexus, task); + else + ERROR_LOG("unexpected message type %u\n", + task->tlv_type); + break; + }; + + if (retval != 0) { + ERROR_LOG("failed to handle message. " \ + "nexus:%p tlv_type:0x%x op:%d\n", + nexus, task->tlv_type, event_data->msg.op); + } + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_on_send_completion */ +/*---------------------------------------------------------------------------*/ +static int xio_nexus_on_send_completion(struct xio_nexus *nexus, + union xio_transport_event_data + *event_data) +{ + int retval = -1; + struct xio_task *task = event_data->msg.task; + + switch (task->tlv_type) { + case XIO_NEXUS_SETUP_RSP: + retval = xio_nexus_on_send_setup_rsp_comp(nexus, task); + break; + case XIO_NEXUS_SETUP_REQ: + retval = 0; + break; + default: + retval = xio_nexus_on_send_msg_comp(nexus, task); + break; + }; + + if (retval != 0) { + ERROR_LOG("failed to handle message. " \ + "nexus:%p tlv_type:%d op:%d\n", + nexus, task->tlv_type, event_data->msg.op); + } + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_on_direct_rdma_completion */ +/*---------------------------------------------------------------------------*/ +static int xio_nexus_on_direct_rdma_completion( + struct xio_nexus *nexus, + union xio_transport_event_data *event_data) +{ + struct xio_task *task = event_data->msg.task; + union xio_nexus_event_data nexus_event_data; + + nexus_event_data.msg.task = task; + nexus_event_data.msg.op = event_data->msg.op; + + xio_observable_notify_observer( + &nexus->observable, + &task->session->observer, + XIO_NEXUS_EVENT_DIRECT_RDMA_COMPLETION, + &nexus_event_data); + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_on_assign_in_buf */ +/*---------------------------------------------------------------------------*/ +static int xio_nexus_on_assign_in_buf(struct xio_nexus *nexus, + union xio_transport_event_data + *event_data) +{ + int retval = 0; + struct xio_task *task = event_data->msg.task; + union xio_nexus_event_data nexus_event_data; + + nexus_event_data.assign_in_buf.task = event_data->msg.task; + task->nexus = nexus; + + xio_observable_notify_any_observer( + &nexus->observable, + XIO_NEXUS_EVENT_ASSIGN_IN_BUF, + &nexus_event_data); + + event_data->assign_in_buf.is_assigned = + nexus_event_data.assign_in_buf.is_assigned; + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_on_alloc_head_buf */ +/*---------------------------------------------------------------------------*/ +static int xio_nexus_on_alloc_head_buf(struct xio_nexus *nexus, + union xio_transport_event_data + *event_data) +{ + int retval = 0; + struct xio_task *task = event_data->msg.task; + union xio_nexus_event_data nexus_event_data; + + nexus_event_data.alloc_head_buf.task = event_data->msg.task; + task->nexus = nexus; + + xio_observable_notify_any_observer( + &nexus->observable, + XIO_NEXUS_EVENT_ALLOC_HEAD_BUF, + &nexus_event_data); + + event_data->alloc_head_buf.is_assigned = + nexus_event_data.alloc_head_buf.is_assigned; + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_on_alloc_data_buf */ +/*---------------------------------------------------------------------------*/ +static int xio_nexus_on_alloc_data_buf(struct xio_nexus *nexus, + union xio_transport_event_data + *event_data) +{ + int retval = 0; + struct xio_task *task = event_data->msg.task; + union xio_nexus_event_data nexus_event_data; + + nexus_event_data.alloc_data_buf.task = event_data->msg.task; + task->nexus = nexus; + + xio_observable_notify_any_observer( + &nexus->observable, + XIO_NEXUS_EVENT_ALLOC_DATA_BUF, + &nexus_event_data); + + event_data->alloc_data_buf.is_assigned = + nexus_event_data.alloc_data_buf.is_assigned; + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_on_cancel_request */ +/*---------------------------------------------------------------------------*/ +static int xio_nexus_on_cancel_request(struct xio_nexus *nexus, + union xio_transport_event_data + *event_data) +{ + union xio_nexus_event_data nexus_event_data = {}; + + nexus_event_data.cancel.ulp_msg = event_data->cancel.ulp_msg; + nexus_event_data.cancel.ulp_msg_sz = event_data->cancel.ulp_msg_sz; + nexus_event_data.cancel.task = event_data->cancel.task; + nexus_event_data.cancel.result = event_data->cancel.result; + + /* route the message to any of the sessions */ + xio_observable_notify_any_observer( + &nexus->observable, + XIO_NEXUS_EVENT_CANCEL_REQUEST, + &nexus_event_data); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_on_assign_in_buf */ +/*---------------------------------------------------------------------------*/ +static int xio_nexus_on_cancel_response(struct xio_nexus *nexus, + union xio_transport_event_data + *event_data) +{ + union xio_nexus_event_data nexus_event_data = {}; + + nexus_event_data.cancel.ulp_msg = event_data->cancel.ulp_msg; + nexus_event_data.cancel.ulp_msg_sz = event_data->cancel.ulp_msg_sz; + nexus_event_data.cancel.task = event_data->cancel.task; + nexus_event_data.cancel.result = event_data->cancel.result; + + /* route the message to any of the sessions */ + xio_observable_notify_any_observer( + &nexus->observable, + XIO_NEXUS_EVENT_CANCEL_RESPONSE, + &nexus_event_data); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_on_transport_event */ +/*---------------------------------------------------------------------------*/ +static int xio_nexus_on_transport_event(void *observer, void *sender, + int event, void *event_data) +{ + struct xio_nexus *nexus = (struct xio_nexus *)observer; + struct xio_event_params *ev_params; + int tx = 1; + union xio_transport_event_data *ev_data = + (union xio_transport_event_data *)event_data; + + switch (event) { + case XIO_TRANSPORT_EVENT_NEW_MESSAGE: +/* + TRACE_LOG("nexus: [notification] - new message. " \ + "nexus:%p, transport:%p\n", observer, sender); +*/ + xio_nexus_on_new_message(nexus, ev_data); + break; + case XIO_TRANSPORT_EVENT_SEND_COMPLETION: +/* + TRACE_LOG("nexus: [notification] - send completion. " \ + "nexus:%p, transport:%p\n", observer, sender); +*/ + xio_nexus_on_send_completion(nexus, ev_data); + break; + case XIO_TRANSPORT_EVENT_DIRECT_RDMA_COMPLETION: + xio_nexus_on_direct_rdma_completion(nexus, ev_data); + break; + case XIO_TRANSPORT_EVENT_ASSIGN_IN_BUF: + xio_nexus_on_assign_in_buf(nexus, ev_data); + break; + + case XIO_TRANSPORT_EVENT_ALLOC_HEAD_BUF: + xio_nexus_on_alloc_head_buf(nexus, ev_data); + break; + + case XIO_TRANSPORT_EVENT_ALLOC_DATA_BUF: + xio_nexus_on_alloc_data_buf(nexus, ev_data); + break; + + case XIO_TRANSPORT_EVENT_MESSAGE_ERROR: + DEBUG_LOG("nexus: [notification] - message error. " \ + "nexus:%p, transport:%p\n", observer, sender); + xio_nexus_on_message_error(nexus, ev_data); + break; + case XIO_TRANSPORT_EVENT_CANCEL_REQUEST: + DEBUG_LOG("nexus: [notification] - cancel request. " \ + "nexus:%p, transport:%p\n", observer, sender); + xio_nexus_on_cancel_request(nexus, ev_data); + break; + case XIO_TRANSPORT_EVENT_CANCEL_RESPONSE: + DEBUG_LOG("nexus: [notification] - cancel respnose. " \ + "nexus:%p, transport:%p\n", observer, sender); + xio_nexus_on_cancel_response(nexus, ev_data); + break; + case XIO_TRANSPORT_EVENT_NEW_CONNECTION: + DEBUG_LOG("nexus: [notification] - new transport. " \ + "nexus:%p, transport:%p\n", observer, sender); + xio_nexus_on_new_transport(nexus, ev_data); + break; + case XIO_TRANSPORT_EVENT_ESTABLISHED: + DEBUG_LOG("nexus: [notification] - transport established. " \ + "nexus:%p, transport:%p\n", observer, sender); + xio_nexus_on_transport_established(nexus, ev_data); + break; + case XIO_TRANSPORT_EVENT_DISCONNECTED: + DEBUG_LOG("nexus: [notification] - transport disconnected. " \ + "nexus:%p, transport:%p\n", observer, sender); + xio_nexus_on_transport_disconnected(nexus, ev_data); + tx = 0; + break; + case XIO_TRANSPORT_EVENT_CLOSED: + DEBUG_LOG("nexus: [notification] - transport closed. " \ + "nexus:%p, transport:%p\n", observer, sender); + xio_nexus_on_transport_closed(nexus, ev_data); + tx = 0; + return 0; + case XIO_TRANSPORT_EVENT_REFUSED: + DEBUG_LOG("nexus: [notification] - transport refused. " \ + "nexus:%p, transport:%p\n", observer, sender); + if (nexus->state == XIO_NEXUS_STATE_RECONNECT) { + xio_nexus_client_reconnect_failed(nexus); + } else { + nexus->state = XIO_NEXUS_STATE_DISCONNECTED; + TRACE_LOG("nexus state changed to disconnected\n"); + xio_nexus_flush_tx_queue(nexus); + xio_observable_notify_all_observers( + &nexus->observable, + XIO_NEXUS_EVENT_REFUSED, + &event_data); + } + tx = 0; + break; + case XIO_TRANSPORT_EVENT_ERROR: + DEBUG_LOG("nexus: [notification] - transport error. " \ + "nexus:%p, transport:%p\n", observer, sender); + /* event still pending */ + if (nexus->trans_error_event.data) + return 0; + ev_params = (struct xio_event_params *) + kmalloc(sizeof(*ev_params), GFP_KERNEL); + if (!ev_params) { + ERROR_LOG("failed to allocate memory\n"); + return -1; + } + ev_params->nexus = nexus; + memcpy(&ev_params->event_data, ev_data, sizeof(*ev_data)); + nexus->trans_error_event.data = ev_params; + + xio_context_add_event(nexus->transport_hndl->ctx, + &nexus->trans_error_event); + + tx = 0; + break; + }; + + if (tx && !list_empty(&nexus->tx_queue)) + xio_nexus_xmit(nexus); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_destroy */ +/*---------------------------------------------------------------------------*/ +static int xio_nexus_destroy(struct xio_nexus *nexus) +{ + DEBUG_LOG("nexus:%p - close complete\n", nexus); + + xio_context_disable_event(&nexus->destroy_event); + xio_context_disable_event(&nexus->trans_error_event); + + kfree(nexus->trans_error_event.data); + nexus->trans_error_event.data = NULL; + if (nexus->server) + xio_server_unreg_observer(nexus->server, + &nexus->srv_observer); + + if (nexus->transport_hndl) + xio_transport_unreg_observer(nexus->transport_hndl, + &nexus->trans_observer); + + spin_lock(&nexus->nexus_obs_lock); + xio_nexus_free_observers_htbl(nexus); + xio_observable_unreg_all_observers(&nexus->observable); + spin_unlock(&nexus->nexus_obs_lock); + + if (nexus->transport_hndl) + xio_ctx_del_delayed_work( + nexus->transport_hndl->ctx, + &nexus->close_time_hndl); + + xio_nexus_flush_tx_queue(nexus); + + xio_nexus_cache_remove(nexus->cid); + + if (nexus->transport_hndl) + xio_context_unreg_observer(nexus->transport_hndl->ctx, + &nexus->ctx_observer); + + kfree(nexus->portal_uri); + nexus->portal_uri = NULL; + + kfree(nexus->out_if_addr); + nexus->out_if_addr = NULL; + + XIO_OBSERVER_DESTROY(&nexus->trans_observer); + + XIO_OBSERVABLE_DESTROY(&nexus->observable); + + XIO_OBSERVER_DESTROY(&nexus->ctx_observer); + XIO_OBSERVER_DESTROY(&nexus->srv_observer); + mutex_destroy(&nexus->lock_connect); + + kfree(nexus); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_open */ +/*---------------------------------------------------------------------------*/ +struct xio_nexus *xio_nexus_open(struct xio_context *ctx, + const char *portal_uri, + struct xio_observer *observer, uint32_t oid, + uint32_t attr_mask, + struct xio_nexus_init_attr *init_attr) + +{ + struct xio_transport *transport; + struct xio_nexus *nexus; + char proto[8]; + struct xio_transport_init_attr *ptrans_init_attr = NULL; + struct xio_nexus_query_params query; + + /* look for opened nexus */ + query.ctx = ctx; + query.portal_uri = portal_uri; + query.tos = 0; + query.tos_enabled = 0; + if (attr_mask && init_attr) { + if (test_bits(XIO_NEXUS_ATTR_TOS, &attr_mask)) { + query.tos = init_attr->tos; + query.tos_enabled = 1; + } + } + + nexus = xio_nexus_cache_find(&query); + if (nexus && + (nexus->state == XIO_NEXUS_STATE_CONNECTED || + nexus->state == XIO_NEXUS_STATE_CONNECTING || + nexus->state == XIO_NEXUS_STATE_OPEN || + nexus->state == XIO_NEXUS_STATE_LISTEN || + nexus->state == XIO_NEXUS_STATE_INIT)) { + if (observer) { + spin_lock(&nexus->nexus_obs_lock); + xio_observable_reg_observer(&nexus->observable, + observer); + xio_nexus_hash_observer(nexus, observer, oid); + spin_unlock(&nexus->nexus_obs_lock); + } + + return nexus; + } + + /* extract portal from uri */ + if (xio_uri_get_proto(portal_uri, proto, sizeof(proto)) != 0) { + xio_set_error(XIO_E_ADDR_ERROR); + ERROR_LOG("parsing uri failed. uri: %s\n", portal_uri); + return NULL; + } + /* get the transport's proto */ + transport = xio_get_transport(proto); + if (!transport) { + ERROR_LOG("failed to load %s transport layer.\n", proto); + ERROR_LOG("validate that your system support %s " \ + "and the accelio's %s module is loaded\n", + proto, proto); + xio_set_error(ENOPROTOOPT); + return NULL; + } + + if (!transport->open) { + ERROR_LOG("transport %s does not implement \"open\"\n", + proto); + xio_set_error(ENOSYS); + return NULL; + } + /* allocate nexus */ + nexus = (struct xio_nexus *) + kcalloc(1, sizeof(struct xio_nexus), GFP_KERNEL); + if (!nexus) { + xio_set_error(ENOMEM); + ERROR_LOG("kcalloc failed. %m\n"); + return NULL; + } + + XIO_OBSERVER_INIT(&nexus->trans_observer, nexus, + xio_nexus_on_transport_event); + XIO_OBSERVABLE_INIT(&nexus->observable, nexus); + INIT_LIST_HEAD(&nexus->tx_queue); + mutex_init(&nexus->lock_connect); + + xio_nexus_init_observers_htbl(nexus); + + if (observer) { + spin_lock(&nexus->nexus_obs_lock); + xio_observable_reg_observer(&nexus->observable, observer); + xio_nexus_hash_observer(nexus, observer, oid); + spin_unlock(&nexus->nexus_obs_lock); + } + + /* start listen to server events */ + XIO_OBSERVER_INIT(&nexus->srv_observer, nexus, + xio_on_server_event); + + /* start listen to context events */ + XIO_OBSERVER_INIT(&nexus->ctx_observer, nexus, + xio_on_context_event); + + xio_context_reg_observer(ctx, &nexus->ctx_observer); + + if (attr_mask && init_attr) { + if (test_bits(XIO_NEXUS_ATTR_TOS, &attr_mask)) { + set_bits(XIO_TRANSPORT_ATTR_TOS, + &nexus->trans_attr_mask); + nexus->trans_attr.tos = init_attr->tos; + ptrans_init_attr = &nexus->trans_attr; + } + } + + nexus->transport_hndl = transport->open( + transport, ctx, + &nexus->trans_observer, + nexus->trans_attr_mask, + ptrans_init_attr); + if (!nexus->transport_hndl) { + ERROR_LOG("transport open failed\n"); + goto cleanup; + } + nexus->transport = transport; + kref_init(&nexus->kref); + nexus->state = XIO_NEXUS_STATE_OPEN; + +#ifdef XIO_SRQ_ENABLE + if (nexus->transport_hndl->proto == XIO_PROTO_RDMA) + nexus->srq_enabled = 1; + else + nexus->srq_enabled = 0; +#else + nexus->srq_enabled = 0; +#endif + + if (nexus->transport->get_pools_setup_ops) { + struct xio_context *ctx = nexus->transport_hndl->ctx; + enum xio_proto proto = nexus->transport_hndl->proto; + + if (!ctx->primary_pool_ops[proto] || + !ctx->initial_pool_ops[proto]) + nexus->transport->get_pools_setup_ops( + nexus->transport_hndl, + &ctx->initial_pool_ops[proto], + &ctx->primary_pool_ops[proto]); + } else { + ERROR_LOG("transport does not implement \"add_observer\"\n"); + goto cleanup; + } + nexus->destroy_event.handler = xio_nexus_destroy_handler; + nexus->destroy_event.data = nexus; + + nexus->trans_error_event.handler = xio_nexus_trans_error_handler; + nexus->trans_error_event.data = NULL; + + xio_nexus_cache_add(nexus, &nexus->cid); + + TRACE_LOG("nexus: [new] nexus:%p, transport_hndl:%p\n", nexus, + nexus->transport_hndl); + + return nexus; +cleanup: + xio_nexus_destroy(nexus); + + return NULL; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_reconnect */ +/* client side reconnection */ +/*---------------------------------------------------------------------------*/ +int xio_nexus_reconnect(struct xio_nexus *nexus) +{ + struct xio_transport *transport; + struct xio_context *ctx; + int retval; + + if (nexus->state != XIO_NEXUS_STATE_RECONNECT) { + xio_set_error(XIO_E_STATE); + ERROR_LOG("reconnect not permitted in current state(%d)\n", + nexus->state); + return -1; + } + + transport = nexus->transport; + ctx = nexus->transport_hndl->ctx; + + nexus->new_transport_hndl = transport->open(nexus->transport, ctx, + &nexus->trans_observer, + nexus->trans_attr_mask, + &nexus->trans_attr); + + if (!nexus->new_transport_hndl) { + ERROR_LOG("transport open failed\n"); + return -1; + } + + retval = transport->connect(nexus->new_transport_hndl, + nexus->portal_uri, + nexus->out_if_addr); + + if (retval != 0) { + /* ignore close notification */ + xio_observable_unreg_observer( + &nexus->new_transport_hndl->observable, + &nexus->trans_observer); + + transport->close(nexus->new_transport_hndl); + nexus->new_transport_hndl = NULL; + ERROR_LOG("transport reconnect failed\n"); + return -1; + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_notify_observer_work */ +/*---------------------------------------------------------------------------*/ +static void xio_nexus_notify_observer_work(void *_work_params) +{ + struct xio_nexus_observer_work *work_params = + (struct xio_nexus_observer_work *) _work_params; + xio_observable_notify_observer(work_params->observer_event.observable, + work_params->observer_event.observer, + work_params->observer_event.event, + work_params->observer_event.event_data); + xio_ctx_set_work_destructor(work_params->ctx, + work_params, + (void (*)(void *))kfree, + &work_params->observer_work); +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_connect */ +/*---------------------------------------------------------------------------*/ +int xio_nexus_connect(struct xio_nexus *nexus, const char *portal_uri, + struct xio_observer *observer, const char *out_if) +{ + int retval; + struct xio_nexus_observer_work *work_params; + + if (!nexus->transport->connect) { + ERROR_LOG("transport does not implement \"connect\"\n"); + xio_set_error(ENOSYS); + return -1; + } + mutex_lock(&nexus->lock_connect); + switch (nexus->state) { + case XIO_NEXUS_STATE_OPEN: + /* for reconnect */ + nexus->portal_uri = kstrdup(portal_uri, GFP_KERNEL); + if (!nexus->portal_uri) { + ERROR_LOG("memory alloc failed\n"); + xio_set_error(ENOMEM); + goto cleanup1; + } + if (out_if) { + nexus->out_if_addr = kstrdup(out_if, GFP_KERNEL); + if (!nexus->out_if_addr) { + ERROR_LOG("memory alloc failed\n"); + xio_set_error(ENOMEM); + goto cleanup2; + } + } + TRACE_LOG("%s: nexus:%p, rdma_hndl:%p, portal:%s\n", __func__, + nexus, nexus->transport_hndl, portal_uri); + retval = nexus->transport->connect(nexus->transport_hndl, + portal_uri, + out_if); + if (retval != 0) + goto cleanup3; + nexus->state = XIO_NEXUS_STATE_CONNECTING; + break; + case XIO_NEXUS_STATE_CONNECTED: + /* moving the notification to the ctx the nexus is running on + * to avoid session_setup_request from being sent on another thread + */ + work_params = (struct xio_nexus_observer_work *) + kmalloc(sizeof(*work_params), GFP_KERNEL); + if (unlikely(!work_params)) { + ERROR_LOG("failed to allocate memory\n"); + goto cleanup1; + } + work_params->observer_event.observer = observer; + work_params->observer_event.observable = &nexus->observable; + work_params->observer_event.event = XIO_NEXUS_EVENT_ESTABLISHED; + work_params->observer_event.event_data = NULL; + work_params->ctx = nexus->transport_hndl->ctx; + xio_ctx_add_work(nexus->transport_hndl->ctx, + work_params, + xio_nexus_notify_observer_work, + &work_params->observer_work); + break; + default: + break; + } + mutex_unlock(&nexus->lock_connect); + + return 0; + +cleanup3: + kfree(nexus->out_if_addr); + nexus->out_if_addr = NULL; +cleanup2: + kfree(nexus->portal_uri); + nexus->portal_uri = NULL; +cleanup1: + ERROR_LOG("transport connect failed\n"); + mutex_unlock(&nexus->lock_connect); + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_listen */ +/*---------------------------------------------------------------------------*/ +int xio_nexus_listen(struct xio_nexus *nexus, const char *portal_uri, + uint16_t *src_port, int backlog) +{ + int retval; + + if (!nexus->transport->listen) { + ERROR_LOG("transport does not implement \"listen\"\n"); + xio_set_error(ENOSYS); + return -1; + } + if (nexus->state == XIO_NEXUS_STATE_OPEN) { + /* do not hold the listener nexus in storage */ + xio_nexus_cache_remove(nexus->cid); + retval = nexus->transport->listen(nexus->transport_hndl, + portal_uri, src_port, + backlog); + if (retval != 0) { + DEBUG_LOG("transport listen failed. uri:[%s]\n", + portal_uri); + return -1; + } + nexus->state = XIO_NEXUS_STATE_LISTEN; + nexus->is_listener = 1; + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_accept */ +/*---------------------------------------------------------------------------*/ +int xio_nexus_accept(struct xio_nexus *nexus) +{ + int retval; + + if (!nexus->transport->accept) { + ERROR_LOG("transport does not implement \"accept\"\n"); + xio_set_error(ENOSYS); + return -1; + } + if (nexus->state == XIO_NEXUS_STATE_OPEN) { + retval = nexus->transport->accept(nexus->transport_hndl); + if (retval != 0) { + ERROR_LOG("transport accept failed.\n"); + return -1; + } + } + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_reject */ +/*---------------------------------------------------------------------------*/ +int xio_nexus_reject(struct xio_nexus *nexus) +{ + int retval; + + if (!nexus->transport->reject) { + ERROR_LOG("transport does not implement \"reject\"\n"); + xio_set_error(ENOSYS); + return -1; + } + if (nexus->state == XIO_NEXUS_STATE_OPEN) { + retval = nexus->transport->reject(nexus->transport_hndl); + if (retval != 0) { + ERROR_LOG("transport reject failed.\n"); + return -1; + } + } + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_delayed_close */ +/*---------------------------------------------------------------------------*/ +static void xio_nexus_delayed_close(struct kref *kref) +{ + struct xio_nexus *nexus = container_of(kref, + struct xio_nexus, + kref); + int retval; + + TRACE_LOG("xio_nexus_deleyed close. nexus:%p, state:%d\n", + nexus, nexus->state); + + switch (nexus->state) { + case XIO_NEXUS_STATE_LISTEN: + /* the listener nexus, called from xio_unbind */ + case XIO_NEXUS_STATE_ERROR: + case XIO_NEXUS_STATE_DISCONNECTED: + xio_nexus_release(nexus); + break; + default: + /* only client shall cause disconnection */ + retval = xio_ctx_add_delayed_work( + nexus->transport_hndl->ctx, + g_options.transport_close_timeout, nexus, + xio_nexus_release_cb, + &nexus->close_time_hndl); + if (retval) + ERROR_LOG("xio_nexus_delayed_close failed\n"); + break; + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_close */ +/*---------------------------------------------------------------------------*/ +void xio_nexus_close(struct xio_nexus *nexus, struct xio_observer *observer) +{ + TRACE_LOG("nexus: [putref] ptr:%p, refcnt:%d\n", nexus, + atomic_read(&nexus->kref.refcount)); + + if (observer) { + xio_nexus_notify_observer( + nexus, observer, + XIO_NEXUS_EVENT_CLOSED, NULL); + xio_nexus_unreg_observer(nexus, observer); + } + kref_put(&nexus->kref, xio_nexus_delayed_close); +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_flush_tx_queue */ +/*---------------------------------------------------------------------------*/ +static int xio_nexus_flush_tx_queue(struct xio_nexus *nexus) +{ + struct xio_task *ptask, *next_ptask; + + list_for_each_entry_safe(ptask, next_ptask, &nexus->tx_queue, + tasks_list_entry) { + TRACE_LOG("flushing task %p type 0x%x\n", + ptask, ptask->tlv_type); + if (ptask->sender_task) { + xio_tasks_pool_put(ptask->sender_task); + ptask->sender_task = NULL; + } + xio_tasks_pool_put(ptask); + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_xmit */ +/*---------------------------------------------------------------------------*/ +static int xio_nexus_xmit(struct xio_nexus *nexus) +{ + int retval = 0; + struct xio_task *task; + + if (!nexus->transport) { + ERROR_LOG("transport not initialized\n"); + return -1; + } + if (!nexus->transport->send) + return 0; + + while (1) { + if (list_empty(&nexus->tx_queue)) + break; + + task = list_first_entry(&nexus->tx_queue, + struct xio_task, tasks_list_entry); + retval = nexus->transport->send(nexus->transport_hndl, task); + if (retval != 0) { + union xio_nexus_event_data nexus_event_data; + + if (xio_errno() == EAGAIN) + return 0; + + ERROR_LOG("transport send failed err:%d\n", + xio_errno()); + nexus_event_data.msg_error.reason = + (enum xio_status)xio_errno(); + nexus_event_data.msg_error.direction = + XIO_MSG_DIRECTION_OUT; + nexus_event_data.msg_error.task = task; + + /* special error for connection */ + xio_set_error(ENOMSG); + retval = -ENOMSG; + + xio_observable_notify_any_observer( + &nexus->observable, + XIO_NEXUS_EVENT_MESSAGE_ERROR, + &nexus_event_data); + break; + } + } + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_send */ +/*---------------------------------------------------------------------------*/ +int xio_nexus_send(struct xio_nexus *nexus, struct xio_task *task) +{ + int retval; + + if (!nexus->transport) { + ERROR_LOG("transport not initialized\n"); + return -1; + } + if (!nexus->transport->send) + return 0; + + /* push to end of the queue */ + list_move_tail(&task->tasks_list_entry, &nexus->tx_queue); + + /* xmit it to the transport */ + retval = xio_nexus_xmit(nexus); + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_poll */ +/*---------------------------------------------------------------------------*/ +int xio_nexus_poll(struct xio_nexus *nexus, long min_nr, long nr, + struct timespec *timeout) +{ + int retval = 0; + + if (nexus->transport->poll) { + retval = nexus->transport->poll(nexus->transport_hndl, + min_nr, nr, timeout); + if (retval < 0) { + ERROR_LOG("transport poll failed\n"); + return -1; + } + } + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_set_opt */ +/*---------------------------------------------------------------------------*/ +int xio_nexus_set_opt(struct xio_nexus *nexus, int optname, const void *optval, + int optlen) +{ + if (nexus->transport->set_opt) + return nexus->transport->set_opt(nexus->transport_hndl, + optname, optval, optlen); + + xio_set_error(XIO_E_NOT_SUPPORTED); + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_get_opt */ +/*---------------------------------------------------------------------------*/ +int xio_nexus_get_opt(struct xio_nexus *nexus, int optname, void *optval, + int *optlen) +{ + if (nexus->transport->get_opt) + return nexus->transport->get_opt(nexus->transport_hndl, + optname, optval, optlen); + + xio_set_error(XIO_E_NOT_SUPPORTED); + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_modify */ +/*---------------------------------------------------------------------------*/ +int xio_nexus_modify(struct xio_nexus *nexus, + struct xio_nexus_attr *attr, int attr_mask) +{ + int tattr_mask = 0; + struct xio_transport_attr tattr; + + if (!nexus->transport->modify) + goto not_supported; + + memset(&tattr, 0, sizeof(tattr)); + if (test_flag(XIO_NEXUS_ATTR_TOS, &attr_mask)) { + tattr_mask |= XIO_TRANSPORT_ATTR_TOS; + tattr.tos = attr->tos; + } + if (tattr_mask == 0) + goto not_supported; + + return nexus->transport->modify(nexus->transport_hndl, + &tattr, tattr_mask); +not_supported: + xio_set_error(XIO_E_NOT_SUPPORTED); + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_query */ +/*---------------------------------------------------------------------------*/ +int xio_nexus_query(struct xio_nexus *nexus, + struct xio_nexus_attr *attr, int attr_mask) +{ + int tattr_mask = 0, retval; + struct xio_transport_attr tattr; + + if (!nexus->transport->modify) + goto not_supported; + + memset(&tattr, 0, sizeof(tattr)); + if (test_flag(XIO_NEXUS_ATTR_TOS, &attr_mask)) + tattr_mask |= XIO_TRANSPORT_ATTR_TOS; + + if (tattr_mask == 0) + goto not_supported; + + retval = nexus->transport->query(nexus->transport_hndl, + &tattr, tattr_mask); + if (retval) + return -1; + + if (test_flag(XIO_NEXUS_ATTR_TOS, &attr_mask)) + attr->tos = tattr.tos; + +not_supported: + xio_set_error(XIO_E_NOT_SUPPORTED); + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_get_peer_addr */ +/*---------------------------------------------------------------------------*/ +int xio_nexus_get_peer_addr(struct xio_nexus *nexus, + struct sockaddr_storage *sa, socklen_t len) +{ + memcpy(sa, &nexus->transport_hndl->peer_addr, len); + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_get_local_addr */ +/*---------------------------------------------------------------------------*/ +int xio_nexus_get_local_addr(struct xio_nexus *nexus, + struct sockaddr_storage *sa, socklen_t len) +{ + memcpy(sa, &nexus->transport_hndl->local_addr, len); + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_cancel_req */ +/*---------------------------------------------------------------------------*/ +int xio_nexus_cancel_req(struct xio_nexus *nexus, struct xio_msg *req, + uint64_t stag, void *ulp_msg, size_t ulp_msg_sz) +{ + if (nexus->transport->cancel_req) + return nexus->transport->cancel_req(nexus->transport_hndl, + req, stag, + ulp_msg, ulp_msg_sz); + + xio_set_error(XIO_E_NOT_SUPPORTED); + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_cancel_rsp */ +/*---------------------------------------------------------------------------*/ +int xio_nexus_cancel_rsp(struct xio_nexus *nexus, struct xio_task *task, + enum xio_status result, void *ulp_msg, + size_t ulp_msg_sz) +{ + if (nexus->transport->cancel_req) + return nexus->transport->cancel_rsp(nexus->transport_hndl, + task, result, + ulp_msg, ulp_msg_sz); + xio_set_error(XIO_E_NOT_SUPPORTED); + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_server_reconnect_timeout */ +/*---------------------------------------------------------------------------*/ +static void xio_nexus_server_reconnect_timeout(void *data) +{ + struct xio_nexus *nexus = (struct xio_nexus *)data; + + /* No reconnect within timeout */ + nexus->state = XIO_NEXUS_STATE_DISCONNECTED; + TRACE_LOG("nexus state changed to disconnected\n"); + xio_nexus_flush_tx_queue(nexus); + xio_observable_notify_all_observers(&nexus->observable, + XIO_NEXUS_EVENT_DISCONNECTED, + NULL); +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_server_reconnect */ +/*---------------------------------------------------------------------------*/ +static int xio_nexus_server_reconnect(struct xio_nexus *nexus) +{ + int retval; + + if (nexus->state != XIO_NEXUS_STATE_CONNECTED) + return -1; + + xio_nexus_state_set(nexus, XIO_NEXUS_STATE_RECONNECT); + + xio_observable_notify_all_observers(&nexus->observable, + XIO_NEXUS_EVENT_RECONNECTING, + NULL); + + /* Just wait and see if some client tries to reconnect */ + retval = xio_ctx_add_delayed_work(nexus->transport_hndl->ctx, + XIO_SERVER_TIMEOUT, nexus, + xio_nexus_server_reconnect_timeout, + &nexus->close_time_hndl); + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_client_reconnect_timeout */ +/*---------------------------------------------------------------------------*/ +static void xio_nexus_client_reconnect_timeout(void *data) +{ + struct xio_nexus *nexus = (struct xio_nexus *)data; + int retval; + + /* Try to reconnect after the waiting period */ + retval = xio_nexus_reconnect(nexus); + if (!retval) { + TRACE_LOG("reconnect succeed\n"); + return; + } + + if (nexus->reconnect_retries) { + nexus->reconnect_retries--; + retval = xio_ctx_add_delayed_work( + nexus->transport_hndl->ctx, + xio_msecs[nexus->reconnect_retries], + nexus, + xio_nexus_client_reconnect_timeout, + &nexus->close_time_hndl); + } else { + /* retries number exceeded */ + nexus->state = XIO_NEXUS_STATE_DISCONNECTED; + TRACE_LOG("nexus state changed to disconnected\n"); + xio_nexus_flush_tx_queue(nexus); + xio_observable_notify_all_observers( + &nexus->observable, + XIO_NEXUS_EVENT_DISCONNECTED, + NULL); + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_client_reconnect_failed */ +/*---------------------------------------------------------------------------*/ +static void xio_nexus_client_reconnect_failed(void *data) +{ + struct xio_nexus *nexus = (struct xio_nexus *)data; + int retval; + + retval = xio_nexus_prep_new_transport(nexus); + if (retval != 0) + ERROR_LOG("prep new transport failed\n"); + + /* Failed to reconnect (connect was called) */ + if (nexus->reconnect_retries) { + nexus->reconnect_retries--; + retval = xio_ctx_add_delayed_work( + nexus->transport_hndl->ctx, + xio_msecs[nexus->reconnect_retries], + nexus, + xio_nexus_client_reconnect_timeout, + &nexus->close_time_hndl); + if (retval) + ERROR_LOG("adding delayed work failed\n"); + } else { + /* retries number exceeded */ + nexus->state = XIO_NEXUS_STATE_DISCONNECTED; + TRACE_LOG("nexus state changed to disconnected\n"); + xio_nexus_flush_tx_queue(nexus); + xio_observable_notify_all_observers( + &nexus->observable, + XIO_NEXUS_EVENT_DISCONNECTED, + NULL); + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_client_reconnect */ +/*---------------------------------------------------------------------------*/ +static int xio_nexus_client_reconnect(struct xio_nexus *nexus) +{ + /* With client we do an exponential back-off first delay is 0 */ + int retval; + + if (nexus->state != XIO_NEXUS_STATE_CONNECTED) + return -1; + + if (!nexus->transport->dup2) + return -1; + + if (nexus->state == XIO_NEXUS_STATE_RECONNECT){ + return 0; + } + + xio_nexus_state_set(nexus, XIO_NEXUS_STATE_RECONNECT); + + xio_observable_notify_all_observers(&nexus->observable, + XIO_NEXUS_EVENT_RECONNECTING, + NULL); + + /* All portal_uri and out_if were saved in the nexus + * observer is not used in this flow + */ + + /* Three retries but vector start from 0 */ + nexus->reconnect_retries = 3; + /* Try to reconnect immediately + * Note connect may succeed but we may get a reject */ + retval = xio_nexus_reconnect(nexus); + if (!retval) + return 0; + + nexus->reconnect_retries = 2; + retval = xio_ctx_add_delayed_work(nexus->transport_hndl->ctx, + xio_msecs[nexus->reconnect_retries], + nexus, + xio_nexus_client_reconnect_timeout, + &nexus->close_time_hndl); + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_update_task */ +/*---------------------------------------------------------------------------*/ +int xio_nexus_update_task(struct xio_nexus *nexus, struct xio_task *task) +{ + /* transport may not need to update tasks */ + if (!nexus->transport->update_task) + return 0; + + if (nexus->transport->update_task(nexus->transport_hndl, task)) + return -1; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_update_rkey */ +/*---------------------------------------------------------------------------*/ +int xio_nexus_update_rkey(struct xio_nexus *nexus, + uint32_t *rkey) +{ + if (!nexus->transport->update_rkey) + return 0; + + if (nexus->transport->update_rkey(nexus->transport_hndl, rkey)) + return -1; + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_set_server */ +/*---------------------------------------------------------------------------*/ +void xio_nexus_set_server(struct xio_nexus *nexus, struct xio_server *server) +{ + nexus->server = server; + if (server) + xio_server_reg_observer(server, &nexus->srv_observer); +} diff --git a/open_src/xio/src/common/xio_nexus.h b/open_src/xio/src/common/xio_nexus.h new file mode 100644 index 0000000..d9b6989 --- /dev/null +++ b/open_src/xio/src/common/xio_nexus.h @@ -0,0 +1,402 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_NEXUS_H +#define XIO_NEXUS_H + +#ifdef __cplusplus +extern "C" { +#endif + +/*---------------------------------------------------------------------------*/ +/* typedefs */ +/*---------------------------------------------------------------------------*/ +struct xio_nexus; + +/*---------------------------------------------------------------------------*/ +/* enum */ +/*---------------------------------------------------------------------------*/ +enum xio_nexus_event { + XIO_NEXUS_EVENT_NEW_CONNECTION, + XIO_NEXUS_EVENT_ESTABLISHED, + XIO_NEXUS_EVENT_DISCONNECTED, + XIO_NEXUS_EVENT_RECONNECTING, + XIO_NEXUS_EVENT_RECONNECTED, + XIO_NEXUS_EVENT_CLOSED, + XIO_NEXUS_EVENT_REFUSED, + XIO_NEXUS_EVENT_NEW_MESSAGE, + XIO_NEXUS_EVENT_SEND_COMPLETION, + XIO_NEXUS_EVENT_ASSIGN_IN_BUF, + XIO_NEXUS_EVENT_ALLOC_HEAD_BUF, + XIO_NEXUS_EVENT_ALLOC_DATA_BUF, + XIO_NEXUS_EVENT_CANCEL_REQUEST, + XIO_NEXUS_EVENT_CANCEL_RESPONSE, + XIO_NEXUS_EVENT_ERROR, + XIO_NEXUS_EVENT_MESSAGE_ERROR, + XIO_NEXUS_EVENT_DIRECT_RDMA_COMPLETION, +}; + +enum xio_nexus_state { + XIO_NEXUS_STATE_INIT, + XIO_NEXUS_STATE_OPEN, + XIO_NEXUS_STATE_LISTEN, + XIO_NEXUS_STATE_CONNECTING, + XIO_NEXUS_STATE_CONNECTED, + XIO_NEXUS_STATE_REJECTED, + XIO_NEXUS_STATE_CLOSED, + XIO_NEXUS_STATE_DISCONNECTED, + XIO_NEXUS_STATE_RECONNECT, + XIO_NEXUS_STATE_ERROR +}; + +enum xio_nexus_attr_mask { + XIO_NEXUS_ATTR_TOS = 1 << 0 +}; + +/*---------------------------------------------------------------------------*/ +/* structs */ +/*---------------------------------------------------------------------------*/ +union xio_nexus_event_data { + struct { + struct xio_task *task; + enum xio_wc_op op; + int pad; + } msg; + struct { + struct xio_task *task; + int is_assigned; + int pad; + } assign_in_buf; + struct { + struct xio_task *task; + enum xio_status reason; + enum xio_msg_direction direction; + } msg_error; + struct { + struct xio_nexus *child_nexus; + } new_nexus; + struct { + enum xio_status reason; + enum xio_msg_direction direction; + } error; + struct { + struct xio_task *task; + enum xio_status result; + int pad; + void *ulp_msg; + size_t ulp_msg_sz; + } cancel; + struct { + struct xio_task *task; + struct xio_iovec *header; + int is_assigned; + int pad; + } alloc_head_buf; + struct { + struct xio_task *task; + struct xio_vmsg *in; /**< incoming side of message */; + int is_assigned; + int pad; + } alloc_data_buf; +}; + +struct xio_nexus_attr { + uint8_t tos; /**< type of service RFC 2474 */ + uint8_t pad[3]; +}; + +struct xio_nexus_init_attr { + uint8_t tos; /**< type of service RFC 2474 */ + uint8_t pad[3]; +}; + +/** + * Connection data type + */ +struct xio_nexus { + struct xio_transport *transport; + struct xio_transport_base *transport_hndl; + + struct xio_tasks_pool *primary_tasks_pool; + struct xio_tasks_pool *initial_tasks_pool; + + struct xio_observer trans_observer; + struct xio_observer ctx_observer; + struct xio_observer srv_observer; + struct xio_observable observable; + struct kref kref; + + int cid; + enum xio_nexus_state state; + short is_first_req; + short reconnect_retries; + int is_listener; + int srq_enabled; + xio_delayed_work_handle_t close_time_hndl; + + struct list_head observers_htbl; + struct list_head tx_queue; + struct xio_server *server; + + /* Client side for reconnect */ + int server_cid; + int server_cid_pad; + struct xio_transport_base *new_transport_hndl; + char *portal_uri; + char *out_if_addr; + uint32_t trans_attr_mask; + struct xio_transport_init_attr trans_attr; + struct xio_ev_data destroy_event; + struct xio_ev_data trans_error_event; + spinlock_t nexus_obs_lock; + int pad2; + struct mutex lock_connect; /* lock nexus connect */ + + HT_ENTRY(xio_nexus, xio_key_int32) nexus_htbl; +}; + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_close */ +/*---------------------------------------------------------------------------*/ +void xio_nexus_close(struct xio_nexus *nexus, struct xio_observer *observer); + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_open */ +/*---------------------------------------------------------------------------*/ +struct xio_nexus *xio_nexus_open(struct xio_context *ctx, + const char *portal_uri, + struct xio_observer *observer, + uint32_t oid, + uint32_t attr_mask, + struct xio_nexus_init_attr *init_attr); + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_connect */ +/*---------------------------------------------------------------------------*/ +int xio_nexus_connect(struct xio_nexus *nexus, const char *portal_uri, + struct xio_observer *observer, + const char *out_if); + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_listen */ +/*---------------------------------------------------------------------------*/ +int xio_nexus_listen(struct xio_nexus *nexus, const char *portal_uri, + uint16_t *src_port, int backlog); + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_accept */ +/*---------------------------------------------------------------------------*/ +int xio_nexus_accept(struct xio_nexus *nexus); + +/*---------------------------------------------------------------------------*/ +/* xio_handler_init */ +/*---------------------------------------------------------------------------*/ +int xio_nexus_reject(struct xio_nexus *nexus); + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_poll */ +/*---------------------------------------------------------------------------*/ +int xio_nexus_poll(struct xio_nexus *nexus, + long min_nr, long nr, struct timespec *timeout); + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_send */ +/*---------------------------------------------------------------------------*/ +int xio_nexus_send(struct xio_nexus *nexus, struct xio_task *task); + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_cancel_req */ +/*---------------------------------------------------------------------------*/ +int xio_nexus_cancel_req(struct xio_nexus *nexus, + struct xio_msg *req, uint64_t stag, + void *ulp_msg, size_t ulp_msg_sz); +/*---------------------------------------------------------------------------*/ +/* xio_nexus_cancel_rsp */ +/*---------------------------------------------------------------------------*/ +int xio_nexus_cancel_rsp(struct xio_nexus *nexus, + struct xio_task *task, enum xio_status result, + void *ulp_msg, size_t ulp_msg_sz); + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_set_opt */ +/*---------------------------------------------------------------------------*/ +int xio_nexus_set_opt(struct xio_nexus *nexus, int optname, + const void *optval, int optlen); + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_get_opt */ +/*---------------------------------------------------------------------------*/ +int xio_nexus_get_opt(struct xio_nexus *nexus, int optname, + void *optval, int *optlen); + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_get_primary_task */ +/*---------------------------------------------------------------------------*/ +struct xio_task *xio_nexus_get_primary_task(struct xio_nexus *nexus); + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_primary_free_tasks */ +/*---------------------------------------------------------------------------*/ +int xio_nexus_primary_free_tasks(struct xio_nexus *nexus); + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_set_server */ +/*---------------------------------------------------------------------------*/ +void xio_nexus_set_server(struct xio_nexus *nexus, struct xio_server *server); + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_reg_observer */ +/*---------------------------------------------------------------------------*/ +void xio_nexus_reg_observer(struct xio_nexus *nexus, + struct xio_observer *observer, + uint32_t oid); + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_unreg_observer */ +/*---------------------------------------------------------------------------*/ +void xio_nexus_unreg_observer(struct xio_nexus *nexus, + struct xio_observer *observer); + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_observer_lookup */ +/*---------------------------------------------------------------------------*/ +struct xio_observer *xio_nexus_observer_lookup(struct xio_nexus *nexus, + uint32_t id); + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_notify_observer */ +/*---------------------------------------------------------------------------*/ +static inline void xio_nexus_notify_observer( + struct xio_nexus *nexus, + struct xio_observer *observer, + int event, void *event_data) +{ + xio_observable_notify_observer(&nexus->observable, observer, + event, event_data); +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_get_peer_addr */ +/*---------------------------------------------------------------------------*/ +int xio_nexus_get_peer_addr(struct xio_nexus *nexus, + struct sockaddr_storage *sa, socklen_t len); + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_get_local_addr */ +/*---------------------------------------------------------------------------*/ +int xio_nexus_get_local_addr(struct xio_nexus *nexus, + struct sockaddr_storage *sa, socklen_t len); + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_get_validators_cls */ +/*---------------------------------------------------------------------------*/ +static inline +struct xio_transport_msg_validators_cls *xio_nexus_get_validators_cls( + struct xio_nexus *nexus) +{ + return &nexus->transport->validators_cls; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_get_proto */ +/*---------------------------------------------------------------------------*/ +static inline int xio_nexus_get_proto(struct xio_nexus *nexus) +{ + return nexus->transport_hndl->proto; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_addref */ +/*---------------------------------------------------------------------------*/ +static inline void xio_nexus_addref(struct xio_nexus *nexus) +{ + if (xio_is_delayed_work_pending(&nexus->close_time_hndl)) { + kref_init(&nexus->kref); + xio_ctx_del_delayed_work(nexus->transport_hndl->ctx, + &nexus->close_time_hndl); + } else { + kref_get(&nexus->kref); + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_state_get */ +/*---------------------------------------------------------------------------*/ +static inline enum xio_nexus_state xio_nexus_state_get(struct xio_nexus *nexus) +{ + return nexus->state; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_state_set */ +/*---------------------------------------------------------------------------*/ +static inline void xio_nexus_state_set(struct xio_nexus *nexus, + enum xio_nexus_state state) +{ + nexus->state = state; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_update_task */ +/*---------------------------------------------------------------------------*/ +int xio_nexus_update_task(struct xio_nexus *nexus, struct xio_task *task); + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_update_rkey */ +/*---------------------------------------------------------------------------*/ +int xio_nexus_update_rkey(struct xio_nexus *nexus, + uint32_t *rkey); + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_modify */ +/*---------------------------------------------------------------------------*/ +int xio_nexus_modify(struct xio_nexus *nexus, + struct xio_nexus_attr *attr, + int attr_mask); + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_query */ +/*---------------------------------------------------------------------------*/ +int xio_nexus_query(struct xio_nexus *nexus, + struct xio_nexus_attr *attr, + int attr_mask); + +#ifdef __cplusplus +} +#endif + +#endif /*XIO_NEXUS_H */ diff --git a/open_src/xio/src/common/xio_nexus_cache.c b/open_src/xio/src/common/xio_nexus_cache.c new file mode 100644 index 0000000..3c0cd16 --- /dev/null +++ b/open_src/xio/src/common/xio_nexus_cache.c @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include "libxio.h" +#include +#include "xio_log.h" +#include "xio_common.h" +#include "xio_hash.h" +#include "xio_protocol.h" +#include "xio_mbuf.h" +#include "xio_task.h" +#include "xio_observer.h" +#include "xio_objpool.h" +#include "xio_workqueue.h" +#include "xio_ev_data.h" +#include "xio_context.h" +#include "xio_transport.h" +#include "xio_transport.h" +#include "xio_nexus.h" +#include "xio_nexus_cache.h" + +static HT_HEAD(, xio_nexus, HASHTABLE_PRIME_SMALL) nexus_cache; +static spinlock_t cs_lock; + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_cache_add */ +/*---------------------------------------------------------------------------*/ +static int nexus_cache_add(struct xio_nexus *nexus, int nexus_id) +{ + struct xio_nexus *c; + struct xio_key_int32 key = { + .id = nexus_id, + .pad = {0}, + }; + + HT_LOOKUP(&nexus_cache, &key, c, nexus_htbl); + if (c) + return -1; + + HT_INSERT(&nexus_cache, &key, nexus, nexus_htbl); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_cache_remove */ +/*---------------------------------------------------------------------------*/ +int xio_nexus_cache_remove(int nexus_id) +{ + struct xio_nexus *c; + struct xio_key_int32 key; + + spin_lock(&cs_lock); + key.id = nexus_id; + HT_LOOKUP(&nexus_cache, &key, c, nexus_htbl); + if (!c) { + spin_unlock(&cs_lock); + return -1; + } + + HT_REMOVE(&nexus_cache, c, xio_nexus, nexus_htbl); + spin_unlock(&cs_lock); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_cache_lookup */ +/*---------------------------------------------------------------------------*/ +struct xio_nexus *xio_nexus_cache_lookup(int nexus_id) +{ + struct xio_nexus *c; + struct xio_key_int32 key; + + spin_lock(&cs_lock); + key.id = nexus_id; + HT_LOOKUP(&nexus_cache, &key, c, nexus_htbl); + spin_unlock(&cs_lock); + + return c; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_cache_add */ +/*---------------------------------------------------------------------------*/ +int xio_nexus_cache_add(struct xio_nexus *nexus, + int *nexus_id) +{ + static int cid; /* = 0 global nexus provider */ + int retval; + + spin_lock(&cs_lock); + retval = nexus_cache_add(nexus, cid); + if (retval == 0) + *nexus_id = cid++; + spin_unlock(&cs_lock); + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_nexus_cache_find */ +/*---------------------------------------------------------------------------*/ +struct xio_nexus *xio_nexus_cache_find(struct xio_nexus_query_params *query) +{ + struct xio_nexus *nexus; + int tos_enabled; + + spin_lock(&cs_lock); + HT_FOREACH(nexus, &nexus_cache, nexus_htbl) { + if (nexus->transport_hndl->portal_uri) { + if ((strcmp(nexus->transport_hndl->portal_uri, + query->portal_uri) != 0) || + (nexus->transport_hndl->ctx != query->ctx)) + continue; + + tos_enabled = test_bits(XIO_NEXUS_ATTR_TOS, + &nexus->trans_attr_mask); + if (tos_enabled != query->tos_enabled) + continue; + if (tos_enabled && nexus->trans_attr.tos != query->tos) + continue; + + /* match found */ + xio_nexus_addref(nexus); + + TRACE_LOG("nexus: [addref] nexus:%p, refcnt:%d\n", nexus, + atomic_read(&nexus->kref.refcount)); + goto done; + } + } + nexus = NULL; + +done: + spin_unlock(&cs_lock); + return nexus; +} + +/*---------------------------------------------------------------------------*/ +/* nexus_cache_construct */ +/*---------------------------------------------------------------------------*/ +void nexus_cache_construct(void) +{ + HT_INIT(&nexus_cache, xio_int32_hash, xio_int32_cmp, xio_int32_cp); + spin_lock_init(&cs_lock); +} + +/* +void nexus_cache_destruct(void) +{ +} +*/ + diff --git a/open_src/xio/src/common/xio_nexus_cache.h b/open_src/xio/src/common/xio_nexus_cache.h new file mode 100644 index 0000000..f5a2678 --- /dev/null +++ b/open_src/xio/src/common/xio_nexus_cache.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_NEXUS_CACHE_H +#define XIO_NEXUS_CACHE_H + +/*---------------------------------------------------------------------------*/ +/* forward declarations */ +/*---------------------------------------------------------------------------*/ +struct xio_nexus_mgr; +struct xio_nexus; + +struct xio_session; + +struct xio_nexus_query_params { + struct xio_context *ctx; + const char *portal_uri; + int tos_enabled; + uint8_t tos; + uint8_t reserved[3]; +}; + +/*---------------------------------------------------------------------------*/ +/* nexus_cache_construct */ +/*---------------------------------------------------------------------------*/ +void nexus_cache_construct(void); + +int xio_nexus_cache_add( + struct xio_nexus *nexus, + int *nexus_id); + +int xio_nexus_cache_remove( + int session_id); + +struct xio_nexus *xio_nexus_cache_lookup( + int nexus_id); + +struct xio_nexus *xio_nexus_cache_find(struct xio_nexus_query_params *query); + +#endif /*XIO_NEXUS_CACHE_H */ + diff --git a/open_src/xio/src/common/xio_objpool.c b/open_src/xio/src/common/xio_objpool.c new file mode 100644 index 0000000..ba14706 --- /dev/null +++ b/open_src/xio/src/common/xio_objpool.c @@ -0,0 +1,179 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include "xio_objpool.h" +#include +/*---------------------------------------------------------------------------*/ +/* structures */ +/*---------------------------------------------------------------------------*/ +struct xio_mem_chunk { + struct list_head chunk_entry; +}; + +struct xio_mem_obj { + void *obj; + struct list_head chain_entry; + struct xio_objpool *pool; +}; + +struct xio_objpool { + struct list_head free_list; /* list of xio_mem_obj */ + struct list_head used_list; /* list of xio_mem_obj */ + struct list_head chunks_list; /* list of mem chunks */ + uint64_t obj_size; /* obj size */ + uint64_t grow_nr; /* obj to realloc in pool */ + uint64_t total_nr; /* total objs in pool */ +}; + +/*---------------------------------------------------------------------------*/ +/* xio_objpool_realloc */ +/*---------------------------------------------------------------------------*/ +static int xio_objpool_realloc(struct xio_objpool *p, int size, int n) +{ + struct xio_mem_obj *obj; + struct xio_mem_chunk *chunk; + size_t alloc_sz; + char *buf; + + p->total_nr += n; + + alloc_sz = sizeof(*chunk) + + n*(sizeof(*obj) + sizeof(obj) + size); + + buf = (char *)vzalloc(alloc_sz); + if (!buf) + goto err; + + chunk = (struct xio_mem_chunk *)buf; + + list_add(&chunk->chunk_entry, &p->chunks_list); + + inc_ptr(buf, sizeof(*chunk)); + + obj = (struct xio_mem_obj *)buf; + while (n--) { + obj->obj = sum_to_ptr((void *)obj, sizeof(*obj)); + obj->pool = p; + ((void **)obj->obj)[0] = obj; + inc_ptr(obj->obj, sizeof(void *)); + list_add(&obj->chain_entry, &p->free_list); + obj = (struct xio_mem_obj *)sum_to_ptr((void *)obj->obj, size); + } + + return 0; + +err: + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_objpool_realloc */ +/*---------------------------------------------------------------------------*/ +struct xio_objpool *xio_objpool_create(int size, int init_nr, int grow_nr) +{ + struct xio_objpool *p; + int retval; + + p = (struct xio_objpool *)kcalloc(1, sizeof(*p), GFP_KERNEL); + if (!p) + return NULL; + + p->grow_nr = grow_nr; + p->obj_size = size; + + INIT_LIST_HEAD(&p->free_list); + INIT_LIST_HEAD(&p->used_list); + INIT_LIST_HEAD(&p->chunks_list); + + retval = xio_objpool_realloc(p, size, init_nr); + if (retval == -1) + return NULL; + + return p; +} + +/*---------------------------------------------------------------------------*/ +/* xio_objpool_destroy */ +/*---------------------------------------------------------------------------*/ +void xio_objpool_destroy(struct xio_objpool *p) +{ + struct xio_mem_chunk *chunk; + struct xio_mem_chunk *tmp_chunk; + + list_for_each_entry_safe(chunk, tmp_chunk, + &p->chunks_list, chunk_entry) { + list_del(&chunk->chunk_entry); + vfree(chunk); + } + kfree(p); +} + +/*---------------------------------------------------------------------------*/ +/* xio_objpool_alloc */ +/*---------------------------------------------------------------------------*/ +void *xio_objpool_alloc(struct xio_objpool *p) +{ + struct xio_mem_obj *obj; + struct xio_mem_obj *tmp_obj; + + if (list_empty(&p->free_list) && + xio_objpool_realloc(p, p->obj_size, p->grow_nr) == -1) { + return NULL; + } + /* get first free item from the allocated objs */ + list_for_each_entry_safe(obj, tmp_obj, &p->free_list, chain_entry) { + list_move(&obj->chain_entry, &p->used_list); + return obj->obj; + } + + return NULL; +} + +/*---------------------------------------------------------------------------*/ +/* xio_objpool_free */ +/*---------------------------------------------------------------------------*/ +void xio_objpool_free(void *o) +{ + struct xio_mem_obj *obj; + + if (!o) + return; + obj = (struct xio_mem_obj *)(((void **)o)[-1]); + list_move(&obj->chain_entry, &obj->pool->free_list); +} + diff --git a/open_src/xio/src/common/xio_objpool.h b/open_src/xio/src/common/xio_objpool.h new file mode 100644 index 0000000..9c1ecc6 --- /dev/null +++ b/open_src/xio/src/common/xio_objpool.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef XIO_OBJPOOL_H +#define XIO_OBJPOOL_H + +/*---------------------------------------------------------------------------*/ +/* opaque data type */ +/*---------------------------------------------------------------------------*/ +struct xio_objpool; + +/** + * create dynamically growing objects pool + * + * @param[in] size size of the object managed by the pool + * @param[in] init_nr initial number of objects to allocate + * @param[in] grow_nr growing number of objects each time the pool + * is empty. + * + * @return pointer to object pool + */ +struct xio_objpool *xio_objpool_create(int size, int init_nr, int grow_nr); + +/** + * destroy objects pool + * + * @param[in] opool pointer to objects pool to be destroyed + * + */ +void xio_objpool_destroy(struct xio_objpool *opool); + +/** + * allocate object from the pool + * + * @param[in] opool pointer to objects pool + * + * @return pointer to single object from the pool + */ +void *xio_objpool_alloc(struct xio_objpool *opool); + +/** + * free object back to the pool + * + * @param[in] obj pointer to object + * + */ +void xio_objpool_free(void *obj); + +#endif /* XIO_OBJPOOL_H */ + diff --git a/open_src/xio/src/common/xio_observer.c b/open_src/xio/src/common/xio_observer.c new file mode 100644 index 0000000..b1713b3 --- /dev/null +++ b/open_src/xio/src/common/xio_observer.c @@ -0,0 +1,273 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include "xio_log.h" +#include "xio_common.h" +#include "xio_observer.h" +#include + +/*---------------------------------------------------------------------------*/ +/* xio_observer_create */ +/*---------------------------------------------------------------------------*/ +struct xio_observer *xio_observer_create(void *impl, notify_fn_t notify) +{ + struct xio_observer *observer; + + observer = (struct xio_observer *) + kcalloc(1, sizeof(struct xio_observer), GFP_KERNEL); + if (!observer) { + xio_set_error(ENOMEM); + return NULL; + } + + observer->impl = impl; + observer->notify = notify; + + return observer; +} + +/*---------------------------------------------------------------------------*/ +/* xio_observer_destroy */ +/*---------------------------------------------------------------------------*/ +void xio_observer_destroy(struct xio_observer *observer) +{ + observer->impl = NULL; + observer->notify = NULL; + + kfree(observer); +} + +/*---------------------------------------------------------------------------*/ +/* xio_observerable_create */ +/*---------------------------------------------------------------------------*/ +struct xio_observable *xio_observable_create(void *impl) +{ + struct xio_observable *observable; + + observable = (struct xio_observable *) + kcalloc(1, sizeof(struct xio_observable), GFP_KERNEL); + if (!observable) { + xio_set_error(ENOMEM); + return NULL; + } + + INIT_LIST_HEAD(&observable->observers_list); + + observable->impl = impl; + + return observable; +} + +/*---------------------------------------------------------------------------*/ +/* xio_observable_destroy */ +/*---------------------------------------------------------------------------*/ +void xio_observable_destroy(struct xio_observable *observable) +{ + INIT_LIST_HEAD(&observable->observers_list); + + observable->impl = NULL; + + kfree(observable); +} + +/*---------------------------------------------------------------------------*/ +/* xio_observable_unreg_all_observers */ +/*---------------------------------------------------------------------------*/ +static struct xio_observer_node *xio_observable_find( + struct xio_observable *observable, + struct xio_observer *observer) +{ + struct xio_observer_node *observer_node, *tmp_observer_node; + + if (observable->observer_node && + observable->observer_node->observer == observer) { + ERROR_LOG("already exist: " \ + "observable:%p, observer:%p\n", + observable, observable->observer_node->observer); + return observable->observer_node; + } + + list_for_each_entry_safe(observer_node, tmp_observer_node, + &observable->observers_list, + observers_list_node) { + if (observer_node->observer == observer) { + ERROR_LOG("already exist: " \ + "observable:%p, observer:%p\n", + observable, observer_node->observer); + observable->observer_node = observer_node; + return observer_node; + } + } + return NULL; +} + +/*---------------------------------------------------------------------------*/ +/* xio_observable_reg_observer */ +/*---------------------------------------------------------------------------*/ +void xio_observable_reg_observer(struct xio_observable *observable, + struct xio_observer *observer) +{ + struct xio_observer_node *observer_node; + + if (xio_observable_find(observable, observer)) { + ERROR_LOG("double registration is forbidden\n"); + return; + } + + observer_node = (struct xio_observer_node *)kcalloc(1, + sizeof(struct xio_observer_node), GFP_KERNEL); + if (!observer_node) { + xio_set_error(ENOMEM); + return; + } + observer_node->observer = observer; + + if (list_empty(&observable->observers_list)) + observable->observer_node = observer_node; + else + observable->observer_node = NULL; + + list_add(&observer_node->observers_list_node, + &observable->observers_list); +} +EXPORT_SYMBOL(xio_observable_reg_observer); + +/*---------------------------------------------------------------------------*/ +/* xio_observable_unreg_observer */ +/*---------------------------------------------------------------------------*/ +void xio_observable_unreg_observer(struct xio_observable *observable, + struct xio_observer *observer) +{ + struct xio_observer_node *observer_node, *tmp_observer_node; + + list_for_each_entry_safe(observer_node, tmp_observer_node, + &observable->observers_list, + observers_list_node) { + if (observer == observer_node->observer) { + if (observable->observer_node == observer_node) + observable->observer_node = NULL; + + list_del(&observer_node->observers_list_node); + kfree(observer_node); + break; + } + } +} +EXPORT_SYMBOL(xio_observable_unreg_observer); + +/*---------------------------------------------------------------------------*/ +/* xio_observable_notify_observer */ +/*---------------------------------------------------------------------------*/ +void xio_observable_notify_observer(struct xio_observable *observable, + struct xio_observer *observer, + int event, void *event_data) +{ + if (likely(observable->impl && observer->impl)) + observer->notify(observer->impl, observable->impl, + event, event_data); + else + DEBUG_LOG("spurious notification " \ + "observable:%p, observer:%p\n", + observable, observer); +} +EXPORT_SYMBOL(xio_observable_notify_observer); + +/*---------------------------------------------------------------------------*/ +/* xio_observable_notify_all_observers */ +/*---------------------------------------------------------------------------*/ +void xio_observable_notify_all_observers(struct xio_observable *observable, + int event, void *event_data) +{ + struct xio_observer_node *observer_node, *tmp_observer_node; + + list_for_each_entry_safe(observer_node, tmp_observer_node, + &observable->observers_list, + observers_list_node) { + if(likely(observable->impl && observer_node->observer->impl)) + observer_node->observer->notify( + observer_node->observer->impl, + observable->impl, event, event_data); + } +} +EXPORT_SYMBOL(xio_observable_notify_all_observers); + +/*---------------------------------------------------------------------------*/ +/* xio_observable_notify_any_observer */ +/*---------------------------------------------------------------------------*/ +void xio_observable_notify_any_observer(struct xio_observable *observable, + int event, void *event_data) +{ + struct xio_observer_node *observer_node, *tmp_observer_node; + + if (likely(observable->observer_node)) { + observable->observer_node->observer->notify( + NULL, + observable->impl, event, event_data); + return; + } + + list_for_each_entry_safe(observer_node, tmp_observer_node, + &observable->observers_list, + observers_list_node) { + observer_node->observer->notify( + NULL, + observable->impl, event, event_data); + observable->observer_node = observer_node; + break; + } +} +EXPORT_SYMBOL(xio_observable_notify_any_observer); + +/*---------------------------------------------------------------------------*/ +/* xio_observable_unreg_all_observers */ +/*---------------------------------------------------------------------------*/ +void xio_observable_unreg_all_observers(struct xio_observable *observable) +{ + struct xio_observer_node *observer_node, *tmp_observer_node; + + list_for_each_entry_safe(observer_node, tmp_observer_node, + &observable->observers_list, + observers_list_node) { + list_del(&observer_node->observers_list_node); + kfree(observer_node); + } + observable->observer_node = NULL; +} +EXPORT_SYMBOL(xio_observable_unreg_all_observers); + diff --git a/open_src/xio/src/common/xio_observer.h b/open_src/xio/src/common/xio_observer.h new file mode 100644 index 0000000..97d1ede --- /dev/null +++ b/open_src/xio/src/common/xio_observer.h @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_OBSERVER_H +#define XIO_OBSERVER_H + +/*---------------------------------------------------------------------------*/ +/* typedefs */ +/*---------------------------------------------------------------------------*/ +typedef int (*notify_fn_t)(void *observer_impl, + void *observable_impl, + int event, void *event_data); + +/*---------------------------------------------------------------------------*/ +/* xio_observer */ +/*---------------------------------------------------------------------------*/ +struct xio_observer { + void *impl; + notify_fn_t notify; +}; + +#define XIO_OBSERVER_INIT(name, obj, notify_fn) \ + { (name)->impl = obj; (name)->notify = notify_fn; } + +#define XIO_OBSERVER_DESTROY(name) \ + { (name)->impl = NULL; (name)->notify = NULL; } + +/*---------------------------------------------------------------------------*/ +/* xio_observer_node */ +/*---------------------------------------------------------------------------*/ +struct xio_observer_node { + struct xio_observer *observer; + struct list_head observers_list_node; +}; + +/*---------------------------------------------------------------------------*/ +/* xio_observerable */ +/*---------------------------------------------------------------------------*/ +struct xio_observable { + void *impl; + struct list_head observers_list; + struct xio_observer_node *observer_node; /* for one observer */ +}; + +struct xio_observer_event{ + struct xio_observer *observer; + struct xio_observable *observable; + void *event_data; + int event; + int pad; +}; + +#define XIO_OBSERVABLE_INIT(name, obj) \ + { (name)->impl = obj; INIT_LIST_HEAD(&(name)->observers_list); \ + (name)->observer_node = NULL; } + +#define XIO_OBSERVABLE_DESTROY(name) \ + { (name)->impl = NULL; INIT_LIST_HEAD(&(name)->observers_list); \ + (name)->observer_node = NULL; } + +/*---------------------------------------------------------------------------*/ +/* xio_observable_reg_observer */ +/*---------------------------------------------------------------------------*/ +void xio_observable_reg_observer(struct xio_observable *observable, + struct xio_observer *observer); + +/*---------------------------------------------------------------------------*/ +/* xio_observable_unreg_observer */ +/*---------------------------------------------------------------------------*/ +void xio_observable_unreg_observer(struct xio_observable *observable, + struct xio_observer *observer); + +/*---------------------------------------------------------------------------*/ +/* xio_observable_notify_observer */ +/*---------------------------------------------------------------------------*/ +void xio_observable_notify_observer(struct xio_observable *observable, + struct xio_observer *observer, + int event, void *event_data); + +/*---------------------------------------------------------------------------*/ +/* xio_observable_notify_all_observers */ +/*---------------------------------------------------------------------------*/ +void xio_observable_notify_all_observers(struct xio_observable *observable, + int event, void *event_data); + +/*---------------------------------------------------------------------------*/ +/* xio_observable_notify_any_observer */ +/*---------------------------------------------------------------------------*/ +void xio_observable_notify_any_observer(struct xio_observable *observable, + int event, void *event_data); + +/*---------------------------------------------------------------------------*/ +/* xio_observable_unreg_all_observers */ +/*---------------------------------------------------------------------------*/ +void xio_observable_unreg_all_observers(struct xio_observable *observable); + +/*---------------------------------------------------------------------------*/ +/* xio_observable_is_empty */ +/*---------------------------------------------------------------------------*/ +static inline int xio_observable_is_empty(struct xio_observable *observable) +{ + return list_empty(&observable->observers_list); +} + +#endif /* XIO_OBSERVER_H */ diff --git a/open_src/xio/src/common/xio_options.c b/open_src/xio/src/common/xio_options.c new file mode 100644 index 0000000..22202ce --- /dev/null +++ b/open_src/xio/src/common/xio_options.c @@ -0,0 +1,472 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include "libxio.h" +#include "xio_common.h" +#include "xio_mem.h" +#include "xio_observer.h" +#include "xio_transport.h" +#include "xio_log.h" + +#define XIO_OPTVAL_DEF_MAX_IN_IOVSZ XIO_IOVLEN +#define XIO_OPTVAL_DEF_MAX_OUT_IOVSZ XIO_IOVLEN +#define XIO_OPTVAL_DEF_ENABLE_RECONNECT 0 +#define XIO_OPTVAL_DEF_ENABLE_FLOW_CONTROL 0 +#define XIO_OPTVAL_DEF_SND_QUEUE_DEPTH_MSGS 1024 +#define XIO_OPTVAL_DEF_RCV_QUEUE_DEPTH_MSGS 1024 +#define XIO_OPTVAL_DEF_SND_QUEUE_DEPTH_BYTES (128 * 1024 * 1024) +#define XIO_OPTVAL_DEF_RCV_QUEUE_DEPTH_BYTES (128 * 1024 * 1024) +#define XIO_OPTVAL_DEF_MAX_INLINE_XIO_HEADER 1024 +#define XIO_OPTVAL_DEF_MAX_INLINE_XIO_DATA (64 * 1024) +#define XIO_OPTVAL_DEF_XFER_BUF_ALIGN (64) +#define XIO_OPTVAL_DEF_INLINE_XIO_DATA_ALIGN (0) +#define XIO_OPTVAL_DEF_ENABLE_KEEPALIVE 1 +#define XIO_OPTVAL_DEF_KEEPALIVE_PROBES 3 +#define XIO_OPTVAL_DEF_KEEPALIVE_INTVL 20 +#define XIO_OPTVAL_DEF_KEEPALIVE_TIME 60 +#define XIO_OPTVAL_DEF_TRANSPORT_CLOSE_TIMEOUT 60000 +#define XIO_OPTVAL_DEF_PAD 0 + +/* xio options */ +struct xio_options g_options = { + XIO_OPTVAL_DEF_MAX_IN_IOVSZ, /*max_in_iovsz*/ + XIO_OPTVAL_DEF_MAX_OUT_IOVSZ, /*max_out_iovsz*/ + XIO_OPTVAL_DEF_ENABLE_RECONNECT, /*reconnect*/ + XIO_OPTVAL_DEF_MAX_INLINE_XIO_HEADER, /*max_inline_xio_hdr*/ + XIO_OPTVAL_DEF_MAX_INLINE_XIO_DATA, /*max_inline_xio_data*/ + XIO_OPTVAL_DEF_ENABLE_FLOW_CONTROL, /*enable_flow_control*/ + XIO_OPTVAL_DEF_SND_QUEUE_DEPTH_MSGS, /*snd_queue_depth_msgs*/ + XIO_OPTVAL_DEF_RCV_QUEUE_DEPTH_MSGS, /*rcv_queue_depth_msgs*/ + XIO_OPTVAL_DEF_SND_QUEUE_DEPTH_BYTES, /*snd_queue_depth_bytes*/ + XIO_OPTVAL_DEF_RCV_QUEUE_DEPTH_BYTES, /*rcv_queue_depth_bytes*/ + XIO_OPTVAL_DEF_XFER_BUF_ALIGN, /* xfer_buf_align */ + XIO_OPTVAL_DEF_INLINE_XIO_DATA_ALIGN, /* inline_xio_data_align */ + XIO_OPTVAL_DEF_ENABLE_KEEPALIVE, + XIO_OPTVAL_DEF_TRANSPORT_CLOSE_TIMEOUT, /* transport_close_timeout */ + XIO_OPTVAL_DEF_PAD, + { + XIO_OPTVAL_DEF_KEEPALIVE_PROBES, + XIO_OPTVAL_DEF_KEEPALIVE_TIME, + XIO_OPTVAL_DEF_KEEPALIVE_INTVL + } +}; + +/*---------------------------------------------------------------------------*/ +/* xio_get_options */ +/*---------------------------------------------------------------------------*/ +struct xio_options *xio_get_options(void) +{ + return &g_options; +} +EXPORT_SYMBOL(xio_get_options); + +/*---------------------------------------------------------------------------*/ +/* xio_set_opt */ +/*---------------------------------------------------------------------------*/ +static int xio_general_set_opt(void *xio_obj, int optname, + const void *optval, int optlen) +{ + int tmp; + + switch (optname) { + case XIO_OPTNAME_LOG_FN: + if (optlen == 0 && !optval) + return xio_set_log_fn(NULL); + else if (optlen == sizeof(xio_log_fn)) + return xio_set_log_fn((xio_log_fn)optval); + break; + case XIO_OPTNAME_LOG_LEVEL: + if (optlen != sizeof(enum xio_log_level)) + return -1; + return xio_set_log_level(*((enum xio_log_level *)optval)); + case XIO_OPTNAME_DISABLE_HUGETBL: + xio_disable_huge_pages(*((int *)optval)); + return 0; + case XIO_OPTNAME_MEM_ALLOCATOR: + if (optlen == sizeof(struct xio_mem_allocator)) + return xio_set_mem_allocator( + (struct xio_mem_allocator *)optval); + break; + case XIO_OPTNAME_CONFIG_MEMPOOL: + if (optlen == sizeof(struct xio_mempool_config)) { + memcpy(&g_mempool_config, + (struct xio_mempool_config *)optval, optlen); + return 0; + } + break; + case XIO_OPTNAME_MAX_IN_IOVLEN: + if (optlen == sizeof(int)) { + struct xio_transport *rdma_transport = + xio_get_transport("rdma"); + struct xio_transport *tcp_transport = + xio_get_transport("tcp"); + int retval = 0; + + if (*((int *)optval) > XIO_IOVLEN && + *((int *)optval) <= XIO_MAX_IOV) { + g_options.max_in_iovsz = *((int *)optval); + if (rdma_transport && + rdma_transport->set_opt) + retval |= rdma_transport->set_opt( + xio_obj, optname, + optval, optlen); + if (tcp_transport && + tcp_transport->set_opt) + retval |= tcp_transport->set_opt( + xio_obj, optname, + optval, optlen); + } + return retval; + } + break; + case XIO_OPTNAME_MAX_OUT_IOVLEN: + if (optlen == sizeof(int)) { + struct xio_transport *rdma_transport = + xio_get_transport("rdma"); + struct xio_transport *tcp_transport = + xio_get_transport("tcp"); + int retval = 0; + + if (*((int *)optval) > XIO_IOVLEN && + *((int *)optval) <= XIO_MAX_IOV) { + g_options.max_out_iovsz = *((int *)optval); + if (rdma_transport && + rdma_transport->set_opt) + retval |= rdma_transport->set_opt( + xio_obj, optname, + optval, optlen); + if (tcp_transport && + tcp_transport->set_opt) + retval |= tcp_transport->set_opt( + xio_obj, optname, + optval, optlen); + } + return retval; + } + break; + case XIO_OPTNAME_ENABLE_DMA_LATENCY: + if (optlen == sizeof(int)) { + struct xio_transport *rdma_transport = + xio_get_transport("rdma"); + struct xio_transport *tcp_transport = + xio_get_transport("tcp"); + int retval = 0; + + if (rdma_transport && + rdma_transport->set_opt) + retval |= rdma_transport->set_opt( + xio_obj, optname, + optval, optlen); + if (tcp_transport && + tcp_transport->set_opt) + retval |= tcp_transport->set_opt( + xio_obj, optname, + optval, optlen); + + return retval; + } + break; + case XIO_OPTNAME_ENABLE_RECONNECT: + g_options.reconnect = *((int *)optval); + if (g_options.reconnect){ + g_options.enable_keepalive = 0; + } + return 0; + case XIO_OPTNAME_ENABLE_FLOW_CONTROL: + g_options.enable_flow_control = *((int *)optval); + return 0; + case XIO_OPTNAME_SND_QUEUE_DEPTH_MSGS: + if (*((int *)optval) < 1) + break; + g_options.snd_queue_depth_msgs = (int)*((uint64_t *)optval); + return 0; + case XIO_OPTNAME_RCV_QUEUE_DEPTH_MSGS: + if (*((int *)optval) < 1) + break; + g_options.rcv_queue_depth_msgs = *((int *)optval); + return 0; + case XIO_OPTNAME_SND_QUEUE_DEPTH_BYTES: + if (*((uint64_t *)optval) < 1) + break; + g_options.snd_queue_depth_bytes = *((uint64_t *)optval); + return 0; + case XIO_OPTNAME_RCV_QUEUE_DEPTH_BYTES: + if (*((uint64_t *)optval) < 1) + break; + g_options.rcv_queue_depth_bytes = *((uint64_t *)optval); + return 0; + case XIO_OPTNAME_MAX_INLINE_XIO_HEADER: + if (optlen != sizeof(int)) + break; + if (*((int *)optval) < 0) + break; + g_options.max_inline_xio_hdr = *((int *)optval); + return 0; + case XIO_OPTNAME_MAX_INLINE_XIO_DATA: + if (optlen != sizeof(int)) + break; + if (*((int *)optval) < 0) + break; + g_options.max_inline_xio_data = *((int *)optval); + return 0; + case XIO_OPTNAME_XFER_BUF_ALIGN: + if (optlen != sizeof(int)) + break; + tmp = *(int *)optval; + if (!is_power_of_2(tmp) || !(tmp % sizeof(void *) == 0)) { + xio_set_error(EINVAL); + return -1; + } + g_options.xfer_buf_align = tmp; + return 0; + case XIO_OPTNAME_INLINE_XIO_DATA_ALIGN: + if (optlen != sizeof(int)) + break; + tmp = *(int *)optval; + if (!tmp) { + g_options.inline_xio_data_align = tmp; + return 0; + } + if (!is_power_of_2(tmp) || !(tmp % sizeof(void *) == 0)) { + xio_set_error(EINVAL); + return -1; + } + g_options.inline_xio_data_align = tmp; + return 0; + case XIO_OPTNAME_ENABLE_KEEPALIVE: + g_options.enable_keepalive = *((int *)optval); + return 0; + case XIO_OPTNAME_CONFIG_KEEPALIVE: + if (optlen == sizeof(struct xio_options_keepalive)) { + memcpy(&g_options.ka, optval, optlen); + return 0; + } else { + xio_set_error(EINVAL); + return -1; + } + break; + case XIO_OPTNAME_TRANSPORT_CLOSE_TIMEOUT: + if (optlen != sizeof(int)) + break; + if (*((int *)optval) < 0) + break; + g_options.transport_close_timeout = *((int *)optval); + return 0; + default: + break; + } + xio_set_error(XIO_E_NOT_SUPPORTED); + return -1; +} +EXPORT_SYMBOL(xio_set_opt); + +/*---------------------------------------------------------------------------*/ +/* xio_general_get_opt */ +/*---------------------------------------------------------------------------*/ +static int xio_general_get_opt(void *xio_obj, int optname, + void *optval, int *optlen) +{ + switch (optname) { + case XIO_OPTNAME_LOG_LEVEL: + *((enum xio_log_level *)optval) = xio_get_log_level(); + *optlen = sizeof(enum xio_log_level); + return 0; + case XIO_OPTNAME_MAX_IN_IOVLEN: + *optlen = sizeof(int); + *((int *)optval) = g_options.max_in_iovsz; + return 0; + case XIO_OPTNAME_MAX_OUT_IOVLEN: + *optlen = sizeof(int); + *((int *)optval) = g_options.max_out_iovsz; + return 0; + case XIO_OPTNAME_ENABLE_RECONNECT: + *optlen = sizeof(int); + *((int *)optval) = g_options.reconnect; + return 0; + case XIO_OPTNAME_ENABLE_FLOW_CONTROL: + *optlen = sizeof(int); + *((int *)optval) = g_options.enable_flow_control; + return 0; + case XIO_OPTNAME_SND_QUEUE_DEPTH_MSGS: + *optlen = sizeof(int); + *((int *)optval) = g_options.snd_queue_depth_msgs; + return 0; + case XIO_OPTNAME_RCV_QUEUE_DEPTH_MSGS: + *optlen = sizeof(int); + *((int *)optval) = g_options.rcv_queue_depth_msgs; + return 0; + case XIO_OPTNAME_SND_QUEUE_DEPTH_BYTES: + *optlen = sizeof(uint64_t); + *((uint64_t *)optval) = g_options.snd_queue_depth_bytes; + return 0; + case XIO_OPTNAME_RCV_QUEUE_DEPTH_BYTES: + *optlen = sizeof(uint64_t); + *((uint64_t *)optval) = g_options.rcv_queue_depth_bytes; + return 0; + case XIO_OPTNAME_MAX_INLINE_XIO_HEADER: + *optlen = sizeof(int); + *((int *)optval) = g_options.max_inline_xio_hdr; + return 0; + case XIO_OPTNAME_MAX_INLINE_XIO_DATA: + *optlen = sizeof(int); + *((int *)optval) = g_options.max_inline_xio_data; + return 0; + case XIO_OPTNAME_INLINE_XIO_DATA_ALIGN: + *optlen = sizeof(int); + *((int *)optval) = g_options.inline_xio_data_align; + return 0; + case XIO_OPTNAME_XFER_BUF_ALIGN: + *optlen = sizeof(int); + *((int *)optval) = g_options.xfer_buf_align; + return 0; + case XIO_OPTNAME_ENABLE_KEEPALIVE: + *optlen = sizeof(int); + *((int *)optval) = g_options.enable_keepalive; + return 0; + case XIO_OPTNAME_CONFIG_KEEPALIVE: + if (*optlen == sizeof(struct xio_options_keepalive)) { + memcpy(optval, &g_options.ka, *optlen); + return 0; + } else { + xio_set_error(EINVAL); + return -1; + } + case XIO_OPTNAME_TRANSPORT_CLOSE_TIMEOUT: + *optlen = sizeof(int); + *((int *)optval) = g_options.transport_close_timeout; + return 0; + default: + break; + } + xio_set_error(XIO_E_NOT_SUPPORTED); + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_set_opt */ +/*---------------------------------------------------------------------------*/ +int xio_set_opt(void *xio_obj, int level, int optname, + const void *optval, int optlen) +{ + static struct xio_transport *rdma_transport; + static struct xio_transport *tcp_transport; + + switch (level) { + case XIO_OPTLEVEL_ACCELIO: + return xio_general_set_opt(xio_obj, optname, optval, optlen); + case XIO_OPTLEVEL_RDMA: + if (!rdma_transport) { + rdma_transport = xio_get_transport("rdma"); + if (!rdma_transport) { + xio_set_error(EFAULT); + return -1; + } + } + if (!rdma_transport->set_opt) + break; + return rdma_transport->set_opt(xio_obj, + optname, optval, optlen); + break; + case XIO_OPTLEVEL_TCP: + if (!tcp_transport) { + tcp_transport = xio_get_transport("tcp"); + if (!tcp_transport) { + xio_set_error(EFAULT); + return -1; + } + } + if (!tcp_transport->set_opt) + break; + return tcp_transport->set_opt(xio_obj, + optname, optval, optlen); + break; + default: + break; + } + + xio_set_error(XIO_E_NOT_SUPPORTED); + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_get_opt */ +/*---------------------------------------------------------------------------*/ +int xio_get_opt(void *xio_obj, int level, int optname, + void *optval, int *optlen) +{ + static struct xio_transport *rdma_transport; + static struct xio_transport *tcp_transport; + + switch (level) { + case XIO_OPTLEVEL_ACCELIO: + return xio_general_get_opt(xio_obj, optname, optval, optlen); + case XIO_OPTLEVEL_RDMA: + if (!rdma_transport) { + rdma_transport = xio_get_transport("rdma"); + if (!rdma_transport) { + xio_set_error(EFAULT); + return -1; + } + } + if (!rdma_transport->get_opt) + break; + return rdma_transport->get_opt(xio_obj, + optname, optval, optlen); + break; + case XIO_OPTLEVEL_TCP: + if (!tcp_transport) { + tcp_transport = xio_get_transport("tcp"); + if (!tcp_transport) { + xio_set_error(EFAULT); + return -1; + } + } + if (!tcp_transport->get_opt) + break; + return tcp_transport->get_opt(xio_obj, + optname, optval, optlen); + break; + default: + break; + } + + xio_set_error(XIO_E_NOT_SUPPORTED); + return -1; +} +EXPORT_SYMBOL(xio_get_opt); diff --git a/open_src/xio/src/common/xio_protocol.h b/open_src/xio/src/common/xio_protocol.h new file mode 100644 index 0000000..dc2547f --- /dev/null +++ b/open_src/xio/src/common/xio_protocol.h @@ -0,0 +1,462 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_PROTOCOL_H +#define XIO_PROTOCOL_H + +union generic_16bit { + uint8_t b[2]; + int16_t s; +}; + +union generic_32bit { + uint8_t b[4]; + float f; + int32_t i; + int16_t s; +}; + +union generic_64bit { + uint8_t b[8]; + int64_t ll; /* Long long (64 bit) */ + double d; /* IEEE-754 double precision floating point */ +}; + +/** + * @brief Place an unsigned byte into the buffer + * + * @param b the byte to add + * @param bindex the position in the packet + * @param buffer the packet buffer + * @return the new position of the last used byte in the buffer + */ +static inline size_t xio_write_uint8(uint8_t b, int bindex, uint8_t *buffer) +{ + *(buffer + bindex) = b; + return sizeof(b); +} + +/** + * @brief Get an unsigned byte from the buffer + * + * @param b the byte to get + * @param bindex the position in the packet + * @param buffer the packet buffer + * @return the new position of the last used byte in the buffer + */ +static inline size_t xio_read_uint8(uint8_t *b, int bindex, + const uint8_t *buffer) +{ + *b = *(buffer + bindex); + return sizeof(*b); +} + +/** + * @brief Place a signed byte into the buffer + * + * @param b the byte to add + * @param buffer the packet buffer + * @return the new position of the last used byte in the buffer + */ +static inline size_t xio_write_int8(int8_t b, int bindex, uint8_t *buffer) +{ + *(buffer + bindex) = (uint8_t)b; + return sizeof(b); +} + +/** + * @brief Get a signed byte from the buffer + * + * @param b the byte to get + * @param buffer the packet buffer + * @return the new position of the last used byte in the buffer + */ +static inline size_t xio_read_int8(int8_t *b, int bindex, + const uint8_t *buffer) +{ + *b = (int8_t)*(buffer + bindex); + return sizeof(*b); +} + +/** + * @brief Place two unsigned bytes into the buffer + * + * @param b the bytes to add + * @param buffer the packet buffer + * @return the new position of the last used byte in the buffer + */ +static inline size_t xio_write_uint16(uint16_t b, const int bindex, + uint8_t *buffer) +{ + buffer[bindex] = (b >> 8) & 0xff; + buffer[bindex+1] = (b) & 0xff; + + return sizeof(b); +} + +/** + * @brief Get two unsigned bytes from the buffer + * + * @param b the bytes to get + * @param buffer the packet buffer + * @return the new position of the last used byte in the buffer + */ +static inline size_t xio_read_uint16(uint16_t *b, const int bindex, + const uint8_t *buffer) +{ + *b = ((((uint32_t)buffer[bindex]) << 8) + | ((uint32_t)buffer[bindex+1])); + + return sizeof(*b); +} + +/** + * @brief Place two signed bytes into the buffer + * + * @param b the bytes to add + * @param buffer the packet buffer + * @return the new position of the last used byte in the buffer + */ +static inline size_t xio_write_int16(int16_t b, int bindex, uint8_t *buffer) +{ + return xio_write_uint16(b, bindex, buffer); +} + +/** + * @brief Get two signed bytes from the buffer + * + * @param b the bytes to add + * @param buffer the packet buffer + * @return the new position of the last used byte in the buffer + */ +static inline size_t xio_read_int16(int16_t *b, int bindex, + const uint8_t *buffer) +{ + return xio_read_uint16((uint16_t *)b, bindex, buffer); +} + +/** + * @brief Place four unsigned bytes into the buffer + * + * @param b the bytes to add + * @param buffer the packet buffer + * @return the new position of the last used byte in the buffer + */ +static inline size_t xio_write_uint32(uint32_t b, const int bindex, + uint8_t *buffer) +{ + buffer[bindex] = (b >> 24) & 0xff; + buffer[bindex+1] = (b >> 16) & 0xff; + buffer[bindex+2] = (b >> 8) & 0xff; + buffer[bindex+3] = (b) & 0xff; + return sizeof(b); +} + +/** + * @brief Get four unsigned bytes from the buffer + * + * @param b the bytes to add + * @param buffer the packet buffer + * @return the new position of the last used byte in the buffer + */ +static inline size_t xio_read_uint32(uint32_t *b, const int bindex, + const uint8_t *buffer) +{ + *b = (uint32_t)(buffer[bindex]) << 24 | + (uint32_t)(buffer[bindex+1]) << 16 | + (uint32_t)(buffer[bindex+2]) << 8 | + (uint32_t)(buffer[bindex+3]); + + return sizeof(*b); +} + +/** + * @brief Place four signed bytes into the buffer + * + * @param b the bytes to add + * @param buffer the packet buffer + * @return the new position of the last used byte in the buffer + */ +static inline size_t xio_write_int32(int32_t b, int bindex, uint8_t *buffer) +{ + buffer[bindex] = (b >> 24) & 0xff; + buffer[bindex+1] = (b >> 16) & 0xff; + buffer[bindex+2] = (b >> 8) & 0xff; + buffer[bindex+3] = (b) & 0xff; + return sizeof(b); +} + +/** + * @brief Get four signed bytes from the buffer + * + * @param b the bytes to add + * @param buffer the packet buffer + * @return the new position of the last used byte in the buffer + */ +static inline size_t xio_read_int32(int32_t *b, int bindex, + const uint8_t *buffer) +{ + *b = ((((uint32_t)buffer[bindex]) << 24) | + (((uint32_t)buffer[bindex+1]) << 16) | + (((uint32_t)buffer[bindex+2]) << 8) | + ((uint32_t)buffer[bindex+3])); + + return sizeof(*b); +} + +/** + * @brief Place four unsigned bytes form the buffer + * + * @param b the bytes to add + * @param buffer the packet buffer + * @return the new position of the last used byte in the buffer + */ +static inline size_t xio_write_uint64(uint64_t b, const int bindex, + uint8_t *buffer) +{ + buffer[bindex] = (b >> 56) & 0xff; + buffer[bindex+1] = (b >> 48) & 0xff; + buffer[bindex+2] = (b >> 40) & 0xff; + buffer[bindex+3] = (b >> 32) & 0xff; + buffer[bindex+4] = (b >> 24) & 0xff; + buffer[bindex+5] = (b >> 16) & 0xff; + buffer[bindex+6] = (b >> 8) & 0xff; + buffer[bindex+7] = (b) & 0xff; + return sizeof(b); +} + +/** + * @brief Get four unsigned bytes from the buffer + * + * @param b the bytes to get + * @param buffer the packet buffer + * @return the new position of the last used byte in the buffer + */ +static inline size_t xio_read_uint64(uint64_t *b, const int bindex, + const uint8_t *buffer) +{ + *b = ((((uint64_t)buffer[bindex]) << 56) + | (((uint64_t)buffer[bindex+1]) << 48) + | (((uint64_t)buffer[bindex+2]) << 40) + | (((uint64_t)buffer[bindex+3]) << 32) + | (((uint64_t)buffer[bindex+4]) << 24) + | (((uint64_t)buffer[bindex+5]) << 16) + | (((uint64_t)buffer[bindex+6]) << 8) + | ((uint64_t)buffer[bindex+7])); + + return sizeof(*b); +} + +/** + * @brief Place four signed bytes into the buffer + * + * @param b the bytes to add + * @param buffer the packet buffer + * @return the new position of the last used byte in the buffer + */ +static inline size_t xio_write_int64(int64_t b, int bindex, uint8_t *buffer) +{ + return xio_write_uint64(b, bindex, buffer); +} + +/** + * @brief Get four signed bytes from the buffer + * + * @param b the bytes to get + * @param buffer the packet buffer + * @return the new position of the last used byte in the buffer + */ +static inline size_t xio_read_int64(int64_t *b, int bindex, + const uint8_t *buffer) +{ + return xio_read_uint64((uint64_t *)b, bindex, buffer); +} + +/** + * @brief Place a float into the buffer + * + * @param b the float to add + * @param buffer the packet buffer + * @return the new position of the last used byte in the buffer + */ +static inline size_t xio_write_float(float b, int bindex, uint8_t *buffer) +{ + union generic_32bit g; + + g.f = b; + return xio_write_int32(g.i, bindex, buffer); +} + +/** + * @brief Get a float from the buffer + * + * @param b the float to get + * @param buffer the packet buffer + * @return the new position of the last used byte in the buffer + */ +static inline size_t xio_read_float(float *b, int bindex, + const uint8_t *buffer) +{ + union generic_32bit g; + size_t len = xio_read_int32(&g.i, bindex, buffer); + + *b = g.f; + + return len; +} + +/** + * @brief Place a double into the buffer + * + * @param b the double to add + * @param buffer the packet buffer + * @return the new position of the last used byte in the buffer + */ +static inline size_t xio_write_double(double b, int bindex, uint8_t *buffer) +{ + union generic_64bit g; + + g.d = b; + return xio_write_int64(g.ll, bindex, buffer); +} + +/** + * @brief Get a double from the buffer + * + * @param b the double to get + * @param buffer the packet buffer + * @return the new position of the last used byte in the buffer + */ +static inline size_t xio_read_double(double *b, int bindex, + const uint8_t *buffer) +{ + union generic_64bit g; + size_t len = xio_read_int64(&g.ll, bindex, buffer); + + *b = g.d; + + return len; +} + +/** + * @brief Place an array into the buffer + * + * @param b the array to add + * @param length size of the array (for strings: length WITH '\0' char) + * @param buffer packet buffer + * @return new position of the last used byte in the buffer + */ +static inline size_t xio_write_array(const uint8_t *b, size_t length, + int bindex, uint8_t *buffer) +{ + memcpy(buffer+bindex, b, length); + return length; +} + +/** + * @brief get an array from the buffer + * + * @param b the array to add + * @param length size of the array (for strings: length WITH '\0' char) + * @param buffer packet buffer + * @return new position of the last used byte in the buffer + */ +static inline size_t xio_read_array(uint8_t *b, size_t length, + int bindex, const uint8_t *buffer) +{ + memcpy(b, buffer+bindex, length); + return length; +} + +/** + * @brief Place a string into the buffer + * + * @param b the string to add + * @param maxlength size of the array (for strings: length WITHOUT '\0' char) + * @param buffer packet buffer + * @return new position of the last used byte in the buffer + */ +static inline size_t xio_write_string(const char *b, size_t maxlength, + int bindex, uint8_t *buffer) +{ + size_t length = 0; + + /* Copy string into buffer, ensuring not to exceed the buffer size */ + unsigned int i; + + for (i = 2; i < maxlength - 1 || (b[i] == '\0'); i++) + buffer[bindex+i] = b[i]; + + length = i - 2; + /* Enforce null termination at end of buffer */ + buffer[maxlength - 1] = '\0'; + + /* Write length into first field */ + xio_write_uint16(length, bindex, buffer); + + return length; +} + +/** + * @brief Get a string from the buffer + * + * @param b the string to get + * @param maxlength size of the array (for strings: length WITHOUT '\0' char) + * @param buffer packet buffer + * @return new position of the last used byte in the buffer + */ +static inline size_t xio_read_string(char *b, size_t maxlength, + int bindex, const uint8_t *buffer) +{ + uint16_t length = 0; + unsigned int i; + + /* Read length from first field */ + xio_read_uint16(&length, bindex, buffer); + + /* Copy string into buffer, ensuring not to exceed the buffer size */ + for (i = 0; i < min(((size_t)length), maxlength); i++) + b[i] = buffer[bindex+i+2]; + + /* Enforce null termination at end of buffer */ + b[maxlength-1] = '\0'; + + return length; +} + +#endif /* XIO_PROTOCOL_H */ + diff --git a/open_src/xio/src/common/xio_server.c b/open_src/xio/src/common/xio_server.c new file mode 100644 index 0000000..6c5c610 --- /dev/null +++ b/open_src/xio/src/common/xio_server.c @@ -0,0 +1,428 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include "libxio.h" +#include "xio_log.h" +#include "xio_common.h" +#include "xio_observer.h" +#include "xio_hash.h" +#include "xio_transport.h" +#include "xio_protocol.h" +#include "xio_mbuf.h" +#include "xio_task.h" +#include "xio_idr.h" +#include "xio_msg_list.h" +#include "xio_ev_data.h" +#include "xio_objpool.h" +#include "xio_workqueue.h" +#include "xio_context.h" +#include "xio_session.h" +#include "xio_nexus.h" +#include "xio_connection.h" +#include "xio_server.h" +#include + +static int xio_on_nexus_event(void *observer, void *notifier, int event, + void *event_data); +static void xio_server_destroy(struct kref *kref); + +/*---------------------------------------------------------------------------*/ +/* xio_server_reg_observer */ +/*---------------------------------------------------------------------------*/ +int xio_server_reg_observer(struct xio_server *server, + struct xio_observer *observer) +{ + kref_get(&server->kref); + xio_observable_reg_observer(&server->nexus_observable, observer); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_server_unreg_observer */ +/*---------------------------------------------------------------------------*/ +void xio_server_unreg_observer(struct xio_server *server, + struct xio_observer *observer) +{ + xio_observable_unreg_observer(&server->nexus_observable, observer); + kref_put(&server->kref, xio_server_destroy); +} + +/*---------------------------------------------------------------------------*/ +/* xio_on_new_nexus */ +/*---------------------------------------------------------------------------*/ +static int xio_on_new_nexus(struct xio_server *server, + struct xio_nexus *nexus, + union xio_nexus_event_data *event_data) +{ + int retval; + + /* set the server as observer */ + retval = xio_nexus_accept(event_data->new_nexus.child_nexus); + if (retval != 0) { + ERROR_LOG("failed to accept connection\n"); + return -1; + } + + return 0; +} + +/* first message after new connection are going trough the server */ +static int xio_on_new_message(struct xio_server *server, + struct xio_nexus *nexus, + int event, + union xio_nexus_event_data *event_data) +{ + struct xio_session *session = NULL; + struct xio_connection *connection = NULL; + struct xio_connection *connection1 = NULL; + struct xio_task *task; + uint32_t tlv_type; + struct xio_session_params params; + int locked = 0; + + if (!server || !nexus || !event_data || !event_data->msg.task) { + ERROR_LOG("server [new session]: failed " \ + "invalid parameter\n"); + return -1; + } + if (nexus->state == XIO_NEXUS_STATE_CLOSED) { + ERROR_LOG("got a request for server %p but the corresponding nexus %p is closing\n", + server, nexus); + return -1; + } + + task = event_data->msg.task; + + params.type = XIO_SESSION_SERVER; + params.initial_sn = 0; + params.ses_ops = &server->ops; + params.uri = server->uri; + params.private_data = NULL; + params.private_data_len = 0; + params.user_context = server->cb_private_data; + + /* read the first message type */ + tlv_type = xio_read_tlv_type(&event_data->msg.task->mbuf); + + if (tlv_type == XIO_SESSION_SETUP_REQ) { + /* create new session */ + session = xio_session_create(¶ms); + if (!session) { + ERROR_LOG("server [new session]: failed " \ + " allocating session failed\n"); + return -1; + } + DEBUG_LOG("server [new session]: server:%p, " \ + "session:%p, nexus:%p ,session_id:%d\n", + server, session, nexus, session->session_id); + + /* get transport class routines */ + session->validators_cls = xio_nexus_get_validators_cls(nexus); + + connection = + xio_session_alloc_connection(session, + server->ctx, 0, + server->cb_private_data); + if (!connection) { + ERROR_LOG("server failed to allocate new connection\n"); + goto cleanup; + } + connection1 = xio_session_assign_nexus(session, nexus); + if (!connection1) { + ERROR_LOG("server failed to assign new connection\n"); + goto cleanup1; + } + connection = connection1; + + xio_idr_add_uobj(usr_idr, session, "xio_session"); + xio_idr_add_uobj(usr_idr, connection, "xio_connection"); + xio_connection_set_state(connection, + XIO_CONNECTION_STATE_ONLINE); + + xio_connection_keepalive_start(connection); + + task->session = session; + task->connection = connection; + } else if (tlv_type == XIO_CONNECTION_HELLO_REQ) { + struct xio_session *session1; + /* find the old session without lock */ + session = xio_find_session(event_data->msg.task); + if (!session) { + ERROR_LOG("server [new connection]: failed " \ + "session not found. server:%p\n", + server); + xio_nexus_close(nexus, NULL); + return -1; + } + /* lock it and retry find */ + mutex_lock(&session->lock); + /* session before destruction - try to lock before continue */ + session1 = xio_find_session(event_data->msg.task); + if (!session1) { + ERROR_LOG("server [new connection]: failed " \ + "session not found. server:%p\n", + server); + xio_nexus_close(nexus, NULL); + mutex_unlock(&session->lock); + return -1; + } + locked = 1; + task->session = session; + + DEBUG_LOG("server [new connection]: server:%p, " \ + "session:%p, nexus:%p, session_id:%d\n", + server, session, nexus, session->session_id); + + connection = xio_session_alloc_connection( + task->session, + server->ctx, 0, + server->cb_private_data); + + if (!connection) { + ERROR_LOG("server failed to allocate new connection\n"); + goto cleanup; + } + connection1 = xio_session_assign_nexus(task->session, nexus); + if (!connection1) { + ERROR_LOG("server failed to assign new connection\n"); + goto cleanup1; + } + connection = connection1; + + /* copy the server attributes to the connection */ + xio_connection_set_ops(connection, &server->ops); + + task->connection = connection; + + /* This in a multiple-portal situation */ + session->state = XIO_SESSION_STATE_ONLINE; + xio_connection_set_state(connection, + XIO_CONNECTION_STATE_ONLINE); + + xio_connection_keepalive_start(connection); + + xio_idr_add_uobj(usr_idr, connection, "xio_connection"); + } else { + ERROR_LOG("server unexpected message\n"); + return -1; + } + + /* route the message to the session */ + if (session) + xio_nexus_notify_observer(nexus, &session->observer, + event, event_data); + if (locked) + mutex_unlock(&session->lock); + + return 0; + +cleanup1: + if (connection) + xio_session_free_connection(connection); + +cleanup: + if (session) + xio_session_destroy(session); + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_on_nexus_event */ +/*---------------------------------------------------------------------------*/ +static int xio_on_nexus_event(void *observer, void *notifier, int event, + void *event_data) +{ + struct xio_server *server = (struct xio_server *)observer; + struct xio_nexus *nexus = (struct xio_nexus *)notifier; + int retval = 0; + + switch (event) { + case XIO_NEXUS_EVENT_NEW_MESSAGE: + case XIO_NEXUS_EVENT_ASSIGN_IN_BUF: + TRACE_LOG("server: [notification] - new message. " \ + "server:%p, nexus:%p\n", observer, notifier); + + xio_on_new_message(server, nexus, event, + (union xio_nexus_event_data *)event_data); + break; + case XIO_NEXUS_EVENT_NEW_CONNECTION: + DEBUG_LOG("server: [notification] - new connection. " \ + "server:%p, nexus:%p\n", observer, notifier); + xio_on_new_nexus(server, nexus, + (union xio_nexus_event_data *)event_data); + break; + + case XIO_NEXUS_EVENT_DISCONNECTED: + case XIO_NEXUS_EVENT_CLOSED: + case XIO_NEXUS_EVENT_ESTABLISHED: + break; + + case XIO_NEXUS_EVENT_ERROR: + ERROR_LOG("server: [notification] - connection error. " \ + "server:%p, nexus:%p\n", observer, notifier); + break; + default: + ERROR_LOG("server: [notification] - unexpected event :%d. " \ + "server:%p, nexus:%p\n", event, observer, notifier); + break; + } + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_bind */ +/*---------------------------------------------------------------------------*/ +struct xio_server *xio_bind(struct xio_context *ctx, + struct xio_session_ops *ops, + const char *uri, + uint16_t *src_port, + uint32_t session_flags, + void *cb_private_data) +{ + struct xio_server *server; + int retval; + int backlog = 4; + + if (!ctx || !ops || !uri) { + ERROR_LOG("invalid parameters ctx:%p, ops:%p, uri:%p\n", + ctx, ops, uri); + xio_set_error(EINVAL); + return NULL; + } + + TRACE_LOG("bind to %s\n", uri); + + /* create the server */ + server = (struct xio_server *) + kcalloc(1, sizeof(struct xio_server), GFP_KERNEL); + if (!server) { + xio_set_error(ENOMEM); + return NULL; + } + kref_init(&server->kref); + + /* fill server data*/ + server->ctx = ctx; + server->cb_private_data = cb_private_data; + server->uri = kstrdup(uri, GFP_KERNEL); + + server->session_flags = session_flags; + memcpy(&server->ops, ops, sizeof(*ops)); + + XIO_OBSERVER_INIT(&server->observer, server, xio_on_nexus_event); + + XIO_OBSERVABLE_INIT(&server->nexus_observable, server); + + server->listener = xio_nexus_open(ctx, uri, NULL, 0, 0, NULL); + if (!server->listener) { + ERROR_LOG("failed to create connection\n"); + goto cleanup; + } + retval = xio_nexus_listen(server->listener, + uri, src_port, backlog); + if (retval != 0) { + ERROR_LOG("connection listen failed\n"); + goto cleanup1; + } + xio_nexus_set_server(server->listener, server); + xio_idr_add_uobj(usr_idr, server, "xio_server"); + + return server; + +cleanup1: + xio_nexus_close(server->listener, NULL); +cleanup: + kfree(server->uri); + kfree(server); + + return NULL; +} +EXPORT_SYMBOL(xio_bind); + +/*---------------------------------------------------------------------------*/ +/* xio_server_destroy */ +/*---------------------------------------------------------------------------*/ +static void xio_server_destroy(struct kref *kref) +{ + struct xio_server *server = container_of(kref, + struct xio_server, kref); + + DEBUG_LOG("xio_server_destroy - server:%p\n", server); + xio_observable_unreg_all_observers(&server->nexus_observable); + + xio_nexus_close(server->listener, NULL); + + XIO_OBSERVER_DESTROY(&server->observer); + XIO_OBSERVABLE_DESTROY(&server->nexus_observable); + + kfree(server->uri); + kfree(server); +} + +/*---------------------------------------------------------------------------*/ +/* xio_unbind */ +/*---------------------------------------------------------------------------*/ +int xio_unbind(struct xio_server *server) +{ + int retval = 0; + int found; + + if (!server) + return -1; + + found = xio_idr_lookup_uobj(usr_idr, server); + if (found) { + xio_idr_remove_uobj(usr_idr, server); + } else { + ERROR_LOG("server not found:%p\n", server); + xio_set_error(XIO_E_USER_OBJ_NOT_FOUND); + return -1; + } + /* notify all observers that the server wishes to exit */ + xio_observable_notify_all_observers(&server->nexus_observable, + XIO_SERVER_EVENT_CLOSE, NULL); + + kref_put(&server->kref, xio_server_destroy); + + return retval; +} +EXPORT_SYMBOL(xio_unbind); diff --git a/open_src/xio/src/common/xio_server.h b/open_src/xio/src/common/xio_server.h new file mode 100644 index 0000000..f64ebe4 --- /dev/null +++ b/open_src/xio/src/common/xio_server.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_SERVER_H +#define XIO_SERVER_H + +/*---------------------------------------------------------------------------*/ +/* enum */ +/*---------------------------------------------------------------------------*/ +enum xio_server_event { + XIO_SERVER_EVENT_CLOSE +}; + +struct xio_server { + struct xio_nexus *listener; + struct xio_observer observer; + char *uri; + struct xio_context *ctx; + struct xio_session_ops ops; + uint32_t session_flags; + struct kref kref; + void *cb_private_data; + struct xio_observable nexus_observable; +}; + +/*---------------------------------------------------------------------------*/ +/* xio_server_reg_observer */ +/*---------------------------------------------------------------------------*/ +int xio_server_reg_observer(struct xio_server *server, + struct xio_observer *observer); + +/*---------------------------------------------------------------------------*/ +/* xio_server_unreg_observer */ +/*---------------------------------------------------------------------------*/ +void xio_server_unreg_observer(struct xio_server *server, + struct xio_observer *observer); + +#endif /*XIO_SERVER_H */ + diff --git a/open_src/xio/src/common/xio_session.c b/open_src/xio/src/common/xio_session.c new file mode 100644 index 0000000..5f07758 --- /dev/null +++ b/open_src/xio/src/common/xio_session.c @@ -0,0 +1,2169 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include "libxio.h" +#include "xio_log.h" +#include "xio_common.h" +#include "xio_protocol.h" +#include "xio_observer.h" +#include "xio_mbuf.h" +#include "xio_task.h" +#include "xio_transport.h" +#include "xio_hash.h" +#include "xio_sg_table.h" +#include "xio_idr.h" +#include "xio_msg_list.h" +#include "xio_ev_data.h" +#include "xio_objpool.h" +#include "xio_workqueue.h" +#include "xio_context.h" +#include "xio_nexus.h" +#include "xio_connection.h" +#include "xio_sessions_cache.h" +#include "xio_session.h" +#include "xio_session_priv.h" +#include + +/*---------------------------------------------------------------------------*/ +/* forward declarations */ +/*---------------------------------------------------------------------------*/ +static int xio_on_req_recv(struct xio_connection *connection, + struct xio_task *task); +static int xio_on_rsp_recv(struct xio_connection *nexusetion, + struct xio_task *task); +static int xio_on_ow_req_send_comp(struct xio_connection *connection, + struct xio_task *task); +static int xio_on_rsp_send_comp(struct xio_connection *connection, + struct xio_task *task); +/*---------------------------------------------------------------------------*/ +/* xio_session_alloc_connection */ +/*---------------------------------------------------------------------------*/ +struct xio_connection *xio_session_alloc_connection( + struct xio_session *session, + struct xio_context *ctx, + uint32_t connection_idx, + void *connection_user_context) +{ + struct xio_connection *connection; + + /* allocate and initialize connection */ + connection = xio_connection_create(session, ctx, connection_idx, + connection_user_context); + if (!connection) { + ERROR_LOG("failed to initialize connection. " \ + "seesion:%p, ctx:%p, connection_idx:%d\n", + session, ctx, connection_idx); + return NULL; + } + /* add the connection to the session's connections list */ + spin_lock(&session->connections_list_lock); + list_add_tail(&connection->connections_list_entry, + &session->connections_list); + session->connections_nr++; + spin_unlock(&session->connections_list_lock); + + return connection; +} + +/*---------------------------------------------------------------------------*/ +/* xio_session_free_connection */ +/*---------------------------------------------------------------------------*/ +int xio_session_free_connection(struct xio_connection *connection) +{ + int retval; + + spin_lock(&connection->session->connections_list_lock); + connection->session->connections_nr--; + list_del(&connection->connections_list_entry); + spin_unlock(&connection->session->connections_list_lock); + + retval = xio_connection_close(connection); + if (retval != 0) { + ERROR_LOG("failed to close connection"); + return -1; + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_session_assign_nexus */ +/*---------------------------------------------------------------------------*/ +struct xio_connection *xio_session_assign_nexus( + struct xio_session *session, + struct xio_nexus *nexus) +{ + struct xio_connection *connection; + + spin_lock(&session->connections_list_lock); + /* find free slot */ + list_for_each_entry(connection, &session->connections_list, + connections_list_entry) { + if ((connection->ctx == nexus->transport_hndl->ctx) && + (!connection->nexus || + (connection->nexus == nexus))) { + /* remove old observer if exist */ + spin_unlock(&session->connections_list_lock); + xio_connection_set_nexus(connection, nexus); + return connection; + } + } + spin_unlock(&session->connections_list_lock); + + return NULL; +} + +/*---------------------------------------------------------------------------*/ +/* xio_session_find_connection */ +/*---------------------------------------------------------------------------*/ +struct xio_connection *xio_session_find_connection( + struct xio_session *session, + struct xio_nexus *nexus) +{ + struct xio_connection *connection; + struct xio_context *ctx = nexus->transport_hndl->ctx; + + list_for_each_entry(connection, &ctx->ctx_list, ctx_list_entry) { + if (connection->nexus == nexus && + connection->session == session) + return connection; + } + + return NULL; +} + +/*---------------------------------------------------------------------------*/ +/* xio_session_find_connection_by_ctx */ +/*---------------------------------------------------------------------------*/ +struct xio_connection *xio_session_find_connection_by_ctx( + struct xio_session *session, + struct xio_context *ctx) +{ + struct xio_connection *connection; + + spin_lock(&session->connections_list_lock); + list_for_each_entry(connection, &session->connections_list, + connections_list_entry) { + if (connection->ctx == ctx) { + spin_unlock(&session->connections_list_lock); + return connection; + } + } + spin_unlock(&session->connections_list_lock); + + return NULL; +} + +/*---------------------------------------------------------------------------*/ +/* xio_find_session */ +/*---------------------------------------------------------------------------*/ +struct xio_session *xio_find_session(struct xio_task *task) +{ + struct xio_session_hdr *tmp_hdr; + struct xio_observer *observer; + struct xio_session *session; + uint32_t dest_session_id; + + xio_mbuf_push(&task->mbuf); + + /* set start of the session header */ + tmp_hdr = (struct xio_session_hdr *) + xio_mbuf_set_session_hdr(&task->mbuf); + + xio_mbuf_pop(&task->mbuf); + + dest_session_id = ntohl(tmp_hdr->dest_session_id); + + observer = xio_nexus_observer_lookup(task->nexus, dest_session_id); + if (observer && observer->impl) + return (struct xio_session *)observer->impl; + + /* fall back to cache - this is should only happen when new connection + * message arrive to a portal on the server - just for the first + * message + */ + session = xio_sessions_cache_lookup(dest_session_id); + if (!session) + ERROR_LOG("failed to find session %d\n", dest_session_id); + + return session; +} + +/*---------------------------------------------------------------------------*/ +/* xio_session_write_header */ +/*---------------------------------------------------------------------------*/ +void xio_session_write_header(struct xio_task *task, + struct xio_session_hdr *hdr) +{ + struct xio_session_hdr *tmp_hdr; + + /* set start of the session header */ + tmp_hdr = + (struct xio_session_hdr *)xio_mbuf_set_session_hdr(&task->mbuf); + + /* fill header */ + PACK_LVAL(hdr, tmp_hdr, dest_session_id); + PACK_LVAL(hdr, tmp_hdr, flags); + PACK_LLVAL(hdr, tmp_hdr, serial_num); + PACK_SVAL(hdr, tmp_hdr, sn); + PACK_SVAL(hdr, tmp_hdr, ack_sn); + PACK_SVAL(hdr, tmp_hdr, credits_msgs); + PACK_LVAL(hdr, tmp_hdr, receipt_result); + PACK_LLVAL(hdr, tmp_hdr, credits_bytes); +#ifdef XIO_SESSION_DEBUG + PACK_LLVAL(hdr, tmp_hdr, connection); + PACK_LLVAL(hdr, tmp_hdr, session); +#endif + + xio_mbuf_inc(&task->mbuf, sizeof(struct xio_session_hdr)); +} + +/*---------------------------------------------------------------------------*/ +/* xio_session_read_header */ +/*---------------------------------------------------------------------------*/ +void xio_session_read_header(struct xio_task *task, + struct xio_session_hdr *hdr) +{ + struct xio_session_hdr *tmp_hdr; + + /* set start of the session header */ + tmp_hdr = (struct xio_session_hdr *) + xio_mbuf_set_session_hdr(&task->mbuf); + + /* fill request */ + UNPACK_LVAL(tmp_hdr, hdr, dest_session_id); + UNPACK_LVAL(tmp_hdr, hdr, flags); + UNPACK_LLVAL(tmp_hdr, hdr, serial_num); + UNPACK_SVAL(tmp_hdr, hdr, sn); + UNPACK_SVAL(tmp_hdr, hdr, ack_sn); + UNPACK_SVAL(tmp_hdr, hdr, credits_msgs); + UNPACK_LVAL(tmp_hdr, hdr, receipt_result); + UNPACK_LLVAL(tmp_hdr, hdr, credits_bytes); +#ifdef XIO_SESSION_DEBUG + UNPACK_LLVAL(tmp_hdr, hdr, connection); + UNPACK_LLVAL(tmp_hdr, hdr, session); +#endif + + xio_mbuf_inc(&task->mbuf, sizeof(struct xio_session_hdr)); +} + +/*---------------------------------------------------------------------------*/ +/* xio_session_notify_teardown */ +/*---------------------------------------------------------------------------*/ +void xio_session_notify_teardown(struct xio_session *session, int reason) +{ + struct xio_session_event_data event = { + .conn = NULL, + .conn_user_context = NULL, + .event = XIO_SESSION_TEARDOWN_EVENT, + .reason = (enum xio_status)reason, + .private_data = NULL, + .private_data_len = 0, + }; + + if (session->ses_ops.on_session_event) { +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(session->teardown_work_ctx); +#endif + session->ses_ops.on_session_event( + session, &event, + session->cb_user_context); +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_lock(session->teardown_work_ctx); +#endif + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_session_notify_new_connection */ +/*---------------------------------------------------------------------------*/ +void xio_session_notify_new_connection(struct xio_session *session, + struct xio_connection *connection) +{ + struct xio_session_event_data event = { + .conn = connection, + .conn_user_context = connection->cb_user_context, + .event = XIO_SESSION_NEW_CONNECTION_EVENT, + .reason = XIO_E_SUCCESS, + .private_data = NULL, + .private_data_len = 0, + }; + + if (session->ses_ops.on_session_event) { +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(connection->ctx); +#endif + session->ses_ops.on_session_event( + session, &event, + session->cb_user_context); +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_lock(connection->ctx); +#endif + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_session_notify_connection_established */ +/*---------------------------------------------------------------------------*/ +void xio_session_notify_connection_established( + struct xio_session *session, + struct xio_connection *connection) +{ + struct xio_session_event_data event = { + .conn = connection, + .conn_user_context = connection->cb_user_context, + .event = XIO_SESSION_CONNECTION_ESTABLISHED_EVENT, + .reason = XIO_E_SUCCESS, + .private_data = NULL, + .private_data_len = 0, + }; + + if (session->ses_ops.on_session_event) { +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(connection->ctx); +#endif + session->ses_ops.on_session_event( + session, &event, + session->cb_user_context); +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_lock(connection->ctx); +#endif + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_session_notify_connection_closed */ +/*---------------------------------------------------------------------------*/ +void xio_session_notify_connection_closed(struct xio_session *session, + struct xio_connection *connection) +{ + struct xio_session_event_data event = { + .conn = connection, + .conn_user_context = connection->cb_user_context, + .event = XIO_SESSION_CONNECTION_CLOSED_EVENT, + .reason = (enum xio_status)connection->close_reason, + .private_data = NULL, + .private_data_len = 0, + }; + + if (connection->cd_bit) + return; + + connection->cd_bit = 1; + + if (session->ses_ops.on_session_event) { +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(connection->ctx); +#endif + session->ses_ops.on_session_event( + session, &event, + session->cb_user_context); +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_lock(connection->ctx); +#endif + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_session_notify_connection_disconnected */ +/*---------------------------------------------------------------------------*/ +void xio_session_notify_connection_disconnected( + struct xio_session *session, + struct xio_connection *connection, + enum xio_status reason) +{ + struct xio_session_event_data event = { + .conn = connection, + .conn_user_context = connection->cb_user_context, + .event = XIO_SESSION_CONNECTION_DISCONNECTED_EVENT, + .private_data = NULL, + .private_data_len = 0, + .reason = reason, + }; + + if (connection->cd_bit) + return; + + connection->cd_bit = 1; + + if (session->ses_ops.on_session_event) { +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(connection->ctx); +#endif + session->ses_ops.on_session_event( + session, &event, + session->cb_user_context); +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_lock(connection->ctx); +#endif + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_session_notify_connection_refused */ +/*---------------------------------------------------------------------------*/ +void xio_session_notify_connection_refused(struct xio_session *session, + struct xio_connection *connection, + enum xio_status reason) +{ + struct xio_session_event_data event = { + .conn = connection, + .conn_user_context = connection->cb_user_context, + .event = XIO_SESSION_CONNECTION_REFUSED_EVENT, + .reason = reason, + .private_data = NULL, + .private_data_len = 0, + }; + + if (session->ses_ops.on_session_event) { +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(connection->ctx); +#endif + session->ses_ops.on_session_event( + session, &event, + session->cb_user_context); +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_lock(connection->ctx); +#endif + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_session_notify_connection_teardown */ +/*---------------------------------------------------------------------------*/ +void xio_session_notify_connection_teardown(struct xio_session *session, + struct xio_connection *connection) +{ + struct xio_session_event_data event = { + .conn = connection, + .conn_user_context = connection->cb_user_context, + .event = XIO_SESSION_CONNECTION_TEARDOWN_EVENT, + .reason = (enum xio_status)connection->close_reason, + .private_data = NULL, + .private_data_len = 0, + }; + + if (session->ses_ops.on_session_event) { +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(connection->ctx); +#endif + session->ses_ops.on_session_event( + session, &event, + session->cb_user_context); +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_lock(connection->ctx); +#endif + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_session_notify_connection_error */ +/*---------------------------------------------------------------------------*/ +void xio_session_notify_connection_error(struct xio_session *session, + struct xio_connection *connection, + enum xio_status reason) +{ + struct xio_session_event_data event = { + .conn = connection, + .conn_user_context = connection->cb_user_context, + .event = XIO_SESSION_CONNECTION_ERROR_EVENT, + .reason = reason, + .private_data = NULL, + .private_data_len = 0, + }; + + if (session->ses_ops.on_session_event) { +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(connection->ctx); +#endif + session->ses_ops.on_session_event( + session, &event, + session->cb_user_context); +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_lock(connection->ctx); +#endif + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_session_notify_reconnecting */ +/*---------------------------------------------------------------------------*/ +void xio_session_notify_reconnecting(struct xio_session *session, + struct xio_connection *connection) +{ + struct xio_session_event_data event = { + .conn = connection, + .conn_user_context = connection->cb_user_context, + .event = XIO_SESSION_CONNECTION_RECONNECTING_EVENT, + .reason = (enum xio_status)connection->close_reason, + .private_data = NULL, + .private_data_len = 0, + }; + + if (session->ses_ops.on_session_event) { +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(connection->ctx); +#endif + session->ses_ops.on_session_event( + session, &event, + session->cb_user_context); +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_lock(connection->ctx); +#endif + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_session_notify_reconnected */ +/*---------------------------------------------------------------------------*/ +void xio_session_notify_reconnected(struct xio_session *session, + struct xio_connection *connection) +{ + struct xio_session_event_data event = { + .conn = connection, + .conn_user_context = connection->cb_user_context, + .event = XIO_SESSION_CONNECTION_RECONNECTED_EVENT, + .reason = XIO_E_SUCCESS, + .private_data = NULL, + .private_data_len = 0, + }; + + if (session->ses_ops.on_session_event) { +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(connection->ctx); +#endif + session->ses_ops.on_session_event( + session, &event, + session->cb_user_context); +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_lock(connection->ctx); +#endif + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_on_req_recv */ +/*---------------------------------------------------------------------------*/ +static int xio_on_req_recv(struct xio_connection *connection, + struct xio_task *task) +{ + struct xio_session_hdr hdr; + struct xio_msg *msg = &task->imsg; +#ifdef XIO_CFLAG_STAT_COUNTERS + struct xio_statistics *stats = &connection->ctx->stats; + struct xio_vmsg *vmsg = &msg->in; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + + sgtbl = xio_sg_table_get(&msg->in); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(msg->in.sgl_type); +#endif + + /* read session header */ + xio_session_read_header(task, &hdr); + + if (connection->req_exp_sn == hdr.sn) { + connection->req_exp_sn++; + connection->req_ack_sn = hdr.sn; + if (connection->enable_flow_control) { + connection->peer_credits_msgs += hdr.credits_msgs; + connection->peer_credits_bytes += hdr.credits_bytes; + } + connection->restarted = 0; + } else { + if (unlikely(connection->restarted)) { + connection->req_exp_sn = hdr.sn + 1; + connection->restarted = 0; + } else { + ERROR_LOG("ERROR: sn expected:%d, sn arrived:%d\n", + connection->req_exp_sn, hdr.sn); + } + } + /* + DEBUG_LOG("[%s] sn:%d, exp:%d, ack:%d, credits:%d, peer_credits:%d\n", + __func__, + connection->req_sn, connection->req_exp_sn, + connection->req_ack_sn, + connection->credits_msgs, connection->peer_credits_msgs); + */ +#ifdef XIO_SESSION_DEBUG + connection->peer_connection = hdr.connection; + connection->peer_session = hdr.session; +#endif + msg->sn = hdr.serial_num; + msg->flags = 0; + msg->next = NULL; + + if (test_bits(XIO_MSG_FLAG_LAST_IN_BATCH, &task->imsg_flags)) + set_bits(XIO_MSG_FLAG_LAST_IN_BATCH, &msg->flags); + + xio_connection_queue_io_task(connection, task); + + task->state = XIO_TASK_STATE_DELIVERED; + + /* add reference count to protect against release in callback */ + /* add ref to task avoiding race when user call release or send + * completion + */ + if (hdr.flags & XIO_MSG_FLAG_REQUEST_READ_RECEIPT) + xio_task_addref(task); + +#ifdef XIO_CFLAG_STAT_COUNTERS + msg->timestamp = get_cycles(); + xio_stat_inc(stats, XIO_STAT_RX_MSG); + xio_stat_add(stats, XIO_STAT_RX_BYTES, + vmsg->header.iov_len + tbl_length(sgtbl_ops, sgtbl)); +#endif + if (test_bits(XIO_MSG_FLAG_EX_IMM_READ_RECEIPT, &hdr.flags)) { + xio_task_addref(task); + /* send receipt before calling the callback */ + xio_connection_send_read_receipt(connection, msg); + } + + /* notify the upper layer */ + if (task->status) { + xio_session_notify_msg_error(connection, msg, + (enum xio_status)task->status, + XIO_MSG_DIRECTION_IN); + task->status = 0; + } else { + /* check for repeated msgs */ + /* repeated msgs will not be delivered to the application since they were already delivered */ + if (connection->latest_delivered < msg->sn || connection->latest_delivered == 0) { +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(connection->ctx); +#endif + connection->ses_ops.on_msg( + connection->session, msg, + task->last_in_rxq, + connection->cb_user_context); +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_lock(connection->ctx); +#endif + connection->latest_delivered = msg->sn; + } + } + + if (hdr.flags & XIO_MSG_FLAG_REQUEST_READ_RECEIPT) { + if (task->state == XIO_TASK_STATE_DELIVERED) { + xio_connection_send_read_receipt(connection, msg); + } else { + /* free the ref added in this function */ + xio_tasks_pool_put(task); + } + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_on_rsp_recv */ +/*---------------------------------------------------------------------------*/ +static int xio_on_rsp_recv(struct xio_connection *connection, + struct xio_task *task) +{ + struct xio_session_hdr hdr; + struct xio_msg *msg = &task->imsg; + struct xio_msg *omsg; + struct xio_task *sender_task = task->sender_task; + int standalone_receipt = 0; +#ifdef XIO_CFLAG_STAT_COUNTERS + struct xio_statistics *stats = &connection->ctx->stats; +#endif + + if ((connection->state != XIO_CONNECTION_STATE_ONLINE) && + (connection->state != XIO_CONNECTION_STATE_FIN_WAIT_1)) { + DEBUG_LOG("responses received while connection is offline\n"); + /* for various reasons, responses can arrive while connection + * is already offline + * release the response, and let it be flushed via "flush" + * mechanism + */ + xio_release_response_task(task); + goto exit; + } + + /* read session header */ + xio_session_read_header(task, &hdr); + + /* standalone receipt */ + if (xio_app_receipt_request(&hdr) == + XIO_MSG_FLAG_EX_RECEIPT_FIRST) + standalone_receipt = 1; + + /* update receive + send window */ + if (connection->rsp_exp_sn == hdr.sn) { + connection->rsp_exp_sn++; + connection->rsp_ack_sn = hdr.sn; + connection->restarted = 0; + if (connection->enable_flow_control) { + connection->peer_credits_msgs += hdr.credits_msgs; + connection->peer_credits_bytes += hdr.credits_bytes; + } + } else { + if (unlikely(connection->restarted)) { + connection->rsp_exp_sn = hdr.sn + 1; + connection->restarted = 0; + } else { + ERROR_LOG("ERROR: expected sn:%d, arrived sn:%d\n", + connection->rsp_exp_sn, hdr.sn); + } + } + /* + DEBUG_LOG("[%s] sn:%d, exp:%d, ack:%d, credits:%d, peer_credits:%d\n", + __func__, + connection->rsp_sn, connection->rsp_exp_sn, + connection->rsp_ack_sn, + connection->credits_msgs, connection->peer_credits_msgs); + */ +#ifdef XIO_SESSION_DEBUG + connection->peer_connection = hdr.connection; + connection->peer_session = hdr.session; +#endif + + msg->sn = hdr.serial_num; + + omsg = sender_task->omsg; + +#ifdef XIO_CFLAG_STAT_COUNTERS + xio_stat_add(stats, XIO_STAT_DELAY, + get_cycles() - omsg->timestamp); + xio_stat_inc(stats, XIO_STAT_RX_MSG); +#endif + omsg->next = NULL; + + xio_clear_ex_flags(&omsg->flags); + + task->connection = connection; + task->session = connection->session; + + /* remove only if not response with "read receipt" */ + if (!standalone_receipt) { + xio_connection_remove_in_flight(connection, omsg); + } else { + if (task->tlv_type == XIO_ONE_WAY_RSP) + if (xio_app_receipt_first_request(&hdr)) + xio_connection_remove_in_flight(connection, + omsg); + } + + omsg->type = (enum xio_msg_type)task->tlv_type; + + /* cache the task in io queue */ + xio_connection_queue_io_task(connection, task); + + /* remove the message from in flight queue */ + + if (task->tlv_type == XIO_ONE_WAY_RSP) { + /* one way message with "read receipt" */ + if (!xio_app_receipt_first_request(&hdr)) + ERROR_LOG("protocol requires first flag to be set. " \ + "flags:0x%x\n", hdr.flags); + + if (connection->enable_flow_control) { + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + + sgtbl = xio_sg_table_get(&omsg->out); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(omsg->out.sgl_type); + + connection->tx_queued_msgs--; + connection->tx_bytes -= + (omsg->out.header.iov_len + + tbl_length(sgtbl_ops, sgtbl)); + } + + omsg->sn = msg->sn; /* one way do have response */ + omsg->receipt_res = (enum xio_receipt_result)hdr.receipt_result; + + if (omsg->flags & + XIO_MSG_FLAG_REQUEST_READ_RECEIPT) { + if (connection->ses_ops.on_msg_delivered) { +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(connection->ctx); +#endif + connection->ses_ops.on_msg_delivered( + connection->session, + omsg, + task->last_in_rxq, + connection->cb_user_context); +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_lock(connection->ctx); +#endif + } + } else { + if (connection->ses_ops.on_ow_msg_send_complete) { +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(connection->ctx); +#endif + connection->ses_ops.on_ow_msg_send_complete( + connection->session, omsg, + connection->cb_user_context); +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_lock(connection->ctx); +#endif + } + } + sender_task->omsg = NULL; + xio_release_response_task(task); + } else { + if (xio_app_receipt_first_request(&hdr)) { + if (connection->ses_ops.on_msg_delivered) { + omsg->receipt_res = + (enum xio_receipt_result)hdr.receipt_result; + omsg->sn = hdr.serial_num; +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(connection->ctx); +#endif + connection->ses_ops.on_msg_delivered( + connection->session, + omsg, + task->last_in_rxq, + connection->cb_user_context); +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_lock(connection->ctx); +#endif + } + /* standalone receipt */ + if (standalone_receipt) { + /* after receipt delivered reproduce the + * original "in" side */ + memcpy(&omsg->in, &sender_task->in_receipt, + sizeof(omsg->in)); + + /* recycle the receipt */ + xio_tasks_pool_put(task); + } + } + if (xio_app_receipt_last_request(&hdr)) { +#ifdef XIO_CFLAG_STAT_COUNTERS + struct xio_vmsg *vmsg = &msg->in; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + + sgtbl = xio_sg_table_get(&msg->in); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(msg->in.sgl_type); + xio_stat_add(stats, XIO_STAT_RX_BYTES, + vmsg->header.iov_len + + tbl_length(sgtbl_ops, sgtbl)); +#endif + omsg->request = msg; + if (task->status) { + xio_session_notify_msg_error( + connection, omsg, + (enum xio_status)task->status, + XIO_MSG_DIRECTION_IN); + task->status = 0; + } else { +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(connection->ctx); +#endif + /*if (connection->ses_ops.on_msg) */ + connection->ses_ops.on_msg( + connection->session, + omsg, + task->last_in_rxq, + connection->cb_user_context); +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_lock(connection->ctx); +#endif + } + } + } + +exit: + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_on_rsp_send_comp */ +/*---------------------------------------------------------------------------*/ +static int xio_on_rsp_send_comp( + struct xio_connection *connection, + struct xio_task *task) +{ + if (connection->is_flushed) { + xio_tasks_pool_put(task); + goto exit; + } + + /* remove the message from in flight queue */ + xio_connection_remove_in_flight(connection, task->omsg); + + /* + * completion of receipt + */ + if ((task->omsg_flags & + (XIO_MSG_FLAG_EX_RECEIPT_FIRST | XIO_MSG_FLAG_EX_RECEIPT_LAST)) == + XIO_MSG_FLAG_EX_RECEIPT_FIRST) { + xio_connection_release_read_receipt(connection, task->omsg); + xio_release_response_task(task); + } else { + /* send completion notification only to responder to + * release responses + */ + xio_clear_ex_flags(&task->omsg->flags); + if (connection->ses_ops.on_msg_send_complete) { +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(connection->ctx); +#endif + connection->ses_ops.on_msg_send_complete( + connection->session, task->omsg, + connection->cb_user_context); +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_lock(connection->ctx); +#endif + } + /* recycle the task */ + xio_tasks_pool_put(task); + } + +exit: + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_on_credits_ack_recv */ +/*---------------------------------------------------------------------------*/ +int xio_on_credits_ack_recv(struct xio_connection *connection, + struct xio_task *task) +{ + struct xio_session_hdr hdr; + + if (connection->enable_flow_control == 0) + return 0; + + /* read session header */ + xio_session_read_header(task, &hdr); + + if (connection->req_exp_sn == hdr.sn) { + connection->req_exp_sn++; + connection->req_ack_sn = hdr.sn; + connection->peer_credits_msgs += hdr.credits_msgs; + connection->peer_credits_bytes += hdr.credits_bytes; + } else { + ERROR_LOG("ERROR: sn expected:%d, sn arrived:%d\n", + connection->req_exp_sn, hdr.sn); + } + connection->credits_msgs++; + xio_tasks_pool_put(task); + /* + DEBUG_LOG("[%s] sn:%d, exp:%d, ack:%d, credits:%d, peer_credits:%d\n", + __func__, + connection->sn, connection->exp_sn, connection->ack_sn, + connection->credits_msgs, connection->peer_credits_msgs); + */ + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_on_ow_req_send_comp */ +/*---------------------------------------------------------------------------*/ +static int xio_on_ow_req_send_comp( + struct xio_connection *connection, + struct xio_task *task) +{ +#ifdef XIO_CFLAG_STAT_COUNTERS + struct xio_statistics *stats = &connection->ctx->stats; +#endif + struct xio_msg *omsg = task->omsg; + + if (connection->is_flushed) { + xio_tasks_pool_put(task); + goto exit; + } + + if (!omsg || omsg->flags & XIO_MSG_FLAG_REQUEST_READ_RECEIPT || + task->omsg_flags & XIO_MSG_FLAG_REQUEST_READ_RECEIPT || + task->omsg->flags & XIO_MSG_FLAG_EX_IMM_READ_RECEIPT) + return 0; +#ifdef XIO_CFLAG_STAT_COUNTERS + xio_stat_add(stats, XIO_STAT_DELAY, + get_cycles() - omsg->timestamp); + xio_stat_inc(stats, XIO_STAT_RX_MSG); /* need to replace with + * TX_COMP + */ +#endif + xio_connection_remove_in_flight(connection, omsg); + omsg->flags = task->omsg_flags; + xio_clear_ex_flags(&omsg->flags); + + if (connection->enable_flow_control) { + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + + sgtbl = xio_sg_table_get(&omsg->out); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(omsg->out.sgl_type); + + connection->tx_queued_msgs--; + connection->tx_bytes -= + (omsg->out.header.iov_len + + tbl_length(sgtbl_ops, sgtbl)); + } + + /* send completion notification to + * release request + */ + if (connection->ses_ops.on_ow_msg_send_complete) { +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(connection->ctx); +#endif + connection->ses_ops.on_ow_msg_send_complete( + connection->session, omsg, + connection->cb_user_context); +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_lock(connection->ctx); +#endif + } + xio_tasks_pool_put(task); + +exit: + return 0; +} + +int xio_on_rdma_direct_comp(struct xio_session *session, + struct xio_nexus *nexus, + union xio_nexus_event_data *event_data) +{ + struct xio_task *task = event_data->msg.task; + struct xio_msg *omsg = task->omsg; + struct xio_connection *connection = task->connection; + + if (unlikely(task->tlv_type != XIO_MSG_TYPE_RDMA)) { + ERROR_LOG("Unexpected message type %u\n", + task->tlv_type); + return 0; + } + + if (connection->is_flushed) { + xio_tasks_pool_put(task); + goto xmit; + } + + if (!omsg) + return 0; + + xio_connection_remove_in_flight(connection, omsg); + omsg->flags = task->omsg_flags; + connection->tx_queued_msgs--; + + if (connection->ses_ops.on_rdma_direct_complete) { +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(connection->ctx); +#endif + connection->ses_ops.on_rdma_direct_complete( + connection->session, omsg, + connection->cb_user_context); +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_lock(connection->ctx); +#endif + } + xio_tasks_pool_put(task); + +xmit: + /* now try to send */ + xio_connection_xmit_msgs(connection); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_on_nexus_disconnected */ +/*---------------------------------------------------------------------------*/ +int xio_on_nexus_disconnected(struct xio_session *session, + struct xio_nexus *nexus, + union xio_nexus_event_data *event_data) +{ + struct xio_connection *connection; + + DEBUG_LOG("xio_session_on_nexus_disconnected. session:%p, nexus:%p\n", + session, nexus); + + if (session->lead_connection && + session->lead_connection->nexus == nexus) { + connection = session->lead_connection; + connection->close_reason = XIO_E_SESSION_DISCONNECTED; + xio_connection_disconnected(connection); + } else if (session->redir_connection && + session->redir_connection->nexus == nexus) { + connection = session->redir_connection; + connection->close_reason = XIO_E_SESSION_DISCONNECTED; + xio_connection_disconnected(connection); + } else { + spin_lock(&session->connections_list_lock); + connection = xio_session_find_connection(session, nexus); + spin_unlock(&session->connections_list_lock); + connection->close_reason = XIO_E_SESSION_DISCONNECTED; + + /* disconnection arrive during active closing phase */ + if (connection->state != XIO_CONNECTION_STATE_CLOSED) { + kref_init(&connection->kref); + xio_connection_disconnected(connection); + } + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_on_nexus_reconnecting */ +/*---------------------------------------------------------------------------*/ +int xio_on_nexus_reconnecting(struct xio_session *session, + struct xio_nexus *nexus) +{ + struct xio_connection *connection; + + if (session->lead_connection && + session->lead_connection->nexus == nexus) + connection = session->lead_connection; + else + connection = xio_session_find_connection(session, nexus); + + if (connection) + xio_connection_reconnect(connection); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_on_nexus_reconnected */ +/*---------------------------------------------------------------------------*/ +int xio_on_nexus_reconnected(struct xio_session *session, + struct xio_nexus *nexus) +{ + struct xio_connection *connection; + + if (session->lead_connection && + session->lead_connection->nexus == nexus) + connection = session->lead_connection; + else + connection = xio_session_find_connection(session, nexus); + + if (connection) + xio_connection_restart(connection); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_on_nexus_closed */ +/*---------------------------------------------------------------------------*/ +int xio_on_nexus_closed(struct xio_session *session, + struct xio_nexus *nexus, + union xio_nexus_event_data *event_data) +{ + struct xio_connection *connection; + + TRACE_LOG("session:%p - nexus:%p close complete\n", session, nexus); + + /* no more notifications */ + xio_nexus_unreg_observer(nexus, &session->observer); + + if (session->lead_connection && + session->lead_connection->nexus == nexus) + connection = session->lead_connection; + else + connection = xio_session_find_connection(session, nexus); + if (connection) + connection->nexus = NULL; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_on_nexus_message_error */ +/*---------------------------------------------------------------------------*/ +int xio_on_nexus_message_error(struct xio_session *session, + struct xio_nexus *nexus, + union xio_nexus_event_data *event_data) +{ + struct xio_task *task = event_data->msg_error.task; + + xio_connection_remove_msg_from_queue(task->connection, task->omsg); + xio_connection_queue_io_task(task->connection, task); + + if (task->session->ses_ops.on_msg_error && IS_APPLICATION_MSG(task->tlv_type)) { +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(task->connection->ctx); +#endif + task->session->ses_ops.on_msg_error( + task->session, + event_data->msg_error.reason, + event_data->msg_error.direction, + task->omsg, + task->connection->cb_user_context); +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_lock(task->connection->ctx); +#endif + } + + if (IS_REQUEST(task->tlv_type) || task->tlv_type == XIO_MSG_TYPE_RDMA) + xio_tasks_pool_put(task); + else + xio_release_response_task(task); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_on_nexus_error */ +/*---------------------------------------------------------------------------*/ +int xio_on_nexus_error(struct xio_session *session, struct xio_nexus *nexus, + union xio_nexus_event_data *event_data) +{ + struct xio_connection *connection, *next_connection; + + /* disable the teardown */ + session->disable_teardown = 0; + + switch (session->state) { + case XIO_SESSION_STATE_CONNECT: + case XIO_SESSION_STATE_REDIRECTED: + session->state = XIO_SESSION_STATE_REFUSED; + list_for_each_entry_safe( + connection, next_connection, + &session->connections_list, + connections_list_entry) { + xio_connection_error_event(connection, + event_data->error.reason); + } + + break; + default: + connection = xio_session_find_connection(session, nexus); + xio_connection_error_event(connection, + event_data->error.reason); + break; + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_on_new_message */ +/*---------------------------------------------------------------------------*/ +int xio_on_new_message(struct xio_session *s, + struct xio_nexus *nexus, + union xio_nexus_event_data *event_data) +{ + struct xio_task *task = event_data->msg.task; + struct xio_connection *connection = NULL; + struct xio_session *session = s; + int retval = -1; + int xmit = 0; + + if (task->sender_task) { + session = task->sender_task->session; + connection = task->sender_task->connection; + } + + if (!session) { + session = xio_find_session(task); + if (!session) { + ERROR_LOG("failed to find session\n"); + xio_tasks_pool_put(task); + return -1; + } + } + + if (!connection) { + connection = xio_session_find_connection(session, nexus); + if (!connection) { + /* leading connection is refused */ + if (session->lead_connection && + session->lead_connection->nexus == nexus) { + connection = session->lead_connection; + } else if (session->redir_connection && + session->redir_connection->nexus == nexus) { + /* redirected connection is refused */ + connection = session->redir_connection; + } else { + ERROR_LOG("failed to find connection\n"); + xio_tasks_pool_put(task); + return -1; + } + } + } + + task->session = session; + task->connection = connection; + + switch (task->tlv_type) { + case XIO_MSG_REQ: + case XIO_ONE_WAY_REQ: + retval = xio_on_req_recv(connection, task); + xmit = 1; + break; + case XIO_MSG_RSP: + case XIO_ONE_WAY_RSP: + retval = xio_on_rsp_recv(connection, task); + xmit = 1; + break; + case XIO_ACK_REQ: + retval = xio_on_credits_ack_recv(connection, task); + xmit = 1; + break; + case XIO_FIN_REQ: + retval = xio_on_fin_req_recv(connection, task); + break; + case XIO_FIN_RSP: + retval = xio_on_fin_ack_recv(connection, task); + break; + case XIO_SESSION_SETUP_REQ: + retval = xio_on_setup_req_recv(connection, task); + xmit = 1; + break; + case XIO_SESSION_SETUP_RSP: + retval = xio_on_setup_rsp_recv(connection, task); + xmit = 1; + break; + case XIO_CONNECTION_HELLO_REQ: + retval = xio_on_connection_hello_req_recv(connection, task); + xmit = 1; + break; + case XIO_CONNECTION_HELLO_RSP: + retval = xio_on_connection_hello_rsp_recv(connection, task); + xmit = 1; + break; + case XIO_CONNECTION_KA_REQ: + retval = xio_on_connection_ka_req_recv(connection, task); + break; + case XIO_CONNECTION_KA_RSP: + retval = xio_on_connection_ka_rsp_recv(connection, task); + break; + default: + retval = -1; + break; + } + + /* now try to send */ + if (xmit) + xio_connection_xmit_msgs(connection); + + if (retval != 0) + ERROR_LOG("receiving new message failed. type:0x%x\n", + task->tlv_type); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_on_send_completion */ +/*---------------------------------------------------------------------------*/ +int xio_on_send_completion(struct xio_session *session, + struct xio_nexus *nexus, + union xio_nexus_event_data *event_data) +{ + struct xio_task *task = event_data->msg.task; + struct xio_connection *connection; + int retval = -1; + int xmit = 0; + + connection = task->connection; + + switch (task->tlv_type) { + case XIO_MSG_REQ: + case XIO_SESSION_SETUP_REQ: + retval = 0; + break; + case XIO_MSG_RSP: + case XIO_ONE_WAY_RSP: + retval = xio_on_rsp_send_comp(connection, task); + xmit = 1; + break; + case XIO_ONE_WAY_REQ: + retval = xio_on_ow_req_send_comp(connection, task); + xmit = 1; + break; + case XIO_ACK_REQ: + retval = xio_on_credits_ack_send_comp(connection, task); + xmit = 1; + break; + case XIO_FIN_REQ: + retval = xio_on_fin_req_send_comp(connection, task); + break; + case XIO_FIN_RSP: + retval = xio_on_fin_ack_send_comp(connection, task); + break; + case XIO_SESSION_SETUP_RSP: + retval = xio_on_setup_rsp_send_comp(connection, task); + break; + case XIO_CONNECTION_HELLO_REQ: + retval = 0; + break; + case XIO_CONNECTION_HELLO_RSP: + retval = xio_on_connection_hello_rsp_send_comp(connection, + task); + xmit = 1; + break; + case XIO_CONNECTION_KA_REQ: + retval = 0; + break; + case XIO_CONNECTION_KA_RSP: + retval = xio_on_connection_ka_rsp_send_comp(connection, + task); + break; + default: + break; + } + /* now try to send */ + if (xmit) + xio_connection_xmit_msgs(connection); + + if (retval != 0) + ERROR_LOG("message send completion failed. type:0x%x\n", + task->tlv_type); + + return 0; +} +/*---------------------------------------------------------------------------*/ +/* xio_on_head_alloc_buf */ +/*---------------------------------------------------------------------------*/ +int xio_on_head_alloc_buf(struct xio_session *session, + struct xio_nexus *nexus, + union xio_nexus_event_data *event_data) +{ + struct xio_task *task = event_data->assign_in_buf.task; + struct xio_connection *connection; + int retval; + + if (!session) + session = xio_find_session(task); + + connection = xio_session_find_connection(session, nexus); + if (!connection) { + connection = xio_session_assign_nexus(session, nexus); + if (!connection) { + ERROR_LOG("failed to find connection :%p. " \ + "dropping message:%d\n", nexus, + event_data->msg.op); + return -1; + } + } + + if (connection->ses_ops.rev_msg_head_alloc_buf) { +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(connection->ctx); +#endif + retval = connection->ses_ops.rev_msg_head_alloc_buf( + session, + event_data->alloc_head_buf.header, + connection->cb_user_context); +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_lock(connection->ctx); +#endif + event_data->alloc_head_buf.is_assigned = (retval == 0); + return 0; + } + event_data->alloc_head_buf.is_assigned = 0; + return 0; +} +/*---------------------------------------------------------------------------*/ +/* xio_on_data_alloc_buf */ +/*---------------------------------------------------------------------------*/ +int xio_on_data_alloc_buf(struct xio_session *session, + struct xio_nexus *nexus, + union xio_nexus_event_data *event_data) +{ + struct xio_task *task = event_data->assign_in_buf.task; + struct xio_connection *connection; + int retval; + + if (!session) + session = xio_find_session(task); + + connection = xio_session_find_connection(session, nexus); + if (!connection) { + connection = xio_session_assign_nexus(session, nexus); + if (!connection) { + ERROR_LOG("failed to find connection :%p. " \ + "dropping message:%d\n", nexus, + event_data->msg.op); + return -1; + } + } + + if (connection->ses_ops.rev_msg_data_alloc_buf) { +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(connection->ctx); +#endif + retval = connection->ses_ops.rev_msg_data_alloc_buf( + session, + &task->imsg, + connection->cb_user_context); +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_lock(connection->ctx); +#endif + event_data->alloc_data_buf.is_assigned = (retval == 0); + return 0; + } + event_data->alloc_data_buf.is_assigned = 0; + return 0; +} +/*---------------------------------------------------------------------------*/ +/* xio_on_assign_in_buf */ +/*---------------------------------------------------------------------------*/ +int xio_on_assign_in_buf(struct xio_session *session, + struct xio_nexus *nexus, + union xio_nexus_event_data *event_data) +{ + struct xio_task *task = event_data->assign_in_buf.task; + struct xio_connection *connection; + int retval; + + if (!session) + session = xio_find_session(task); + + connection = xio_session_find_connection(session, nexus); + if (!connection) { + connection = xio_session_assign_nexus(session, nexus); + if (!connection) { + ERROR_LOG("failed to find connection :%p. " \ + "dropping message:%d\n", nexus, + event_data->msg.op); + return -1; + } + } + + if (connection->ses_ops.assign_data_in_buf) { +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(connection->ctx); +#endif + retval = connection->ses_ops.assign_data_in_buf( + &task->imsg, + connection->cb_user_context); +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_lock(connection->ctx); +#endif + event_data->assign_in_buf.is_assigned = (retval == 0); + return 0; + } + event_data->assign_in_buf.is_assigned = 0; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_on_cancel_request */ +/*---------------------------------------------------------------------------*/ +int xio_on_cancel_request(struct xio_session *sess, + struct xio_nexus *nexus, + union xio_nexus_event_data *event_data) +{ + struct xio_session_cancel_hdr hdr; + struct xio_msg *req = NULL; + struct xio_session_cancel_hdr *tmp_hdr; + struct xio_session *session; + struct xio_connection *connection; + struct xio_task *task; + struct xio_observer *observer; + + tmp_hdr = (struct xio_session_cancel_hdr *) + event_data->cancel.ulp_msg; + hdr.sn = ntohll(tmp_hdr->sn); + hdr.responder_session_id = ntohl(tmp_hdr->responder_session_id); + + observer = xio_nexus_observer_lookup(nexus, hdr.responder_session_id); + if (!observer) { + ERROR_LOG("failed to find session\n"); + return -1; + } + + session = (struct xio_session *)observer->impl; + + connection = xio_session_find_connection(session, nexus); + if (!connection) { + ERROR_LOG("failed to find session\n"); + return -1; + } + + /* lookup for task in io list */ + task = xio_connection_find_io_task(connection, hdr.sn); + if (task) { + if (connection->ses_ops.on_cancel_request) { +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(connection->ctx); +#endif + connection->ses_ops.on_cancel_request( + connection->session, + &task->imsg, + connection->cb_user_context); + return 0; +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_lock(connection->ctx); +#endif + } + WARN_LOG("cancel is not supported on responder\n"); + } + TRACE_LOG("message to cancel not found %llu\n", hdr.sn); + + req = (struct xio_msg *)kcalloc(1, sizeof(*req), GFP_KERNEL); + if (!req) { + ERROR_LOG("req allocation failed\n"); + return -1; + } + + req->sn = hdr.sn; + xio_connection_send_cancel_response(connection, req, NULL, + XIO_E_MSG_NOT_FOUND); + kfree(req); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_on_cancel_response */ +/*---------------------------------------------------------------------------*/ +int xio_on_cancel_response(struct xio_session *sess, + struct xio_nexus *nexus, + union xio_nexus_event_data *event_data) +{ + struct xio_session_cancel_hdr hdr; + struct xio_session_cancel_hdr *tmp_hdr; + struct xio_observer *observer; + struct xio_session *session; + struct xio_connection *connection; + struct xio_msg *msg = NULL; + struct xio_msg *pmsg; + + if (!event_data) { + xio_set_error(EINVAL); + ERROR_LOG("null event_data\n"); + return -1; + } + + if (!event_data->cancel.task) { + tmp_hdr = (struct xio_session_cancel_hdr *) + event_data->cancel.ulp_msg; + hdr.sn = ntohll(tmp_hdr->sn); + hdr.requester_session_id = ntohl(tmp_hdr->requester_session_id); + + observer = xio_nexus_observer_lookup(nexus, + hdr.requester_session_id); + if (!observer) { + ERROR_LOG("failed to find session\n"); + return -1; + } + session = (struct xio_session *)observer->impl; + + /* large object - allocate it */ + msg = (struct xio_msg *)kcalloc(1, sizeof(*msg), GFP_KERNEL); + if (!msg) { + ERROR_LOG("msg allocation failed\n"); + return -1; + } + + pmsg = msg; /* fake a message */ + msg->sn = hdr.sn; + } else { + session = event_data->cancel.task->session; + pmsg = event_data->cancel.task->omsg; + hdr.sn = pmsg->sn; + } + + connection = xio_session_find_connection(session, nexus); + if (!connection) { + ERROR_LOG("failed to find session\n"); + kfree(msg); + return -1; + } + + /* need to release the last reference since answer is not expected */ + if (event_data->cancel.result == XIO_E_MSG_CANCELED && + event_data->cancel.task) + xio_tasks_pool_put(event_data->cancel.task); + + if (connection->ses_ops.on_cancel) + connection->ses_ops.on_cancel( + session, + pmsg, + event_data->cancel.result, + connection->cb_user_context); + else + ERROR_LOG("cancel is not supported\n"); + + kfree(msg); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_session_create */ +/*---------------------------------------------------------------------------*/ +struct xio_session *xio_session_create(struct xio_session_params *params) +{ + struct xio_session *session = NULL; + int retval; + int uri_len = 0; + + /* input validation */ + if (!params || !params->uri) { + xio_set_error(EINVAL); + ERROR_LOG("xio_session_open: invalid parameter\n"); + return NULL; + } + uri_len = strlen(params->uri); + + /* extract portal from uri */ + /* create the session */ + session = (struct xio_session *) + kcalloc(1, sizeof(struct xio_session), GFP_KERNEL); + if (!session) { + ERROR_LOG("failed to create session\n"); + xio_set_error(ENOMEM); + return NULL; + } + + XIO_OBSERVER_INIT(&session->observer, session, + (params->type == XIO_SESSION_SERVER) ? + xio_server_on_nexus_event : + xio_client_on_nexus_event); + + INIT_LIST_HEAD(&session->connections_list); + + session->hs_private_data_len = params->private_data_len; + + /* copy private data if exist */ + if (session->hs_private_data_len) { + session->hs_private_data = kmalloc(session->hs_private_data_len, + GFP_KERNEL); + if (!session->hs_private_data) { + xio_set_error(ENOMEM); + goto cleanup; + } + memcpy(session->hs_private_data, params->private_data, + session->hs_private_data_len); + } + mutex_init(&session->lock); + spin_lock_init(&session->connections_list_lock); + + /* fill session data*/ + session->type = params->type; + session->cb_user_context = params->user_context; + + session->trans_sn = params->initial_sn; + session->state = XIO_SESSION_STATE_INIT; + session->snd_queue_depth_msgs = g_options.snd_queue_depth_msgs; + session->rcv_queue_depth_msgs = g_options.rcv_queue_depth_msgs; + session->snd_queue_depth_bytes = g_options.snd_queue_depth_bytes; + session->rcv_queue_depth_bytes = g_options.rcv_queue_depth_bytes; + session->connection_srv_first = NULL; + + memcpy(&session->ses_ops, params->ses_ops, + sizeof(*params->ses_ops)); + + session->uri_len = uri_len; + session->uri = kstrdup(params->uri, GFP_KERNEL); + if (!session->uri) { + xio_set_error(ENOMEM); + goto cleanup2; + } + + /* add the session to storage */ + retval = xio_sessions_cache_add(session, &session->session_id); + if (retval != 0) { + ERROR_LOG("adding session to sessions cache failed :%p\n", + session); + goto cleanup3; + } + xio_idr_add_uobj(usr_idr, session, "xio_session"); + + return session; + +cleanup3: + kfree(session->uri); +cleanup2: + kfree(session->hs_private_data); +cleanup: + kfree(session); + + ERROR_LOG("session creation failed\n"); + + return NULL; +} +EXPORT_SYMBOL(xio_session_create); + +/*---------------------------------------------------------------------------*/ +/* xio_session_post_destroy */ +/*---------------------------------------------------------------------------*/ +void xio_session_post_destroy(void *_session) +{ + int found; + int i; + struct xio_session *session = (struct xio_session *)_session; + + + if (session->teardown_work_ctx) { + xio_context_unreg_observer(session->teardown_work_ctx, + &session->ctx_observer); + xio_ctx_del_work(session->teardown_work_ctx, + &session->teardown_work); + } + + if (!list_empty(&session->connections_list)) { + xio_set_error(EBUSY); + ERROR_LOG("xio_session_destroy failed: " \ + "connections are still open\n"); + return; + } + + found = xio_idr_lookup_uobj(usr_idr, session); + if (found) { + xio_idr_remove_uobj(usr_idr, session); + } else { + ERROR_LOG("session not found:%p\n", session); + xio_set_error(XIO_E_USER_OBJ_NOT_FOUND); + return; + } + + TRACE_LOG("session destroy:%p\n", session); + + session->state = XIO_SESSION_STATE_CLOSED; + + /* unregister session from context */ + xio_sessions_cache_remove(session->session_id); + for (i = 0; i < session->services_array_len; i++) + kfree(session->services_array[i]); + for (i = 0; i < session->portals_array_len; i++) + kfree(session->portals_array[i]); + kfree(session->services_array); + kfree(session->portals_array); + kfree(session->hs_private_data); + kfree(session->uri); + XIO_OBSERVER_DESTROY(&session->observer); + XIO_OBSERVER_DESTROY(&session->ctx_observer); + + mutex_destroy(&session->lock); + kfree(session); + + return; +} + +/*---------------------------------------------------------------------------*/ +/* xio_session_destroy */ +/*---------------------------------------------------------------------------*/ +int xio_session_destroy(struct xio_session *session) +{ + if (!session) + return 0; + +#ifdef XIO_THREAD_SAFE_DEBUG + if (session->teardown_work_ctx) + /* not locking if the session did not contain active conn */ + xio_ctx_debug_thread_lock(session->teardown_work_ctx); +#endif + + TRACE_LOG("xio_post_destroy_session seesion:%p\n", session); + + if (session->teardown_work_ctx && + xio_ctx_is_work_in_handler(session->teardown_work_ctx, + &session->teardown_work)) { + xio_context_unreg_observer(session->teardown_work_ctx, + &session->ctx_observer); + + xio_ctx_set_work_destructor( + session->teardown_work_ctx, session, + xio_session_post_destroy, + &session->teardown_work); + } else { + xio_session_post_destroy(session); + } +#ifdef XIO_THREAD_SAFE_DEBUG + if (session->teardown_work_ctx) + xio_ctx_debug_thread_unlock(session->teardown_work_ctx); +#endif + + return 0; +} +EXPORT_SYMBOL(xio_session_destroy); + +/*---------------------------------------------------------------------------*/ +/* xio_session_assign_ops */ +/*---------------------------------------------------------------------------*/ +void xio_session_assign_ops(struct xio_session *session, + struct xio_session_ops *ops) +{ + memcpy(&session->ses_ops, ops, sizeof(*ops)); +} + +/*---------------------------------------------------------------------------*/ +/* xio_session_event_str */ +/*---------------------------------------------------------------------------*/ +const char *xio_session_event_str(enum xio_session_event event) +{ + switch (event) { + case XIO_SESSION_REJECT_EVENT: + return "session reject"; + case XIO_SESSION_TEARDOWN_EVENT: + return "session teardown"; + case XIO_SESSION_NEW_CONNECTION_EVENT: + return "new connection"; + case XIO_SESSION_CONNECTION_ESTABLISHED_EVENT: + return "connection established"; + case XIO_SESSION_CONNECTION_CLOSED_EVENT: + return "connection closed"; + case XIO_SESSION_CONNECTION_DISCONNECTED_EVENT: + return "connection disconnected"; + case XIO_SESSION_CONNECTION_REFUSED_EVENT: + return "connection refused"; + case XIO_SESSION_CONNECTION_TEARDOWN_EVENT: + return "connection teardown"; + case XIO_SESSION_CONNECTION_ERROR_EVENT: + return "connection error"; + case XIO_SESSION_ERROR_EVENT: + return "session error"; + case XIO_SESSION_CONNECTION_RECONNECTING_EVENT: + return "connection reconnecting"; + case XIO_SESSION_CONNECTION_RECONNECTED_EVENT: + return "connection reconnected"; + }; + return "unknown session event"; +} +EXPORT_SYMBOL(xio_session_event_str); + +/*---------------------------------------------------------------------------*/ +/* xio_query_session */ +/*---------------------------------------------------------------------------*/ +int xio_query_session(struct xio_session *session, + struct xio_session_attr *attr, + int attr_mask) +{ + if (!session || !attr) { + xio_set_error(EINVAL); + ERROR_LOG("invalid parameters\n"); + return -1; + } + if (attr_mask & XIO_SESSION_ATTR_USER_CTX) + attr->user_context = session->cb_user_context; + + if (attr_mask & XIO_SESSION_ATTR_SES_OPS) + attr->ses_ops = &session->ses_ops; + + if (attr_mask & XIO_SESSION_ATTR_URI) + attr->uri = session->uri; + + return 0; +} +EXPORT_SYMBOL(xio_query_session); + +/*---------------------------------------------------------------------------*/ +/* xio_modify_session */ +/*---------------------------------------------------------------------------*/ +int xio_modify_session(struct xio_session *session, + struct xio_session_attr *attr, + int attr_mask) +{ + if (!session || !attr) { + xio_set_error(EINVAL); + ERROR_LOG("invalid parameters\n"); + return -1; + } + + if (attr_mask & XIO_SESSION_ATTR_USER_CTX) + session->cb_user_context = attr->user_context; + + return 0; +} +EXPORT_SYMBOL(xio_modify_session); + +/*---------------------------------------------------------------------------*/ +/* xio_get_connection */ +/*---------------------------------------------------------------------------*/ +struct xio_connection *xio_get_connection(struct xio_session *session, + struct xio_context *ctx) +{ + ERROR_LOG("%s function have been deprecated. " \ + "That means it have been replaced by new function or" \ + "is no longer supported, and may be removed" \ + "from future versions. " \ + "All code that uses the functions should" \ + "be converted to use its replacement if one exists.\n", + __func__); + return xio_session_find_connection_by_ctx(session, ctx); +} + +/*---------------------------------------------------------------------------*/ +/* xio_session_notify_cancel */ +/*---------------------------------------------------------------------------*/ +int xio_session_notify_cancel(struct xio_connection *connection, + struct xio_msg *req, enum xio_status result) +{ + /* notify the upper layer */ + if (connection->ses_ops.on_cancel) { +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(connection->ctx); +#endif + connection->ses_ops.on_cancel( + connection->session, req, + result, + connection->cb_user_context); +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_lock(connection->ctx); +#endif + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_session_notify_msg_error */ +/*---------------------------------------------------------------------------*/ +int xio_session_notify_msg_error(struct xio_connection *connection, + struct xio_msg *msg, enum xio_status result, + enum xio_msg_direction direction) +{ + /* notify the upper layer */ + if (connection->ses_ops.on_msg_error && IS_APPLICATION_MSG(msg->type)) { +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(connection->ctx); +#endif + connection->ses_ops.on_msg_error( + connection->session, + result, direction, msg, + connection->cb_user_context); +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_lock(connection->ctx); +#endif + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_session_on_context_event */ +/*---------------------------------------------------------------------------*/ +static int xio_session_on_context_event(void *observer, void *sender, int event, + void *event_data) +{ + struct xio_session *session = (struct xio_session *)observer; + + if (event == XIO_CONTEXT_EVENT_CLOSE) { + TRACE_LOG("context: [close] ctx:%p\n", sender); + + xio_context_unreg_observer(session->teardown_work_ctx, + &session->ctx_observer); + /* clean the context so that upon session destroy do not + * do not handle workqueue + */ + xio_ctx_del_work(session->teardown_work_ctx, + &session->teardown_work); + + session->teardown_work_ctx = NULL; + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_session_pre_teardown */ +/*---------------------------------------------------------------------------*/ +static void xio_session_pre_teardown(void *_session) +{ + struct xio_session *session = (struct xio_session *)_session; + int destroy_session = 0; + int reason; + + switch (session->state) { + case XIO_SESSION_STATE_REJECTED: + reason = XIO_E_SESSION_REJECTED; + break; + case XIO_SESSION_STATE_ACCEPTED: + if (session->type == XIO_SESSION_SERVER) + reason = XIO_E_SESSION_DISCONNECTED; + else + reason = XIO_E_SESSION_REFUSED; + break; + default: + reason = session->teardown_reason; + break; + } + mutex_lock(&session->lock); + + spin_lock(&session->connections_list_lock); + destroy_session = ((session->connections_nr == 0) && + !session->lead_connection && + !session->redir_connection); + + spin_unlock(&session->connections_list_lock); + + /* last chance to teardown */ + if (destroy_session) { + /* remove the session from cache */ + xio_sessions_cache_remove(session->session_id); + mutex_unlock(&session->lock); + session->state = XIO_SESSION_STATE_CLOSING; + session->teardown_reason = reason; + + /* start listen to context events - context can be destroyed + * while session still alive */ + xio_context_unreg_observer(session->teardown_work_ctx, + &session->ctx_observer); + + XIO_OBSERVER_INIT(&session->ctx_observer, session, + xio_session_on_context_event); + xio_context_reg_observer(session->teardown_work_ctx, + &session->ctx_observer); + + xio_session_notify_teardown(session, session->teardown_reason); + } else { + mutex_unlock(&session->lock); + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_session_init_teardown */ +/*---------------------------------------------------------------------------*/ +void xio_session_init_teardown(struct xio_session *session, + struct xio_context *ctx, + int close_reason) +{ + session->teardown_reason = close_reason; + session->teardown_work_ctx = ctx; + + xio_ctx_add_work( + ctx, + session, + xio_session_pre_teardown, + &session->teardown_work); +} + diff --git a/open_src/xio/src/common/xio_session.h b/open_src/xio/src/common/xio_session.h new file mode 100644 index 0000000..65ace1f --- /dev/null +++ b/open_src/xio/src/common/xio_session.h @@ -0,0 +1,249 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_SESSION_H +#define XIO_SESSION_H + +/*---------------------------------------------------------------------------*/ +/* forward declarations */ +/*---------------------------------------------------------------------------*/ +struct xio_session; + +/*---------------------------------------------------------------------------*/ +/* enums */ +/*---------------------------------------------------------------------------*/ +enum xio_session_state { + XIO_SESSION_STATE_INIT, + XIO_SESSION_STATE_CONNECT, + XIO_SESSION_STATE_ONLINE, + XIO_SESSION_STATE_REDIRECTED, + XIO_SESSION_STATE_ACCEPTED, + XIO_SESSION_STATE_REJECTED, + XIO_SESSION_STATE_REFUSED, + XIO_SESSION_STATE_CLOSING, + XIO_SESSION_STATE_CLOSED, +}; + +/*---------------------------------------------------------------------------*/ +/* structures */ +/*---------------------------------------------------------------------------*/ +struct xio_session { + struct xio_transport_msg_validators_cls *validators_cls; + struct xio_session_ops ses_ops; + + uint64_t trans_sn; /* transaction sn */ + uint32_t session_id; + uint32_t peer_session_id; + uint32_t connections_nr; + uint16_t snd_queue_depth_msgs; + uint16_t rcv_queue_depth_msgs; + uint16_t peer_snd_queue_depth_msgs; + uint16_t peer_rcv_queue_depth_msgs; + uint16_t pad[2]; + uint64_t snd_queue_depth_bytes; + uint64_t rcv_queue_depth_bytes; + uint64_t peer_snd_queue_depth_bytes; + uint64_t peer_rcv_queue_depth_bytes; + struct list_head sessions_list_entry; + struct list_head connections_list; + + HT_ENTRY(xio_session, xio_key_int32) sessions_htbl; + + struct xio_msg *setup_req; + struct xio_observer observer; + struct xio_observer ctx_observer; + + enum xio_session_type type; + + volatile enum xio_session_state state; + + struct xio_new_session_rsp new_ses_rsp; + char *uri; + char **portals_array; + char **services_array; + + /* + * References a user-controlled data buffer. The contents of + * the buffer are copied and transparently passed to the remote side + * as part of the communication request. Maybe NULL if private_data + * is not required. + */ + void *hs_private_data; + void *cb_user_context; + + /* + * Specifies the size of the user-controlled data buffer. + */ + uint16_t hs_private_data_len; + uint16_t uri_len; + uint16_t portals_array_len; + uint16_t services_array_len; + uint16_t last_opened_portal; + uint16_t last_opened_service; + + uint32_t teardown_reason; + uint32_t reject_reason; + uint32_t pad1; + struct mutex lock; /* lock open connection */ + spinlock_t connections_list_lock; + int disable_teardown; + struct xio_connection *lead_connection; + struct xio_connection *redir_connection; + /* server: represents the leading connection on server side */ + struct xio_connection *connection_srv_first; + struct xio_context *teardown_work_ctx; + xio_work_handle_t teardown_work; + +}; + +/*---------------------------------------------------------------------------*/ +/* functions */ +/*---------------------------------------------------------------------------*/ +void xio_session_write_header( + struct xio_task *task, + struct xio_session_hdr *hdr); + +static inline uint64_t xio_session_get_sn( + struct xio_session *session) +{ + return xio_sync_fetch_and_add64(&session->trans_sn, 1); +} + +struct xio_session *xio_find_session( + struct xio_task *task); + +struct xio_connection *xio_session_find_connection( + struct xio_session *session, + struct xio_nexus *nexus); + +struct xio_connection *xio_session_alloc_connection( + struct xio_session *session, + struct xio_context *ctx, + uint32_t connection_idx, + void *connection_user_context); + +int xio_session_free_connection( + struct xio_connection *connection); + +struct xio_connection *xio_session_assign_nexus( + struct xio_session *session, + struct xio_nexus *nexus); + +void xio_session_assign_ops( + struct xio_session *session, + struct xio_session_ops *ops); + +struct xio_connection *xio_server_create_accepted_connection( + struct xio_session *session, + struct xio_nexus *nexus); + +int xio_session_reconnect( + struct xio_session *session, + struct xio_connection *connection); + +/*---------------------------------------------------------------------------*/ +/* xio_session_is_valid_in_req */ +/*---------------------------------------------------------------------------*/ +static inline int xio_session_is_valid_in_req(struct xio_session *session, + struct xio_msg *msg) +{ + return session->validators_cls->is_valid_in_req(msg); +} + +/*---------------------------------------------------------------------------*/ +/* xio_session_is_valid_out_msg */ +/*---------------------------------------------------------------------------*/ +static inline int xio_session_is_valid_out_msg(struct xio_session *session, + struct xio_msg *msg) +{ + return session->validators_cls->is_valid_out_msg(msg); +} + +int xio_session_notify_cancel(struct xio_connection *connection, + struct xio_msg *req, enum xio_status result); + +void xio_session_notify_new_connection(struct xio_session *session, + struct xio_connection *connection); + +void xio_session_notify_connection_established( + struct xio_session *session, + struct xio_connection *connection); + +void xio_session_notify_connection_closed( + struct xio_session *session, + struct xio_connection *connection); + +void xio_session_notify_connection_disconnected( + struct xio_session *session, + struct xio_connection *connection, + enum xio_status reason); + +void xio_session_notify_connection_refused( + struct xio_session *session, + struct xio_connection *connection, + enum xio_status reason); + +void xio_session_notify_connection_error( + struct xio_session *session, + struct xio_connection *connection, + enum xio_status reason); + +void xio_session_notify_connection_teardown( + struct xio_session *session, + struct xio_connection *connection); + +int xio_session_notify_msg_error(struct xio_connection *connection, + struct xio_msg *msg, enum xio_status result, + enum xio_msg_direction direction); + +void xio_session_notify_teardown(struct xio_session *session, int reason); + +void xio_session_notify_rejected(struct xio_session *session); + +void xio_session_notify_reconnecting( + struct xio_session *session, + struct xio_connection *connection); + +void xio_session_notify_reconnected( + struct xio_session *session, + struct xio_connection *connection); + +void xio_session_init_teardown(struct xio_session *session, + struct xio_context *ctx, int close_reason); + +#endif /*XIO_SESSION_H */ + diff --git a/open_src/xio/src/common/xio_session_client.c b/open_src/xio/src/common/xio_session_client.c new file mode 100644 index 0000000..ba1c440 --- /dev/null +++ b/open_src/xio/src/common/xio_session_client.c @@ -0,0 +1,1091 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include "libxio.h" +#include "xio_log.h" +#include "xio_common.h" +#include "xio_protocol.h" +#include "xio_observer.h" +#include "xio_mbuf.h" +#include "xio_task.h" +#include "xio_transport.h" +#include "xio_sessions_cache.h" +#include "xio_idr.h" +#include "xio_hash.h" +#include "xio_msg_list.h" +#include "xio_ev_data.h" +#include "xio_objpool.h" +#include "xio_workqueue.h" +#include "xio_context.h" +#include "xio_nexus.h" +#include "xio_session.h" +#include "xio_connection.h" +#include "xio_session_priv.h" +#include + +/*---------------------------------------------------------------------------*/ +/* xio_session_write_setup_req */ +/*---------------------------------------------------------------------------*/ +struct xio_msg *xio_session_write_setup_req(struct xio_session *session) +{ + struct xio_msg *msg; + void *buf; + uint8_t *ptr; + uint16_t len; + + /* allocate message */ + buf = kcalloc(SETUP_BUFFER_LEN + sizeof(struct xio_msg), + sizeof(uint8_t), GFP_KERNEL); + if (unlikely(!buf)) { + ERROR_LOG("message allocation failed\n"); + xio_set_error(ENOMEM); + return NULL; + } + + /* fill the message */ + msg = (struct xio_msg *)buf; + buf = sum_to_ptr(buf, sizeof(*msg)); + msg->out.header.iov_base = buf; + msg->out.header.iov_len = 0; + msg->out.sgl_type = XIO_SGL_TYPE_IOV_PTR; + msg->in.sgl_type = XIO_SGL_TYPE_IOV_PTR; + /* All other in/out parameters are zero because of kcalloc anyway */ + + msg->type = (enum xio_msg_type)XIO_SESSION_SETUP_REQ; + + ptr = (uint8_t *)msg->out.header.iov_base; + len = 0; + + /* serialize message on the buffer */ + len = xio_write_uint32(session->session_id, 0, ptr); + ptr = ptr + len; + + /* tx queue depth bytes*/ + len = xio_write_uint64(session->snd_queue_depth_bytes, 0, ptr); + ptr = ptr + len; + + /* rx queue depth bytes*/ + len = xio_write_uint64(session->rcv_queue_depth_bytes, 0, ptr); + ptr = ptr + len; + + /* tx queue depth msgs*/ + len = xio_write_uint16((uint16_t)session->snd_queue_depth_msgs, + 0, ptr); + ptr = ptr + len; + + /* rx queue depth msgs*/ + len = xio_write_uint16((uint16_t)session->rcv_queue_depth_msgs, + 0, ptr); + ptr = ptr + len; + + /* uri length */ + len = xio_write_uint16((uint16_t)session->uri_len, 0, ptr); + ptr = ptr + len; + + /* private length */ + len = xio_write_uint16((uint16_t)(session->hs_private_data_len), + 0, ptr); + ptr = ptr + len; + + if (session->uri_len) { + len = xio_write_array((uint8_t *)session->uri, + session->uri_len, 0, ptr); + ptr = ptr + len; + } + if (session->hs_private_data_len) { + len = xio_write_array((const uint8_t *)session->hs_private_data, + session->hs_private_data_len, + 0, ptr); + ptr = ptr + len; + } + msg->out.header.iov_len = ptr - (uint8_t *)msg->out.header.iov_base; + + if (msg->out.header.iov_len > SETUP_BUFFER_LEN) { + ERROR_LOG("primary task pool is empty\n"); + xio_set_error(XIO_E_MSG_SIZE); + kfree(msg); + return NULL; + } + + return msg; +} + +/*---------------------------------------------------------------------------*/ +/* xio_session_accept_connections */ +/*---------------------------------------------------------------------------*/ +int xio_session_accept_connections(struct xio_session *session) +{ + struct xio_connection *connection, *tmp_connection; + struct xio_nexus *nexus; + int retval = 0; + char *portal; + + list_for_each_entry_safe(connection, tmp_connection, + &session->connections_list, + connections_list_entry) { + if (!connection->nexus) { + if (connection->conn_idx == 0) { + portal = session->portals_array[ + session->last_opened_portal++]; + if (session->last_opened_portal == + session->portals_array_len) + session->last_opened_portal = 0; + } else { + int pid = (connection->conn_idx % + session->portals_array_len); + portal = session->portals_array[pid]; + } + nexus = xio_nexus_open(connection->ctx, portal, + &session->observer, + session->session_id, + connection->nexus_attr_mask, + &connection->nexus_attr); + + if (unlikely(!nexus)) { + ERROR_LOG("failed to open connection to %s\n", + portal); + retval = -1; + break; + } + connection = xio_session_assign_nexus(session, nexus); + if (unlikely(!connection)) { + ERROR_LOG("failed to assign connection\n"); + retval = -1; + break; + } + connection->cd_bit = 0; + DEBUG_LOG("reconnecting to %s. connection:%p, " \ + "nexus:%p\n", + portal, connection, nexus); + retval = xio_nexus_connect(nexus, portal, + &session->observer, NULL); + if (unlikely(retval != 0)) { + ERROR_LOG("connection connect failed\n"); + retval = -1; + break; + } + } + } + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_session_redirect_connection */ +/*---------------------------------------------------------------------------*/ +int xio_session_redirect_connection(struct xio_session *session) +{ + struct xio_nexus *nexus, *tmp_nexus; + int retval; + char *service; + + service = session->services_array[session->last_opened_service++]; + if (session->last_opened_service == session->services_array_len) + session->last_opened_service = 0; + + nexus = xio_nexus_open(session->lead_connection->ctx, service, + NULL, 0, + session->lead_connection->nexus_attr_mask, + &session->lead_connection->nexus_attr); + if (unlikely(!nexus)) { + ERROR_LOG("failed to open connection to %s\n", + service); + return -1; + } + /* initialize the redirected connection */ + tmp_nexus = session->lead_connection->nexus; + session->redir_connection = session->lead_connection; + session->redir_connection->cd_bit = 0; + xio_connection_set_nexus(session->redir_connection, nexus); + + ERROR_LOG("connection redirected to %s\n", service); + retval = xio_nexus_connect(nexus, service, &session->observer, NULL); + if (unlikely(retval != 0)) { + ERROR_LOG("connection connect failed\n"); + goto cleanup; + } + + kfree(session->uri); + session->uri = kstrdup(service, GFP_KERNEL); + + /* prep the lead connection for close */ + session->lead_connection = xio_connection_create( + session, + session->lead_connection->ctx, + session->lead_connection->conn_idx, + session->lead_connection->cb_user_context); + xio_connection_set_nexus(session->lead_connection, tmp_nexus); + + return 0; + +cleanup: + xio_nexus_close(nexus, &session->observer); + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_on_session_rejected */ +/*---------------------------------------------------------------------------*/ +int xio_on_session_rejected(struct xio_session *session) +{ + struct xio_connection *pconnection, *tmp_connection; + + /* also send disconnect to connections that do no have nexus */ + list_for_each_entry_safe(pconnection, tmp_connection, + &session->connections_list, + connections_list_entry) { + session->disable_teardown = 0; + pconnection->disable_notify = 0; + pconnection->close_reason = XIO_E_SESSION_REJECTED; + if (pconnection->nexus) + xio_disconnect_initial_connection(pconnection); + else + xio_connection_disconnected(pconnection); + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_read_setup_rsp */ +/*---------------------------------------------------------------------------*/ +int xio_read_setup_rsp(struct xio_connection *connection, + struct xio_task *task, + uint16_t *action) +{ + struct xio_msg *msg = &task->imsg; + struct xio_session_hdr hdr; + struct xio_session *session = connection->session; + struct xio_new_session_rsp *rsp = &session->new_ses_rsp; + uint8_t *ptr; + uint16_t len; + int i = 0; + uint16_t str_len; + + /* read session header */ + xio_session_read_header(task, &hdr); +#ifdef XIO_SESSION_DEBUG + connection->peer_connection = hdr.connection; + connection->peer_session = hdr.session; +#endif + task->imsg.sn = hdr.serial_num; + + /* free the outgoing message */ + kfree(task->sender_task->omsg); + task->sender_task->omsg = NULL; + + /* read the message */ + ptr = (uint8_t *)msg->in.header.iov_base; + + /* read the payload */ + len = xio_read_uint32(&session->peer_session_id, 0, ptr); + ptr = ptr + len; + + len = xio_read_uint16(action, 0, ptr); + ptr = ptr + len; + + switch (*action) { + case XIO_ACTION_ACCEPT: + /* read the peer tx queue depth bytes */ + len = xio_read_uint64(&session->peer_snd_queue_depth_bytes, + 0, ptr); + ptr = ptr + len; + + /* read the peer rx queue depth bytes */ + len = xio_read_uint64(&session->peer_rcv_queue_depth_bytes, + 0, ptr); + ptr = ptr + len; + + /* read the peer tx queue depth msgs */ + len = xio_read_uint16(&session->peer_snd_queue_depth_msgs, + 0, ptr); + ptr = ptr + len; + + /* read the peer rx queue depth msgs */ + len = xio_read_uint16(&session->peer_rcv_queue_depth_msgs, + 0, ptr); + ptr = ptr + len; + + len = xio_read_uint16(&session->portals_array_len, 0, ptr); + ptr = ptr + len; + + len = xio_read_uint16(&rsp->private_data_len, 0, ptr); + ptr = ptr + len; + + if (session->portals_array_len) { + session->portals_array = (char **)kcalloc( + session->portals_array_len, + sizeof(char *), GFP_KERNEL); + if (unlikely(!session->portals_array)) { + ERROR_LOG("allocation failed\n"); + xio_set_error(ENOMEM); + return -1; + } + for (i = 0; i < session->portals_array_len; i++) { + len = xio_read_uint16(&str_len, 0, ptr); + ptr = ptr + len; + + session->portals_array[i] = + kstrndup((char *)ptr, str_len, + GFP_KERNEL); + session->portals_array[i][str_len] = 0; + ptr = ptr + str_len; + } + + } else { + session->portals_array = NULL; + } + + if (session->new_ses_rsp.private_data_len) { + rsp->private_data = kcalloc(rsp->private_data_len, + sizeof(uint8_t), GFP_KERNEL); + if (unlikely(!rsp->private_data)) { + ERROR_LOG("allocation failed\n"); + xio_set_error(ENOMEM); + return -1; + } + + len = xio_read_array((uint8_t *)rsp->private_data, + rsp->private_data_len, 0, ptr); + ptr = ptr + len; + } else { + rsp->private_data = NULL; + } + break; + case XIO_ACTION_REDIRECT: + len = xio_read_uint16(&session->services_array_len, 0, ptr); + ptr = ptr + len; + + len = xio_read_uint16(&rsp->private_data_len, 0, ptr); + ptr = ptr + len; + + if (session->services_array_len) { + session->services_array = (char **)kcalloc( + session->services_array_len, + sizeof(char *), GFP_KERNEL); + if (unlikely(!session->services_array)) { + ERROR_LOG("allocation failed\n"); + xio_set_error(ENOMEM); + return -1; + } + + for (i = 0; i < session->services_array_len; i++) { + len = xio_read_uint16(&str_len, 0, ptr); + ptr = ptr + len; + + session->services_array[i] = + kstrndup((char *)ptr, str_len, + GFP_KERNEL); + session->services_array[i][str_len] = 0; + ptr = ptr + str_len; + } + + } else { + session->services_array = NULL; + } + break; + + case XIO_ACTION_REJECT: + len = xio_read_uint32(&session->reject_reason, 0, ptr); + ptr = ptr + len; + + len = xio_read_uint16(&rsp->private_data_len, 0, ptr); + ptr = ptr + len; + + if (session->new_ses_rsp.private_data_len) { + rsp->private_data = kcalloc( + rsp->private_data_len, + sizeof(uint8_t), GFP_KERNEL); + if (unlikely(!rsp->private_data)) { + ERROR_LOG("allocation failed\n"); + xio_set_error(ENOMEM); + return -1; + } + + len = xio_read_array((uint8_t *)rsp->private_data, + rsp->private_data_len, 0, ptr); + ptr = ptr + len; + } else { + rsp->private_data = NULL; + } + break; + default: + break; + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_session_fill_portals_array */ +/*---------------------------------------------------------------------------*/ +static int xio_session_fill_portals_array(struct xio_session *session) +{ + char portal[64]; + + /* extract portal from uri */ + if (xio_uri_get_portal(session->uri, portal, sizeof(portal)) != 0) { + xio_set_error(EADDRNOTAVAIL); + ERROR_LOG("parsing uri failed. uri: %s\n", session->uri); + return -1; + } + session->portals_array = (char **)kcalloc( + 1, + sizeof(char *), GFP_KERNEL); + if (unlikely(!session->portals_array)) { + ERROR_LOG("allocation failed\n"); + xio_set_error(ENOMEM); + return -1; + } + session->portals_array_len = 1; + session->portals_array[0] = kstrdup(portal, GFP_KERNEL); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_on_setup_rsp_recv */ +/*---------------------------------------------------------------------------*/ +int xio_on_setup_rsp_recv(struct xio_connection *connection, + struct xio_task *task) +{ + uint16_t action = 0; + struct xio_session *session = connection->session; + struct xio_new_session_rsp *rsp = &session->new_ses_rsp; + int retval = 0, fill_portals = 0; + struct xio_connection *tmp_connection; + + retval = xio_read_setup_rsp(connection, task, &action); + + /* the tx task is returend back to pool */ + xio_tasks_pool_put(task->sender_task); + task->sender_task = NULL; + + xio_tasks_pool_put(task); + DEBUG_LOG("task recycled\n"); + + if (unlikely(retval != 0)) { + ERROR_LOG("failed to read setup response\n"); + return -1; + } + + switch (action) { + case XIO_ACTION_ACCEPT: + if (!session->portals_array) { + xio_session_fill_portals_array(session); + fill_portals = 1; + } + session->state = XIO_SESSION_STATE_ONLINE; + TRACE_LOG("session state is now ONLINE. session:%p\n", session); + /* notify the upper layer */ + if (session->ses_ops.on_session_established) { +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(connection->ctx); +#endif + session->ses_ops.on_session_established( + session, rsp, + session->cb_user_context); +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_lock(connection->ctx); +#endif + } + + kfree(rsp->private_data); + rsp->private_data = NULL; + + if (fill_portals) { + xio_on_connection_hello_rsp_recv(connection, NULL); + /* insert the connection into list */ + xio_session_assign_nexus(session, connection->nexus); + session->lead_connection = NULL; + session->redir_connection = NULL; + session->disable_teardown = 0; + + if (session->connections_nr > 1) { + /* open new connections */ + retval = xio_session_accept_connections( + session); + if (unlikely(retval != 0)) { + ERROR_LOG( + "failed to accept connection\n"); + return -1; + } + } + } else { /* reconnect to peer other session */ + TRACE_LOG("session state is now ACCEPT. session:%p\n", + session); + + /* clone temporary connection */ + tmp_connection = xio_connection_create( + session, + session->lead_connection->ctx, + session->lead_connection->conn_idx, + session->lead_connection->cb_user_context); + + xio_connection_set_nexus(tmp_connection, + connection->nexus); + connection->nexus = NULL; + session->lead_connection = tmp_connection; + + /* close the lead/redirected connection */ + /* temporary disable teardown */ + session->disable_teardown = 1; + session->lead_connection->disable_notify = 1; + session->lead_connection->state = + XIO_CONNECTION_STATE_ONLINE; + + /* temporary account it as user object */ + xio_idr_add_uobj(usr_idr, session->lead_connection, + "xio_connection"); + xio_disconnect_initial_connection( + session->lead_connection); + + /* open new connections */ + retval = xio_session_accept_connections(session); + if (unlikely(retval != 0)) { + ERROR_LOG("failed to accept connection\n"); + return -1; + } + } + return 0; + case XIO_ACTION_REDIRECT: + TRACE_LOG("session state is now REDIRECT. session:%p\n", + session); + + session->state = XIO_SESSION_STATE_REDIRECTED; + + /* open new connections */ + retval = xio_session_redirect_connection(session); + if (unlikely(retval != 0)) { + ERROR_LOG("failed to redirect connection\n"); + return -1; + } + + /* close the lead connection */ + session->disable_teardown = 1; + session->lead_connection->disable_notify = 1; + session->lead_connection->state = XIO_CONNECTION_STATE_ONLINE; + xio_disconnect_initial_connection(session->lead_connection); + + return 0; + case XIO_ACTION_REJECT: + xio_connection_set_state(connection, + XIO_CONNECTION_STATE_ESTABLISHED); + xio_session_notify_connection_established(session, + connection); + + session->state = XIO_SESSION_STATE_REJECTED; + session->disable_teardown = 0; + session->lead_connection = NULL; + + TRACE_LOG("session state is now REJECT. session:%p\n", + session); + + retval = xio_on_session_rejected(session); + if (unlikely(retval != 0)) + ERROR_LOG("failed to reject session\n"); + + kfree(rsp->private_data); + rsp->private_data = NULL; + + return retval; + } + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_on_nexus_refused */ +/*---------------------------------------------------------------------------*/ +int xio_on_nexus_refused(struct xio_session *session, + struct xio_nexus *nexus, + union xio_nexus_event_data *event_data) +{ + struct xio_connection *connection, *next_connection; + + /* enable the teardown */ + session->disable_teardown = 0; + + switch (session->state) { + case XIO_SESSION_STATE_CONNECT: + case XIO_SESSION_STATE_REDIRECTED: + session->state = XIO_SESSION_STATE_REFUSED; + list_for_each_entry_safe( + connection, next_connection, + &session->connections_list, + connections_list_entry) { + xio_connection_refused(connection); + } + break; + default: + connection = xio_session_find_connection(session, nexus); + if (connection) + xio_connection_refused(connection); + break; + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_on_client_nexus_established */ +/*---------------------------------------------------------------------------*/ +int xio_on_client_nexus_established(struct xio_session *session, + struct xio_nexus *nexus, + union xio_nexus_event_data *event_data) +{ + int retval = 0; + struct xio_connection *connection; + struct xio_msg *msg; + struct xio_session_event_data ev_data = { + .conn = NULL, + .conn_user_context = NULL, + .event = XIO_SESSION_ERROR_EVENT, + .private_data = NULL, + .private_data_len = 0, + .reason = XIO_E_SESSION_REFUSED, + }; + + switch (session->state) { + case XIO_SESSION_STATE_CONNECT: + msg = xio_session_write_setup_req(session); + if (unlikely(!msg)) { + ERROR_LOG("setup request creation failed\n"); + return -1; + } + + retval = xio_connection_send(session->lead_connection, + msg); + if (retval && retval != -EAGAIN) { + TRACE_LOG("failed to send session "\ + "setup request\n"); + ev_data.conn = session->lead_connection; + ev_data.conn_user_context = + session->lead_connection->cb_user_context; + if (session->ses_ops.on_session_event) { +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(session->lead_connection->ctx); +#endif + session->ses_ops.on_session_event( + session, &ev_data, + session->cb_user_context); +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_lock(session->lead_connection->ctx); +#endif + } + } + + break; + case XIO_SESSION_STATE_REDIRECTED: + msg = xio_session_write_setup_req(session); + if (unlikely(!msg)) { + ERROR_LOG("setup request creation failed\n"); + return -1; + } + session->state = XIO_SESSION_STATE_CONNECT; + + retval = xio_connection_send(session->redir_connection, + msg); + if (retval && retval != -EAGAIN) { + TRACE_LOG("failed to send session setup request\n"); + ev_data.conn = session->redir_connection; + ev_data.conn_user_context = + session->redir_connection->cb_user_context; + if (session->ses_ops.on_session_event) { +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(session->redir_connection->ctx); +#endif + session->ses_ops.on_session_event( + session, &ev_data, + session->cb_user_context); +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_lock(session->redir_connection->ctx); +#endif + } + + } + break; + case XIO_SESSION_STATE_ONLINE: + case XIO_SESSION_STATE_ACCEPTED: + connection = xio_session_find_connection(session, nexus); + if (unlikely(!connection)) { + ERROR_LOG("failed to find connection session:%p," \ + "nexus:%p\n", session, nexus); + return -1; + } + session->disable_teardown = 0; + if (connection->state == XIO_CONNECTION_STATE_INIT) { + /* introduce the connection to the session */ + xio_connection_send_hello_req(connection); + } else { + xio_connection_set_state(connection, + XIO_CONNECTION_STATE_ONLINE); + xio_connection_keepalive_start(connection); + xio_connection_xmit_msgs(connection); + } + break; + default: + break; + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_client_on_nexus_event */ +/*---------------------------------------------------------------------------*/ +int xio_client_on_nexus_event(void *observer, void *sender, int event, + void *_event_data) +{ + struct xio_session *session = (struct xio_session *)observer; + struct xio_nexus *nexus = (struct xio_nexus *)sender; + union xio_nexus_event_data *event_data = + (union xio_nexus_event_data *)_event_data; + + switch (event) { + case XIO_NEXUS_EVENT_ALLOC_HEAD_BUF: + TRACE_LOG("session: [notification] - assign in buf. " \ + "session:%p, nexus:%p\n", observer, sender); + + xio_on_head_alloc_buf(session, nexus, event_data); + break; + + case XIO_NEXUS_EVENT_ALLOC_DATA_BUF: + TRACE_LOG("session: [notification] - assign in buf. " \ + "session:%p, nexus:%p\n", observer, sender); + + xio_on_data_alloc_buf(session, nexus, event_data); + break; + + case XIO_NEXUS_EVENT_NEW_MESSAGE: +/* + TRACE_LOG("session: [notification] - new message. " \ + "session:%p, nexus:%p\n", observer, sender); + +*/ xio_on_new_message(session, nexus, event_data); + break; + case XIO_NEXUS_EVENT_SEND_COMPLETION: +/* TRACE_LOG("session: [notification] - send_completion. " \ + "session:%p, nexus:%p\n", observer, sender); +*/ + xio_on_send_completion(session, nexus, event_data); + break; + case XIO_NEXUS_EVENT_DIRECT_RDMA_COMPLETION: + xio_on_rdma_direct_comp(session, nexus, event_data); + break; + case XIO_NEXUS_EVENT_ASSIGN_IN_BUF: +/* TRACE_LOG("session: [notification] - assign in buf. " \ + "session:%p, nexus:%p\n", observer, sender); +*/ + xio_on_assign_in_buf(session, nexus, event_data); + break; + case XIO_NEXUS_EVENT_CANCEL_REQUEST: + DEBUG_LOG("session: [notification] - cancel request. " \ + "session:%p, nexus:%p\n", observer, sender); + xio_on_cancel_request(session, nexus, event_data); + break; + case XIO_NEXUS_EVENT_CANCEL_RESPONSE: + DEBUG_LOG("session: [notification] - cancel response. " \ + "session:%p, nexus:%p\n", observer, sender); + xio_on_cancel_response(session, nexus, event_data); + break; + case XIO_NEXUS_EVENT_ESTABLISHED: + DEBUG_LOG("session: [notification] - nexus established. " \ + "session:%p, nexus:%p\n", observer, sender); + xio_on_client_nexus_established(session, nexus, event_data); + break; + case XIO_NEXUS_EVENT_DISCONNECTED: + DEBUG_LOG("session: [notification] - nexus disconnected" \ + " session:%p, nexus:%p\n", observer, sender); + xio_on_nexus_disconnected(session, nexus, event_data); + break; + case XIO_NEXUS_EVENT_RECONNECTING: + DEBUG_LOG("session: [notification] - connection reconnecting" \ + " session:%p, nexus:%p\n", observer, sender); + xio_on_nexus_reconnecting(session, nexus); + break; + case XIO_NEXUS_EVENT_RECONNECTED: + DEBUG_LOG("session: [notification] - connection reconnected" \ + " session:%p, nexus:%p\n", observer, sender); + xio_on_nexus_reconnected(session, nexus); + break; + case XIO_NEXUS_EVENT_CLOSED: + DEBUG_LOG("session: [notification] - nexus closed. " \ + "session:%p, nexus:%p\n", observer, sender); + xio_on_nexus_closed(session, nexus, event_data); + break; + case XIO_NEXUS_EVENT_REFUSED: + DEBUG_LOG("session: [notification] - nexus refused. " \ + "session:%p, nexus:%p\n", observer, sender); + xio_on_nexus_refused(session, nexus, event_data); + break; + case XIO_NEXUS_EVENT_ERROR: + DEBUG_LOG("session: [notification] - nexus error. " \ + "session:%p, nexus:%p\n", observer, sender); + xio_on_nexus_error(session, nexus, event_data); + break; + case XIO_NEXUS_EVENT_MESSAGE_ERROR: + DEBUG_LOG("session: [notification] - nexus message error. " \ + "session:%p, nexus:%p\n", observer, sender); + xio_on_nexus_message_error(session, nexus, event_data); + break; + default: + DEBUG_LOG("session: [notification] - unexpected event. " \ + "event:%d, session:%p, nexus:%p\n", + event, observer, sender); + xio_on_nexus_error(session, nexus, event_data); + break; + } + + return 0; +} + +static inline void xio_session_refuse_connection(void *conn) +{ + struct xio_connection *connection = (struct xio_connection *)conn; + + xio_connection_refused(connection); +} + +/*---------------------------------------------------------------------------*/ +/* xio_connect */ +/*---------------------------------------------------------------------------*/ +struct xio_connection *xio_connect(struct xio_connection_params *cparams) +{ + struct xio_session *session; + struct xio_context *ctx; + struct xio_session *psession = NULL; + struct xio_connection *connection = NULL, *tmp_connection; + struct xio_nexus *nexus = NULL; + int retval; + int attr_mask = 0; + struct xio_nexus_init_attr *pattr = NULL; + struct xio_nexus_init_attr attr; + + if (!cparams) { + ERROR_LOG("invalid parameter\n"); + xio_set_error(EINVAL); + return NULL; + } + + if (!cparams->ctx || !cparams->session) { + ERROR_LOG("invalid parameters ctx:%p, session:%p\n", + cparams->ctx, cparams->session); + xio_set_error(EINVAL); + return NULL; + } + ctx = cparams->ctx; + session = cparams->session; + if (cparams->enable_tos) { + attr.tos = cparams->tos; + attr_mask = XIO_NEXUS_ATTR_TOS; + pattr = &attr; + } + + /* lookup for session in cache */ + psession = xio_sessions_cache_lookup(session->session_id); + if (!psession) { + ERROR_LOG("failed to find session\n"); + xio_set_error(EINVAL); + return NULL; + } + + mutex_lock(&session->lock); + + /* only one connection per context allowed */ + connection = xio_session_find_connection_by_ctx(session, ctx); + if (connection) { + ERROR_LOG("context:%p, already assigned connection:%p\n", + ctx, connection); + goto cleanup2; + } + if (session->state == XIO_SESSION_STATE_INIT) { + char portal[64]; + /* extract portal from uri */ + if (xio_uri_get_portal(session->uri, portal, + sizeof(portal)) != 0) { + xio_set_error(EADDRNOTAVAIL); + ERROR_LOG("parsing uri failed. uri: %s\n", + session->uri); + goto cleanup; + } + nexus = xio_nexus_open(ctx, portal, &session->observer, + session->session_id, + attr_mask, pattr); + if (!nexus) { + ERROR_LOG("failed to create connection\n"); + goto cleanup; + } + /* initialize the lead connection */ + session->lead_connection = xio_session_alloc_connection( + session, ctx, + cparams->conn_idx, + cparams->conn_user_context); + session->lead_connection->nexus = nexus; + + connection = session->lead_connection; + + /* get transport class routines */ + session->validators_cls = xio_nexus_get_validators_cls(nexus); + + session->state = XIO_SESSION_STATE_CONNECT; + + retval = xio_nexus_connect(nexus, portal, + &session->observer, + cparams->out_addr); + if (retval != 0) { + ERROR_LOG("connection connect failed\n"); + session->state = XIO_SESSION_STATE_INIT; + goto cleanup; + } + } else if ((session->state == XIO_SESSION_STATE_CONNECT) || + (session->state == XIO_SESSION_STATE_REDIRECTED)) { + connection = xio_session_alloc_connection( + session, ctx, + cparams->conn_idx, + cparams->conn_user_context); + if (session->state == XIO_SESSION_STATE_REFUSED || + session->state == XIO_SESSION_STATE_REJECTED) { + xio_idr_add_uobj(usr_idr, connection, "xio_connection"); + mutex_unlock(&session->lock); + retval = xio_ctx_add_work( + connection->ctx, + connection, + xio_session_refuse_connection, + &connection->fin_work); + if (retval != 0) + ERROR_LOG("xio_ctx_timer_add failed.\n"); + + return connection; + } else if (session->state == XIO_SESSION_STATE_CLOSING || + session->state == XIO_SESSION_STATE_CLOSED) { + DEBUG_LOG("refusing connection %p - " \ + "session is closing\n", connection); + goto cleanup; + } + } else if (session->state == XIO_SESSION_STATE_ONLINE || + session->state == XIO_SESSION_STATE_ACCEPTED) { + struct xio_nexus *nexus; + char *portal; + + if (cparams->conn_idx == 0) { + portal = session->portals_array[ + session->last_opened_portal++]; + if (session->last_opened_portal == + session->portals_array_len) + session->last_opened_portal = 0; + } else { + int pid = + (cparams->conn_idx % + session->portals_array_len); + + portal = session->portals_array[pid]; + } + connection = xio_session_alloc_connection( + session, ctx, + cparams->conn_idx, + cparams->conn_user_context); + + nexus = xio_nexus_open(ctx, portal, &session->observer, + session->session_id, + attr_mask, pattr); + if (!nexus) { + ERROR_LOG("failed to open connection\n"); + goto cleanup; + } + tmp_connection = xio_session_assign_nexus(session, nexus); + if (tmp_connection != connection) { + ERROR_LOG("failed to open connection nexus:%p, %p %p\n", + nexus, tmp_connection, connection); + goto cleanup; + } + retval = xio_nexus_connect(nexus, portal, + &session->observer, + cparams->out_addr); + if (retval != 0) { + ERROR_LOG("connection connect failed\n"); + goto cleanup; + } + } else if (session->state == XIO_SESSION_STATE_REFUSED || + session->state == XIO_SESSION_STATE_REJECTED || + session->state == XIO_SESSION_STATE_CLOSING || + session->state == XIO_SESSION_STATE_CLOSED) { + goto cleanup2; + } + + xio_idr_add_uobj(usr_idr, connection, "xio_connection"); + + if (cparams->enable_tos) { + connection->nexus_attr_mask = attr_mask; + connection->nexus_attr = attr; + } + + if (cparams->disconnect_timeout_secs) { + if (cparams->disconnect_timeout_secs < XIO_MIN_CONNECTION_TIMEOUT) + connection->disconnect_timeout = XIO_MIN_CONNECTION_TIMEOUT; + else + connection->disconnect_timeout = cparams->disconnect_timeout_secs * 1000; + } else { + connection->disconnect_timeout = XIO_DEF_CONNECTION_TIMEOUT; + } + + mutex_unlock(&session->lock); + + DEBUG_LOG("xio_connect: session:%p, connection:%p, " \ + "ctx:%p, nexus:%p\n", + session, connection, ctx, + ((connection) ? connection->nexus : NULL)); + + return connection; + +cleanup: + if (nexus) + xio_nexus_close(nexus, &session->observer); + + if (connection) + xio_session_free_connection(connection); + +cleanup2: + mutex_unlock(&session->lock); + + return NULL; +} +EXPORT_SYMBOL(xio_connect); diff --git a/open_src/xio/src/common/xio_session_priv.h b/open_src/xio/src/common/xio_session_priv.h new file mode 100644 index 0000000..1881ba9 --- /dev/null +++ b/open_src/xio/src/common/xio_session_priv.h @@ -0,0 +1,318 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef XIO_SESSION_PRIV_H +#define XIO_SESSION_PRIV_H + +#define XIO_ACTION_ACCEPT 1 +#define XIO_ACTION_REDIRECT 2 +#define XIO_ACTION_REJECT 3 + +#define MAX_PORTAL_LEN 192 +#define MAX_RESOURCE_LEN 1024 +#define SETUP_BUFFER_LEN 3840 /* 4096-256 */ + +/* Common API */ + +/*---------------------------------------------------------------------------*/ +/* xio_on_nexus_disconnected */ +/*---------------------------------------------------------------------------*/ +int xio_on_nexus_disconnected(struct xio_session *session, + struct xio_nexus *nexus, + union xio_nexus_event_data *event_data); + +/*---------------------------------------------------------------------------*/ +/* xio_on_nexus_closed */ +/*---------------------------------------------------------------------------*/ +int xio_on_nexus_closed(struct xio_session *session, + struct xio_nexus *nexus, + union xio_nexus_event_data *event_data); + +/*---------------------------------------------------------------------------*/ +/* xio_on_nexus_error */ +/*---------------------------------------------------------------------------*/ +int xio_on_nexus_error(struct xio_session *session, + struct xio_nexus *nexus, + union xio_nexus_event_data *event_data); + +/*---------------------------------------------------------------------------*/ +/* xio_on_new_message */ +/*---------------------------------------------------------------------------*/ +int xio_on_new_message(struct xio_session *session, + struct xio_nexus *nexus, + union xio_nexus_event_data *event_data); + +/*---------------------------------------------------------------------------*/ +/* xio_on_send_completion */ +/*---------------------------------------------------------------------------*/ +int xio_on_send_completion(struct xio_session *session, + struct xio_nexus *nexus, + union xio_nexus_event_data *event_data); + +/*---------------------------------------------------------------------------*/ +/* xio_on_rdma_direct_comp */ +/*---------------------------------------------------------------------------*/ +int xio_on_rdma_direct_comp(struct xio_session *session, + struct xio_nexus *nexus, + union xio_nexus_event_data *event_data); + +/*---------------------------------------------------------------------------*/ +/* xio_on_head_alloc_buf */ +/*---------------------------------------------------------------------------*/ +int xio_on_head_alloc_buf(struct xio_session *session, + struct xio_nexus *nexus, + union xio_nexus_event_data *event_data); + +/*---------------------------------------------------------------------------*/ +/* xio_on_data_alloc_buf */ +/*---------------------------------------------------------------------------*/ +int xio_on_data_alloc_buf(struct xio_session *session, + struct xio_nexus *nexus, + union xio_nexus_event_data *event_data); + +/*---------------------------------------------------------------------------*/ +/* xio_on_assign_in_buf */ +/*---------------------------------------------------------------------------*/ +int xio_on_assign_in_buf(struct xio_session *session, + struct xio_nexus *nexus, + union xio_nexus_event_data *event_data); + +/*---------------------------------------------------------------------------*/ +/* xio_on_cancel_request */ +/*---------------------------------------------------------------------------*/ +int xio_on_cancel_request(struct xio_session *sess, + struct xio_nexus *nexus, + union xio_nexus_event_data *event_data); + +/*---------------------------------------------------------------------------*/ +/* xio_on_cancel_response */ +/*---------------------------------------------------------------------------*/ +int xio_on_cancel_response(struct xio_session *sess, + struct xio_nexus *nexus, + union xio_nexus_event_data *event_data); + +/*---------------------------------------------------------------------------*/ +/* xio_on_nexus_message_error */ +/*---------------------------------------------------------------------------*/ +int xio_on_nexus_message_error(struct xio_session *session, + struct xio_nexus *nexus, + union xio_nexus_event_data *event_data); + +/*---------------------------------------------------------------------------*/ +/* xio_session_read_header */ +/*---------------------------------------------------------------------------*/ +void xio_session_read_header(struct xio_task *task, + struct xio_session_hdr *hdr); + +/*---------------------------------------------------------------------------*/ +/* xio_session_notify_teardown */ +/*---------------------------------------------------------------------------*/ +void xio_session_notify_teardown(struct xio_session *session, int reason); + +/*---------------------------------------------------------------------------*/ +/* xio_session_notify_connection_closed */ +/*---------------------------------------------------------------------------*/ +void xio_session_notify_connection_closed(struct xio_session *session, + struct xio_connection *connection); + +/*---------------------------------------------------------------------------*/ +/* xio_session_find_connection_by_ctx */ +/*---------------------------------------------------------------------------*/ +struct xio_connection *xio_session_find_connection_by_ctx( + struct xio_session *session, + struct xio_context *ctx); + +/* Server API */ + +/*---------------------------------------------------------------------------*/ +/* xio_session_notify_reconnecting */ +/*---------------------------------------------------------------------------*/ +void xio_session_notify_reconnecting(struct xio_session *session, + struct xio_connection *connection); + +/*---------------------------------------------------------------------------*/ +/* xio_session_notify_reconnected */ +/*---------------------------------------------------------------------------*/ +void xio_session_notify_reconnected(struct xio_session *session, + struct xio_connection *connection); + +/*---------------------------------------------------------------------------*/ +/* xio_on_setup_req_recv */ +/*---------------------------------------------------------------------------*/ +int xio_on_setup_req_recv(struct xio_connection *connection, + struct xio_task *task); + +/*---------------------------------------------------------------------------*/ +/* xio_session_write_accept_rsp */ +/*---------------------------------------------------------------------------*/ +struct xio_msg *xio_session_write_accept_rsp( + struct xio_session *session, + uint16_t action, + const char **portals_array, + uint16_t portals_array_len, + void *user_context, + uint16_t user_context_len); + +/*---------------------------------------------------------------------------*/ +/* xio_session_write_reject_rsp */ +/*---------------------------------------------------------------------------*/ +struct xio_msg *xio_session_write_reject_rsp( + struct xio_session *session, + enum xio_status reason, + void *user_context, + uint16_t user_context_len); + +/*---------------------------------------------------------------------------*/ +/* xio_on_setup_rsp_send_comp */ +/*---------------------------------------------------------------------------*/ +int xio_on_setup_rsp_send_comp(struct xio_connection *connection, + struct xio_task *task); + +/*---------------------------------------------------------------------------*/ +/* xio_on_server_nexus_established */ +/*---------------------------------------------------------------------------*/ +int xio_on_server_nexus_established(struct xio_session *session, + struct xio_nexus *nexus, + union xio_nexus_event_data *event_data); + +/*---------------------------------------------------------------------------*/ +/* xio_on_nexus_event_server */ +/*---------------------------------------------------------------------------*/ +int xio_server_on_nexus_event(void *observer, void *sender, int event, + void *event_data); + +/* Client API */ + +/*---------------------------------------------------------------------------*/ +/* xio_session_write_setup_req */ +/*---------------------------------------------------------------------------*/ +struct xio_msg *xio_session_write_setup_req(struct xio_session *session); + +/*---------------------------------------------------------------------------*/ +/* xio_session_accept_connection */ +/*---------------------------------------------------------------------------*/ +int xio_session_accept_connection(struct xio_session *session); + +/*---------------------------------------------------------------------------*/ +/* xio_session_redirect_connection */ +/*---------------------------------------------------------------------------*/ +int xio_session_redirect_connection(struct xio_session *session); + +/*---------------------------------------------------------------------------*/ +/* xio_on_connection_rejected */ +/*---------------------------------------------------------------------------*/ +int xio_on_connection_rejected(struct xio_session *session, + struct xio_connection *connection); + +/*---------------------------------------------------------------------------*/ +/* xio_read_setup_rsp */ +/*---------------------------------------------------------------------------*/ +int xio_read_setup_rsp(struct xio_connection *connection, + struct xio_task *task, + uint16_t *action); + +/*---------------------------------------------------------------------------*/ +/* xio_on_setup_rsp_recv */ +/*---------------------------------------------------------------------------*/ +int xio_on_setup_rsp_recv(struct xio_connection *connection, + struct xio_task *task); + +/*---------------------------------------------------------------------------*/ +/* xio_on_fin_rsp_recv */ +/*---------------------------------------------------------------------------*/ +int xio_on_fin_rsp_recv(struct xio_connection *connection, + struct xio_task *task); + +/*---------------------------------------------------------------------------*/ +/* xio_on_nexus_refused */ +/*---------------------------------------------------------------------------*/ +int xio_on_nexus_refused(struct xio_session *session, + struct xio_nexus *nexus, + union xio_nexus_event_data *event_data); + +/*---------------------------------------------------------------------------*/ +/* xio_on_client_nexus_established */ +/*---------------------------------------------------------------------------*/ +int xio_on_client_nexus_established(struct xio_session *session, + struct xio_nexus *nexus, + union xio_nexus_event_data *event_data); + +/*---------------------------------------------------------------------------*/ +/* xio_on_nexus_event_client */ +/*---------------------------------------------------------------------------*/ +int xio_client_on_nexus_event(void *observer, void *sender, int event, + void *event_data); + +/* Should be in xio_ connection.h but it doesn't compile if moved there */ +/*---------------------------------------------------------------------------*/ +/* xio_connection_set_nexus */ +/*---------------------------------------------------------------------------*/ +static inline void xio_connection_set_nexus(struct xio_connection *connection, + struct xio_nexus *nexus) +{ + if (connection->nexus && connection->nexus == nexus) + return; + + if (connection->nexus) + xio_nexus_unreg_observer(connection->nexus, + &connection->session->observer); + + if (nexus) { + xio_nexus_unreg_observer(nexus, + &connection->session->observer); + xio_nexus_reg_observer(nexus, + &connection->session->observer, + connection->session->session_id); + } + + connection->nexus = nexus; +} + +/*---------------------------------------------------------------------------*/ +/* xio_on_nexus_reconnecting */ +/*---------------------------------------------------------------------------*/ +int xio_on_nexus_reconnecting(struct xio_session *session, + struct xio_nexus *nexus); + +/*---------------------------------------------------------------------------*/ +/* xio_on_nexus_reconnected */ +/*---------------------------------------------------------------------------*/ +int xio_on_nexus_reconnected(struct xio_session *session, + struct xio_nexus *nexus); + +#endif /* XIO_SESSION_PRIV_H */ diff --git a/open_src/xio/src/common/xio_session_server.c b/open_src/xio/src/common/xio_session_server.c new file mode 100644 index 0000000..d296ac6 --- /dev/null +++ b/open_src/xio/src/common/xio_session_server.c @@ -0,0 +1,707 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include "libxio.h" +#include "xio_log.h" +#include "xio_common.h" +#include "xio_protocol.h" +#include "xio_observer.h" +#include "xio_mbuf.h" +#include "xio_task.h" +#include "xio_transport.h" +#include "xio_sessions_cache.h" +#include "xio_hash.h" +#include "xio_msg_list.h" +#include "xio_ev_data.h" +#include "xio_objpool.h" +#include "xio_workqueue.h" +#include "xio_context.h" +#include "xio_nexus.h" +#include "xio_session.h" +#include "xio_connection.h" +#include "xio_session_priv.h" +#include + +/*---------------------------------------------------------------------------*/ +/* xio_on_setup_req_recv */ +/*---------------------------------------------------------------------------*/ +int xio_on_setup_req_recv(struct xio_connection *connection, + struct xio_task *task) +{ + struct xio_msg *msg = &task->imsg; + struct xio_new_session_req req; + uint8_t *ptr; + uint16_t len; + struct xio_session_hdr hdr; + struct xio_session *session = connection->session; + int retval; + struct xio_session_event_data error_event = { + .conn = NULL, + .conn_user_context = NULL, + .event = XIO_SESSION_ERROR_EVENT, + .reason = XIO_E_SUCCESS, + .private_data = NULL, + .private_data_len = 0, + }; + + /* read session header */ + xio_session_read_header(task, &hdr); +#ifdef XIO_SESSION_DEBUG + connection->peer_connection = hdr.connection; + connection->peer_session = hdr.session; +#endif + task->imsg.sn = hdr.serial_num; + task->connection = connection; + task->session = session; + connection->session->setup_req = msg; + connection->session->connection_srv_first = connection; + + /* read the header */ + ptr = (uint8_t *)msg->in.header.iov_base; + + memset(&req, 0, sizeof(req)); + + /* session id */ + len = xio_read_uint32(&session->peer_session_id, 0, ptr); + ptr = ptr + len; + + /* queue depth bytes */ + len = xio_read_uint64(&session->peer_snd_queue_depth_bytes, 0, ptr); + ptr = ptr + len; + + len = xio_read_uint64(&session->peer_rcv_queue_depth_bytes, 0, ptr); + ptr = ptr + len; + + /* queue depth msgs */ + len = xio_read_uint16((uint16_t *)&session->peer_snd_queue_depth_msgs, + 0, ptr); + ptr = ptr + len; + + len = xio_read_uint16((uint16_t *)&session->peer_rcv_queue_depth_msgs, + 0, ptr); + ptr = ptr + len; + + /* uri length */ + len = xio_read_uint16(&req.uri_len, 0, ptr); + ptr = ptr + len; + + /* private length */ + len = xio_read_uint16(&req.private_data_len, 0, ptr); + ptr = ptr + len; + + if (req.uri_len) { + req.uri = + (char *)kcalloc(req.uri_len, sizeof(char), GFP_KERNEL); + if (unlikely(!req.uri)) { + xio_set_error(ENOMEM); + ERROR_LOG("uri allocation failed. len:%d\n", + req.uri_len); + goto cleanup1; + } + + len = xio_read_array((uint8_t *)req.uri, + req.uri_len, 0, ptr); + ptr = ptr + len; + } + if (req.private_data_len) { + req.private_data = kcalloc(req.private_data_len, + sizeof(uint8_t), GFP_KERNEL); + if (unlikely(!req.private_data)) { + xio_set_error(ENOMEM); + ERROR_LOG("private data allocation failed. len:%d\n", + req.private_data_len); + goto cleanup2; + } + len = xio_read_array((uint8_t *)req.private_data, + req.private_data_len, + 0, ptr); + ptr = ptr + len; + } + + req.proto = (enum xio_proto)xio_nexus_get_proto(connection->nexus); + xio_nexus_get_peer_addr(connection->nexus, + &req.src_addr, sizeof(req.src_addr)); + + /* cache the task in io queue*/ + xio_connection_queue_io_task(connection, task); + + /* notify the upper layer */ + if (connection->ses_ops.on_new_session) { +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(connection->ctx); +#endif + retval = connection->ses_ops.on_new_session( + session, &req, + connection->cb_user_context); +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_lock(connection->ctx); +#endif + if (retval) + goto cleanup2; + } else { + retval = xio_accept(session, NULL, 0, NULL, 0); + if (retval) { + ERROR_LOG("failed to auto accept session. session:%p\n", + session); + goto cleanup2; + } + } + + /* Don't move session state to ONLINE. In case of multiple portals + * the accept moves the state to ACCEPTED until the first "HELLO" + * message arrives. Note that the "upper layer" may call redirect or + * reject. + */ + + xio_session_notify_new_connection(session, connection); + + kfree(req.private_data); + kfree(req.uri); + + return 0; + +cleanup2: + kfree(req.private_data); + +cleanup1: + kfree(req.uri); + + if (session->ses_ops.on_session_event) { +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(connection->ctx); +#endif + error_event.reason = (enum xio_status)xio_errno(); + session->ses_ops.on_session_event( + session, &error_event, + session->cb_user_context); +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_lock(connection->ctx); +#endif + } + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_session_write_accept_rsp */ +/*---------------------------------------------------------------------------*/ +struct xio_msg *xio_session_write_accept_rsp(struct xio_session *session, + uint16_t action, + const char **portals_array, + uint16_t portals_array_len, + void *user_context, + uint16_t user_context_len) +{ + struct xio_msg *msg; + uint8_t *buf; + uint8_t *ptr; + uint16_t len, i, str_len, tot_len; + + /* calculate length */ + tot_len = 5*sizeof(uint16_t) + sizeof(uint32_t) + 2*sizeof(uint64_t); + for (i = 0; i < portals_array_len; i++) + tot_len += strlen(portals_array[i]) + sizeof(uint16_t); + tot_len += user_context_len; + + if (tot_len > SETUP_BUFFER_LEN) { + ERROR_LOG("buffer is too small\n"); + xio_set_error(EMSGSIZE); + return NULL; + } + + /* allocate message */ + buf = (uint8_t *)kcalloc(SETUP_BUFFER_LEN + sizeof(struct xio_msg), + sizeof(uint8_t), GFP_KERNEL); + if (unlikely(!buf)) { + ERROR_LOG("message allocation failed\n"); + xio_set_error(ENOMEM); + return NULL; + } + + /* fill the message */ + msg = (struct xio_msg *)buf; + msg->out.header.iov_base = buf + sizeof(struct xio_msg); + msg->out.header.iov_len = 0; + + ptr = (uint8_t *)msg->out.header.iov_base; + len = 0; + + /* serialize message into the buffer */ + + /* session_id */ + len = xio_write_uint32(session->session_id, 0, ptr); + ptr = ptr + len; + + /* action */ + len = xio_write_uint16(action, 0, ptr); + ptr = ptr + len; + + if (action == XIO_ACTION_ACCEPT) { + /* tx queue depth bytes */ + len = xio_write_uint64(session->snd_queue_depth_bytes, 0, ptr); + ptr = ptr + len; + + /* rx queue depth bytes */ + len = xio_write_uint64(session->rcv_queue_depth_bytes, 0, ptr); + ptr = ptr + len; + + /* tx queue depth msgs */ + len = xio_write_uint16(session->snd_queue_depth_msgs, 0, ptr); + ptr = ptr + len; + + /* rx queue depth msgs */ + len = xio_write_uint16(session->rcv_queue_depth_msgs, 0, ptr); + ptr = ptr + len; + } + + /* portals_array_len */ + len = xio_write_uint16(portals_array_len, 0, ptr); + ptr = ptr + len; + + /* user_context_len */ + len = xio_write_uint16(user_context_len, 0, ptr); + ptr = ptr + len; + + for (i = 0; i < portals_array_len; i++) { + str_len = strlen(portals_array[i]); + + len = xio_write_uint16(str_len, 0, ptr); + ptr = ptr + len; + + len = xio_write_array((uint8_t *)portals_array[i], + str_len, 0, ptr); + ptr = ptr + len; + } + + if (user_context_len) { + len = xio_write_array((const uint8_t *)user_context, + user_context_len, + 0, ptr); + ptr = ptr + len; + } + + msg->out.header.iov_len = ptr - (uint8_t *)msg->out.header.iov_base; + + if (msg->out.header.iov_len != tot_len) { + ERROR_LOG("calculated length %d != actual length %zd\n", + tot_len, msg->out.header.iov_len); + } + + return msg; +} + +/*---------------------------------------------------------------------------*/ +/* xio_session_write_reject_rsp */ +/*---------------------------------------------------------------------------*/ +struct xio_msg *xio_session_write_reject_rsp(struct xio_session *session, + enum xio_status reason, + void *user_context, + uint16_t user_context_len) +{ + struct xio_msg *msg; + uint8_t *buf; + uint8_t *ptr; + uint16_t len, tot_len; + uint16_t action = XIO_ACTION_REJECT; + + /* calclate length */ + tot_len = 2*sizeof(uint16_t) + 2*sizeof(uint32_t); + tot_len += user_context_len; + + if (tot_len > SETUP_BUFFER_LEN) { + ERROR_LOG("buffer is too small\n"); + xio_set_error(EMSGSIZE); + return NULL; + } + + /* allocate message */ + buf = (uint8_t *)kcalloc(SETUP_BUFFER_LEN + sizeof(struct xio_msg), + sizeof(uint8_t), GFP_KERNEL); + if (!buf) { + ERROR_LOG("message allocation failed\n"); + xio_set_error(ENOMEM); + return NULL; + } + + /* fill the message */ + msg = (struct xio_msg *)buf; + msg->out.header.iov_base = buf + sizeof(struct xio_msg); + msg->out.header.iov_len = 0; + + ptr = (uint8_t *)msg->out.header.iov_base; + len = 0; + + /* serialize message into the buffer */ + + /* session_id */ + len = xio_write_uint32(session->session_id, 0, ptr); + ptr = ptr + len; + + /* action */ + len = xio_write_uint16(action, 0, ptr); + ptr = ptr + len; + + /* reason */ + len = xio_write_uint32(reason, 0, ptr); + ptr = ptr + len; + + /* user_context_len */ + len = xio_write_uint16(user_context_len, 0, ptr); + ptr = ptr + len; + + if (user_context_len) { + len = xio_write_array((const uint8_t *)user_context, + user_context_len, + 0, ptr); + ptr = ptr + len; + } + + msg->out.header.iov_len = ptr - (uint8_t *)msg->out.header.iov_base; + + if (msg->out.header.iov_len != tot_len) { + ERROR_LOG("calculated length %d != actual length %zd\n", + tot_len, msg->out.header.iov_len); + } + + return msg; +} + +/*---------------------------------------------------------------------------*/ +/* xio_accept */ +/*---------------------------------------------------------------------------*/ +int xio_accept(struct xio_session *session, + const char **portals_array, + size_t portals_array_len, + void *user_context, + size_t user_context_len) +{ + int retval = 0; + struct xio_msg *msg; + struct xio_task *task; + + msg = xio_session_write_accept_rsp(session, + XIO_ACTION_ACCEPT, + portals_array, + portals_array_len, + user_context, + user_context_len); + if (!msg) { + ERROR_LOG("setup request creation failed\n"); + return -1; + } + + msg->request = session->setup_req; + msg->type = (enum xio_msg_type)XIO_SESSION_SETUP_RSP; + + task = container_of(msg->request, + struct xio_task, imsg); + + if (portals_array_len != 0) { + /* server side state is changed to ACCEPT, will be move to + * ONLINE state when first "hello" message arrives + */ + session->state = XIO_SESSION_STATE_ACCEPTED; + /* temporary disable teardown */ + session->disable_teardown = 1; + TRACE_LOG("session state is now ACCEPT. session:%p\n", + session); + } else { + /* initialize credits */ + task->connection->peer_credits_msgs = + session->peer_rcv_queue_depth_msgs; + task->connection->credits_msgs = 0; + task->connection->peer_credits_bytes = + session->peer_rcv_queue_depth_bytes; + task->connection->credits_bytes = 0; + + /* server side state is changed to ONLINE, immediately */ + session->state = XIO_SESSION_STATE_ONLINE; + TRACE_LOG("session state changed to ONLINE. session:%p\n", + session); + } + retval = xio_connection_send(task->connection, msg); + if (retval && retval != -EAGAIN) { + ERROR_LOG("failed to send message. errno:%d\n", -retval); + xio_set_error(-retval); + return -1; + } + + return 0; +} +EXPORT_SYMBOL(xio_accept); + +/*---------------------------------------------------------------------------*/ +/* xio_redirect */ +/*---------------------------------------------------------------------------*/ +int xio_redirect(struct xio_session *session, + const char **portals_array, + size_t portals_array_len) +{ + int retval = 0; + struct xio_msg *msg; + struct xio_task *task; + + if (portals_array_len == 0 || !portals_array) { + xio_set_error(EINVAL); + ERROR_LOG("portals array for redirect is mandatory\n"); + return -1; + } + + msg = xio_session_write_accept_rsp(session, + XIO_ACTION_REDIRECT, + portals_array, + portals_array_len, + NULL, + 0); + if (unlikely(!msg)) { + ERROR_LOG("setup request creation failed\n"); + return -1; + } + if (portals_array_len != 0) { + /* server side state is changed to ACCEPT */ + session->state = XIO_SESSION_STATE_REDIRECTED; + TRACE_LOG("session state is now REDIRECTED. session:%p\n", + session); + } + msg->request = session->setup_req; + msg->type = (enum xio_msg_type)XIO_SESSION_SETUP_RSP; + + task = container_of(msg->request, + struct xio_task, imsg); + + retval = xio_connection_send(task->connection, msg); + if (retval && retval != -EAGAIN) { + ERROR_LOG("failed to send message errno:%d\n", -retval); + xio_set_error(-retval); + return -1; + } + + return 0; +} +EXPORT_SYMBOL(xio_redirect); + +/*---------------------------------------------------------------------------*/ +/* xio_reject */ +/*---------------------------------------------------------------------------*/ +int xio_reject(struct xio_session *session, + enum xio_status reason, + void *user_context, + size_t user_context_len) +{ + int retval = 0; + struct xio_msg *msg; + struct xio_task *task; + + msg = xio_session_write_reject_rsp(session, reason, user_context, + user_context_len); + if (!msg) { + ERROR_LOG("setup request creation failed\n"); + return -1; + } + /* server side state is changed to REJECTED */ + session->state = XIO_SESSION_STATE_REJECTED; + TRACE_LOG("session state is now REJECT. session:%p\n", + session); + + msg->request = session->setup_req; + msg->type = (enum xio_msg_type)XIO_SESSION_SETUP_RSP; + + task = container_of(msg->request, + struct xio_task, imsg); + + task->connection->close_reason = XIO_E_SESSION_REJECTED; + + retval = xio_connection_send(task->connection, msg); + if (retval && retval != -EAGAIN) { + ERROR_LOG("failed to send message. errno:%d\n", -retval); + xio_set_error(-retval); + return -1; + } + + return 0; +} +EXPORT_SYMBOL(xio_reject); + +/*---------------------------------------------------------------------------*/ +/* xio_on_setup_rsp_send_comp */ +/*---------------------------------------------------------------------------*/ +int xio_on_setup_rsp_send_comp(struct xio_connection *connection, + struct xio_task *task) +{ + TRACE_LOG("got session setup response comp. session:%p, " \ + "connection:%p\n", + connection->session, connection); + + kfree(task->omsg); + + /* recycle the task */ + xio_tasks_pool_put(task); + + /* time to set new callback */ + DEBUG_LOG("task recycled\n"); + + switch (connection->session->state) { + case XIO_SESSION_STATE_ACCEPTED: + case XIO_SESSION_STATE_REJECTED: + case XIO_SESSION_STATE_REDIRECTED: + xio_disconnect_initial_connection(connection); + break; + default: + /* try to transmit now */ + xio_connection_xmit_msgs(connection); + break; + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_on_server_nexus_established */ +/*---------------------------------------------------------------------------*/ +int xio_on_server_nexus_established(struct xio_session *session, + struct xio_nexus *nexus, + union xio_nexus_event_data *event_data) +{ + struct xio_connection *connection = xio_session_find_connection(session, nexus); + connection->restarted = 1; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_server_on_nexus_event */ +/*---------------------------------------------------------------------------*/ +int xio_server_on_nexus_event(void *observer, void *sender, int event, + void *_event_data) +{ + struct xio_session *session = (struct xio_session *)observer; + struct xio_nexus *nexus = (struct xio_nexus *)sender; + int retval = 0; + union xio_nexus_event_data *event_data = (union xio_nexus_event_data *) + _event_data; + switch (event) { + case XIO_NEXUS_EVENT_ALLOC_HEAD_BUF: + TRACE_LOG("session: [notification] - assign in buf. " \ + "session:%p, nexus:%p\n", observer, sender); + + xio_on_head_alloc_buf(session, nexus, event_data); + break; + + case XIO_NEXUS_EVENT_ALLOC_DATA_BUF: + TRACE_LOG("session: [notification] - assign in buf. " \ + "session:%p, nexus:%p\n", observer, sender); + + xio_on_data_alloc_buf(session, nexus, event_data); + break; + case XIO_NEXUS_EVENT_NEW_MESSAGE: +/* + TRACE_LOG("session: [notification] - new message. " \ + "session:%p, nexus:%p\n", observer, sender); + +*/ xio_on_new_message(session, nexus, event_data); + break; + case XIO_NEXUS_EVENT_SEND_COMPLETION: +/* TRACE_LOG("session: [notification] - send_completion. " \ + "session:%p, nexus:%p\n", observer, sender); +*/ + xio_on_send_completion(session, nexus, event_data); + break; + case XIO_NEXUS_EVENT_DIRECT_RDMA_COMPLETION: + xio_on_rdma_direct_comp(session, nexus, event_data); + break; + case XIO_NEXUS_EVENT_ASSIGN_IN_BUF: +/* TRACE_LOG("session: [notification] - assign in buf. " \ + "session:%p, nexus:%p\n", observer, sender); +*/ + xio_on_assign_in_buf(session, nexus, event_data); + break; + case XIO_NEXUS_EVENT_CANCEL_REQUEST: + DEBUG_LOG("session: [notification] - cancel request. " \ + "session:%p, nexus:%p\n", observer, sender); + xio_on_cancel_request(session, nexus, event_data); + break; + case XIO_NEXUS_EVENT_CANCEL_RESPONSE: + DEBUG_LOG("session: [notification] - cancel response. " \ + "session:%p, nexus:%p\n", observer, sender); + xio_on_cancel_response(session, nexus, event_data); + break; + case XIO_NEXUS_EVENT_ESTABLISHED: + DEBUG_LOG("session: [notification] - connection established. " \ + "session:%p, nexus:%p\n", observer, sender); + xio_on_server_nexus_established(session, nexus, event_data); + break; + case XIO_NEXUS_EVENT_DISCONNECTED: + DEBUG_LOG("session: [notification] - connection disconnected" \ + " session:%p, nexus:%p\n", observer, sender); + xio_on_nexus_disconnected(session, nexus, event_data); + break; + case XIO_NEXUS_EVENT_RECONNECTING: + DEBUG_LOG("session: [notification] - connection reconnecting" \ + " session:%p, nexus:%p\n", observer, sender); + xio_on_nexus_reconnecting(session, nexus); + break; + case XIO_NEXUS_EVENT_RECONNECTED: + DEBUG_LOG("session: [notification] - connection reconnected" \ + " session:%p, nexus:%p\n", observer, sender); + xio_on_nexus_reconnected(session, nexus); + break; + case XIO_NEXUS_EVENT_CLOSED: + DEBUG_LOG("session: [notification] - connection closed. " \ + "session:%p, nexus:%p\n", observer, sender); + xio_on_nexus_closed(session, nexus, event_data); + break; + case XIO_NEXUS_EVENT_ERROR: + DEBUG_LOG("session: [notification] - connection error. " \ + "session:%p, nexus:%p\n", observer, sender); + xio_on_nexus_error(session, nexus, event_data); + break; + case XIO_NEXUS_EVENT_MESSAGE_ERROR: + DEBUG_LOG("session: [notification] - nexus message error. " \ + "session:%p, nexus:%p\n", observer, sender); + xio_on_nexus_message_error(session, nexus, event_data); + break; + default: + DEBUG_LOG("session: [notification] - unexpected event. " \ + "event:%d, session:%p, nexus:%p\n", + event, observer, sender); + xio_on_nexus_error(session, nexus, event_data); + break; + } + + return retval; +} diff --git a/open_src/xio/src/common/xio_sessions_cache.c b/open_src/xio/src/common/xio_sessions_cache.c new file mode 100644 index 0000000..ae7faea --- /dev/null +++ b/open_src/xio/src/common/xio_sessions_cache.c @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include "libxio.h" +#include "xio_log.h" +#include "xio_common.h" +#include "xio_hash.h" +#include "xio_observer.h" +#include "xio_transport.h" +#include "xio_protocol.h" +#include "xio_mbuf.h" +#include "xio_task.h" +#include "xio_workqueue.h" +#include "xio_session.h" +#include "xio_sessions_cache.h" + +static HT_HEAD(, xio_session, HASHTABLE_PRIME_SMALL) sessions_cache; +static spinlock_t ss_lock; + +/*---------------------------------------------------------------------------*/ +/* sessions_cache_add */ +/*---------------------------------------------------------------------------*/ +static int sessions_cache_add(struct xio_session *session, + uint32_t session_id) +{ + struct xio_session *s; + struct xio_key_int32 key = { + .id = session_id, + .pad = {0}, + }; + HT_LOOKUP(&sessions_cache, &key, s, sessions_htbl); + if (s) + return -1; + + HT_INSERT(&sessions_cache, &key, session, sessions_htbl); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sessions_cache_remove */ +/*---------------------------------------------------------------------------*/ +int xio_sessions_cache_remove(uint32_t session_id) +{ + struct xio_session *s; + struct xio_key_int32 key; + + spin_lock(&ss_lock); + key.id = session_id; + HT_LOOKUP(&sessions_cache, &key, s, sessions_htbl); + if (!s) { + spin_unlock(&ss_lock); + return -1; + } + + HT_REMOVE(&sessions_cache, s, xio_session, sessions_htbl); + spin_unlock(&ss_lock); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sessions_cache_lookup */ +/*---------------------------------------------------------------------------*/ +struct xio_session *xio_sessions_cache_lookup(uint32_t session_id) +{ + struct xio_session *s; + struct xio_key_int32 key; + + spin_lock(&ss_lock); + key.id = session_id; + HT_LOOKUP(&sessions_cache, &key, s, sessions_htbl); + spin_unlock(&ss_lock); + + return s; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sessions_cache_add */ +/*---------------------------------------------------------------------------*/ +int xio_sessions_cache_add(struct xio_session *session, + uint32_t *session_id) +{ + static uint32_t sid; /* = 0 global session provider */ + int retval; + + spin_lock(&ss_lock); + retval = sessions_cache_add(session, sid); + if (retval == 0) + *session_id = sid++; + spin_unlock(&ss_lock); + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* sessions_cache_construct */ +/*---------------------------------------------------------------------------*/ +void sessions_cache_construct(void) +{ + HT_INIT(&sessions_cache, xio_int32_hash, xio_int32_cmp, xio_int32_cp); + spin_lock_init(&ss_lock); +} + +/* +void sessions_cache_destruct(void) +{ +} +*/ diff --git a/open_src/xio/src/common/xio_sessions_cache.h b/open_src/xio/src/common/xio_sessions_cache.h new file mode 100644 index 0000000..c44180d --- /dev/null +++ b/open_src/xio/src/common/xio_sessions_cache.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_SESSIONS_CACHE_H +#define XIO_SESSIONS_CACHE_H + +/*---------------------------------------------------------------------------*/ +/* forward declarations */ +/*---------------------------------------------------------------------------*/ +struct xio_session; + +/*---------------------------------------------------------------------------*/ +/* sessions_cache_construct */ +/*---------------------------------------------------------------------------*/ +void sessions_cache_construct(void); + +int xio_sessions_cache_add(struct xio_session *session, uint32_t *session_id); + +int xio_sessions_cache_remove(uint32_t session_id); + +struct xio_session *xio_sessions_cache_lookup(uint32_t session_id); + +#endif /*XIO_SESSIONS_CACHE_H */ + diff --git a/open_src/xio/src/common/xio_sg_table.h b/open_src/xio/src/common/xio_sg_table.h new file mode 100644 index 0000000..0fbe304 --- /dev/null +++ b/open_src/xio/src/common/xio_sg_table.h @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef XIO_SG_TABLE_OPS +#define XIO_SG_TABLE_OPS + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void (*sge_set_buf_fn)(void *sge, const void *buf, + uint32_t buflen, void *mr); +typedef void *(*sge_addr_fn)(void *sge); +typedef void (*sge_set_addr_fn)(void *sge, void *addr); +typedef void *(*sge_mr_fn)(void *sge); +typedef void (*sge_set_mr_fn)(void *sge, void *mr); +typedef size_t (*sge_length_fn)(void *sge); +typedef void (*sge_set_length_fn)(void *sge, size_t len); + +typedef void *(*sge_first_fn)(void *tbl); +typedef void *(*sge_last_fn)(void *tbl); +typedef void *(*sge_next_fn)(void *tbl, void *sge); + +typedef int (*tbl_empty_fn)(void *tbl); +typedef void *(*tbl_sglist_fn)(void *tbl); +typedef uint32_t (*tbl_nents_fn)(void *tbl); +typedef void (*tbl_set_nents_fn)(void *tbl, uint32_t nents); +typedef uint32_t (*tbl_max_nents_fn)(void *tbl); +typedef void (*tbl_set_max_nents_fn)(void *tbl, uint32_t max_nents); +typedef size_t (*tbl_length_fn)(void *tbl); + +struct xio_sg_table_ops { + sge_set_buf_fn sge_set_buf; + sge_addr_fn sge_addr; + sge_set_addr_fn sge_set_addr; + sge_mr_fn sge_mr; + sge_set_mr_fn sge_set_mr; + sge_length_fn sge_length; + sge_set_length_fn sge_set_length; + + sge_first_fn sge_first; + sge_last_fn sge_last; + sge_next_fn sge_next; + + tbl_empty_fn tbl_empty; + tbl_sglist_fn tbl_sglist; + tbl_nents_fn tbl_nents; + tbl_set_nents_fn tbl_set_nents; + tbl_max_nents_fn tbl_max_nents; + tbl_set_max_nents_fn tbl_set_max_nents; + tbl_length_fn tbl_length; +}; + +int tbl_copy(struct xio_sg_table_ops *dtbl_ops, void *dtbl, + struct xio_sg_table_ops *stbl_ops, void *stbl); + +int tbl_copy_sg(struct xio_sg_table_ops *dtbl_ops, void *dtbl, + struct xio_sg_table_ops *stbl_ops, void *stbl); + +int tbl_clone(struct xio_sg_table_ops *dtbl_ops, void *dtbl, + struct xio_sg_table_ops *stbl_ops, void *stbl); + +#define sge_set_buf(ops, sge, buf, buflen, mr) \ + ((ops)->sge_set_buf((sge), (buf), (buflen), (mr))) +#define sge_addr(ops, sge) \ + ((ops)->sge_addr((sge))) +#define sge_set_addr(ops, sge, addr) \ + ((ops)->sge_set_addr((sge), (addr))) +#define sge_mr(ops, sge) \ + ((ops)->sge_mr((sge))) +#define sge_set_mr(ops, sge, mr) \ + ((ops)->sge_set_mr((sge), (mr))) +#define sge_length(ops, sge) \ + ((ops)->sge_length((sge))) +#define sge_set_length(ops, sge, len) \ + ((ops)->sge_set_length((sge), (len))) +#define sge_first(ops, tbl) \ + ((ops)->sge_first((tbl))) +#define sge_last(ops, tbl) \ + ((ops)->sge_last((tbl))) +#define sge_next(ops, tbl, sge) \ + ((ops)->sge_next((tbl), (sge))) +#define tbl_empty(ops, tbl) \ + ((ops)->tbl_empty((tbl))) +#define tbl_nents(ops, tbl) \ + ((ops)->tbl_nents((tbl))) +#define tbl_sglist(ops, tbl) \ + ((ops)->tbl_sglist((tbl))) +#define tbl_max_nents(ops, tbl) \ + ((ops)->tbl_max_nents((tbl))) +#define tbl_set_nents(ops, tbl, nents) \ + ((ops)->tbl_set_nents((tbl), nents)) +#define tbl_set_max_nents(ops, tbl, max_nents) \ + ((ops)->tbl_set_max_nents((tbl), max_nents)) +#define tbl_length(ops, tbl) \ + ((ops)->tbl_length((tbl))) + +#define for_each_sge(sgtbl, ops, sg, __i) \ + for ((__i) = 0, (sg) = sge_first((ops), (sgtbl)); \ + (__i) < tbl_nents((ops), (sgtbl)); \ + (__i)++, (sg) = sge_next((ops), (sgtbl), (sg))) + +#define xio_sg_table_get(vmsg) \ + ((void *)&((vmsg)->data_tbl)) + +void *xio_sg_table_ops_get(enum xio_sgl_type sgl_type); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/open_src/xio/src/common/xio_task.h b/open_src/xio/src/common/xio_task.h new file mode 100644 index 0000000..e2edb7b --- /dev/null +++ b/open_src/xio/src/common/xio_task.h @@ -0,0 +1,360 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_TASK_H +#define XIO_TASK_H + +#ifndef list_last_entry +#define list_last_entry(ptr, type, member) \ + list_entry((ptr)->prev, type, member) +#endif + +enum xio_task_state { + XIO_TASK_STATE_INIT, + XIO_TASK_STATE_DELIVERED, + XIO_TASK_STATE_READ, + XIO_TASK_STATE_RESPONSE_RECV, /* mark the sender task */ + XIO_TASK_STATE_CANCEL_PENDING, /* mark for rdma read task */ +}; + +/*---------------------------------------------------------------------------*/ +/* forward declarations */ +/*---------------------------------------------------------------------------*/ +struct xio_tasks_pool; + +/*---------------------------------------------------------------------------*/ +/* structs */ +/*---------------------------------------------------------------------------*/ +struct xio_task { + struct list_head tasks_list_entry; + void *dd_data; + struct xio_task *sender_task; /* client only on receiver */ + struct xio_mbuf mbuf; + + enum xio_task_state state; /* task state enum */ + struct kref kref; + uint64_t stag; /* session unique tag */ + uint32_t tlv_type; + uint16_t is_control; + uint16_t omsg_flags; + uint16_t imsg_flags; + uint16_t last_in_rxq; + uint32_t ltid; /* local task id */ + uint32_t rtid; /* remote task id */ + uint32_t magic; + int32_t status; + int32_t pad1; + + void *pool; + void *slab; + void *context; + struct xio_session *session; + struct xio_connection *connection; + struct xio_nexus *nexus; + + struct xio_vmsg in_receipt; /* save in of message with */ + /* receipt */ + struct xio_msg *omsg; /* pointer from user */ + struct xio_msg imsg; /* message to the user */ +}; + +struct xio_tasks_pool_hooks { + void *context; + int (*slab_pre_create)(void *context, int alloc_nr, + void *pool_dd_data, + void *slab_dd_data); + int (*slab_destroy)(void *context, + void *pool_dd_data, + void *slab_dd_data); + int (*slab_init_task)(void *context, + void *pool_dd_data, + void *slab_dd_data, + int tid, struct xio_task *task); + int (*slab_uninit_task)(void *context, + void *pool_dd_data, + void *slab_dd_data, + struct xio_task *task); + int (*slab_remap_task)(void *old_context, + void *new_context, + void *pool_dd_data, + void *slab_dd_data, + struct xio_task *task); + int (*slab_post_create)(void *context, + void *pool_dd_data, + void *slab_dd_data); + int (*pool_pre_create)(void *context, void *pool, + void *pool_dd_data); + int (*pool_post_create)(void *context, void *pool, + void *pool_dd_data); + int (*pool_destroy)(void *context, void *pool, + void *pool_dd_data); + int (*task_pre_put)(void *context, struct xio_task *task); + int (*task_post_get)(void *context, struct xio_task *task); +}; + +struct xio_tasks_pool_params { + struct xio_tasks_pool_hooks pool_hooks; + char *pool_name; + unsigned int start_nr; + unsigned int max_nr; + unsigned int alloc_nr; + int pool_dd_data_sz; + int slab_dd_data_sz; + int task_dd_data_sz; +}; + +struct xio_tasks_slab { + struct list_head slabs_list_entry; + /* pool of tasks */ + struct xio_task **array; + uint32_t start_idx; + uint32_t end_idx; + uint32_t nr; + uint32_t huge_alloc; + void *dd_data; +}; + +struct xio_tasks_pool { + /* LIFO */ + struct xio_tasks_pool_params params; + struct list_head stack; + unsigned int curr_used; + unsigned int curr_alloced; + unsigned int max_used; + unsigned int curr_idx; + unsigned int node_id; /* numa node id */ + unsigned int pad; + struct list_head slabs_list; + void *dd_data; +}; + +/*---------------------------------------------------------------------------*/ +/* xio_task_reset */ +/*---------------------------------------------------------------------------*/ +static inline void xio_task_reset(struct xio_task *task) +{ + /* user responsibility to reset after receive */ + /* + if (task->imsg.user_context) + task->imsg.user_context = 0; + + task->imsg.flags = 0; + task->tlv_type = 0xdead; + task->omsg_flags = 0; + task->state = XIO_TASK_STATE_INIT; + xio_mbuf_reset(&task->mbuf); + */ + + task->sender_task = NULL; + task->tlv_type = 0xdead; +} + +/*---------------------------------------------------------------------------*/ +/* xio_task_addref */ +/*---------------------------------------------------------------------------*/ +static inline void xio_task_addref( + struct xio_task *t) +{ + kref_get(&t->kref); +} + +/*---------------------------------------------------------------------------*/ +/* xio_task_reinit */ +/*---------------------------------------------------------------------------*/ +static int xio_task_reinit(void *context, struct xio_task *task) +{ + struct xio_tasks_pool *pool = (struct xio_tasks_pool *)task->pool; + struct xio_tasks_slab *slab = (struct xio_tasks_slab *)task->slab; + int retval = -1; + int i = task->ltid - slab->start_idx; + + if (pool->params.pool_hooks.slab_init_task && i >= 0) + retval = pool->params.pool_hooks.slab_init_task(context, + pool->dd_data, + slab->dd_data, + i, + task); + task->context = context; + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_task_release */ +/*---------------------------------------------------------------------------*/ +static inline void xio_task_release(struct kref *kref) +{ + struct xio_task *task = container_of(kref, struct xio_task, kref); + struct xio_tasks_pool *pool; + + assert(task->pool); + + pool = (struct xio_tasks_pool *)task->pool; + + if (pool->params.pool_hooks.task_pre_put) + pool->params.pool_hooks.task_pre_put(task->context, task); + + xio_task_reset(task); + + pool->curr_used--; + + list_move(&task->tasks_list_entry, &pool->stack); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tasks_pool_create */ +/*---------------------------------------------------------------------------*/ +struct xio_tasks_pool *xio_tasks_pool_create( + struct xio_tasks_pool_params *params); + +/*---------------------------------------------------------------------------*/ +/* xio_tasks_pool_destroy */ +/*---------------------------------------------------------------------------*/ +void xio_tasks_pool_destroy(struct xio_tasks_pool *q); + +/*---------------------------------------------------------------------------*/ +/* xio_tasks_pool_remap */ +/*---------------------------------------------------------------------------*/ +void xio_tasks_pool_remap(struct xio_tasks_pool *q, void *new_context); + +/*---------------------------------------------------------------------------*/ +/* xio_tasks_pool_alloc_slab */ +/*---------------------------------------------------------------------------*/ +int xio_tasks_pool_alloc_slab(struct xio_tasks_pool *q, void *context); + +/*---------------------------------------------------------------------------*/ +/* xio_tasks_pool_dump_used */ +/*---------------------------------------------------------------------------*/ +void xio_tasks_pool_dump_used(struct xio_tasks_pool *q); + +/*---------------------------------------------------------------------------*/ +/* xio_tasks_pool_get */ +/*---------------------------------------------------------------------------*/ +static inline struct xio_task *xio_tasks_pool_get( + struct xio_tasks_pool *q, void *context) +{ + struct xio_task *t; + + t = list_first_entry_or_null(&q->stack, struct xio_task, + tasks_list_entry); + + if (unlikely(!t || list_is_last(&t->tasks_list_entry, &q->stack))) { + if (q->curr_used == q->params.max_nr - 1) + goto pool_exhausted; + + xio_tasks_pool_alloc_slab(q, context); + if (unlikely(list_empty(&q->stack))) + goto pool_exhausted; + t = list_last_entry(&q->stack, struct xio_task, + tasks_list_entry); + } else { + t = list_first_entry(&q->stack, struct xio_task, + tasks_list_entry); + } + list_del_init(&t->tasks_list_entry); + q->curr_used++; + if (q->curr_used > q->max_used) + q->max_used = q->curr_used; + + kref_init(&t->kref); + t->tlv_type = 0xbeef; /* poison the type */ + + if (t->context != context) + xio_task_reinit(context, t); + + if (q->params.pool_hooks.task_post_get) + q->params.pool_hooks.task_post_get(context, t); + + return t; + +pool_exhausted: + ERROR_LOG("%s - pool exhausted. used:%d max_nr:%d\n", + q->params.pool_name, + q->curr_used , q->params.max_nr); + return NULL; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tasks_pool_put */ +/*---------------------------------------------------------------------------*/ +static inline void xio_tasks_pool_put(struct xio_task *task) +{ + kref_put(&task->kref, xio_task_release); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tasks_pool_free_tasks */ +/*---------------------------------------------------------------------------*/ +static inline int xio_tasks_pool_free_tasks( + struct xio_tasks_pool *q) +{ + if (!q) + return 0; + + if (q->curr_used) { + ERROR_LOG("tasks inventory: %d/%d = missing:%d\n", + q->curr_alloced - q->curr_used, q->curr_alloced, + q->curr_used); + xio_tasks_pool_dump_used(q); + } + + return q->curr_alloced - q->curr_used; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tasks_pool_lookup */ +/*---------------------------------------------------------------------------*/ +static inline struct xio_task *xio_tasks_pool_lookup( + struct xio_tasks_pool *q, + unsigned int id) +{ + struct xio_tasks_slab *slab; + struct xio_task *task = NULL; + + list_for_each_entry(slab, &q->slabs_list, slabs_list_entry) { + if (id >= slab->start_idx && id <= slab->end_idx) { + task = slab->array[id - slab->start_idx]; + break; + } + } + if (likely(task && task->ltid == id)) + return task; + + return NULL; +} + +#endif diff --git a/open_src/xio/src/common/xio_transport.c b/open_src/xio/src/common/xio_transport.c new file mode 100644 index 0000000..b41e608 --- /dev/null +++ b/open_src/xio/src/common/xio_transport.c @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#include "libxio.h" +#include +#include "xio_log.h" +#include "xio_common.h" +#include "xio_protocol.h" +#include "xio_mbuf.h" +#include "xio_task.h" +#include "xio_observer.h" +#include "xio_transport.h" + +/*---------------------------------------------------------------------------*/ +/* globals */ +/*---------------------------------------------------------------------------*/ +static LIST_HEAD(transports_list); + +/*---------------------------------------------------------------------------*/ +/* xio_reg_transport */ +/*---------------------------------------------------------------------------*/ +int xio_reg_transport(struct xio_transport *transport) +{ + if (transport) + list_add(&transport->transports_list_entry, &transports_list); + + return 0; +} +EXPORT_SYMBOL(xio_reg_transport); + +/*---------------------------------------------------------------------------*/ +/* xio_unreg_transport */ +/*---------------------------------------------------------------------------*/ +void xio_unreg_transport(struct xio_transport *transport) +{ + list_del(&transport->transports_list_entry); +} +EXPORT_SYMBOL(xio_unreg_transport); + +/*---------------------------------------------------------------------------*/ +/* xio_get_transport */ +/*---------------------------------------------------------------------------*/ +struct xio_transport *xio_get_transport(const char *name) +{ + struct xio_transport *transport; + int found = 0; + + list_for_each_entry(transport, &transports_list, + transports_list_entry) { + if (!strcmp(name, transport->name)) { + found = 1; + break; + } + } + if (!found) + return NULL; + + /* lazy initialization of transport */ + if (transport->init) { + int retval = transport->init(transport); + + if (retval != 0) { + ERROR_LOG("%s transport initialization failed.\n", + name); + return NULL; + } + } + + return transport; +} + +/*---------------------------------------------------------------------------*/ +/* xio_transport_flush_task_list */ +/*---------------------------------------------------------------------------*/ +int xio_transport_flush_task_list(struct list_head *list) +{ + struct xio_task *ptask, *next_ptask; + + list_for_each_entry_safe(ptask, next_ptask, list, + tasks_list_entry) { + /* + TRACE_LOG("flushing task %p type 0x%x\n", + ptask, ptask->tlv_type); + */ + if (ptask->sender_task) { + xio_tasks_pool_put(ptask->sender_task); + ptask->sender_task = NULL; + } + xio_tasks_pool_put(ptask); + } + + return 0; +} +EXPORT_SYMBOL(xio_transport_flush_task_list); + +/*---------------------------------------------------------------------------*/ +/* xio_transport_assign_in_buf */ +/*---------------------------------------------------------------------------*/ +int xio_transport_assign_in_buf(struct xio_transport_base *trans_hndl, + struct xio_task *task, int *is_assigned) +{ + union xio_transport_event_data event_data = {}; + + event_data.assign_in_buf.task = task; + + xio_transport_notify_observer(trans_hndl, + XIO_TRANSPORT_EVENT_ASSIGN_IN_BUF, + &event_data); + + *is_assigned = event_data.assign_in_buf.is_assigned; + return 0; +} +EXPORT_SYMBOL(xio_transport_assign_in_buf); + diff --git a/open_src/xio/src/common/xio_transport.h b/open_src/xio/src/common/xio_transport.h new file mode 100644 index 0000000..f4b89e2 --- /dev/null +++ b/open_src/xio/src/common/xio_transport.h @@ -0,0 +1,383 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_TRANSPORT_H +#define XIO_TRANSPORT_H + +/*---------------------------------------------------------------------------*/ +/* forward declarations */ +/*---------------------------------------------------------------------------*/ +struct xio_task; +struct xio_observer; +struct xio_observable; +struct xio_tasks_pool_ops; + +/*---------------------------------------------------------------------------*/ +/* enums */ +/*---------------------------------------------------------------------------*/ +enum xio_transport_event { + XIO_TRANSPORT_EVENT_NEW_CONNECTION, + XIO_TRANSPORT_EVENT_ESTABLISHED, + XIO_TRANSPORT_EVENT_DISCONNECTED, + XIO_TRANSPORT_EVENT_CLOSED, + XIO_TRANSPORT_EVENT_REFUSED, + XIO_TRANSPORT_EVENT_NEW_MESSAGE, + XIO_TRANSPORT_EVENT_SEND_COMPLETION, + XIO_TRANSPORT_EVENT_ASSIGN_IN_BUF, + XIO_TRANSPORT_EVENT_ALLOC_HEAD_BUF,// head + XIO_TRANSPORT_EVENT_ALLOC_DATA_BUF,// data + XIO_TRANSPORT_EVENT_CANCEL_REQUEST, + XIO_TRANSPORT_EVENT_CANCEL_RESPONSE, + XIO_TRANSPORT_EVENT_MESSAGE_ERROR, + XIO_TRANSPORT_EVENT_ERROR, + XIO_TRANSPORT_EVENT_DIRECT_RDMA_COMPLETION +}; + +enum xio_transport_opt { + XIO_TRANSPORT_OPT_MSG_ATTR, +}; + +enum xio_transport_attr_mask { + XIO_TRANSPORT_ATTR_TOS = 1 << 0, +}; + +/*---------------------------------------------------------------------------*/ +/* unions and structs */ +/*---------------------------------------------------------------------------*/ +union xio_transport_event_data { + struct { + struct xio_task *task; + enum xio_wc_op op; + int pad; + } msg; + struct { + struct xio_task *task; + int is_assigned; + int pad; + } assign_in_buf; + struct { + void *ulp_msg; + size_t ulp_msg_sz; + struct xio_task *task; + enum xio_status result; + int pad; + } cancel; + struct { + struct xio_transport_base *child_trans_hndl; + } new_connection; + struct { + uint32_t cid; + } established; + struct { + struct xio_task *task; + enum xio_status reason; + enum xio_msg_direction direction; + } msg_error; + struct { + enum xio_status reason; + } error; + struct { + struct xio_task *task; + struct xio_iovec *header; + int is_assigned; + int pad; + } alloc_head_buf; + struct { + struct xio_task *task; + int is_assigned; + int pad; + } alloc_data_buf; +}; + +struct xio_transport_base { + struct xio_observable observable; + uint32_t is_client; /* client or server */ + int pad; + char *portal_uri; + struct sockaddr_storage peer_addr; + struct sockaddr_storage local_addr; + enum xio_proto proto; + struct kref kref; + struct xio_context *ctx; +}; + +struct xio_transport_attr { + uint8_t tos; /**< type of service RFC 2474 */ + uint8_t pad[3]; /**< padding */ +}; + +struct xio_transport_init_attr { + uint8_t tos; /**< type of service RFC 2474 */ + uint8_t pad[3]; /**< padding */ +}; + +struct xio_transport_msg_validators_cls { + int (*is_valid_out_msg)(struct xio_msg *msg); + int (*is_valid_in_req)(struct xio_msg *msg); +}; + +struct xio_tasks_pool_ops { + void (*pool_get_params)(struct xio_transport_base *transport_hndl, + int *start_nr, + int *max_nr, + int *alloc_nr, + int *pool_dd_size, + int *slab_dd_size, + int *task_dd_size); + + int (*slab_pre_create)(struct xio_transport_base *trans_hndl, + int alloc_nr, + void *pool_dd_data, void *slab_dd_data); + int (*slab_destroy)(struct xio_transport_base *trans_hndl, + void *pool_dd_data, void *slab_dd_data); + int (*slab_init_task)(struct xio_transport_base *trans_hndl, + void *pool_dd_data, void *slab_dd_data, + int tid, struct xio_task *task); + int (*slab_uninit_task)(struct xio_transport_base *trans_hndl, + void *pool_dd_data, void *slab_dd_data, + struct xio_task *task); + int (*slab_remap_task)(struct xio_transport_base *old_th, + struct xio_transport_base *new_th, + void *pool_dd_data, void *slab_dd_data, + struct xio_task *task); + int (*slab_post_create)(struct xio_transport_base *trans_hndl, + void *pool_dd_data, void *slab_dd_data); + int (*pool_pre_create)(struct xio_transport_base *trans_hndl, + void *pool, void *pool_dd_data); + int (*pool_post_create)(struct xio_transport_base *trans_hndl, + void *pool, void *pool_dd_data); + int (*pool_destroy)(struct xio_transport_base *trans_hndl, + void *pool, void *pool_dd_data); + int (*task_pre_put)(struct xio_transport_base *trans_hndl, + struct xio_task *task); + int (*task_post_get)(struct xio_transport_base *trans_hndl, + struct xio_task *task); +}; + +struct xio_tasks_pool_cls { + void *pool; + struct xio_task * (*task_get)(void *pool, void *context); + void (*task_put)(struct xio_task *task); + + struct xio_task * (*task_lookup)(void *pool, int task_id); +}; + +struct xio_transport { + struct xio_transport_msg_validators_cls validators_cls; + + const char *name; + + /* transport ctor/dtor called right after registration */ + void (*ctor)(void); + void (*dtor)(void); + + /* transport initialization */ + int (*init)(struct xio_transport *self); + void (*release)(struct xio_transport *self); + + /* running thread (context) is going down */ + int (*context_shutdown)(struct xio_transport_base *trans_hndl, + struct xio_context *ctx); + + /* task pools management */ + void (*get_pools_setup_ops)( + struct xio_transport_base *trans_hndl, + struct xio_tasks_pool_ops **initial_pool_ops, + struct xio_tasks_pool_ops **primary_pool_ops); + + void (*set_pools_cls)(struct xio_transport_base *trans_hndl, + struct xio_tasks_pool_cls *initial_pool_cls, + struct xio_tasks_pool_cls *primary_pool_cls); + + /* connection */ + struct xio_transport_base *(*open)( + struct xio_transport *self, + struct xio_context *ctx, + struct xio_observer *observer, + uint32_t trans_attr_mask, + struct xio_transport_init_attr *attr); + + int (*connect)(struct xio_transport_base *trans_hndl, + const char *portal_uri, + const char *out_if); + + int (*listen)(struct xio_transport_base *trans_hndl, + const char *portal_uri, uint16_t *src_port, + int backlog); + + int (*accept)(struct xio_transport_base *trans_hndl); + + int (*poll)(struct xio_transport_base *trans_hndl, + long min_nr, long nr, + struct timespec *timeout); + + int (*reject)(struct xio_transport_base *trans_hndl); + + void (*close)(struct xio_transport_base *trans_hndl); + + int (*dup2)(struct xio_transport_base *old_trans_hndl, + struct xio_transport_base **new_trans_hndl); + + int (*update_task)(struct xio_transport_base *trans_hndl, + struct xio_task *task); + + int (*update_rkey)(struct xio_transport_base *trans_hndl, + uint32_t *rkey); + + int (*send)(struct xio_transport_base *trans_hndl, + struct xio_task *task); + + int (*set_opt)(void *xio_obj, + int optname, const void *optval, int optlen); + + int (*get_opt)(void *xio_obj, + int optname, void *optval, int *optlen); + + int (*cancel_req)(struct xio_transport_base *trans_hndl, + struct xio_msg *req, uint64_t stag, + void *ulp_msg, size_t ulp_msg_len); + + int (*cancel_rsp)(struct xio_transport_base *trans_hndl, + struct xio_task *task, enum xio_status result, + void *ulp_msg, size_t ulp_msg_len); + + int (*modify)(struct xio_transport_base *trans_hndl, + struct xio_transport_attr *attr, + int attr_mask); + + int (*query)(struct xio_transport_base *trans_hndl, + struct xio_transport_attr *attr, + int attr_mask); + + struct list_head transports_list_entry; +}; + +/*---------------------------------------------------------------------------*/ +/* xio_transport_reg_observer */ +/*---------------------------------------------------------------------------*/ +static inline void xio_transport_reg_observer( + struct xio_transport_base *trans_hndl, + struct xio_observer *observer) +{ + xio_observable_reg_observer(&trans_hndl->observable, observer); +} + +/*---------------------------------------------------------------------------*/ +/* xio_transport_unreg_observer */ +/*---------------------------------------------------------------------------*/ +static inline void xio_transport_unreg_observer( + struct xio_transport_base *trans_hndl, + struct xio_observer *observer) +{ + xio_observable_unreg_observer(&trans_hndl->observable, observer); +} + +/*---------------------------------------------------------------------------*/ +/* xio_transport_unreg_observer */ +/*---------------------------------------------------------------------------*/ +static inline void xio_transport_notify_observer( + struct xio_transport_base *trans_hndl, + int event, void *event_data) +{ + xio_observable_notify_all_observers(&trans_hndl->observable, + event, event_data); +} + +/*---------------------------------------------------------------------------*/ +/* xio_transport_notify_observer_error */ +/*---------------------------------------------------------------------------*/ +static inline void xio_transport_notify_observer_error( + struct xio_transport_base *trans_hndl, + int reason) +{ + union xio_transport_event_data ev_data = {}; + + ev_data.error.reason = (enum xio_status)reason; + + xio_observable_notify_all_observers(&trans_hndl->observable, + XIO_TRANSPORT_EVENT_ERROR, + &ev_data); +} + +/*---------------------------------------------------------------------------*/ +/* xio_transport_notify_message_error */ +/*---------------------------------------------------------------------------*/ +static inline void xio_transport_notify_message_error( + struct xio_transport_base *trans_hndl, + struct xio_task *task, + enum xio_status reason) +{ + union xio_transport_event_data ev_data; + + ev_data.msg_error.task = task; + ev_data.msg_error.reason = reason; + + xio_observable_notify_all_observers(&trans_hndl->observable, + XIO_TRANSPORT_EVENT_MESSAGE_ERROR, + &ev_data); +} + +int xio_transport_flush_task_list(struct list_head *list); + +int xio_transport_assign_in_buf(struct xio_transport_base *trans_hndl, + struct xio_task *task, + int *is_assigned); + +/*---------------------------------------------------------------------------*/ +/* xio_reg_transport */ +/*---------------------------------------------------------------------------*/ +int xio_reg_transport(struct xio_transport *transport); + +/*---------------------------------------------------------------------------*/ +/* xio_unreg_transport */ +/*---------------------------------------------------------------------------*/ +void xio_unreg_transport(struct xio_transport *transport); + +/*---------------------------------------------------------------------------*/ +/* xio_get_transport */ +/*---------------------------------------------------------------------------*/ +struct xio_transport *xio_get_transport(const char *name); + +int xio_rdma_cancel_req(struct xio_transport_base *transport, + struct xio_msg *req, uint64_t stag, + void *ulp_msg, size_t ulp_msg_sz); + +int xio_rdma_cancel_rsp(struct xio_transport_base *transport, + struct xio_task *task, enum xio_status result, + void *ulp_msg, size_t ulp_msg_sz); + +#endif /*XIO_TRANSPORT_H */ diff --git a/open_src/xio/src/common/xio_utils.c b/open_src/xio/src/common/xio_utils.c new file mode 100644 index 0000000..2982a07 --- /dev/null +++ b/open_src/xio/src/common/xio_utils.c @@ -0,0 +1,471 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include "libxio.h" +#include "xio_log.h" +#include "xio_common.h" +#include "xio_protocol.h" + +/*---------------------------------------------------------------------------*/ +/* xio_uri_get_proto */ +/*---------------------------------------------------------------------------*/ +int xio_uri_get_proto(const char *uri, char *proto, int proto_len) +{ + char *start = (char *)uri; + const char *end; + char *p; + int i; + + end = strstr(uri, "://"); + if (!end) + return -1; + + p = start; + for (i = 0; i < proto_len; i++) { + if (p == end) { + proto[i] = 0; + return 0; + } + proto[i] = *p; + p++; + } + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_uri_get_resource_ptr */ +/*---------------------------------------------------------------------------*/ +const char *xio_uri_get_resource_ptr(const char *uri) +{ + const char *start; + const char *p1, *p2 = NULL; + + start = strstr(uri, "://"); + if (!start) + return NULL; + + if (*(start+3) == '[') { /* IPv6 */ + p1 = strstr(start + 4, "]:"); + if (!p1) + return NULL; + p2 = strchr(p1 + 2, '/'); + + return p2; + } + + p1 = (char *)uri + strlen(uri); + while (p1 != (start + 3)) { + if (*p1 == '/') + p2 = p1; + p1--; + } + + return p2; +} + +/*---------------------------------------------------------------------------*/ +/* xio_uri_get_portal */ +/*---------------------------------------------------------------------------*/ +int xio_uri_get_portal(const char *uri, char *portal, int portal_len) +{ + const char *res = xio_uri_get_resource_ptr(uri); + int len = (!res) ? strlen(uri) : (size_t)(res - uri); + + if (len < portal_len) { + strncpy(portal, uri, len); + portal[len] = 0; + return 0; + } + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_uri_get_resource */ +/*---------------------------------------------------------------------------*/ +int xio_uri_get_resource(const char *uri, char *resource, int resource_len) +{ + const char *res = xio_uri_get_resource_ptr(uri); + + if (res) { + int len = strlen(res); + + if (len < resource_len) { + strcpy(resource, res); + return 0; + } + } + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_write_tlv */ +/*---------------------------------------------------------------------------*/ +size_t xio_write_tlv(uint32_t type, uint64_t len, uint8_t *buffer) +{ + struct xio_tlv *tlv = (struct xio_tlv *)buffer; + + tlv->magic = htonl(XIO_MAGIC); + tlv->type = htonl(type); + tlv->len = htonll(len); + + return sizeof(struct xio_tlv) + (size_t)len; +} +EXPORT_SYMBOL(xio_write_tlv); + +/*---------------------------------------------------------------------------*/ +/* xio_read_tlv */ +/*---------------------------------------------------------------------------*/ +size_t xio_read_tlv(uint32_t *type, uint64_t *len, void **value, + uint8_t *buffer) +{ + struct xio_tlv *tlv; + + tlv = (struct xio_tlv *)buffer; + if (unlikely(tlv->magic != htonl(XIO_MAGIC))) + return -1; + + *type = ntohl(tlv->type); + *len = ntohll(tlv->len); + *value = buffer + sizeof(struct xio_tlv); + + return sizeof(struct xio_tlv) + (size_t)*len; +} +EXPORT_SYMBOL(xio_read_tlv); + +#ifndef SETIOV +#define SETIOV(_iov, _addr, _len) ((_iov)->iov_base = \ + (void *)(_addr), (_iov)->iov_len = (_len)) +#endif +#ifndef GETIOVBASE +#define GETIOVBASE(_iov) ((_iov)->iov_base) +#endif +#ifndef GETIOVLEN +#define GETIOVLEN(_iov) ((_iov)->iov_len) +#endif + +/*---------------------------------------------------------------------------*/ +/* memclonev */ +/*---------------------------------------------------------------------------*/ +size_t memclonev(struct xio_iovec *dst, int dsize, + struct xio_iovec *src, int ssize) +{ + int nr = 0; + int sz; + + sz = (dsize < ssize) ? dsize : ssize; + + while (nr < sz) { + dst[nr].iov_base = src[nr].iov_base; + dst[nr].iov_len = src[nr].iov_len; + nr++; + } + + return sz; +} +EXPORT_SYMBOL(memclonev); + +/*---------------------------------------------------------------------------*/ +/* memclonev_ex */ +/*---------------------------------------------------------------------------*/ +size_t memclonev_ex(struct xio_iovec_ex *dst, int dsize, + struct xio_iovec_ex *src, int ssize) +{ + int nr = 0; + int sz; + + sz = (dsize < ssize) ? dsize : ssize; + + while (nr < sz) { + dst[nr].iov_base = src[nr].iov_base; + dst[nr].iov_len = src[nr].iov_len; + nr++; + } + + return sz; +} + +/* + * Total number of bytes covered by an iovec. + */ +inline size_t xio_iov_length(const struct xio_iovec *iov, + unsigned long nr_segs) +{ + size_t nbytes = 0; + const struct xio_iovec *piov = iov; + + while (nr_segs > 0) { + nbytes += GETIOVLEN(piov); + nr_segs--; + piov++; + } + + return nbytes; +} + +inline size_t xio_iovex_length(const struct xio_iovec_ex *iov, + unsigned long nr_segs) +{ + size_t nbytes = 0; + const struct xio_iovec_ex *piov = iov; + + while (nr_segs > 0) { + nbytes += GETIOVLEN(piov); + nr_segs--; + piov++; + } + + return nbytes; +} + +/* +void *xio_memcpy(void* dest, const void* src, size_t count) +{ + char* dst8 = (char*)dest; + char* src8 = (char*)src; + + if (count & 1) { + dst8[0] = src8[0]; + dst8 += 1; + src8 += 1; + } + + count /= 2; + while (count--) { + dst8[0] = src8[0]; + dst8[1] = src8[1]; + + dst8 += 2; + src8 += 2; + } + return dest; +} +*/ + +/** + * memcpyv + * + * Copy data from one iov to another. + * + * @dst: An array of iovec structures that you want to + * copy the data to. + * @dsize: The number of entries in the dst array. + * @src: An array of iovec structures that you want to + * copy the data from. + * @ssize: The number of entries in the src array. + */ +size_t memcpyv(struct xio_iovec *dst, int dsize, + struct xio_iovec *src, int ssize) +{ + void *daddr = dst[0].iov_base; + void *saddr = src[0].iov_base; + size_t dlen = dst[0].iov_len; + size_t slen = src[0].iov_len; + int d = 0, + s = 0, + dst_len = 0; + + if (dsize < 1 || ssize < 1) { + ERROR_LOG("iovec size < 1 dsize:%d, ssize:%d\n", + dsize, ssize); + return 0; + } + + while (1) { + if (slen < dlen) { + memcpy(daddr, saddr, slen); + dst_len += slen; + + s++; + if (s == ssize) { + dst[d].iov_len = dst_len; + d++; + break; + } + dlen -= slen; + inc_ptr(daddr, slen); + saddr = src[s].iov_base; + slen = src[s].iov_len; + } else if (dlen < slen) { + memcpy(daddr, saddr, dlen); + dst[d].iov_len = dst_len + dlen; + dst_len = 0; + + d++; + if (d == dsize) + break; + slen -= dlen; + inc_ptr(saddr, dlen); + daddr = dst[d].iov_base; + dlen = dst[d].iov_len; + + } else { + memcpy(daddr, saddr, dlen); + dst[d].iov_len = dst_len + dlen; + dst_len = 0; + + d++; + s++; + if ((d == dsize) || (s == ssize)) + break; + + daddr = dst[d].iov_base; + dlen = dst[d].iov_len; + saddr = src[s].iov_base; + slen = src[s].iov_len; + } + } + + /* not enough buffers to complete */ + if (s < ssize) { + ERROR_LOG("dest iovec exausted\n"); + return 0; + } + + return d; +} + +/** + * memcpyv_ex + * + * Copy data from one iov to another. + * + * @dst: An array of iovec structures that you want to + * copy the data to. + * @dsize: The number of entries in the dst array. + * @src: An array of iovec structures that you want to + * copy the data from. + * @ssize: The number of entries in the src array. + */ +size_t memcpyv_ex(struct xio_iovec_ex *dst, int dsize, + struct xio_iovec_ex *src, int ssize) +{ + void *daddr = dst[0].iov_base; + void *saddr = src[0].iov_base; + size_t dlen = dst[0].iov_len; + size_t slen = src[0].iov_len; + int d = 0, + s = 0, + dst_len = 0; + + if (dsize < 1 || ssize < 1) { + ERROR_LOG("iovec size < 1 dsize:%d, ssize:%d\n", + dsize, ssize); + return 0; + } + + while (1) { + if (slen < dlen) { + memcpy(daddr, saddr, slen); + dst_len += slen; + + s++; + if (s == ssize) { + dst[d].iov_len = dst_len; + d++; + break; + } + dlen -= slen; + inc_ptr(daddr, slen); + saddr = src[s].iov_base; + slen = src[s].iov_len; + } else if (dlen < slen) { + memcpy(daddr, saddr, dlen); + dst[d].iov_len = dst_len + dlen; + dst_len = 0; + + d++; + if (d == dsize) + break; + slen -= dlen; + inc_ptr(saddr, dlen); + daddr = dst[d].iov_base; + dlen = dst[d].iov_len; + + } else { + memcpy(daddr, saddr, dlen); + dst[d].iov_len = dst_len + dlen; + dst_len = 0; + + d++; + s++; + if ((d == dsize) || (s == ssize)) + break; + + daddr = dst[d].iov_base; + dlen = dst[d].iov_len; + saddr = src[s].iov_base; + slen = src[s].iov_len; + } + } + + /* not enough buffers to complete */ + if (s < ssize) { + ERROR_LOG("dest iovec exhausted\n"); + return 0; + } + + return d; +} + +extern const char XIO_VERSION_STRING[]; + +/*---------------------------------------------------------------------------*/ +/* xio_version */ +/*---------------------------------------------------------------------------*/ +inline const char *xio_version(void) +{ + return XIO_VERSION_STRING; +} +EXPORT_SYMBOL(xio_version); + +/*---------------------------------------------------------------------------*/ +/* xio_proto_str */ +/*---------------------------------------------------------------------------*/ +const char *xio_proto_str(enum xio_proto proto) +{ + switch (proto) { + case XIO_PROTO_RDMA: return "rdma"; + case XIO_PROTO_TCP: return "tcp"; + default: return "proto_unknown"; + } +} +EXPORT_SYMBOL(xio_proto_str); + diff --git a/open_src/xio/src/common/xio_workqueue.h b/open_src/xio/src/common/xio_workqueue.h new file mode 100644 index 0000000..9c365bd --- /dev/null +++ b/open_src/xio/src/common/xio_workqueue.h @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_WORKQUEUE_H +#define XIO_WORKQUEUE_H + +#include "xio_workqueue_priv.h" + +/* opaque type */ +struct xio_workqueue; +struct xio_context; + +/*---------------------------------------------------------------------------*/ +/* xio_workqueue_create */ +/*---------------------------------------------------------------------------*/ +struct xio_workqueue *xio_workqueue_create(struct xio_context *ctx); + +/*---------------------------------------------------------------------------*/ +/* xio_workqueue_destroy */ +/*---------------------------------------------------------------------------*/ +int xio_workqueue_destroy(struct xio_workqueue *work_queue); + +/*---------------------------------------------------------------------------*/ +/* xio_workqueue_add_delayed_work */ +/*---------------------------------------------------------------------------*/ +int xio_workqueue_add_delayed_work(struct xio_workqueue *work_queue, + int msec_duration, void *data, + void (*function)(void *data), + xio_delayed_work_handle_t *work); + +/*---------------------------------------------------------------------------*/ +/* xio_workqueue_del_delayed_work */ +/*---------------------------------------------------------------------------*/ +int xio_workqueue_del_delayed_work(struct xio_workqueue *work_queue, + xio_delayed_work_handle_t *work); + +/*---------------------------------------------------------------------------*/ +/* xio_workqueue_add_work */ +/*---------------------------------------------------------------------------*/ +int xio_workqueue_add_work(struct xio_workqueue *work_queue, + void *data, + void (*function)(void *data), + xio_work_handle_t *work); + +/*---------------------------------------------------------------------------*/ +/* xio_workqueue_del_work */ +/*---------------------------------------------------------------------------*/ +int xio_workqueue_del_work(struct xio_workqueue *work_queue, + xio_work_handle_t *work); + +/*---------------------------------------------------------------------------*/ +/* xio_workqueue_set_work_destructor */ +/*---------------------------------------------------------------------------*/ +int xio_workqueue_set_work_destructor(struct xio_workqueue *work_queue, + void *data, + void (*destructor)(void *data), + xio_work_handle_t *work); + +/*---------------------------------------------------------------------------*/ +/* xio_workqueue_is_work_in_handler */ +/*---------------------------------------------------------------------------*/ +int xio_workqueue_is_work_in_handler(struct xio_workqueue *work_queue, + xio_work_handle_t *work); + +#endif /* XIO_WORKQUEUE_H */ + diff --git a/open_src/xio/src/kernel/transport/compat/Makefile.in b/open_src/xio/src/kernel/transport/compat/Makefile.in new file mode 100644 index 0000000..a555ab1 --- /dev/null +++ b/open_src/xio/src/kernel/transport/compat/Makefile.in @@ -0,0 +1,53 @@ +# Makefile.in for kernel module + +SHELL = /bin/sh +INSTALL = @INSTALL@ +mkdir_p = mkdir -p +VERSION = @PACKAGE_VERSION@ + +compatmoduledir = @kmoduledir@/extra/compat + +compatmodule := compat.ko + +all: all-@ENABLE_COMPAT_MODULE@ +install: install-@ENABLE_COMPAT_MODULE@ +uninstall: uninstall-@ENABLE_COMPAT_MODULE@ + +all-n: +install-n: +uninstall-n: + +all-y: all-spec + +SUBDIRS ?=`pwd` + +install-y: all + $(mkdir_p) $(DESTDIR)$(compatmoduledir) + $(INSTALL) -m 644 $(compatmodule) $(DESTDIR)$(compatmoduledir)/$(compatmodule) + -/sbin/depmod -a + +uninstall-y: + rm -f $(DESTDIR)$(compatmoduledir)/$(compatmodule) + -/sbin/depmod -a + +clean: + -rm -f $(compatmodule) *.o .*.cmd *.mod.c *.ko *.s */*.o *.order *.symvers *.unsigned + -cd ./compat && $(MAKE) clean && cd ../ + +distclean: clean + rm -f Makefile configure config.status + rm -f config.h config.log config.status config.cache + rm -rf .tmp_versions autom4te.cache + +maintainer-clean: distclean + +distdir: $(DISTFILES) + cp -p $(DISTFILES) $(distdir) + +compat/.git: + cd ../../../ && git submodule update --init + +all-spec: compat/.git + @export KLIB_BUILD=@kernelsrc@ @KERNELMAKE_PARAMS@ + cd ./compat && $(MAKE) + @cp -f ./compat/compat/$(compatmodule) ./ && cp -f ./compat/Module.symvers ./ diff --git a/open_src/xio/src/kernel/transport/compat/autogen.sh b/open_src/xio/src/kernel/transport/compat/autogen.sh new file mode 100644 index 0000000..28dd57d --- /dev/null +++ b/open_src/xio/src/kernel/transport/compat/autogen.sh @@ -0,0 +1,3 @@ +#! /bin/sh + +autoconf diff --git a/open_src/xio/src/kernel/transport/compat/configure.ac b/open_src/xio/src/kernel/transport/compat/configure.ac new file mode 100644 index 0000000..96c4bab --- /dev/null +++ b/open_src/xio/src/kernel/transport/compat/configure.ac @@ -0,0 +1,138 @@ +AC_INIT([xio-kernel],[2.0],[libxio@accellio.org]) + +AC_PROG_INSTALL + +runver=`uname -r` +bad_kernel_version=no +ENABLE_COMPAT_MODULE=y +# do not build against ofed until kernel module can be built out of kernel +# tree +KERNELCFLAGS= +KERNELMAKE_PARAMS= + +kernelsrc= +kernelbuild= +AC_ARG_WITH(kernel, + [ --with-kernel=PATH Specify location of kernel source ], + [kernelsrc="$withval"; kernelbuild="$withval"]) +AC_ARG_WITH(kernel-build, + [ --with-kernel-build=PATH Specify location of kernel build ], + [kernelbuild="$withval"]) +AC_ARG_ENABLE(kernel-module, + [ --enable-kernel-module Compile kernel module ]) + + +#build against installed OFED +AC_MSG_CHECKING([if ofed installed]) +MLNX_OFED=`if ofed_info 2>/dev/null | grep MLNX_OFED >/dev/null 2>/dev/null; then echo true; else echo false; fi` + +if test "$MLNX_OFED" = "true"; then + AC_MSG_RESULT(yes) +else + AC_MSG_RESULT(no) + +fi +if test -z "$enable_kernel_module" -a "$MLNX_OFED" = "true"; then + ENABLE_COMPAT_MODULE=n +fi + + +if test "$ENABLE_COMPAT_MODULE" = y; then + AC_MSG_CHECKING([kernel source directory]) + if test -z "$kernelsrc"; then + kernelbuild= + sourcelink=/lib/modules/${runver}/source + buildlink=/lib/modules/${runver}/build + + if test -e $sourcelink; then + kernelsrc=`(cd $sourcelink; /bin/pwd)` + fi + if test -e $buildlink; then + kernelbuild=`(cd $buildlink; /bin/pwd)` + fi + if test -z "$kernelsrc"; then + kernelsrc=$kernelbuild + fi + if test -z "$kernelsrc" -o -z "$kernelbuild"; then + AC_MSG_RESULT([Not found]) + AC_MSG_ERROR([ + *** Please specify the location of the kernel source with + *** the '--with-kernel=SRCDIR' option]) + fi + fi + AC_MSG_RESULT([$kernelsrc]) + AC_MSG_CHECKING([kernel build directory]) + AC_MSG_RESULT([$kernelbuild]) + + AC_MSG_CHECKING([kernel source version]) + if test -r $kernelbuild/include/linux/version.h && fgrep -q UTS_RELEASE $kernelbuild/include/linux/version.h; then + kernsrcver=`(echo "#include "; echo "kernsrcver=UTS_RELEASE") | cpp -I $kernelbuild/include | grep "^kernsrcver=" | cut -d \" -f 2` + elif test -r $kernelbuild/include/linux/utsrelease.h && fgrep -q UTS_RELEASE $kernelbuild/include/linux/utsrelease.h; then + kernsrcver=`(echo "#include "; echo "kernsrcver=UTS_RELEASE") | cpp -I $kernelbuild/include | grep "^kernsrcver=" | cut -d \" -f 2` + elif test -r $kernelbuild/include/generated/utsrelease.h && fgrep -q UTS_RELEASE $kernelbuild/include/generated/utsrelease.h; then + kernsrcver=`(echo "#include "; echo "kernsrcver=UTS_RELEASE") | cpp -I $kernelbuild/include | grep "^kernsrcver=" | cut -d \" -f 2` + fi + if test -z "$kernsrcver"; then + AC_MSG_RESULT([Not found]) + AC_MSG_ERROR([ + *** Cannot determine the version of the linux kernel source. Please + *** prepare the kernel before running this script]) + fi + AC_MSG_RESULT([$kernsrcver]) + kmoduledir=${INSTALL_MOD_PATH}/lib/modules/$kernsrcver + AC_SUBST(kernelsrc) + AC_SUBST(kmoduledir) + + if echo "$kernsrcver" | egrep -q ["^(2.4|2.6.[0-8]([^0-9]|\$))"]; then + bad_kernel_version=yes + AC_MSG_NOTICE([ +NOTE: Disabled building the kernel module, because this release only +NOTE: supports Linux versions 2.6.9 or later. You can use the kernel +NOTE: module from an earlier COMPAT release with the library from this +NOTE: release.]) + else + compat_configured=no + kernel_autoconf=$kernelbuild/include/linux/autoconf.h + AC_MSG_CHECKING([if COMPAT is configured in the kernel]) + if test -f $kernel_autoconf; then + if grep -q "^#define CONFIG_COMPAT 1" $kernel_autoconf || grep -q "^#define CONFIG_COMPAT_MODULE 1" $kernel_autoconf; then + compat_configured=yes + fi + fi + AC_MSG_RESULT([$compat_configured]) + if test -z "$enable_kernel_module" -a "$xio_configured" = yes; then + ENABLE_COMPAT_MODULE=n + fi + fi +fi + +if test "$ENABLE_COMPAT_MODULE" = n; then + AC_MSG_NOTICE([ +NOTE: Detected that COMPAT is already present in the kernel, so +NOTE: building of kernel module is disabled. To force building +NOTE: of kernel module use the '--enable-kernel-module' option.]) +fi + +if test "$enable_kernel_module" = no; then + ENABLE_COMPAT_MODULE=n +fi +if test "$bad_kernel_version" = yes; then + ENABLE_COMPAT_MODULE=n +fi + +AC_MSG_CHECKING([is ENABLE_COMPAT_MODULE defined]) +if test "$ENABLE_COMPAT_MODULE" = y; then + AC_MSG_RESULT([yes]) +else + AC_MSG_RESULT([no]) +fi + +AC_SUBST(ENABLE_COMPAT_MODULE) +AC_SUBST(KERNELMAKE_PARAMS) +AC_SUBST(KERNELCPPFLAGS) +AC_SUBST(KERNELCFLAGS) + + + +AC_CONFIG_FILES([Makefile]) +AC_OUTPUT diff --git a/open_src/xio/src/kernel/transport/compat/install-sh b/open_src/xio/src/kernel/transport/compat/install-sh new file mode 100644 index 0000000..a9244eb --- /dev/null +++ b/open_src/xio/src/kernel/transport/compat/install-sh @@ -0,0 +1,527 @@ +#!/bin/sh +# install - install a program, script, or datafile + +scriptversion=2011-01-19.21; # UTC + +# This originates from X11R5 (mit/util/scripts/install.sh), which was +# later released in X11R6 (xc/config/util/install.sh) with the +# following copyright and license. +# +# Copyright (C) 1994 X Consortium +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to +# deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +# sell copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# X CONSORTIUM BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN +# AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNEC- +# TION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# Except as contained in this notice, the name of the X Consortium shall not +# be used in advertising or otherwise to promote the sale, use or other deal- +# ings in this Software without prior written authorization from the X Consor- +# tium. +# +# +# FSF changes to this file are in the public domain. +# +# Calling this script install-sh is preferred over install.sh, to prevent +# `make' implicit rules from creating a file called install from it +# when there is no Makefile. +# +# This script is compatible with the BSD install script, but was written +# from scratch. + +nl=' +' +IFS=" "" $nl" + +# set DOITPROG to echo to test this script + +# Don't use :- since 4.3BSD and earlier shells don't like it. +doit=${DOITPROG-} +if test -z "$doit"; then + doit_exec=exec +else + doit_exec=$doit +fi + +# Put in absolute file names if you don't have them in your path; +# or use environment vars. + +chgrpprog=${CHGRPPROG-chgrp} +chmodprog=${CHMODPROG-chmod} +chownprog=${CHOWNPROG-chown} +cmpprog=${CMPPROG-cmp} +cpprog=${CPPROG-cp} +mkdirprog=${MKDIRPROG-mkdir} +mvprog=${MVPROG-mv} +rmprog=${RMPROG-rm} +stripprog=${STRIPPROG-strip} + +posix_glob='?' +initialize_posix_glob=' + test "$posix_glob" != "?" || { + if (set -f) 2>/dev/null; then + posix_glob= + else + posix_glob=: + fi + } +' + +posix_mkdir= + +# Desired mode of installed file. +mode=0755 + +chgrpcmd= +chmodcmd=$chmodprog +chowncmd= +mvcmd=$mvprog +rmcmd="$rmprog -f" +stripcmd= + +src= +dst= +dir_arg= +dst_arg= + +copy_on_change=false +no_target_directory= + +usage="\ +Usage: $0 [OPTION]... [-T] SRCFILE DSTFILE + or: $0 [OPTION]... SRCFILES... DIRECTORY + or: $0 [OPTION]... -t DIRECTORY SRCFILES... + or: $0 [OPTION]... -d DIRECTORIES... + +In the 1st form, copy SRCFILE to DSTFILE. +In the 2nd and 3rd, copy all SRCFILES to DIRECTORY. +In the 4th, create DIRECTORIES. + +Options: + --help display this help and exit. + --version display version info and exit. + + -c (ignored) + -C install only if different (preserve the last data modification time) + -d create directories instead of installing files. + -g GROUP $chgrpprog installed files to GROUP. + -m MODE $chmodprog installed files to MODE. + -o USER $chownprog installed files to USER. + -s $stripprog installed files. + -t DIRECTORY install into DIRECTORY. + -T report an error if DSTFILE is a directory. + +Environment variables override the default commands: + CHGRPPROG CHMODPROG CHOWNPROG CMPPROG CPPROG MKDIRPROG MVPROG + RMPROG STRIPPROG +" + +while test $# -ne 0; do + case $1 in + -c) ;; + + -C) copy_on_change=true;; + + -d) dir_arg=true;; + + -g) chgrpcmd="$chgrpprog $2" + shift;; + + --help) echo "$usage"; exit $?;; + + -m) mode=$2 + case $mode in + *' '* | *' '* | *' +'* | *'*'* | *'?'* | *'['*) + echo "$0: invalid mode: $mode" >&2 + exit 1;; + esac + shift;; + + -o) chowncmd="$chownprog $2" + shift;; + + -s) stripcmd=$stripprog;; + + -t) dst_arg=$2 + # Protect names problematic for `test' and other utilities. + case $dst_arg in + -* | [=\(\)!]) dst_arg=./$dst_arg;; + esac + shift;; + + -T) no_target_directory=true;; + + --version) echo "$0 $scriptversion"; exit $?;; + + --) shift + break;; + + -*) echo "$0: invalid option: $1" >&2 + exit 1;; + + *) break;; + esac + shift +done + +if test $# -ne 0 && test -z "$dir_arg$dst_arg"; then + # When -d is used, all remaining arguments are directories to create. + # When -t is used, the destination is already specified. + # Otherwise, the last argument is the destination. Remove it from $@. + for arg + do + if test -n "$dst_arg"; then + # $@ is not empty: it contains at least $arg. + set fnord "$@" "$dst_arg" + shift # fnord + fi + shift # arg + dst_arg=$arg + # Protect names problematic for `test' and other utilities. + case $dst_arg in + -* | [=\(\)!]) dst_arg=./$dst_arg;; + esac + done +fi + +if test $# -eq 0; then + if test -z "$dir_arg"; then + echo "$0: no input file specified." >&2 + exit 1 + fi + # It's OK to call `install-sh -d' without argument. + # This can happen when creating conditional directories. + exit 0 +fi + +if test -z "$dir_arg"; then + do_exit='(exit $ret); exit $ret' + trap "ret=129; $do_exit" 1 + trap "ret=130; $do_exit" 2 + trap "ret=141; $do_exit" 13 + trap "ret=143; $do_exit" 15 + + # Set umask so as not to create temps with too-generous modes. + # However, 'strip' requires both read and write access to temps. + case $mode in + # Optimize common cases. + *644) cp_umask=133;; + *755) cp_umask=22;; + + *[0-7]) + if test -z "$stripcmd"; then + u_plus_rw= + else + u_plus_rw='% 200' + fi + cp_umask=`expr '(' 777 - $mode % 1000 ')' $u_plus_rw`;; + *) + if test -z "$stripcmd"; then + u_plus_rw= + else + u_plus_rw=,u+rw + fi + cp_umask=$mode$u_plus_rw;; + esac +fi + +for src +do + # Protect names problematic for `test' and other utilities. + case $src in + -* | [=\(\)!]) src=./$src;; + esac + + if test -n "$dir_arg"; then + dst=$src + dstdir=$dst + test -d "$dstdir" + dstdir_status=$? + else + + # Waiting for this to be detected by the "$cpprog $src $dsttmp" command + # might cause directories to be created, which would be especially bad + # if $src (and thus $dsttmp) contains '*'. + if test ! -f "$src" && test ! -d "$src"; then + echo "$0: $src does not exist." >&2 + exit 1 + fi + + if test -z "$dst_arg"; then + echo "$0: no destination specified." >&2 + exit 1 + fi + dst=$dst_arg + + # If destination is a directory, append the input filename; won't work + # if double slashes aren't ignored. + if test -d "$dst"; then + if test -n "$no_target_directory"; then + echo "$0: $dst_arg: Is a directory" >&2 + exit 1 + fi + dstdir=$dst + dst=$dstdir/`basename "$src"` + dstdir_status=0 + else + # Prefer dirname, but fall back on a substitute if dirname fails. + dstdir=` + (dirname "$dst") 2>/dev/null || + expr X"$dst" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ + X"$dst" : 'X\(//\)[^/]' \| \ + X"$dst" : 'X\(//\)$' \| \ + X"$dst" : 'X\(/\)' \| . 2>/dev/null || + echo X"$dst" | + sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ + s//\1/ + q + } + /^X\(\/\/\)[^/].*/{ + s//\1/ + q + } + /^X\(\/\/\)$/{ + s//\1/ + q + } + /^X\(\/\).*/{ + s//\1/ + q + } + s/.*/./; q' + ` + + test -d "$dstdir" + dstdir_status=$? + fi + fi + + obsolete_mkdir_used=false + + if test $dstdir_status != 0; then + case $posix_mkdir in + '') + # Create intermediate dirs using mode 755 as modified by the umask. + # This is like FreeBSD 'install' as of 1997-10-28. + umask=`umask` + case $stripcmd.$umask in + # Optimize common cases. + *[2367][2367]) mkdir_umask=$umask;; + .*0[02][02] | .[02][02] | .[02]) mkdir_umask=22;; + + *[0-7]) + mkdir_umask=`expr $umask + 22 \ + - $umask % 100 % 40 + $umask % 20 \ + - $umask % 10 % 4 + $umask % 2 + `;; + *) mkdir_umask=$umask,go-w;; + esac + + # With -d, create the new directory with the user-specified mode. + # Otherwise, rely on $mkdir_umask. + if test -n "$dir_arg"; then + mkdir_mode=-m$mode + else + mkdir_mode= + fi + + posix_mkdir=false + case $umask in + *[123567][0-7][0-7]) + # POSIX mkdir -p sets u+wx bits regardless of umask, which + # is incompatible with FreeBSD 'install' when (umask & 300) != 0. + ;; + *) + tmpdir=${TMPDIR-/tmp}/ins$RANDOM-$$ + trap 'ret=$?; rmdir "$tmpdir/d" "$tmpdir" 2>/dev/null; exit $ret' 0 + + if (umask $mkdir_umask && + exec $mkdirprog $mkdir_mode -p -- "$tmpdir/d") >/dev/null 2>&1 + then + if test -z "$dir_arg" || { + # Check for POSIX incompatibilities with -m. + # HP-UX 11.23 and IRIX 6.5 mkdir -m -p sets group- or + # other-writeable bit of parent directory when it shouldn't. + # FreeBSD 6.1 mkdir -m -p sets mode of existing directory. + ls_ld_tmpdir=`ls -ld "$tmpdir"` + case $ls_ld_tmpdir in + d????-?r-*) different_mode=700;; + d????-?--*) different_mode=755;; + *) false;; + esac && + $mkdirprog -m$different_mode -p -- "$tmpdir" && { + ls_ld_tmpdir_1=`ls -ld "$tmpdir"` + test "$ls_ld_tmpdir" = "$ls_ld_tmpdir_1" + } + } + then posix_mkdir=: + fi + rmdir "$tmpdir/d" "$tmpdir" + else + # Remove any dirs left behind by ancient mkdir implementations. + rmdir ./$mkdir_mode ./-p ./-- 2>/dev/null + fi + trap '' 0;; + esac;; + esac + + if + $posix_mkdir && ( + umask $mkdir_umask && + $doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir" + ) + then : + else + + # The umask is ridiculous, or mkdir does not conform to POSIX, + # or it failed possibly due to a race condition. Create the + # directory the slow way, step by step, checking for races as we go. + + case $dstdir in + /*) prefix='/';; + [-=\(\)!]*) prefix='./';; + *) prefix='';; + esac + + eval "$initialize_posix_glob" + + oIFS=$IFS + IFS=/ + $posix_glob set -f + set fnord $dstdir + shift + $posix_glob set +f + IFS=$oIFS + + prefixes= + + for d + do + test X"$d" = X && continue + + prefix=$prefix$d + if test -d "$prefix"; then + prefixes= + else + if $posix_mkdir; then + (umask=$mkdir_umask && + $doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir") && break + # Don't fail if two instances are running concurrently. + test -d "$prefix" || exit 1 + else + case $prefix in + *\'*) qprefix=`echo "$prefix" | sed "s/'/'\\\\\\\\''/g"`;; + *) qprefix=$prefix;; + esac + prefixes="$prefixes '$qprefix'" + fi + fi + prefix=$prefix/ + done + + if test -n "$prefixes"; then + # Don't fail if two instances are running concurrently. + (umask $mkdir_umask && + eval "\$doit_exec \$mkdirprog $prefixes") || + test -d "$dstdir" || exit 1 + obsolete_mkdir_used=true + fi + fi + fi + + if test -n "$dir_arg"; then + { test -z "$chowncmd" || $doit $chowncmd "$dst"; } && + { test -z "$chgrpcmd" || $doit $chgrpcmd "$dst"; } && + { test "$obsolete_mkdir_used$chowncmd$chgrpcmd" = false || + test -z "$chmodcmd" || $doit $chmodcmd $mode "$dst"; } || exit 1 + else + + # Make a couple of temp file names in the proper directory. + dsttmp=$dstdir/_inst.$$_ + rmtmp=$dstdir/_rm.$$_ + + # Trap to clean up those temp files at exit. + trap 'ret=$?; rm -f "$dsttmp" "$rmtmp" && exit $ret' 0 + + # Copy the file name to the temp name. + (umask $cp_umask && $doit_exec $cpprog "$src" "$dsttmp") && + + # and set any options; do chmod last to preserve setuid bits. + # + # If any of these fail, we abort the whole thing. If we want to + # ignore errors from any of these, just make sure not to ignore + # errors from the above "$doit $cpprog $src $dsttmp" command. + # + { test -z "$chowncmd" || $doit $chowncmd "$dsttmp"; } && + { test -z "$chgrpcmd" || $doit $chgrpcmd "$dsttmp"; } && + { test -z "$stripcmd" || $doit $stripcmd "$dsttmp"; } && + { test -z "$chmodcmd" || $doit $chmodcmd $mode "$dsttmp"; } && + + # If -C, don't bother to copy if it wouldn't change the file. + if $copy_on_change && + old=`LC_ALL=C ls -dlL "$dst" 2>/dev/null` && + new=`LC_ALL=C ls -dlL "$dsttmp" 2>/dev/null` && + + eval "$initialize_posix_glob" && + $posix_glob set -f && + set X $old && old=:$2:$4:$5:$6 && + set X $new && new=:$2:$4:$5:$6 && + $posix_glob set +f && + + test "$old" = "$new" && + $cmpprog "$dst" "$dsttmp" >/dev/null 2>&1 + then + rm -f "$dsttmp" + else + # Rename the file to the real destination. + $doit $mvcmd -f "$dsttmp" "$dst" 2>/dev/null || + + # The rename failed, perhaps because mv can't rename something else + # to itself, or perhaps because mv is so ancient that it does not + # support -f. + { + # Now remove or move aside any old file at destination location. + # We try this two ways since rm can't unlink itself on some + # systems and the destination file might be busy for other + # reasons. In this case, the final cleanup might fail but the new + # file should still install successfully. + { + test ! -f "$dst" || + $doit $rmcmd -f "$dst" 2>/dev/null || + { $doit $mvcmd -f "$dst" "$rmtmp" 2>/dev/null && + { $doit $rmcmd -f "$rmtmp" 2>/dev/null; :; } + } || + { echo "$0: cannot unlink or rename $dst" >&2 + (exit 1); exit 1 + } + } && + + # Now rename the file to the real destination. + $doit $mvcmd "$dsttmp" "$dst" + } + fi || exit 1 + + trap '' 0 + fi +done + +# Local variables: +# eval: (add-hook 'write-file-hooks 'time-stamp) +# time-stamp-start: "scriptversion=" +# time-stamp-format: "%:y-%02m-%02d.%02H" +# time-stamp-time-zone: "UTC" +# time-stamp-end: "; # UTC" +# End: diff --git a/open_src/xio/src/kernel/transport/compat/missing b/open_src/xio/src/kernel/transport/compat/missing new file mode 100644 index 0000000..86a8fc3 --- /dev/null +++ b/open_src/xio/src/kernel/transport/compat/missing @@ -0,0 +1,331 @@ +#! /bin/sh +# Common stub for a few missing GNU programs while installing. + +scriptversion=2012-01-06.13; # UTC + +# Copyright (C) 1996, 1997, 1999, 2000, 2002, 2003, 2004, 2005, 2006, +# 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. +# Originally by Fran,cois Pinard , 1996. + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +# As a special exception to the GNU General Public License, if you +# distribute this file as part of a program that contains a +# configuration script generated by Autoconf, you may include it under +# the same distribution terms that you use for the rest of that program. + +if test $# -eq 0; then + echo 1>&2 "Try \`$0 --help' for more information" + exit 1 +fi + +run=: +sed_output='s/.* --output[ =]\([^ ]*\).*/\1/p' +sed_minuso='s/.* -o \([^ ]*\).*/\1/p' + +# In the cases where this matters, `missing' is being run in the +# srcdir already. +if test -f configure.ac; then + configure_ac=configure.ac +else + configure_ac=configure.in +fi + +msg="missing on your system" + +case $1 in +--run) + # Try to run requested program, and just exit if it succeeds. + run= + shift + "$@" && exit 0 + # Exit code 63 means version mismatch. This often happens + # when the user try to use an ancient version of a tool on + # a file that requires a minimum version. In this case we + # we should proceed has if the program had been absent, or + # if --run hadn't been passed. + if test $? = 63; then + run=: + msg="probably too old" + fi + ;; + + -h|--h|--he|--hel|--help) + echo "\ +$0 [OPTION]... PROGRAM [ARGUMENT]... + +Handle \`PROGRAM [ARGUMENT]...' for when PROGRAM is missing, or return an +error status if there is no known handling for PROGRAM. + +Options: + -h, --help display this help and exit + -v, --version output version information and exit + --run try to run the given command, and emulate it if it fails + +Supported PROGRAM values: + aclocal touch file \`aclocal.m4' + autoconf touch file \`configure' + autoheader touch file \`config.h.in' + autom4te touch the output file, or create a stub one + automake touch all \`Makefile.in' files + bison create \`y.tab.[ch]', if possible, from existing .[ch] + flex create \`lex.yy.c', if possible, from existing .c + help2man touch the output file + lex create \`lex.yy.c', if possible, from existing .c + makeinfo touch the output file + yacc create \`y.tab.[ch]', if possible, from existing .[ch] + +Version suffixes to PROGRAM as well as the prefixes \`gnu-', \`gnu', and +\`g' are ignored when checking the name. + +Send bug reports to ." + exit $? + ;; + + -v|--v|--ve|--ver|--vers|--versi|--versio|--version) + echo "missing $scriptversion (GNU Automake)" + exit $? + ;; + + -*) + echo 1>&2 "$0: Unknown \`$1' option" + echo 1>&2 "Try \`$0 --help' for more information" + exit 1 + ;; + +esac + +# normalize program name to check for. +program=`echo "$1" | sed ' + s/^gnu-//; t + s/^gnu//; t + s/^g//; t'` + +# Now exit if we have it, but it failed. Also exit now if we +# don't have it and --version was passed (most likely to detect +# the program). This is about non-GNU programs, so use $1 not +# $program. +case $1 in + lex*|yacc*) + # Not GNU programs, they don't have --version. + ;; + + *) + if test -z "$run" && ($1 --version) > /dev/null 2>&1; then + # We have it, but it failed. + exit 1 + elif test "x$2" = "x--version" || test "x$2" = "x--help"; then + # Could not run --version or --help. This is probably someone + # running `$TOOL --version' or `$TOOL --help' to check whether + # $TOOL exists and not knowing $TOOL uses missing. + exit 1 + fi + ;; +esac + +# If it does not exist, or fails to run (possibly an outdated version), +# try to emulate it. +case $program in + aclocal*) + echo 1>&2 "\ +WARNING: \`$1' is $msg. You should only need it if + you modified \`acinclude.m4' or \`${configure_ac}'. You might want + to install the \`Automake' and \`Perl' packages. Grab them from + any GNU archive site." + touch aclocal.m4 + ;; + + autoconf*) + echo 1>&2 "\ +WARNING: \`$1' is $msg. You should only need it if + you modified \`${configure_ac}'. You might want to install the + \`Autoconf' and \`GNU m4' packages. Grab them from any GNU + archive site." + touch configure + ;; + + autoheader*) + echo 1>&2 "\ +WARNING: \`$1' is $msg. You should only need it if + you modified \`acconfig.h' or \`${configure_ac}'. You might want + to install the \`Autoconf' and \`GNU m4' packages. Grab them + from any GNU archive site." + files=`sed -n 's/^[ ]*A[CM]_CONFIG_HEADER(\([^)]*\)).*/\1/p' ${configure_ac}` + test -z "$files" && files="config.h" + touch_files= + for f in $files; do + case $f in + *:*) touch_files="$touch_files "`echo "$f" | + sed -e 's/^[^:]*://' -e 's/:.*//'`;; + *) touch_files="$touch_files $f.in";; + esac + done + touch $touch_files + ;; + + automake*) + echo 1>&2 "\ +WARNING: \`$1' is $msg. You should only need it if + you modified \`Makefile.am', \`acinclude.m4' or \`${configure_ac}'. + You might want to install the \`Automake' and \`Perl' packages. + Grab them from any GNU archive site." + find . -type f -name Makefile.am -print | + sed 's/\.am$/.in/' | + while read f; do touch "$f"; done + ;; + + autom4te*) + echo 1>&2 "\ +WARNING: \`$1' is needed, but is $msg. + You might have modified some files without having the + proper tools for further handling them. + You can get \`$1' as part of \`Autoconf' from any GNU + archive site." + + file=`echo "$*" | sed -n "$sed_output"` + test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"` + if test -f "$file"; then + touch $file + else + test -z "$file" || exec >$file + echo "#! /bin/sh" + echo "# Created by GNU Automake missing as a replacement of" + echo "# $ $@" + echo "exit 0" + chmod +x $file + exit 1 + fi + ;; + + bison*|yacc*) + echo 1>&2 "\ +WARNING: \`$1' $msg. You should only need it if + you modified a \`.y' file. You may need the \`Bison' package + in order for those modifications to take effect. You can get + \`Bison' from any GNU archive site." + rm -f y.tab.c y.tab.h + if test $# -ne 1; then + eval LASTARG=\${$#} + case $LASTARG in + *.y) + SRCFILE=`echo "$LASTARG" | sed 's/y$/c/'` + if test -f "$SRCFILE"; then + cp "$SRCFILE" y.tab.c + fi + SRCFILE=`echo "$LASTARG" | sed 's/y$/h/'` + if test -f "$SRCFILE"; then + cp "$SRCFILE" y.tab.h + fi + ;; + esac + fi + if test ! -f y.tab.h; then + echo >y.tab.h + fi + if test ! -f y.tab.c; then + echo 'main() { return 0; }' >y.tab.c + fi + ;; + + lex*|flex*) + echo 1>&2 "\ +WARNING: \`$1' is $msg. You should only need it if + you modified a \`.l' file. You may need the \`Flex' package + in order for those modifications to take effect. You can get + \`Flex' from any GNU archive site." + rm -f lex.yy.c + if test $# -ne 1; then + eval LASTARG=\${$#} + case $LASTARG in + *.l) + SRCFILE=`echo "$LASTARG" | sed 's/l$/c/'` + if test -f "$SRCFILE"; then + cp "$SRCFILE" lex.yy.c + fi + ;; + esac + fi + if test ! -f lex.yy.c; then + echo 'main() { return 0; }' >lex.yy.c + fi + ;; + + help2man*) + echo 1>&2 "\ +WARNING: \`$1' is $msg. You should only need it if + you modified a dependency of a manual page. You may need the + \`Help2man' package in order for those modifications to take + effect. You can get \`Help2man' from any GNU archive site." + + file=`echo "$*" | sed -n "$sed_output"` + test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"` + if test -f "$file"; then + touch $file + else + test -z "$file" || exec >$file + echo ".ab help2man is required to generate this page" + exit $? + fi + ;; + + makeinfo*) + echo 1>&2 "\ +WARNING: \`$1' is $msg. You should only need it if + you modified a \`.texi' or \`.texinfo' file, or any other file + indirectly affecting the aspect of the manual. The spurious + call might also be the consequence of using a buggy \`make' (AIX, + DU, IRIX). You might want to install the \`Texinfo' package or + the \`GNU make' package. Grab either from any GNU archive site." + # The file to touch is that specified with -o ... + file=`echo "$*" | sed -n "$sed_output"` + test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"` + if test -z "$file"; then + # ... or it is the one specified with @setfilename ... + infile=`echo "$*" | sed 's/.* \([^ ]*\) *$/\1/'` + file=`sed -n ' + /^@setfilename/{ + s/.* \([^ ]*\) *$/\1/ + p + q + }' $infile` + # ... or it is derived from the source name (dir/f.texi becomes f.info) + test -z "$file" && file=`echo "$infile" | sed 's,.*/,,;s,.[^.]*$,,'`.info + fi + # If the file does not exist, the user really needs makeinfo; + # let's fail without touching anything. + test -f $file || exit 1 + touch $file + ;; + + *) + echo 1>&2 "\ +WARNING: \`$1' is needed, and is $msg. + You might have modified some files without having the + proper tools for further handling them. Check the \`README' file, + it often tells you about the needed prerequisites for installing + this package. You may also peek at any GNU archive site, in case + some other package would contain this missing \`$1' program." + exit 1 + ;; +esac + +exit 0 + +# Local variables: +# eval: (add-hook 'write-file-hooks 'time-stamp) +# time-stamp-start: "scriptversion=" +# time-stamp-format: "%:y-%02m-%02d.%02H" +# time-stamp-time-zone: "UTC" +# time-stamp-end: "; # UTC" +# End: diff --git a/open_src/xio/src/kernel/transport/rdma/Makefile.in b/open_src/xio/src/kernel/transport/rdma/Makefile.in new file mode 100644 index 0000000..2abac65 --- /dev/null +++ b/open_src/xio/src/kernel/transport/rdma/Makefile.in @@ -0,0 +1,66 @@ +# Makefile.in for kernel module + +SHELL = /bin/sh +INSTALL = @INSTALL@ +mkdir_p = mkdir -p +VERSION = @PACKAGE_VERSION@ +OFED_CFLAGS = @OFED_CFLAGS@ +KSYMVERS = @RDMA_SYMVERS@ + +NOSTDINC_FLAGS += @OFED_CFLAGS@ + +DISTFILES = Makefile.in configure.ac configure ../install-sh \ + xio_log.h xio_mem.h xio_os.h xio_rdma_utils.h \ + xio_rdma_transport.h \ + xio_rdma_datapath.c xio_rdma_management.c xio_rdma_memory.c \ + xio_rdma_verbs.c xio_rdma_utils.c +xiomoduledir = @kmoduledir@/extra/net/xio + +xiomodule := xio_rdma.ko + +all: all-@ENABLE_XIO_MODULE@ +install: install-@ENABLE_XIO_MODULE@ +uninstall: uninstall-@ENABLE_XIO_MODULE@ + +all-n: +install-n: +uninstall-n: + +all-y: all-spec + +install-y: all + $(mkdir_p) $(DESTDIR)$(xiomoduledir) + $(INSTALL) -m 644 $(xiomodule) $(DESTDIR)$(xiomoduledir)/$(xiomodule) + -/sbin/depmod -a + +uninstall-y: + rm -f $(DESTDIR)$(xiomoduledir)/$(xiomodule) + -/sbin/depmod -a + +clean: + -rm -f $(xiomodule) *.o .*.cmd *.mod.c *.ko *.s */*.o *.order *.symvers *.unsigned + +distclean: clean + rm -f Makefile configure config.status + rm -f config.h config.log config.status config.cache + rm -rf .tmp_versions autom4te.cache + +maintainer-clean: distclean + +distdir: $(DISTFILES) + cp -p $(DISTFILES) $(distdir) + + +ccflags-y += $(OFED_CFLAGS) -I$(SUBDIRS) -I$(SUBDIRS)/.. -I$(SUBDIRS)/../../ -I$(SUBDIRS)/../../xio -I$(SUBDIRS)/../../../common -I$(SUBDIRS)/../../../../include -I$(SUBDIRS)/../../../libxio_os/linuxkernel + +obj-m := xio_rdma.o +xio_rdma-objs := \ + xio_rdma_datapath.o \ + xio_rdma_management.o \ + xio_rdma_memory.o \ + xio_rdma_verbs.o \ + xio_rdma_utils.o + +all-spec: + export NOSTDINC_FLAGS + $(MAKE) -C @kernelsrc@ SUBDIRS=`pwd` KBUILD_EXTRA_SYMBOLS="$(KSYMVERS)" @KERNELMAKE_PARAMS@ modules diff --git a/open_src/xio/src/kernel/transport/rdma/autogen.sh b/open_src/xio/src/kernel/transport/rdma/autogen.sh new file mode 100644 index 0000000..28dd57d --- /dev/null +++ b/open_src/xio/src/kernel/transport/rdma/autogen.sh @@ -0,0 +1,3 @@ +#! /bin/sh + +autoconf diff --git a/open_src/xio/src/kernel/transport/rdma/configure.ac b/open_src/xio/src/kernel/transport/rdma/configure.ac new file mode 100644 index 0000000..512e8e6 --- /dev/null +++ b/open_src/xio/src/kernel/transport/rdma/configure.ac @@ -0,0 +1,216 @@ +AC_INIT([xio-kernel],[2.0],[libxio@accellio.org]) + +AC_PROG_INSTALL + +runver=`uname -r` +bad_kernel_version=no +ENABLE_XIO_MODULE=y +# do not build against ofed until kernel module can be built out of kernel +# tree +ENABLE_OFED_BUILD=y +KERNELCFLAGS= + +kernelsrc= +kernelbuild= +AC_ARG_WITH(kernel, + [ --with-kernel=PATH Specify location of kernel source ], + [kernelsrc="$withval"; kernelbuild="$withval"]) +AC_ARG_WITH(kernel-build, + [ --with-kernel-build=PATH Specify location of kernel build ], + [kernelbuild="$withval"]) +AC_ARG_ENABLE(kernel-module, + [ --enable-kernel-module Compile kernel module ]) + +#build against installed OFED +RDMA_SYMVERS=`pwd`/../../xio/Module.symvers + +if test "$ENABLE_OFED_BUILD" = "y"; then +AC_MSG_CHECKING([if ofed installed]) +MLNX_OFED=`if ofed_info 2>/dev/null | grep MLNX_OFED >/dev/null 2>/dev/null; then echo true; else echo false; fi` +OFED_CFLAGS= + +if test "$MLNX_OFED" = "true"; then + AC_MSG_RESULT(yes) + + # Whether MLNX_OFED for ubuntu has been installed + MLNX_OFED_IB_UBUNTU_INSTALLED=`if dpkg -s mlnx-ofed-kernel-dkms >/dev/null 2>/dev/null; then echo true; else echo false; fi` + + # Whether MLNX_OFED for RedHat has been installed + MLNX_OFED_IB_RH_INSTALLED=`if rpm -q mlnx-ofa_kernel-devel >&/dev/null; then echo true; else echo false; fi` + + # Check if we have custom compiled kernel modules + if test "$MLNX_OFED_IB_RH_INSTALLED" = "false"; then + MLNX_OFED_IB_RH_INSTALLED=`if rpm -q kernel-ib-devel >&/dev/null; then echo true; else echo false; fi` + fi + + if test "$MLNX_OFED_IB_UBUNTU_INSTALLED" = "true"; then + OFED_VERS=`dpkg -s mlnx-ofed-kernel-dkms | awk -F\- '/Version/ {print $1}' | awk '{print $2}'` + OFED_CFLAGS=`echo -I/var/lib/dkms/mlnx-ofed-kernel/$OFED_VERS/build/include -include /var/lib/dkms/mlnx-ofed-kernel/$OFED_VERS/build/include/linux/compat-2.6.h` + RDMA_SYMVERS=`echo $RDMA_SYMVERS /var/lib/dkms/mlnx-ofed-kernel/$OFED_VERS/build/Module.symvers` + fi + + if test "$MLNX_OFED_IB_RH_INSTALLED" = "true"; then + OFED_CFLAGS=`echo -I/usr/src/ofa_kernel/default/include -include /usr/src/ofa_kernel/default/include/linux/compat-2.6.h` + RDMA_SYMVERS=`echo $RDMA_SYMVERS /usr/src/ofa_kernel/default/Module.symvers` + fi +else + AC_MSG_RESULT(no) + RDMA_SYMVERS=`echo $RDMA_SYMVERS ../compat/Module.symvers` + + # Whether or not the OFED kernel-ib-devel RPM has been installed. + OFED_KERNEL_IB_DEVEL_RPM_INSTALLED=`if rpm -q kernel-ib-devel 2>/dev/null | grep -q $(uname -r | sed 's/-/_/g'); then echo true; else echo false; fi` + + # Whether or not the OFED compat-rdma-devel RPM has been installed. + OFED_COMPAT_RDMA_DEVEL_RPM_INSTALLED=`if rpm -q compat-rdma-devel 2>/dev/null | grep -q $(uname -r | sed 's/-/_/g'); then echo true; else echo false; fi` + + if test "$OFED_KERNEL_IB_DEVEL_RPM_INSTALLED" = "true"; then + # Read OFED's config.mk, which contains the definition of the variable + # BACKPORT_INCLUDES. + cfile="/usr/src/ofa_kernel/config.mk" + if test -r "${cfile}"; then + echo "loading build-specific script '${cfile}'" + . "${cfile}" + else + cfile="/usr/src/ofa_kernel/default/config.mk" + if test -r "${cfile}"; then + echo "loading build-specific script '${cfile}'" + . "${cfile}" + fi + fi + + OFED_CFLAGS=`echo $BACKPORT_INCLUDES -I/usr/src/ofa_kernel/include` + RDMA_SYMVERS=`echo $RDMA_SYMVERS /usr/src/ofa_kernel/Module.symvers` + fi + + if test "$OFED_COMPAT_RDMA_DEVEL_RPM_INSTALLED" = "true"; then + OFED_CFLAGS=`echo -I/usr/src/compat-rdma/include -include /usr/src/compat-rdma/include/linux/compat-2.6.h` + RDMA_SYMVERS=`echo $RDMA_SYMVERS /usr/src/compat-rdma/Module.symvers` + fi +fi + +AC_MSG_NOTICE([ofed include files directory is ${OFED_CFLAGS}]) +AC_SUBST(OFED_CFLAGS) +AC_SUBST(RDMA_SYMVERS) +fi + +if test "$ENABLE_XIO_MODULE" = y; then + AC_MSG_CHECKING([kernel source directory]) + if test -z "$kernelsrc"; then + kernelbuild= + sourcelink=/lib/modules/${runver}/source + buildlink=/lib/modules/${runver}/build + + if test -e $sourcelink; then + kernelsrc=`(cd $sourcelink; /bin/pwd)` + fi + if test -e $buildlink; then + kernelbuild=`(cd $buildlink; /bin/pwd)` + fi + if test -z "$kernelsrc"; then + kernelsrc=$kernelbuild + fi + if test -z "$kernelsrc" -o -z "$kernelbuild"; then + AC_MSG_RESULT([Not found]) + AC_MSG_ERROR([ + *** Please specify the location of the kernel source with + *** the '--with-kernel=SRCDIR' option]) + fi + fi + AC_MSG_RESULT([$kernelsrc]) + AC_MSG_CHECKING([kernel build directory]) + AC_MSG_RESULT([$kernelbuild]) + + AC_MSG_CHECKING([kernel source version]) + if test -r $kernelbuild/include/linux/version.h && fgrep -q UTS_RELEASE $kernelbuild/include/linux/version.h; then + kernsrcver=`(echo "#include "; echo "kernsrcver=UTS_RELEASE") | cpp -I $kernelbuild/include | grep "^kernsrcver=" | cut -d \" -f 2` + elif test -r $kernelbuild/include/linux/utsrelease.h && fgrep -q UTS_RELEASE $kernelbuild/include/linux/utsrelease.h; then + kernsrcver=`(echo "#include "; echo "kernsrcver=UTS_RELEASE") | cpp -I $kernelbuild/include | grep "^kernsrcver=" | cut -d \" -f 2` + elif test -r $kernelbuild/include/generated/utsrelease.h && fgrep -q UTS_RELEASE $kernelbuild/include/generated/utsrelease.h; then + kernsrcver=`(echo "#include "; echo "kernsrcver=UTS_RELEASE") | cpp -I $kernelbuild/include | grep "^kernsrcver=" | cut -d \" -f 2` + fi + if test -z "$kernsrcver"; then + AC_MSG_RESULT([Not found]) + AC_MSG_ERROR([ + *** Cannot determine the version of the linux kernel source. Please + *** prepare the kernel before running this script]) + fi + AC_MSG_RESULT([$kernsrcver]) + kmoduledir=${INSTALL_MOD_PATH}/lib/modules/$kernsrcver + AC_SUBST(kernelsrc) + AC_SUBST(kmoduledir) + + if echo "$kernsrcver" | egrep -q ["^(2.4|2.6.[0-8]([^0-9]|\$))"]; then + bad_kernel_version=yes + AC_MSG_NOTICE([ +NOTE: Disabled building the kernel module, because this release only +NOTE: supports Linux versions 2.6.9 or later. You can use the kernel +NOTE: module from an earlier XIO release with the library from this +NOTE: release.]) + else + xio_configured=no + kernel_autoconf=$kernelbuild/include/linux/autoconf.h + AC_MSG_CHECKING([if XIO is configured in the kernel]) + if test -f $kernel_autoconf; then + if grep -q "^#define CONFIG_XIO 1" $kernel_autoconf || grep -q "^#define CONFIG_XIO_MODULE 1" $kernel_autoconf; then + xio_configured=yes + fi + fi + AC_MSG_RESULT([$xio_configured]) + if test -z "$enable_kernel_module" -a "$xio_configured" = yes; then + ENABLE_XIO_MODULE=n + fi + fi +fi + +if test "$ENABLE_XIO_MODULE" = n; then + AC_MSG_NOTICE([ +NOTE: Detected that XIO is already present in the kernel, so +NOTE: building of kernel module is disabled. To force building +NOTE: of kernel module use the '--enable-kernel-module' option.]) +fi + +if test "$enable_kernel_module" = no; then + ENABLE_XIO_MODULE=n +fi +if test "$bad_kernel_version" = yes; then + ENABLE_XIO_MODULE=n +fi + +AC_MSG_CHECKING([is ENABLE_XIO_MODULE defined]) +if test "$ENABLE_XIO_MODULE" = y; then + AC_MSG_RESULT([yes]) +else + AC_MSG_RESULT([no]) +fi + +AC_SUBST(ENABLE_XIO_MODULE) + +if test "$ENABLE_XIO_MODULE" = y; then + AC_MSG_CHECKING([if kernel defines kzalloc function]) + if egrep -qw "kzalloc" $kernelsrc/include/linux/slab.h; then + AC_DEFINE(HAVE_KZALLOC, 1, [kzalloc() is defined]) + AC_MSG_RESULT([yes]) + else + AC_MSG_RESULT([no]) + fi + + isuml=no + KERNELMAKE_PARAMS= + KERNELCPPFLAGS= + AC_MSG_CHECKING([if this is user mode linux]) + if test -f $kernelbuild/include/linux/autoconf.h && egrep -q "^#define CONFIG_(USERMODE|UML) 1" $kernelbuild/include/linux/autoconf.h; then + isuml=yes + KERNELMAKE_PARAMS="ARCH=um" + KERNELCPPFLAGS="-D__arch_um__ -DSUBARCH=\\\"i386\\\" -D_LARGEFILE64_SOURCE -I${kernelsrc}/arch/um/include -Derrno=kernel_errno -I${kernelsrc}/arch/um/kernel/tt/include -I${kernelsrc}/arch/um/kernel/skas/include" + fi + AC_MSG_RESULT([$isuml]) + if test "$kernelbuild" != "$kernelsrc"; then + KERNELMAKE_PARAMS="$KERNELMAKE_PARAMS O=$kernelbuild" + fi + AC_SUBST(KERNELMAKE_PARAMS) + AC_SUBST(KERNELCPPFLAGS) + AC_SUBST(KERNELCFLAGS) +fi + +AC_CONFIG_FILES([Makefile]) +AC_OUTPUT diff --git a/open_src/xio/src/kernel/transport/rdma/install-sh b/open_src/xio/src/kernel/transport/rdma/install-sh new file mode 100644 index 0000000..6781b98 --- /dev/null +++ b/open_src/xio/src/kernel/transport/rdma/install-sh @@ -0,0 +1,520 @@ +#!/bin/sh +# install - install a program, script, or datafile + +scriptversion=2009-04-28.21; # UTC + +# This originates from X11R5 (mit/util/scripts/install.sh), which was +# later released in X11R6 (xc/config/util/install.sh) with the +# following copyright and license. +# +# Copyright (C) 1994 X Consortium +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to +# deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +# sell copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# X CONSORTIUM BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN +# AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNEC- +# TION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# Except as contained in this notice, the name of the X Consortium shall not +# be used in advertising or otherwise to promote the sale, use or other deal- +# ings in this Software without prior written authorization from the X Consor- +# tium. +# +# +# FSF changes to this file are in the public domain. +# +# Calling this script install-sh is preferred over install.sh, to prevent +# `make' implicit rules from creating a file called install from it +# when there is no Makefile. +# +# This script is compatible with the BSD install script, but was written +# from scratch. + +nl=' +' +IFS=" "" $nl" + +# set DOITPROG to echo to test this script + +# Don't use :- since 4.3BSD and earlier shells don't like it. +doit=${DOITPROG-} +if test -z "$doit"; then + doit_exec=exec +else + doit_exec=$doit +fi + +# Put in absolute file names if you don't have them in your path; +# or use environment vars. + +chgrpprog=${CHGRPPROG-chgrp} +chmodprog=${CHMODPROG-chmod} +chownprog=${CHOWNPROG-chown} +cmpprog=${CMPPROG-cmp} +cpprog=${CPPROG-cp} +mkdirprog=${MKDIRPROG-mkdir} +mvprog=${MVPROG-mv} +rmprog=${RMPROG-rm} +stripprog=${STRIPPROG-strip} + +posix_glob='?' +initialize_posix_glob=' + test "$posix_glob" != "?" || { + if (set -f) 2>/dev/null; then + posix_glob= + else + posix_glob=: + fi + } +' + +posix_mkdir= + +# Desired mode of installed file. +mode=0755 + +chgrpcmd= +chmodcmd=$chmodprog +chowncmd= +mvcmd=$mvprog +rmcmd="$rmprog -f" +stripcmd= + +src= +dst= +dir_arg= +dst_arg= + +copy_on_change=false +no_target_directory= + +usage="\ +Usage: $0 [OPTION]... [-T] SRCFILE DSTFILE + or: $0 [OPTION]... SRCFILES... DIRECTORY + or: $0 [OPTION]... -t DIRECTORY SRCFILES... + or: $0 [OPTION]... -d DIRECTORIES... + +In the 1st form, copy SRCFILE to DSTFILE. +In the 2nd and 3rd, copy all SRCFILES to DIRECTORY. +In the 4th, create DIRECTORIES. + +Options: + --help display this help and exit. + --version display version info and exit. + + -c (ignored) + -C install only if different (preserve the last data modification time) + -d create directories instead of installing files. + -g GROUP $chgrpprog installed files to GROUP. + -m MODE $chmodprog installed files to MODE. + -o USER $chownprog installed files to USER. + -s $stripprog installed files. + -t DIRECTORY install into DIRECTORY. + -T report an error if DSTFILE is a directory. + +Environment variables override the default commands: + CHGRPPROG CHMODPROG CHOWNPROG CMPPROG CPPROG MKDIRPROG MVPROG + RMPROG STRIPPROG +" + +while test $# -ne 0; do + case $1 in + -c) ;; + + -C) copy_on_change=true;; + + -d) dir_arg=true;; + + -g) chgrpcmd="$chgrpprog $2" + shift;; + + --help) echo "$usage"; exit $?;; + + -m) mode=$2 + case $mode in + *' '* | *' '* | *' +'* | *'*'* | *'?'* | *'['*) + echo "$0: invalid mode: $mode" >&2 + exit 1;; + esac + shift;; + + -o) chowncmd="$chownprog $2" + shift;; + + -s) stripcmd=$stripprog;; + + -t) dst_arg=$2 + shift;; + + -T) no_target_directory=true;; + + --version) echo "$0 $scriptversion"; exit $?;; + + --) shift + break;; + + -*) echo "$0: invalid option: $1" >&2 + exit 1;; + + *) break;; + esac + shift +done + +if test $# -ne 0 && test -z "$dir_arg$dst_arg"; then + # When -d is used, all remaining arguments are directories to create. + # When -t is used, the destination is already specified. + # Otherwise, the last argument is the destination. Remove it from $@. + for arg + do + if test -n "$dst_arg"; then + # $@ is not empty: it contains at least $arg. + set fnord "$@" "$dst_arg" + shift # fnord + fi + shift # arg + dst_arg=$arg + done +fi + +if test $# -eq 0; then + if test -z "$dir_arg"; then + echo "$0: no input file specified." >&2 + exit 1 + fi + # It's OK to call `install-sh -d' without argument. + # This can happen when creating conditional directories. + exit 0 +fi + +if test -z "$dir_arg"; then + trap '(exit $?); exit' 1 2 13 15 + + # Set umask so as not to create temps with too-generous modes. + # However, 'strip' requires both read and write access to temps. + case $mode in + # Optimize common cases. + *644) cp_umask=133;; + *755) cp_umask=22;; + + *[0-7]) + if test -z "$stripcmd"; then + u_plus_rw= + else + u_plus_rw='% 200' + fi + cp_umask=`expr '(' 777 - $mode % 1000 ')' $u_plus_rw`;; + *) + if test -z "$stripcmd"; then + u_plus_rw= + else + u_plus_rw=,u+rw + fi + cp_umask=$mode$u_plus_rw;; + esac +fi + +for src +do + # Protect names starting with `-'. + case $src in + -*) src=./$src;; + esac + + if test -n "$dir_arg"; then + dst=$src + dstdir=$dst + test -d "$dstdir" + dstdir_status=$? + else + + # Waiting for this to be detected by the "$cpprog $src $dsttmp" command + # might cause directories to be created, which would be especially bad + # if $src (and thus $dsttmp) contains '*'. + if test ! -f "$src" && test ! -d "$src"; then + echo "$0: $src does not exist." >&2 + exit 1 + fi + + if test -z "$dst_arg"; then + echo "$0: no destination specified." >&2 + exit 1 + fi + + dst=$dst_arg + # Protect names starting with `-'. + case $dst in + -*) dst=./$dst;; + esac + + # If destination is a directory, append the input filename; won't work + # if double slashes aren't ignored. + if test -d "$dst"; then + if test -n "$no_target_directory"; then + echo "$0: $dst_arg: Is a directory" >&2 + exit 1 + fi + dstdir=$dst + dst=$dstdir/`basename "$src"` + dstdir_status=0 + else + # Prefer dirname, but fall back on a substitute if dirname fails. + dstdir=` + (dirname "$dst") 2>/dev/null || + expr X"$dst" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ + X"$dst" : 'X\(//\)[^/]' \| \ + X"$dst" : 'X\(//\)$' \| \ + X"$dst" : 'X\(/\)' \| . 2>/dev/null || + echo X"$dst" | + sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ + s//\1/ + q + } + /^X\(\/\/\)[^/].*/{ + s//\1/ + q + } + /^X\(\/\/\)$/{ + s//\1/ + q + } + /^X\(\/\).*/{ + s//\1/ + q + } + s/.*/./; q' + ` + + test -d "$dstdir" + dstdir_status=$? + fi + fi + + obsolete_mkdir_used=false + + if test $dstdir_status != 0; then + case $posix_mkdir in + '') + # Create intermediate dirs using mode 755 as modified by the umask. + # This is like FreeBSD 'install' as of 1997-10-28. + umask=`umask` + case $stripcmd.$umask in + # Optimize common cases. + *[2367][2367]) mkdir_umask=$umask;; + .*0[02][02] | .[02][02] | .[02]) mkdir_umask=22;; + + *[0-7]) + mkdir_umask=`expr $umask + 22 \ + - $umask % 100 % 40 + $umask % 20 \ + - $umask % 10 % 4 + $umask % 2 + `;; + *) mkdir_umask=$umask,go-w;; + esac + + # With -d, create the new directory with the user-specified mode. + # Otherwise, rely on $mkdir_umask. + if test -n "$dir_arg"; then + mkdir_mode=-m$mode + else + mkdir_mode= + fi + + posix_mkdir=false + case $umask in + *[123567][0-7][0-7]) + # POSIX mkdir -p sets u+wx bits regardless of umask, which + # is incompatible with FreeBSD 'install' when (umask & 300) != 0. + ;; + *) + tmpdir=${TMPDIR-/tmp}/ins$RANDOM-$$ + trap 'ret=$?; rmdir "$tmpdir/d" "$tmpdir" 2>/dev/null; exit $ret' 0 + + if (umask $mkdir_umask && + exec $mkdirprog $mkdir_mode -p -- "$tmpdir/d") >/dev/null 2>&1 + then + if test -z "$dir_arg" || { + # Check for POSIX incompatibilities with -m. + # HP-UX 11.23 and IRIX 6.5 mkdir -m -p sets group- or + # other-writeable bit of parent directory when it shouldn't. + # FreeBSD 6.1 mkdir -m -p sets mode of existing directory. + ls_ld_tmpdir=`ls -ld "$tmpdir"` + case $ls_ld_tmpdir in + d????-?r-*) different_mode=700;; + d????-?--*) different_mode=755;; + *) false;; + esac && + $mkdirprog -m$different_mode -p -- "$tmpdir" && { + ls_ld_tmpdir_1=`ls -ld "$tmpdir"` + test "$ls_ld_tmpdir" = "$ls_ld_tmpdir_1" + } + } + then posix_mkdir=: + fi + rmdir "$tmpdir/d" "$tmpdir" + else + # Remove any dirs left behind by ancient mkdir implementations. + rmdir ./$mkdir_mode ./-p ./-- 2>/dev/null + fi + trap '' 0;; + esac;; + esac + + if + $posix_mkdir && ( + umask $mkdir_umask && + $doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir" + ) + then : + else + + # The umask is ridiculous, or mkdir does not conform to POSIX, + # or it failed possibly due to a race condition. Create the + # directory the slow way, step by step, checking for races as we go. + + case $dstdir in + /*) prefix='/';; + -*) prefix='./';; + *) prefix='';; + esac + + eval "$initialize_posix_glob" + + oIFS=$IFS + IFS=/ + $posix_glob set -f + set fnord $dstdir + shift + $posix_glob set +f + IFS=$oIFS + + prefixes= + + for d + do + test -z "$d" && continue + + prefix=$prefix$d + if test -d "$prefix"; then + prefixes= + else + if $posix_mkdir; then + (umask=$mkdir_umask && + $doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir") && break + # Don't fail if two instances are running concurrently. + test -d "$prefix" || exit 1 + else + case $prefix in + *\'*) qprefix=`echo "$prefix" | sed "s/'/'\\\\\\\\''/g"`;; + *) qprefix=$prefix;; + esac + prefixes="$prefixes '$qprefix'" + fi + fi + prefix=$prefix/ + done + + if test -n "$prefixes"; then + # Don't fail if two instances are running concurrently. + (umask $mkdir_umask && + eval "\$doit_exec \$mkdirprog $prefixes") || + test -d "$dstdir" || exit 1 + obsolete_mkdir_used=true + fi + fi + fi + + if test -n "$dir_arg"; then + { test -z "$chowncmd" || $doit $chowncmd "$dst"; } && + { test -z "$chgrpcmd" || $doit $chgrpcmd "$dst"; } && + { test "$obsolete_mkdir_used$chowncmd$chgrpcmd" = false || + test -z "$chmodcmd" || $doit $chmodcmd $mode "$dst"; } || exit 1 + else + + # Make a couple of temp file names in the proper directory. + dsttmp=$dstdir/_inst.$$_ + rmtmp=$dstdir/_rm.$$_ + + # Trap to clean up those temp files at exit. + trap 'ret=$?; rm -f "$dsttmp" "$rmtmp" && exit $ret' 0 + + # Copy the file name to the temp name. + (umask $cp_umask && $doit_exec $cpprog "$src" "$dsttmp") && + + # and set any options; do chmod last to preserve setuid bits. + # + # If any of these fail, we abort the whole thing. If we want to + # ignore errors from any of these, just make sure not to ignore + # errors from the above "$doit $cpprog $src $dsttmp" command. + # + { test -z "$chowncmd" || $doit $chowncmd "$dsttmp"; } && + { test -z "$chgrpcmd" || $doit $chgrpcmd "$dsttmp"; } && + { test -z "$stripcmd" || $doit $stripcmd "$dsttmp"; } && + { test -z "$chmodcmd" || $doit $chmodcmd $mode "$dsttmp"; } && + + # If -C, don't bother to copy if it wouldn't change the file. + if $copy_on_change && + old=`LC_ALL=C ls -dlL "$dst" 2>/dev/null` && + new=`LC_ALL=C ls -dlL "$dsttmp" 2>/dev/null` && + + eval "$initialize_posix_glob" && + $posix_glob set -f && + set X $old && old=:$2:$4:$5:$6 && + set X $new && new=:$2:$4:$5:$6 && + $posix_glob set +f && + + test "$old" = "$new" && + $cmpprog "$dst" "$dsttmp" >/dev/null 2>&1 + then + rm -f "$dsttmp" + else + # Rename the file to the real destination. + $doit $mvcmd -f "$dsttmp" "$dst" 2>/dev/null || + + # The rename failed, perhaps because mv can't rename something else + # to itself, or perhaps because mv is so ancient that it does not + # support -f. + { + # Now remove or move aside any old file at destination location. + # We try this two ways since rm can't unlink itself on some + # systems and the destination file might be busy for other + # reasons. In this case, the final cleanup might fail but the new + # file should still install successfully. + { + test ! -f "$dst" || + $doit $rmcmd -f "$dst" 2>/dev/null || + { $doit $mvcmd -f "$dst" "$rmtmp" 2>/dev/null && + { $doit $rmcmd -f "$rmtmp" 2>/dev/null; :; } + } || + { echo "$0: cannot unlink or rename $dst" >&2 + (exit 1); exit 1 + } + } && + + # Now rename the file to the real destination. + $doit $mvcmd "$dsttmp" "$dst" + } + fi || exit 1 + + trap '' 0 + fi +done + +# Local variables: +# eval: (add-hook 'write-file-hooks 'time-stamp) +# time-stamp-start: "scriptversion=" +# time-stamp-format: "%:y-%02m-%02d.%02H" +# time-stamp-time-zone: "UTC" +# time-stamp-end: "; # UTC" +# End: diff --git a/open_src/xio/src/kernel/transport/rdma/xio_rdma_datapath.c b/open_src/xio/src/kernel/transport/rdma/xio_rdma_datapath.c new file mode 100644 index 0000000..59a725d --- /dev/null +++ b/open_src/xio/src/kernel/transport/rdma/xio_rdma_datapath.c @@ -0,0 +1,5246 @@ +/* + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "libxio.h" +#include +#include "xio_log.h" +#include "xio_observer.h" +#include "xio_common.h" +#include "xio_protocol.h" +#include "xio_mbuf.h" +#include "xio_task.h" +#include "xio_ktransport.h" +#include "xio_transport.h" +#include "xio_protocol.h" +#include "xio_mem.h" +#include "xio_mempool.h" +#include "xio_rdma_transport.h" +#include "xio_rdma_utils.h" +#include "xio_sg_table.h" +#include "xio_ev_data.h" +#include "xio_objpool.h" +#include "xio_workqueue.h" +#include "xio_context.h" + +/*---------------------------------------------------------------------------*/ +/* globals */ +/*---------------------------------------------------------------------------*/ + +/*---------------------------------------------------------------------------*/ +/* forward declarations */ +/*---------------------------------------------------------------------------*/ +static void xio_prep_rdma_wr_send_req( + struct xio_task *task, + struct xio_rdma_transport *rdma_hndl, + struct xio_work_req *next_wr, + int signaled); +static void xio_prep_rdma_rd_send_req( + struct xio_task *task, + struct xio_rdma_transport *rdma_hndl, + int signaled); +static int xio_rdma_on_recv_req(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task); +static int xio_rdma_on_recv_rsp(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task); +static int xio_rdma_on_setup_msg(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task); +static int xio_rdma_on_req_send_comp(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task); +static int xio_rdma_on_rsp_send_comp(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task); +static int xio_rdma_on_direct_rdma_comp(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task, + enum xio_wc_op op); +static int xio_rdma_on_recv_nop(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task); +static int xio_rdma_send_nop(struct xio_rdma_transport *rdma_hndl); +static int xio_rdma_on_recv_cancel_req(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task); +static int xio_rdma_on_recv_cancel_rsp(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task); +static int xio_sched_rdma_rd(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task); +static int xio_sched_rdma_wr_req(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task); +void xio_cq_data_callback_cont(struct ib_cq *cq, void *cq_context); +static int xio_rdma_send_rdma_read_ack(struct xio_rdma_transport *rdma_hndl, + int rtid); +static int xio_rdma_on_recv_rdma_read_ack(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task); +static int xio_sched_rdma_rd(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task); +static int xio_rdma_post_recv_rsp(struct xio_task *task); +/*---------------------------------------------------------------------------*/ +/* xio_post_recv */ +/*---------------------------------------------------------------------------*/ +int xio_post_recv(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task, int num_recv_bufs) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + struct ib_recv_wr *bad_wr = NULL; + int retval, nr_posted; + + retval = ib_post_recv(rdma_hndl->qp, &rdma_task->rxd.recv_wr, &bad_wr); + if (likely(!retval)) { + nr_posted = num_recv_bufs; + } else { + struct ib_recv_wr *wr; + nr_posted = 0; + for (wr = &rdma_task->rxd.recv_wr; wr != bad_wr; wr = wr->next) + nr_posted++; + + xio_set_error(retval); + ERROR_LOG("ib_post_recv failed. (errno=%d %s)\n", + retval, strerror(retval)); + } + rdma_hndl->rqe_avail += nr_posted; + + /* credit updates */ + rdma_hndl->credits += nr_posted; + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_post_send */ +/*---------------------------------------------------------------------------*/ +static int xio_post_send(struct xio_rdma_transport *rdma_hndl, + struct xio_work_req *xio_send, + int num_send_reqs) +{ + struct ib_send_wr *bad_wr, *wr; + int retval, nr_posted; + + /* + for (wr = &xio_send->send_wr; wr != NULL; wr = wr->next) + ERROR_LOG("wr_id:0x%llx, num_sge:%d, addr:0x%llx, len1:%d, " \ + "addr:0x%llx, len2:%d, send_flags:%d\n", + wr->wr_id, + wr->num_sge, + wr->sg_list[0].addr, + wr->sg_list[0].length, + wr->sg_list[1].addr, + wr->sg_list[1].length, + wr->send_flags); + */ + + retval = ib_post_send(rdma_hndl->qp, &xio_send->send_wr, &bad_wr); + if (likely(!retval)) { + nr_posted = num_send_reqs; + } else { + nr_posted = 0; + for (wr = &xio_send->send_wr; wr != bad_wr; wr = wr->next) + nr_posted++; + + xio_set_error(retval); + + ERROR_LOG("ib_post_send failed. (errno=%d %s) posted:%d/%d " \ + "sge_sz:%d, sqe_avail:%d\n", retval, strerror(retval), + nr_posted, num_send_reqs, xio_send->send_wr.num_sge, + rdma_hndl->sqe_avail); + } + rdma_hndl->sqe_avail -= nr_posted; + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_write_sn */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_write_sn(struct xio_task *task, + uint16_t sn, uint16_t ack_sn, uint16_t credits) +{ + uint16_t *psn; + + /* save the current place */ + xio_mbuf_push(&task->mbuf); + /* goto to the first tlv */ + xio_mbuf_reset(&task->mbuf); + /* goto the first transport header*/ + xio_mbuf_set_trans_hdr(&task->mbuf); + + /* jump over the first uint32_t */ + xio_mbuf_inc(&task->mbuf, sizeof(uint32_t)); + + /* and set serial number */ + psn = xio_mbuf_get_curr_ptr(&task->mbuf); + *psn = htons(sn); + + xio_mbuf_inc(&task->mbuf, sizeof(uint16_t)); + + /* and set ack serial number */ + psn = xio_mbuf_get_curr_ptr(&task->mbuf); + *psn = htons(ack_sn); + + xio_mbuf_inc(&task->mbuf, sizeof(uint16_t)); + + /* and set credits */ + psn = xio_mbuf_get_curr_ptr(&task->mbuf); + *psn = htons(credits); + + /* pop to the original place */ + xio_mbuf_pop(&task->mbuf); + + return 0; +} + +static inline uint16_t tx_window_sz(struct xio_rdma_transport *rdma_hndl) +{ + return rdma_hndl->max_sn - rdma_hndl->sn; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_xmit */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_xmit(struct xio_rdma_transport *rdma_hndl) +{ + struct xio_task *task = NULL, *task1, *task2; + struct xio_rdma_task *rdma_task = NULL; + struct xio_rdma_task *prev_rdma_task = NULL; + struct xio_work_req *first_wr = NULL; + struct xio_work_req *curr_wr = NULL; + struct xio_work_req *last_wr = NULL; + struct xio_work_req *prev_wr = &rdma_hndl->dummy_wr; + uint16_t tx_window; + uint16_t window = 0; + uint16_t retval; + uint16_t req_nr = 0; + + tx_window = tx_window_sz(rdma_hndl); + /* save one credit for nop */ + if (rdma_hndl->peer_credits > 1) { + uint16_t peer_credits = rdma_hndl->peer_credits - 1; + + window = min(peer_credits, tx_window); + window = min(window, ((uint16_t)rdma_hndl->sqe_avail)); + } + /* + TRACE_LOG("XMIT: tx_window:%d, peer_credits:%d, sqe_avail:%d\n", + tx_window, + rdma_hndl->peer_credits, + rdma_hndl->sqe_avail); + */ + if (window == 0) { + xio_set_error(EAGAIN); + return -1; + } + + /* if "ready to send queue" is not empty */ + while (rdma_hndl->tx_ready_tasks_num) { + task = list_first_entry(&rdma_hndl->tx_ready_list, + struct xio_task, tasks_list_entry); + + rdma_task = task->dd_data; + + /* prefetch next buffer */ + if (rdma_hndl->tx_ready_tasks_num > 2) { + task1 = list_first_entry/*_or_null*/( + &task->tasks_list_entry, + struct xio_task, tasks_list_entry); + if (task1) { + xio_prefetch(task1->mbuf.buf.head); + task2 = list_first_entry/*_or_null*/( + &task1->tasks_list_entry, + struct xio_task, + tasks_list_entry); + if (task2) + xio_prefetch(task2->mbuf.buf.head); + } + } + + /* phantom task */ + if (rdma_task->phantom_idx) { + if (req_nr >= window) + break; + curr_wr = &rdma_task->rdmad; + + prev_wr->send_wr.next = &curr_wr->send_wr; + + prev_rdma_task = rdma_task; + prev_wr = curr_wr; + req_nr++; + rdma_hndl->tx_ready_tasks_num--; + + rdma_task->txd.send_wr.send_flags &= ~IB_SEND_SIGNALED; + + if (rdma_task->out_ib_op == XIO_IB_RDMA_WRITE || + rdma_task->out_ib_op == XIO_IB_RDMA_WRITE_DIRECT) { + /* + if (xio_map_txmad_work_req(rdma_hndl->dev, + curr_wr)) + ERROR_LOG("DMA map to device failed\n"); + */ + xio_prep_rdma_wr_send_req(task, rdma_hndl, + NULL /*no next*/, + 0 /* signaled */); + } + + if (rdma_task->out_ib_op == XIO_IB_RDMA_READ_DIRECT) { + xio_prep_rdma_rd_send_req(task, rdma_hndl, + 0 /* signaled */); + } + + list_move_tail(&task->tasks_list_entry, + &rdma_hndl->in_flight_list); + continue; + } + if (rdma_task->out_ib_op == XIO_IB_RDMA_WRITE) { + if (req_nr >= (window - 1)) + break; + + /* prepare it for rdma wr and concatenate the send + * wr to it */ + xio_prep_rdma_wr_send_req(task, rdma_hndl, + &rdma_task->txd, 1); + + rdma_task->rdmad.send_wr.next = &rdma_task->txd.send_wr; + rdma_task->txd.send_wr.send_flags |= IB_SEND_SIGNALED; + + /* prev wr will be linked to the RDMA */ + curr_wr = &rdma_task->rdmad; + last_wr = &rdma_task->txd; + + req_nr++; + } else if (rdma_task->out_ib_op == XIO_IB_RDMA_WRITE_DIRECT) { + if (req_nr >= window) + break; + xio_prep_rdma_wr_send_req(task, rdma_hndl, + NULL /*no next*/, + 1 /* signaled */); + curr_wr = &rdma_task->rdmad; + last_wr = curr_wr; + } else if (rdma_task->out_ib_op == XIO_IB_RDMA_READ_DIRECT) { + if (req_nr >= window) + break; + xio_prep_rdma_rd_send_req(task, rdma_hndl, + 1 /* signaled */); + curr_wr = &rdma_task->rdmad; + last_wr = curr_wr; + } else { + if (req_nr >= window) + break; + /* prev wr will be linked to the send */ + curr_wr = &rdma_task->txd; + last_wr = curr_wr; + } + if (rdma_task->out_ib_op != XIO_IB_RDMA_WRITE_DIRECT && + rdma_task->out_ib_op != XIO_IB_RDMA_READ_DIRECT) { + xio_rdma_write_sn(task, rdma_hndl->sn, + rdma_hndl->ack_sn, + rdma_hndl->credits); + rdma_task->sn = rdma_hndl->sn; + + /* set the length of the header */ + rdma_task->txd.sgt.sgl[0].length = + xio_mbuf_data_length(&task->mbuf); + + /* Map the send */ + if (unlikely(xio_map_tx_work_req(rdma_hndl->dev, + &rdma_task->txd))) { + ERROR_LOG("DMA map to device failed\n"); + return -1; + } + rdma_task->txd.send_wr.num_sge = rdma_task->txd.mapped; + + rdma_hndl->sn++; + rdma_hndl->sim_peer_credits += rdma_hndl->credits; + rdma_hndl->credits = 0; + rdma_hndl->peer_credits--; + } + if (IS_REQUEST(task->tlv_type) || + task->tlv_type == XIO_MSG_TYPE_RDMA) + rdma_hndl->reqs_in_flight_nr++; + else if (IS_RESPONSE(task->tlv_type)) + rdma_hndl->rsps_in_flight_nr++; + else + ERROR_LOG("Unexpected tlv_type %u\n", task->tlv_type); + + prev_wr->send_wr.next = &curr_wr->send_wr; + prev_wr = last_wr; + + prev_rdma_task = rdma_task; + req_nr++; + rdma_hndl->tx_ready_tasks_num--; + list_move_tail(&task->tasks_list_entry, + &rdma_hndl->in_flight_list); + } + + if (req_nr) { + first_wr = container_of(rdma_hndl->dummy_wr.send_wr.next, + struct xio_work_req, send_wr); + prev_rdma_task->txd.send_wr.next = NULL; + if (tx_window_sz(rdma_hndl) < 1 || + rdma_hndl->sqe_avail < req_nr + 1) + prev_rdma_task->txd.send_wr.send_flags |= + IB_SEND_SIGNALED; + retval = xio_post_send(rdma_hndl, first_wr, req_nr); + if (unlikely(retval != 0)) { + ERROR_LOG("xio_post_send failed\n"); + return -1; + } + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_xmit_rdma_rd */ +/*---------------------------------------------------------------------------*/ +static int xio_xmit_rdma_rd_(struct xio_rdma_transport *rdma_hndl, + struct list_head *rdma_rd_list, + struct list_head *rdma_rd_in_flight_list, + int *rdma_rd_in_flight, + int *kick_rdma_rd) +{ + struct xio_task *task = NULL; + struct xio_rdma_task *rdma_task = NULL; + struct xio_work_req *first_wr = NULL; + struct xio_work_req *prev_wr = &rdma_hndl->dummy_wr; + struct xio_work_req *curr_wr = NULL; + int num_reqs = 0; + int err; + + if (list_empty(rdma_rd_list) || + rdma_hndl->sqe_avail == 0) + goto exit; + + do { + task = list_first_entry( + rdma_rd_list, + struct xio_task, tasks_list_entry); + list_move_tail(&task->tasks_list_entry, + rdma_rd_in_flight_list); + rdma_task = task->dd_data; + + /* pending "sends" that were delayed for rdma read completion + * are moved to wait in the in_flight list + * because of the need to keep order + */ + if (rdma_task->out_ib_op == XIO_IB_RECV) { + (*rdma_rd_in_flight)++; + continue; + } + + BUG_ON(rdma_task->out_ib_op != XIO_IB_RDMA_READ); + /* prepare it for rdma read */ + xio_prep_rdma_rd_send_req(task, rdma_hndl, 1); + + curr_wr = &rdma_task->rdmad; + prev_wr->send_wr.next = &curr_wr->send_wr; + prev_wr = &rdma_task->rdmad; + + num_reqs++; + } while (!list_empty(rdma_rd_list) && + rdma_hndl->sqe_avail > num_reqs); + + rdma_hndl->kick_rdma_rd_req = 0; + if (num_reqs) { + first_wr = container_of(rdma_hndl->dummy_wr.send_wr.next, + struct xio_work_req, send_wr); + prev_wr->send_wr.next = NULL; + (*rdma_rd_in_flight) += num_reqs; + /* submit the chain of rdma-rd requests, start from the first */ + err = xio_post_send(rdma_hndl, first_wr, num_reqs); + if (unlikely(err)) + ERROR_LOG("xio_post_send failed\n"); + + /* ToDo: error handling */ + } +exit: + *kick_rdma_rd = !list_empty(rdma_rd_list); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_xmit_rdma_rd_req */ +/*---------------------------------------------------------------------------*/ +static inline int xio_xmit_rdma_rd_req(struct xio_rdma_transport *rdma_hndl) +{ + return xio_xmit_rdma_rd_(rdma_hndl, + &rdma_hndl->rdma_rd_req_list, + &rdma_hndl->rdma_rd_req_in_flight_list, + &rdma_hndl->rdma_rd_req_in_flight, + &rdma_hndl->kick_rdma_rd_req); +} + +/*---------------------------------------------------------------------------*/ +/* xio_xmit_rdma_rd_rsp */ +/*---------------------------------------------------------------------------*/ +static inline int xio_xmit_rdma_rd_rsp(struct xio_rdma_transport *rdma_hndl) +{ + return xio_xmit_rdma_rd_(rdma_hndl, + &rdma_hndl->rdma_rd_rsp_list, + &rdma_hndl->rdma_rd_rsp_in_flight_list, + &rdma_hndl->rdma_rd_rsp_in_flight, + &rdma_hndl->kick_rdma_rd_rsp); +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_rearm_rq */ +/*---------------------------------------------------------------------------*/ +int xio_rdma_rearm_rq(struct xio_rdma_transport *rdma_hndl) +{ + struct xio_task *first_task = NULL; + struct xio_task *task = NULL; + struct xio_task *prev_task = NULL; + struct xio_rdma_task *rdma_task = NULL; + struct xio_rdma_task *prev_rdma_task = NULL; + int num_to_post; + int i; + struct xio_work_req *rxd; + + num_to_post = rdma_hndl->actual_rq_depth - rdma_hndl->rqe_avail; + for (i = 0; i < num_to_post; i++) { + /* get ready to receive message */ + task = xio_rdma_primary_task_alloc(rdma_hndl); + if (unlikely(task == 0)) { + ERROR_LOG("primary tasks pool is empty\n"); + return -1; + } + rdma_task = task->dd_data; + /* map the receive address for dma + * Note other sge fields don't change + */ + + rxd = &rdma_task->rxd; + if (unlikely(xio_map_rx_work_req(rdma_hndl->dev, rxd))) { + ERROR_LOG("DMA map from device failed\n"); + return -1; + } + rxd->recv_wr.num_sge = rxd->mapped; + + if (!first_task) + first_task = task; + else + prev_rdma_task->rxd.recv_wr.next = + &rdma_task->rxd.recv_wr; + + prev_task = task; + prev_rdma_task = rdma_task; + rdma_task->out_ib_op = XIO_IB_RECV; + list_add_tail(&task->tasks_list_entry, &rdma_hndl->rx_list); + } + if (prev_task) { + prev_rdma_task->rxd.recv_wr.next = NULL; + xio_post_recv(rdma_hndl, first_task, num_to_post); + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_rx_error_handler */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_rx_error_handler(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + + /* unmap dma */ + xio_unmap_rx_work_req(rdma_hndl->dev, &rdma_task->rxd); + + /* remove the task from rx list */ + xio_tasks_pool_put(task); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_tx_error_handler */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_tx_error_handler(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + + /* unmap dma */ + xio_unmap_tx_work_req(rdma_hndl->dev, &rdma_task->txd); + + /* remove the task from in-flight list */ + xio_tasks_pool_put(task); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_rd_error_handler */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_rd_error_handler(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + /* remove the task from rdma rd in-flight list */ + xio_tasks_pool_put(task); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_wr_error_handler */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_wr_error_handler(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + + if (rdma_task->out_ib_op == XIO_IB_RDMA_WRITE_DIRECT) + return 0; + + /* wait for the concatenated "send" */ + rdma_task->out_ib_op = XIO_IB_SEND; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_handle_task_error */ +/*---------------------------------------------------------------------------*/ +static void xio_handle_task_error(struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + XIO_TO_RDMA_HNDL(task, rdma_hndl); + + switch (rdma_task->out_ib_op) { + case XIO_IB_RECV: + /* this should be the Flush, no task has been created yet */ + xio_rdma_rx_error_handler(rdma_hndl, task); + break; + case XIO_IB_SEND: + /* the task should be completed now */ + xio_rdma_tx_error_handler(rdma_hndl, task); + break; + case XIO_IB_RDMA_READ: + case XIO_IB_RDMA_READ_DIRECT: + xio_rdma_rd_error_handler(rdma_hndl, task); + break; + case XIO_IB_RDMA_WRITE: + case XIO_IB_RDMA_WRITE_DIRECT: + xio_rdma_wr_error_handler(rdma_hndl, task); + break; + default: + ERROR_LOG("unknown out_ib_op: task:%p, type:0x%x, " \ + "magic:0x%x, out_ib_op:0x%x\n", + task, task->tlv_type, + task->magic, rdma_task->out_ib_op); + break; + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_handle_wc_error */ +/*---------------------------------------------------------------------------*/ +static void xio_handle_wc_error(struct ib_wc *wc) +{ + struct xio_task *task = NULL; + struct xio_rdma_task *rdma_task = NULL; + struct xio_rdma_transport *rdma_hndl = NULL; + int retval; + + task = (struct xio_task *)ptr_from_int64(wc->wr_id); + if (task && task->dd_data == ptr_from_int64(XIO_BEACON_WRID)) { + rdma_hndl = container_of(task, + struct xio_rdma_transport, + beacon_task); + rdma_hndl->beacon_sent = 0; + TRACE_LOG("beacon rdma_hndl:%p\n", rdma_hndl); + kref_put(&rdma_hndl->base.kref, xio_rdma_close_cb); + return; + } else if (task && task->dd_data == ptr_from_int64(XIO_FRWR_LI_WRID)) { + ERROR_LOG("frwr are not signaled rdma_hndl:%p\n", rdma_hndl); + return; + } + + if (wc->wr_id) { + task = ptr_from_int64(wc->wr_id); + rdma_task = (struct xio_rdma_task *)task->dd_data; + rdma_hndl = (struct xio_rdma_transport *)task->context; + rdma_hndl->sqe_avail += rdma_task->sqe_used; + rdma_task->sqe_used = 0; + } else { + task = NULL; + } + + if (wc->status == IB_WC_WR_FLUSH_ERR) { + TRACE_LOG("rdma_hndl:%p, rdma_task:%p, task:%p, " \ + "wr_id:0x%llx, " \ + "err:%s, vendor_err:0x%x\n", + rdma_hndl, rdma_task, task, + wc->wr_id, + xio_ib_wc_status_str(wc->status), + wc->vendor_err); + } else { + if (rdma_hndl) + ERROR_LOG("[%s] - state:%d, rdma_hndl:%p, " \ + "rdma_task:%p, task:%p, " \ + "wr_id:0x%llx, " \ + "err:%s, vendor_err:0x%x\n", + rdma_hndl->base.is_client ? + "client" : "server", + rdma_hndl->state, + rdma_hndl, rdma_task, task, + wc->wr_id, + xio_ib_wc_status_str(wc->status), + wc->vendor_err); + else + ERROR_LOG("wr_id:0x%llx, err:%s, vendor_err:0x%x\n", + wc->wr_id, + xio_ib_wc_status_str(wc->status), + wc->vendor_err); + + ERROR_LOG("byte_len=%u, immdata=%u, qp=%p, " \ + "qp_num=0x%x, src_qp=0x%x\n", + wc->byte_len, ntohl(wc->ex.imm_data), + wc->qp, wc->qp ? wc->qp->qp_num : 0xdeadbeaf, + wc->src_qp); + } + if (task && rdma_task) + xio_handle_task_error(task); + + /* temporary */ + if (wc->status != IB_WC_WR_FLUSH_ERR) { + if (rdma_hndl) { + ERROR_LOG("cq error reported. calling " \ + "rdma_disconnect. rdma_hndl:%p\n", + rdma_hndl); + retval = rdma_disconnect(rdma_hndl->cm_id); + if (retval) + ERROR_LOG("rdma_hndl:%p rdma_disconnect" \ + "failed, %d\n", rdma_hndl, retval); + } else { + /* TODO: handle each error specifically */ + ERROR_LOG("ASSERT: program abort\n"); + BUG(); + } + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_idle_handler */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_idle_handler(struct xio_rdma_transport *rdma_hndl) +{ + if (rdma_hndl->state != XIO_TRANSPORT_STATE_CONNECTED || + !rdma_hndl->primary_pool_cls.task_lookup) + return 0; + + /* Does the local have resources to send message? */ + if (!rdma_hndl->sqe_avail) + return 0; + + /* Try to do some useful work, want to spend time before calling the + * pool, this increase the chance that more messages will arrive + * and request notify will not be necessary + */ + + if (rdma_hndl->kick_rdma_rd_req) + xio_xmit_rdma_rd_req(rdma_hndl); + + if (rdma_hndl->kick_rdma_rd_rsp) + xio_xmit_rdma_rd_rsp(rdma_hndl); + + /* Does the local have resources to send message? + * xio_xmit_rdma_rd may consumed the sqe_avail + */ + if (!rdma_hndl->sqe_avail) + return 0; + + /* Can the peer receive messages? */ + if (!rdma_hndl->peer_credits) + return 0; + + /* If we have real messages to send there is no need for + * a special NOP message as credits are piggybacked + */ + if (rdma_hndl->tx_ready_tasks_num) { + xio_rdma_xmit(rdma_hndl); + return 0; + } + + /* Send NOP if messages are not queued */ + + /* Does the peer have already maximum credits? */ + if (rdma_hndl->sim_peer_credits >= MAX_RECV_WR) + return 0; + + /* Does the local have any credits to send? */ + if (!rdma_hndl->credits) + return 0; + + TRACE_LOG("peer_credits:%d, credits:%d sim_peer_credits:%d\n", + rdma_hndl->peer_credits, rdma_hndl->credits, + rdma_hndl->sim_peer_credits); + + xio_rdma_send_nop(rdma_hndl); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_rx_handler */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_rx_handler(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + struct xio_task *task1, *task2; + int must_send = 0; + struct xio_work_req *rxd = &rdma_task->rxd; + struct list_head *task_prev; + int retval; + + /* prefetch next buffer */ + if (likely(task->tasks_list_entry.next != + task->tasks_list_entry.prev)) { + task1 = list_entry(task->tasks_list_entry.next, + struct xio_task, tasks_list_entry); + task_prev = task->tasks_list_entry.prev; + xio_prefetch(task1->mbuf.buf.head); + } else { + task1 = NULL; + task_prev = NULL; + } + + rdma_hndl->rqe_avail--; + rdma_hndl->sim_peer_credits--; + + /* unmap dma */ + xio_unmap_rx_work_req(rdma_hndl->dev, rxd); + if (rdma_task->read_mem_desc.nents && rdma_task->read_mem_desc.mapped) + xio_unmap_desc(rdma_hndl, &rdma_task->read_mem_desc, + DMA_FROM_DEVICE); + + if (rdma_task->write_mem_desc.nents && rdma_task->write_mem_desc.mapped) + xio_unmap_desc(rdma_hndl, &rdma_task->write_mem_desc, + DMA_TO_DEVICE); + + /* rearm the receive queue */ + /* + if ((rdma_hndl->state == XIO_TRANSPORT_STATE_CONNECTED) && + (rdma_hndl->rqe_avail <= rdma_hndl->rq_depth + 1)) + xio_rdma_rearm_rq(rdma_hndl); + */ + retval = xio_mbuf_read_first_tlv(&task->mbuf); + + task->tlv_type = xio_mbuf_tlv_type(&task->mbuf); + + list_move_tail(&task->tasks_list_entry, &rdma_hndl->io_list); + + /* call recv completion */ + switch (task->tlv_type) { + case XIO_CREDIT_NOP: + xio_rdma_on_recv_nop(rdma_hndl, task); + if (rdma_hndl->rqe_avail <= rdma_hndl->rq_depth + 1) + xio_rdma_rearm_rq(rdma_hndl); + must_send = 1; + break; + case XIO_RDMA_READ_ACK: + xio_rdma_on_recv_rdma_read_ack(rdma_hndl, task); + if (rdma_hndl->rqe_avail <= rdma_hndl->rq_depth + 1) + xio_rdma_rearm_rq(rdma_hndl); + must_send = 1; + break; + case XIO_NEXUS_SETUP_REQ: + case XIO_NEXUS_SETUP_RSP: + xio_rdma_on_setup_msg(rdma_hndl, task); + break; + case XIO_CANCEL_REQ: + xio_rdma_on_recv_cancel_req(rdma_hndl, task); + break; + case XIO_CANCEL_RSP: + xio_rdma_on_recv_cancel_rsp(rdma_hndl, task); + break; + default: + /* rearm the receive queue */ + if (rdma_hndl->rqe_avail <= rdma_hndl->rq_depth + 1) + xio_rdma_rearm_rq(rdma_hndl); + if (IS_REQUEST(task->tlv_type)) + xio_rdma_on_recv_req(rdma_hndl, task); + else if (IS_RESPONSE(task->tlv_type)) + xio_rdma_on_recv_rsp(rdma_hndl, task); + else + ERROR_LOG("unknown message type:0x%x\n", + task->tlv_type); + break; + } + /* + if (rdma_hndl->state != XIO_TRANSPORT_STATE_CONNECTED) + return retval; + */ + /* transmit ready packets */ + if (!must_send && rdma_hndl->tx_ready_tasks_num) + must_send = (tx_window_sz(rdma_hndl) >= SEND_THRESHOLD); + /* resource are now available and rdma rd requests are pending kick + * them + */ + if (rdma_hndl->kick_rdma_rd_req) + xio_xmit_rdma_rd_req(rdma_hndl); + + if (rdma_hndl->kick_rdma_rd_rsp) + xio_xmit_rdma_rd_rsp(rdma_hndl); + + if (must_send) + xio_rdma_xmit(rdma_hndl); + + /* prefetch next buffer */ + if (task1) { + if (task1->tasks_list_entry.next != task_prev) { + task2 = list_entry(task1->tasks_list_entry.next, + struct xio_task, tasks_list_entry); + xio_prefetch(task2); + } + } + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_tx_comp_handler */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_tx_comp_handler(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + struct xio_task *ptask, *next_ptask; + struct xio_rdma_task *rdma_task; + int found = 0; + int removed = 0; + struct xio_work_req *txd, *rdmad; + + /* If we got a completion, it means all the previous tasks should've + been sent by now - due to ordering */ + list_for_each_entry_safe(ptask, next_ptask, &rdma_hndl->in_flight_list, + tasks_list_entry) { + list_move_tail(&ptask->tasks_list_entry, + &rdma_hndl->tx_comp_list); + removed++; + rdma_task = ptask->dd_data; + + txd = &rdma_task->txd; + + /* unmap dma */ + xio_unmap_tx_work_req(rdma_hndl->dev, txd); + + rdma_hndl->sqe_avail++; + rdma_hndl->sqe_avail += rdma_task->sqe_used; + rdma_task->sqe_used = 0; + + /* phantom task */ + if (rdma_task->phantom_idx) { + xio_tasks_pool_put(ptask); + continue; + } + + /* rdma wr utilizes two wqe but appears only once in the + * in flight list + */ + if (rdma_task->out_ib_op == XIO_IB_RDMA_WRITE) + rdma_hndl->sqe_avail++; + + if (IS_RDMA_RD_ACK(ptask->tlv_type)) { + rdma_hndl->rsps_in_flight_nr--; + xio_tasks_pool_put(ptask); + } else if (IS_REQUEST(ptask->tlv_type)) { + rdma_hndl->max_sn++; + rdma_hndl->reqs_in_flight_nr--; + xio_rdma_on_req_send_comp(rdma_hndl, ptask); + xio_tasks_pool_put(ptask); + } else if (IS_RESPONSE(ptask->tlv_type)) { + rdmad = &rdma_task->rdmad; + /* unmap dma */ + /* Need to handle FMR/FRWR */ + if (rdma_task->out_ib_op == XIO_IB_RDMA_WRITE) + xio_unmap_txmad_work_req(rdma_hndl->dev, rdmad); + else + xio_unmap_rxmad_work_req(rdma_hndl->dev, rdmad); + + if (rdma_task->read_mem_desc.nents && + rdma_task->read_mem_desc.mapped) + xio_unmap_desc(rdma_hndl, + &rdma_task->read_mem_desc, + DMA_FROM_DEVICE); + + if (rdma_task->write_mem_desc.nents && + rdma_task->write_mem_desc.mapped) + xio_unmap_desc(rdma_hndl, + &rdma_task->write_mem_desc, + DMA_TO_DEVICE); + + rdma_hndl->max_sn++; + rdma_hndl->rsps_in_flight_nr--; + xio_rdma_on_rsp_send_comp(rdma_hndl, ptask); + } else if (IS_NOP(ptask->tlv_type)) { + rdma_hndl->rsps_in_flight_nr--; + xio_tasks_pool_put(ptask); + } else if (ptask->tlv_type == XIO_MSG_TYPE_RDMA) { + rdma_hndl->reqs_in_flight_nr--; + rdmad = &rdma_task->rdmad; + if (rdma_task->out_ib_op == XIO_IB_RDMA_WRITE_DIRECT) { + xio_unmap_txmad_work_req(rdma_hndl->dev, rdmad); + xio_rdma_on_direct_rdma_comp( + rdma_hndl, ptask, + XIO_WC_OP_RDMA_WRITE); + xio_tasks_pool_put(ptask); + } + } else { + ERROR_LOG("unexpected task %p tlv %u type:0x%x id:%d " \ + "magic:0x%x\n", + ptask, ptask->tlv_type, rdma_task->out_ib_op, + ptask->ltid, ptask->magic); + continue; + } + if (ptask == task) { + found = 1; + break; + } + } + /* resource are now available and rdma rd requests are pending kick + * them + */ + if (rdma_hndl->kick_rdma_rd_req) + xio_xmit_rdma_rd_req(rdma_hndl); + + if (rdma_hndl->kick_rdma_rd_rsp) + xio_xmit_rdma_rd_rsp(rdma_hndl); + + if (rdma_hndl->tx_ready_tasks_num) + xio_rdma_xmit(rdma_hndl); + + if (!found && removed) + ERROR_LOG("not found but removed %d type:0x%x\n", + removed, task->tlv_type); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* unmap_rdma_rd_task */ +/*---------------------------------------------------------------------------*/ +static void unmap_rdma_rd_task(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + if (rdma_task->rdmad.mapped) + xio_unmap_rxmad_work_req(rdma_hndl->dev, + &rdma_task->rdmad); + + if (rdma_task->read_mem_desc.nents && + rdma_task->read_mem_desc.mapped) + xio_unmap_desc(rdma_hndl, &rdma_task->read_mem_desc, + DMA_FROM_DEVICE); + + if (rdma_task->write_mem_desc.nents && + rdma_task->write_mem_desc.mapped) + xio_unmap_desc(rdma_hndl, &rdma_task->write_mem_desc, + DMA_TO_DEVICE); +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_rd_req_comp_handler */ +/*---------------------------------------------------------------------------*/ +static void xio_direct_rdma_rd_comp_handler( + struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + + rdma_hndl->sqe_avail++; + rdma_hndl->sqe_avail += rdma_task->sqe_used; + rdma_task->sqe_used = 0; + + if (rdma_task->phantom_idx == 0) { + rdma_hndl->reqs_in_flight_nr--; + xio_rdma_on_direct_rdma_comp(rdma_hndl, task, + XIO_WC_OP_RDMA_READ); + + unmap_rdma_rd_task(rdma_hndl, task); + } else { + xio_tasks_pool_put(task); + xio_xmit_rdma_rd_req(rdma_hndl); + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_rd_req_comp_handler */ +/*---------------------------------------------------------------------------*/ +static void xio_rdma_rd_req_comp_handler(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + union xio_transport_event_data event_data; + struct xio_transport_base *transport = + (struct xio_transport_base *)rdma_hndl; + + rdma_hndl->rdma_rd_req_in_flight--; + + rdma_hndl->sqe_avail++; + rdma_hndl->sqe_avail += rdma_task->sqe_used; + rdma_task->sqe_used = 0; + + if (rdma_task->phantom_idx == 0) { + if (task->state == XIO_TASK_STATE_CANCEL_PENDING) { + TRACE_LOG("[%d] - **** message is canceled\n", + rdma_task->sn); + xio_rdma_cancel_rsp(transport, task, XIO_E_MSG_CANCELED, + NULL, 0); + xio_tasks_pool_put(task); + xio_xmit_rdma_rd_req(rdma_hndl); + if (rdma_task->rdmad.mapped) + xio_unmap_rxmad_work_req(rdma_hndl->dev, + &rdma_task->rdmad); + return; + } + + list_move_tail(&task->tasks_list_entry, &rdma_hndl->io_list); + + xio_xmit_rdma_rd_req(rdma_hndl); + + unmap_rdma_rd_task(rdma_hndl, task); + + /* fill notification event */ + event_data.msg.op = XIO_WC_OP_RECV; + event_data.msg.task = task; + + xio_transport_notify_observer(&rdma_hndl->base, + XIO_TRANSPORT_EVENT_NEW_MESSAGE, + &event_data); + + while (rdma_hndl->rdma_rd_req_in_flight) { + task = list_first_entry( + &rdma_hndl->rdma_rd_req_in_flight_list, + struct xio_task, tasks_list_entry); + + rdma_task = task->dd_data; + + if (rdma_task->out_ib_op != XIO_IB_RECV) + break; + + /* tasks that arrived in Send/Receive while pending + * "RDMA READ" tasks were in flight was fenced. + */ + rdma_hndl->rdma_rd_req_in_flight--; + list_move_tail(&task->tasks_list_entry, + &rdma_hndl->io_list); + event_data.msg.op = XIO_WC_OP_RECV; + event_data.msg.task = task; + + xio_transport_notify_observer( + &rdma_hndl->base, + XIO_TRANSPORT_EVENT_NEW_MESSAGE, + &event_data); + } + } else { + xio_tasks_pool_put(task); + xio_xmit_rdma_rd_req(rdma_hndl); + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_rd_rsp_comp_handler */ +/*---------------------------------------------------------------------------*/ +static void xio_rdma_rd_rsp_comp_handler(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + union xio_transport_event_data event_data; + struct xio_transport_base *transport = + (struct xio_transport_base *)rdma_hndl; + + rdma_hndl->rdma_rd_rsp_in_flight--; + + rdma_hndl->sqe_avail++; + rdma_hndl->sqe_avail += rdma_task->sqe_used; + rdma_task->sqe_used = 0; + + if (rdma_task->phantom_idx == 0) { + if (task->state == XIO_TASK_STATE_CANCEL_PENDING) { + TRACE_LOG("[%d] - **** message is canceled\n", + rdma_task->sn); + xio_rdma_cancel_rsp(transport, task, XIO_E_MSG_CANCELED, + NULL, 0); + xio_tasks_pool_put(task); + xio_xmit_rdma_rd_rsp(rdma_hndl); + if (rdma_task->rdmad.mapped) + xio_unmap_rxmad_work_req(rdma_hndl->dev, + &rdma_task->rdmad); + return; + } + + list_move_tail(&task->tasks_list_entry, &rdma_hndl->io_list); + + /* notify the peer that it can free resources */ + xio_rdma_send_rdma_read_ack(rdma_hndl, task->rtid); + + xio_xmit_rdma_rd_rsp(rdma_hndl); + + unmap_rdma_rd_task(rdma_hndl, task); + + /* copy from task->in to sender_task->in */ + xio_rdma_post_recv_rsp(task); + + /* fill notification event */ + event_data.msg.op = XIO_WC_OP_RECV; + event_data.msg.task = task; + + xio_transport_notify_observer(&rdma_hndl->base, + XIO_TRANSPORT_EVENT_NEW_MESSAGE, + &event_data); + + while (rdma_hndl->rdma_rd_rsp_in_flight) { + task = list_first_entry( + &rdma_hndl->rdma_rd_rsp_in_flight_list, + struct xio_task, tasks_list_entry); + + rdma_task = task->dd_data; + + if (rdma_task->out_ib_op != XIO_IB_RECV) + break; + + /* tasks that arrived in Send/Receive while pending + * "RDMA READ" tasks were in flight was fenced. + */ + rdma_hndl->rdma_rd_rsp_in_flight--; + list_move_tail(&task->tasks_list_entry, + &rdma_hndl->io_list); + event_data.msg.op = XIO_WC_OP_RECV; + event_data.msg.task = task; + + xio_transport_notify_observer( + &rdma_hndl->base, + XIO_TRANSPORT_EVENT_NEW_MESSAGE, + &event_data); + } + } else { + xio_tasks_pool_put(task); + xio_xmit_rdma_rd_rsp(rdma_hndl); + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_handle_wc */ +/*---------------------------------------------------------------------------*/ +static inline void xio_handle_wc(struct ib_wc *wc, int last_in_rxq) +{ + struct xio_task *task = ptr_from_int64(wc->wr_id); + int opcode = wc->opcode; + + XIO_TO_RDMA_HNDL(task, rdma_hndl); + + /* + TRACE_LOG("received opcode :%s byte_len [%u]\n", + xio_ib_wc_opcode_str(wc->opcode), wc->byte_len); + */ + + switch (opcode) { + case IB_WC_RECV: + task->last_in_rxq = last_in_rxq; + xio_rdma_rx_handler(rdma_hndl, task); + break; + case IB_WC_SEND: + case IB_WC_RDMA_WRITE: + if (opcode == IB_WC_SEND || + (opcode == IB_WC_RDMA_WRITE && + task->tlv_type == XIO_MSG_TYPE_RDMA)) + xio_rdma_tx_comp_handler(rdma_hndl, task); + break; + case IB_WC_RDMA_READ: + task->last_in_rxq = last_in_rxq; + if (IS_REQUEST(task->tlv_type)) + xio_rdma_rd_req_comp_handler(rdma_hndl, task); + else if (IS_RESPONSE(task->tlv_type)) + xio_rdma_rd_rsp_comp_handler(rdma_hndl, task); + else if (task->tlv_type == XIO_MSG_TYPE_RDMA) + xio_direct_rdma_rd_comp_handler(rdma_hndl, task); + else + ERROR_LOG("Unexpected tlv_type %u\n", task->tlv_type); + break; + case IB_WC_LOCAL_INV: + case IB_WC_FAST_REG_MR: + break; + default: + ERROR_LOG("unknown opcode :%s [0x%x]\n", + xio_ib_wc_opcode_str(wc->opcode), wc->opcode); + break; + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_poll_completions */ +/*---------------------------------------------------------------------------*/ +void xio_rdma_poll_completions(struct xio_cq *tcq, int timeout_us) +{ + int retval; + int i; + struct xio_task *task; + int last_in_rxq = -1; + int tlv_type; + unsigned long timeout; + unsigned long start_time; + struct ib_wc *wc; + struct xio_rdma_task *rdma_task; + + timeout = usecs_to_jiffies(timeout_us); + + start_time = jiffies; + + while (1) { + retval = ib_poll_cq(tcq->cq, tcq->wc_array_len, tcq->wc_array); + if (likely(retval > 0)) { + wc = &tcq->wc_array[retval - 1]; + for (i = retval - 1; i >= 0; i--) { + if (((wc->opcode == IB_WC_RECV || wc->opcode == IB_WC_RDMA_READ)) && + wc->status == IB_WC_SUCCESS) { + task = (struct xio_task *) + ptr_from_int64(wc->wr_id); + rdma_task = (struct xio_rdma_task *)task->dd_data; + if (!rdma_task->phantom_idx) { + tlv_type = xio_mbuf_read_type( + &task->mbuf); + if (IS_APPLICATION_MSG(tlv_type)) { + last_in_rxq = i; + break; + } + } + } + wc--; + } + wc = &tcq->wc_array[0]; + for (i = 0; i < retval; i++) { + if (likely(wc->status == IB_WC_SUCCESS)) + xio_handle_wc(wc, + (last_in_rxq == i)); + else + xio_handle_wc_error(wc); + wc++; + } + if (time_is_before_eq_jiffies(start_time + timeout)) + break; + if (xio_context_is_loop_stopping(tcq->ctx)) + break; + } else if (retval == 0) { + if (time_is_before_eq_jiffies(start_time + timeout)) + break; + } else { + ERROR_LOG("ib_poll_cq failed. (ret=%d %m)\n", retval); + xio_set_error(-retval); + return; + } + } + + retval = ib_req_notify_cq(tcq->cq, IB_CQ_NEXT_COMP); + if (unlikely(retval)) { + /* didn't request IB_CQ_REPORT_MISSED_EVENTS so can't be > 0 */ + xio_set_error(-retval); + ERROR_LOG("ib_req_notify_cq failed. (ret=%d)\n", retval); + return; + } + tcq->num_delayed_arm = 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_cq_event_handler */ +/*---------------------------------------------------------------------------*/ +static int xio_cq_event_handler(struct xio_cq *tcq) +{ + struct xio_task *task; + unsigned long start_time; + u32 budget = MAX_POLL_WC; + int poll_nr, polled; + int retval, tlv_type; + int i, last_in_rxq = -1; + struct ib_wc *wc; + struct xio_rdma_task *rdma_task; + + start_time = jiffies; + +retry: + while (budget) { + poll_nr = min(budget, tcq->wc_array_len); + for (i = 0; i < poll_nr; i++) { + /* don't hold spinlock_irqsave for long */ + retval = ib_poll_cq(tcq->cq, 1, &tcq->wc_array[i]); + if (unlikely(retval <= 0)) + break; + } + polled = i; + budget -= i; + tcq->wqes += i; + + wc = &tcq->wc_array[polled - 1]; + for (i = polled - 1; i >= 0; i--) { + if ((wc->opcode == IB_WC_RECV || wc->opcode == IB_WC_RDMA_READ) && + wc->status == IB_WC_SUCCESS) { + task = (struct xio_task *) + ptr_from_int64(wc->wr_id); + rdma_task = (struct xio_rdma_task *)task->dd_data; + if (!rdma_task->phantom_idx) { + tlv_type = xio_mbuf_read_type(&task->mbuf); + if (IS_APPLICATION_MSG(tlv_type)) { + last_in_rxq = i; + break; + } + } + } + wc--; + } + /* process work completions */ + wc = &tcq->wc_array[0]; + for (i = 0; i < polled; i++) { + if (wc->status == IB_WC_SUCCESS) + xio_handle_wc(wc, + (last_in_rxq == i)); + else + xio_handle_wc_error(wc); + wc++; + } + /* an error or no more work completions */ + if (polled != poll_nr) + break; + + if (time_after(jiffies, start_time)) { + /* time slice exhausted, reschedule */ + xio_cq_data_callback_cont(tcq->cq, tcq); + return 0; + } + } + + /* If we got anything, return quickly, and come again later */ + if (likely(budget != MAX_POLL_WC)) { + /* budget was consumed, reschedule */ + xio_cq_data_callback_cont(tcq->cq, tcq); + return 0; + } + + if (unlikely(tcq->polling_started == 0 && tcq->ctx->polling_timeout)) { + getnstimeofday(&tcq->polling_end_time); + timespec_add_ns(&tcq->polling_end_time, + tcq->ctx->polling_timeout * NSECS_IN_USEC); + tcq->polling_started = 1; + } + + /* If loop was terminated before the budget was consumed + * need to re-arm the CQ + */ + tcq->num_delayed_arm++; + if (tcq->num_delayed_arm < MAX_NUM_DELAYED_ARM) { + /* Let other activities to do some work + * with the hope that events will arrive and + * no interrupt triggering will be required. + * Kind of busy wait + */ + xio_cq_data_callback_cont(tcq->cq, tcq); + return 0; + } + + if (likely(tcq->polling_started)) { + struct timespec ts; + + getnstimeofday(&ts); + if (tcq->polling_end_time.tv_sec > ts.tv_sec || + tcq->polling_end_time.tv_nsec > ts.tv_nsec) { + xio_cq_data_callback_cont(tcq->cq, tcq); + return 0; + } + tcq->polling_started = 0; + } + + tcq->num_delayed_arm = 0; + + /* retries limit reached */ + retval = ib_req_notify_cq(tcq->cq, + IB_CQ_NEXT_COMP | + IB_CQ_REPORT_MISSED_EVENTS); + if (likely(!retval)) + return 0; + + /* if driver supports IB_CQ_REPORT_MISSED_EVENTS + * note budget is not yet consumed + */ + if (retval > 0) + goto retry; + + ERROR_LOG("ib_req_notify_cq failed. (err=%d)\n", + retval); + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_data_handler */ +/*---------------------------------------------------------------------------*/ +void xio_data_handler(void *user_context) +{ + struct xio_cq *tcq = (struct xio_cq *)user_context; + struct xio_rdma_transport *rdma_hndl; + + xio_cq_event_handler(tcq); + + list_for_each_entry(rdma_hndl, &tcq->trans_list, trans_list_entry) { + xio_rdma_idle_handler(rdma_hndl); + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_cq_data_callback_cont (completion) */ +/*---------------------------------------------------------------------------*/ +void xio_cq_data_callback_cont(struct ib_cq *cq, void *cq_context) +{ + struct xio_cq *tcq = (struct xio_cq *)cq_context; + + tcq->scheds++; + /* do it in init time */ + tcq->event_data.handler = xio_data_handler; + tcq->event_data.data = cq_context; + /* tell "poller mechanism" */ + xio_context_add_event(tcq->ctx, &tcq->event_data); +} + +/*---------------------------------------------------------------------------*/ +/* xio_cq_data_callback (completion) */ +/*---------------------------------------------------------------------------*/ +void xio_cq_data_callback(struct ib_cq *cq, void *cq_context) +{ + struct xio_cq *tcq = (struct xio_cq *)cq_context; + + tcq->events++; + /* do it in init time */ + tcq->event_data.handler = xio_data_handler; + tcq->event_data.data = cq_context; + /* tell "poller mechanism" */ + xio_context_add_event(tcq->ctx, &tcq->event_data); +} + +/*---------------------------------------------------------------------------*/ +/* xio_prep_rdma_rd_send_req */ +/*---------------------------------------------------------------------------*/ +static void xio_prep_rdma_rd_send_req(struct xio_task *task, + struct xio_rdma_transport *rdma_hndl, + int signaled) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + struct xio_work_req *rdmad = &rdma_task->rdmad; + + if (unlikely(!rdmad->nents)) { + ERROR_LOG("ZERO nents %s\n", __func__); + return; + } + + if (unlikely(xio_map_rxmad_work_req(rdma_hndl->dev, rdmad))) { + ERROR_LOG("DMA map from device failed\n"); + return; + } + + rdmad->send_wr.num_sge = rdmad->mapped; + rdmad->send_wr.wr_id = uint64_from_ptr(task); + rdmad->send_wr.next = NULL; + rdmad->send_wr.opcode = IB_WR_RDMA_READ; + rdmad->send_wr.send_flags = (signaled ? IB_SEND_SIGNALED : 0); + + /* remote_addr and rkey were set in xio_prep_rdma_op */ +} + +/*---------------------------------------------------------------------------*/ +/* xio_prep_rdma_wr_send_req */ +/*---------------------------------------------------------------------------*/ +static void xio_prep_rdma_wr_send_req(struct xio_task *task, + struct xio_rdma_transport *rdma_hndl, + struct xio_work_req *next_wr, + int signaled) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + struct xio_device *dev = rdma_hndl->dev; + struct xio_work_req *rdmad = &rdma_task->rdmad; + + if (unlikely(!rdmad->nents)) { + ERROR_LOG("ZERO nents %s\n", __func__); + return; + } + + if (unlikely(xio_map_txmad_work_req(dev, rdmad))) { + ERROR_LOG("DMA map to device failed\n"); + return; + } + + rdmad->send_wr.num_sge = rdmad->mapped; + rdmad->send_wr.wr_id = uint64_from_ptr(task); + rdmad->send_wr.next = (next_wr ? &next_wr->send_wr : NULL); + rdmad->send_wr.opcode = IB_WR_RDMA_WRITE; + rdmad->send_wr.send_flags |= (signaled ? IB_SEND_SIGNALED : 0); + + /* remote_addr and rkey were set in xio_prep_rdma_op */ +} + +/*---------------------------------------------------------------------------*/ +/* xio_prep_rdma_op */ +/*---------------------------------------------------------------------------*/ +static int xio_prep_rdma_op(struct xio_task *task, + struct xio_rdma_transport *rdma_hndl, + enum xio_ib_op_code xio_out_ib_op, + enum ib_wr_opcode opcode, + struct xio_vmsg *vmsg, + struct xio_sge *rsg_list, size_t rsize, + size_t *out_rsize, + uint32_t op_size, + int max_sge, + int signaled, + struct list_head *target_list, + int tasks_number) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + struct xio_task *tmp_task; + int task_idx; + + struct xio_rdma_task *tmp_rdma_task; + struct xio_work_req *rdmad = &rdma_task->rdmad; + struct xio_task *ptask, *next_ptask; + struct scatterlist *sg = NULL; + struct scatterlist *liov; + struct sg_table *sgtbl; + size_t lsize; + uint64_t laddr; + uint64_t raddr; + uint64_t raddr_base; + uint32_t llen; + uint32_t rlen; + uint32_t rkey; + uint32_t tot_len = 0; + uint32_t int_len = 0; + uint32_t rint_len = 0; + int l = 0, r = 0, k = 0; + LIST_HEAD(tmp_list); + + sgtbl = &vmsg->data_tbl; + + lsize = sgtbl->nents; + liov = sgtbl->sgl; + + r = 0; + rlen = rsg_list[r].length; + raddr = rsg_list[r].addr; + raddr_base = raddr; + rkey = rsg_list[r].stag; + + l = 0; + laddr = uint64_from_ptr(sg_virt(liov)); + llen = liov->length; + /* lkey will be set just after mapping when the ib_sge will be set */ + + k = 0; + + if (unlikely(lsize < 1 || rsize < 1)) { + ERROR_LOG("iovec size < 1 lsize:%zud, rsize:%zud\n", + lsize, rsize); + return -1; + } + + task_idx = tasks_number - 1; + + if (task_idx == 0) { + tmp_task = task; + } else { + /* take new task */ + tmp_task = + xio_tasks_pool_get( + rdma_hndl->phantom_tasks_pool, rdma_hndl); + if (unlikely(!tmp_task)) { + ERROR_LOG("phantom tasks pool is empty\n"); + return -1; + } + } + tmp_rdma_task = (struct xio_rdma_task *)tmp_task->dd_data; + rdmad = &tmp_rdma_task->rdmad; + sg = rdmad->sgt.sgl; + /*sg_init_table(sg, XIO_MAX_IOV);*/ + + while (1) { + if (rlen < llen) { + /* .num_sge will come from rdmad->mapped */ + rdmad->send_wr.wr_id = + uint64_from_ptr(tmp_task); + rdmad->send_wr.next = NULL; + rdmad->send_wr.opcode = opcode; + rdmad->send_wr.send_flags = + (signaled ? IB_SEND_SIGNALED : 0); + rdmad->send_wr.wr.rdma.remote_addr = raddr_base; + rdmad->send_wr.wr.rdma.rkey = rkey; + + /* Address is not yet mapped */ + sg_set_page(sg, virt_to_page(laddr), + rlen, offset_in_page(laddr)); + sg_mark_end(sg); + rdmad->last_sg = sg; + rdmad->sgt.nents = k + 1; + rdmad->nents = k + 1; + k = 0; + + tot_len += rlen; + int_len += rlen; + tmp_rdma_task->out_ib_op = xio_out_ib_op; + tmp_rdma_task->phantom_idx = task_idx; + + /* close the task */ + list_move_tail(&tmp_task->tasks_list_entry, &tmp_list); + /* advance the remote index */ + r++; + if (r == rsize) { + liov->length = int_len; + int_len = 0; + l++; + break; + } + task_idx--; + /* Is this the last task */ + if (task_idx) { + /* take new task */ + tmp_task = + xio_tasks_pool_get( + rdma_hndl->phantom_tasks_pool, + rdma_hndl); + if (unlikely(!tmp_task)) { + ERROR_LOG( + "phantom tasks pool is empty\n"); + goto cleanup; + } + } else { + tmp_task = task; + } + + tmp_rdma_task = + (struct xio_rdma_task *)tmp_task->dd_data; + rdmad = &tmp_rdma_task->rdmad; + sg = rdmad->sgt.sgl; + /* sg_init_table(sg, XIO_MAX_IOV); */ + + llen -= rlen; + laddr += rlen; + raddr = rsg_list[r].addr; + rlen = rsg_list[r].length; + rkey = rsg_list[r].stag; + raddr_base = raddr; + } else if (llen < rlen) { + /* Address is not yet mapped */ + sg_set_page(sg, virt_to_page(laddr), + llen, offset_in_page(laddr)); + tot_len += llen; + int_len += llen; + rint_len += llen; + + liov->length = int_len; + int_len = 0; + /* advance the local index */ + l++; + k++; + if (l == lsize || k == max_sge - 1) { + /* .num_sge will come from rdmad->mapped */ + rdmad->send_wr.wr_id = + uint64_from_ptr(tmp_task); + rdmad->send_wr.next = NULL; + rdmad->send_wr.opcode = opcode; + rdmad->send_wr.send_flags = + (signaled ? IB_SEND_SIGNALED : 0); + rdmad->send_wr.wr.rdma.remote_addr = raddr_base; + rdmad->send_wr.wr.rdma.rkey = rkey; + tmp_rdma_task->out_ib_op = xio_out_ib_op; + tmp_rdma_task->phantom_idx = task_idx; + + sg_mark_end(sg); + rdmad->last_sg = sg; + rdmad->sgt.nents = k; + rdmad->nents = k; + + /* close the task */ + list_move_tail(&tmp_task->tasks_list_entry, + &tmp_list); + + if (l == lsize) { + rsg_list[r].length = rint_len; + rint_len = 0; + r++; + break; + } + + /* if we are here then k == max_sge - 1 */ + + task_idx--; + /* Is this the last task */ + if (task_idx) { + /* take new task */ + tmp_task = + xio_tasks_pool_get( + rdma_hndl->phantom_tasks_pool, + rdma_hndl); + if (unlikely(!tmp_task)) { + ERROR_LOG( + "phantom tasks pool is empty\n"); + goto cleanup; + } + } else { + tmp_task = task; + } + + tmp_rdma_task = + (struct xio_rdma_task *)tmp_task->dd_data; + rdmad = &tmp_rdma_task->rdmad; + k = 0; + sg = rdmad->sgt.sgl; + /* sg_init_table(sg, XIO_MAX_IOV); */ + } else { + sg = sg_next(sg); + } + liov = sg_next(liov); + rlen -= llen; + raddr += llen; + laddr = uint64_from_ptr(sg_virt(liov)); + llen = liov->length; + } else { + /* .num_sge will come from rdmad->mapped */ + rdmad->send_wr.wr_id = uint64_from_ptr(tmp_task); + rdmad->send_wr.next = NULL; + rdmad->send_wr.opcode = opcode; + rdmad->send_wr.send_flags = + (signaled ? IB_SEND_SIGNALED : 0); + rdmad->send_wr.wr.rdma.remote_addr = raddr_base; + rdmad->send_wr.wr.rdma.rkey = rkey; + + /* Address is not yet mapped */ + sg_set_page(sg, virt_to_page(laddr), + llen, offset_in_page(laddr)); + sg_mark_end(sg); + rdmad->last_sg = sg; + rdmad->sgt.nents = k + 1; + rdmad->nents = k + 1; + k = 0; + + tot_len += llen; + int_len += llen; + rint_len += llen; + tmp_rdma_task->out_ib_op = xio_out_ib_op; + tmp_rdma_task->phantom_idx = task_idx; + + /* close the task */ + list_move_tail(&tmp_task->tasks_list_entry, + &tmp_list); + + liov->length = int_len; + int_len = 0; + rsg_list[r].length = rint_len; + rint_len = 0; + /* advance the remote and local indices */ + r++; + l++; + if ((l == lsize) || (r == rsize)) + break; + liov = sg_next(liov); + + task_idx--; + /* Is this the last task */ + if (task_idx) { + /* take new task */ + tmp_task = + xio_tasks_pool_get( + rdma_hndl->phantom_tasks_pool, + rdma_hndl); + if (unlikely(!tmp_task)) { + ERROR_LOG( + "phantom tasks pool is empty\n"); + goto cleanup; + } + } else { + tmp_task = task; + } + tmp_rdma_task = + (struct xio_rdma_task *)tmp_task->dd_data; + rdmad = &tmp_rdma_task->rdmad; + sg = rdmad->sgt.sgl; + /* sg_init_table(sg, XIO_MAX_IOV); */ + + laddr = uint64_from_ptr(sg_virt(liov)); + llen = liov->length; + + raddr = rsg_list[r].addr; + rlen = rsg_list[r].length; + rkey = rsg_list[r].stag; + raddr_base = raddr; + } + } + sgtbl->nents = l; + sg_mark_end(liov); + *out_rsize = r; + + if (tot_len < op_size) { + ERROR_LOG("iovec exhausted\n"); + goto cleanup; + } + + list_splice_tail(&tmp_list, target_list); + + return 0; +cleanup: + + /* list does not contain the original task */ + list_for_each_entry_safe(ptask, next_ptask, &tmp_list, + tasks_list_entry) { + /* the tmp tasks are returned back to pool */ + xio_tasks_pool_put(ptask); + } + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* verify_req_send_limits */ +/*---------------------------------------------------------------------------*/ +static int verify_req_send_limits(const struct xio_rdma_transport *rdma_hndl) +{ + if (rdma_hndl->reqs_in_flight_nr + rdma_hndl->rsps_in_flight_nr > + rdma_hndl->max_tx_ready_tasks_num) { + DEBUG_LOG("over limits reqs_in_flight_nr=%u, "\ + "rsps_in_flight_nr=%u, max_tx_ready_tasks_num=%u\n", + rdma_hndl->reqs_in_flight_nr, + rdma_hndl->rsps_in_flight_nr, + rdma_hndl->max_tx_ready_tasks_num); + xio_set_error(EAGAIN); + return -1; + } + + if (rdma_hndl->reqs_in_flight_nr >= + rdma_hndl->max_tx_ready_tasks_num - 1) { + DEBUG_LOG("over limits reqs_in_flight_nr=%u, " \ + "max_tx_ready_tasks_num=%u\n", + rdma_hndl->reqs_in_flight_nr, + rdma_hndl->max_tx_ready_tasks_num); + + xio_set_error(EAGAIN); + return -1; + } + /* tx ready is full - refuse request */ + if (rdma_hndl->tx_ready_tasks_num >= + rdma_hndl->max_tx_ready_tasks_num) { + DEBUG_LOG("over limits tx_ready_tasks_num=%u, "\ + "max_tx_ready_tasks_num=%u\n", + rdma_hndl->tx_ready_tasks_num, + rdma_hndl->max_tx_ready_tasks_num); + xio_set_error(EAGAIN); + return -1; + } + if (rdma_hndl->sqe_avail < 2) { + DEBUG_LOG("rdma_hndl=%p, no sqe_avail=%d\n", + rdma_hndl, rdma_hndl->sqe_avail); + xio_set_error(EAGAIN); + return -1; + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* verify_rsp_send_limits */ +/*---------------------------------------------------------------------------*/ +static int verify_rsp_send_limits(const struct xio_rdma_transport *rdma_hndl) +{ + if (rdma_hndl->reqs_in_flight_nr + rdma_hndl->rsps_in_flight_nr > + rdma_hndl->max_tx_ready_tasks_num) { + DEBUG_LOG("over limits reqs_in_flight_nr=%u, "\ + "rsps_in_flight_nr=%u, max_tx_ready_tasks_num=%u\n", + rdma_hndl->reqs_in_flight_nr, + rdma_hndl->rsps_in_flight_nr, + rdma_hndl->max_tx_ready_tasks_num); + xio_set_error(EAGAIN); + return -1; + } + + if (rdma_hndl->rsps_in_flight_nr >= + rdma_hndl->max_tx_ready_tasks_num - 1) { + DEBUG_LOG("over limits rsps_in_flight_nr=%u, " \ + "max_tx_ready_tasks_num=%u\n", + rdma_hndl->rsps_in_flight_nr, + rdma_hndl->max_tx_ready_tasks_num); + + xio_set_error(EAGAIN); + return -1; + } + /* tx ready is full - refuse request */ + if (rdma_hndl->tx_ready_tasks_num >= + rdma_hndl->max_tx_ready_tasks_num) { + DEBUG_LOG("over limits tx_ready_tasks_num=%u, "\ + "max_tx_ready_tasks_num=%u\n", + rdma_hndl->tx_ready_tasks_num, + rdma_hndl->max_tx_ready_tasks_num); + xio_set_error(EAGAIN); + return -1; + } + if (rdma_hndl->sqe_avail < 2) { + DEBUG_LOG("rdma_hndl=%p, no sqe_avail=%d\n", + rdma_hndl, rdma_hndl->sqe_avail); + xio_set_error(EAGAIN); + return -1; + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* kick_send_and_read */ +/*---------------------------------------------------------------------------*/ +static int kick_send_and_read(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task, + int must_send) +{ + int retval = 0; + + /* transmit only if available */ + if (test_bits(XIO_MSG_FLAG_LAST_IN_BATCH, &task->omsg->flags) || + task->is_control) { + must_send = 1; + } else { + if (tx_window_sz(rdma_hndl) >= SEND_THRESHOLD) + must_send = 1; + } + + /* resource are now available and rdma rd requests are pending kick + * them + */ + if (rdma_hndl->kick_rdma_rd_req) { + retval = xio_xmit_rdma_rd_req(rdma_hndl); + if (retval) { + retval = xio_errno(); + if (retval != EAGAIN) { + ERROR_LOG("xio_xmit_rdma_rd failed. %s\n", + xio_strerror(retval)); + return -1; + } + retval = 0; + } + } + if (rdma_hndl->kick_rdma_rd_rsp) { + retval = xio_xmit_rdma_rd_rsp(rdma_hndl); + if (retval) { + retval = xio_errno(); + if (retval != EAGAIN) { + ERROR_LOG("xio_xmit_rdma_rd_rsp failed. %s\n", + xio_strerror(retval)); + return -1; + } + retval = 0; + } + } + if (must_send) { + retval = xio_rdma_xmit(rdma_hndl); + if (retval) { + retval = xio_errno(); + if (retval != EAGAIN) { + ERROR_LOG("xio_xmit_rdma failed. %s\n", + xio_strerror(retval)); + return -1; + } + retval = 0; + } + } + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_perform_direct_rdma */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_perform_direct_rdma(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + enum xio_ib_op_code out_ib_opcode; + enum ib_wr_opcode wr_opcode; + size_t llen; + size_t rsg_out_list_len = 0; + int retval = 0; + int tasks_used = 0; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + + if (unlikely(verify_req_send_limits(rdma_hndl))) + return -1; + + sgtbl = xio_sg_table_get(&task->omsg->out); + sgtbl_ops = xio_sg_table_ops_get(task->omsg->out.sgl_type); + + llen = tbl_length(sgtbl_ops, sgtbl); + + if (unlikely(task->omsg->rdma.length < llen)) { + ERROR_LOG("peer provided too small iovec\n"); + task->status = XIO_E_REM_USER_BUF_OVERFLOW; + return -1; + } + + retval = xio_validate_rdma_op( + &task->omsg->out, + task->omsg->rdma.rsg_list, + task->omsg->rdma.nents, + llen, + rdma_hndl->max_sge, + &tasks_used); + if (unlikely(retval)) { + ERROR_LOG("failed to validate input scatter lists\n"); + task->status = XIO_E_MSG_INVALID; + return -1; + } + out_ib_opcode = task->omsg->rdma.is_read ? XIO_IB_RDMA_READ_DIRECT : + XIO_IB_RDMA_WRITE_DIRECT; + wr_opcode = task->omsg->rdma.is_read ? IB_WR_RDMA_READ : + IB_WR_RDMA_WRITE; + + retval = xio_prep_rdma_op(task, rdma_hndl, + out_ib_opcode, + wr_opcode, + &task->omsg->out, + task->omsg->rdma.rsg_list, + task->omsg->rdma.nents, + &rsg_out_list_len, + llen, + rdma_hndl->max_sge, + 0, + &rdma_hndl->tx_ready_list, tasks_used); + if (unlikely(retval)) { + ERROR_LOG("failed to allocate tasks\n"); + task->status = XIO_E_NO_BUFS; + return -1; + } + + rdma_hndl->tx_ready_tasks_num += tasks_used; + + return kick_send_and_read(rdma_hndl, task, 0 /* must_send */); +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_write_req_header */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_write_req_header(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task, + struct xio_rdma_req_hdr *req_hdr) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + struct xio_rdma_req_hdr *tmp_req_hdr; + struct xio_sge *tmp_sge; + struct xio_sge sge; + size_t hdr_len; + uint32_t i; + struct ib_device *ib_dev = rdma_hndl->dev->ib_dev; + struct ib_mr *mr = rdma_hndl->dev->mr; /* Need fix + for + FMR/FRWR */ + uint16_t in_num_sge, out_num_sge; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + void *sg; + + sgtbl = xio_sg_table_get(&task->omsg->in); + sgtbl_ops = xio_sg_table_ops_get(task->omsg->in.sgl_type); + + /* point to transport header */ + xio_mbuf_set_trans_hdr(&task->mbuf); + tmp_req_hdr = xio_mbuf_get_curr_ptr(&task->mbuf); + + /* pack relevant values */ + tmp_req_hdr->version = req_hdr->version; + tmp_req_hdr->flags = req_hdr->flags; + PACK_SVAL(req_hdr, tmp_req_hdr, req_hdr_len); + /* sn shall be coded later */ + /* ack_sn shall be coded later */ + /* credits shall be coded later */ + PACK_LVAL(req_hdr, tmp_req_hdr, ltid); + tmp_req_hdr->in_ib_op = req_hdr->in_ib_op; + tmp_req_hdr->out_ib_op = req_hdr->out_ib_op; + /* In case of FMR/FRWR the remote side will get one element */ + if (rdma_task->read_mem_desc.mem_reg.mem_h) + in_num_sge = 1; + else + in_num_sge = req_hdr->in_num_sge; + + if (rdma_task->write_mem_desc.mem_reg.mem_h) + out_num_sge = 1; + else + out_num_sge = req_hdr->out_num_sge; + + tmp_req_hdr->in_num_sge = htons(in_num_sge); + tmp_req_hdr->out_num_sge = htons(out_num_sge); + PACK_SVAL(req_hdr, tmp_req_hdr, ulp_hdr_len); + PACK_SVAL(req_hdr, tmp_req_hdr, ulp_pad_len); + /*remain_data_len is not used */ + PACK_LLVAL(req_hdr, tmp_req_hdr, ulp_imm_len); + + tmp_sge = (void *)((uint8_t *)tmp_req_hdr + + sizeof(struct xio_rdma_req_hdr)); + + /* IN: requester expect small input written via send */ + sg = sge_first(sgtbl_ops, sgtbl); + if (req_hdr->in_ib_op == XIO_IB_SEND) { + for (i = 0; i < req_hdr->in_num_sge; i++) { + sge.addr = 0; + sge.length = sge_length(sgtbl_ops, sg); + sge.stag = 0; + PACK_LLVAL(&sge, tmp_sge, addr); + PACK_LVAL(&sge, tmp_sge, length); + PACK_LVAL(&sge, tmp_sge, stag); + tmp_sge++; + } + } + /* IN: requester expect big input written rdma write */ + if (req_hdr->in_ib_op == XIO_IB_RDMA_WRITE) { + if (rdma_task->read_mem_desc.mem_reg.mem_h) { + /* FMR/FRWR case */ + sge.addr = rdma_task->read_mem_desc.mem_reg.va; + sge.length = rdma_task->read_mem_desc.mem_reg.len; + sge.stag = rdma_task->read_mem_desc.mem_reg.rkey; + PACK_LLVAL(&sge, tmp_sge, addr); + PACK_LVAL(&sge, tmp_sge, length); + PACK_LVAL(&sge, tmp_sge, stag); + tmp_sge++; + } else { + sg = rdma_task->read_mem_desc.sgt.sgl; + for (i = 0; i < req_hdr->in_num_sge; i++) { + sge.addr = ib_sg_dma_address(ib_dev, sg); + sge.length = ib_sg_dma_len(ib_dev, sg); + sge.stag = mr->rkey; + PACK_LLVAL(&sge, tmp_sge, addr); + PACK_LVAL(&sge, tmp_sge, length); + PACK_LVAL(&sge, tmp_sge, stag); + tmp_sge++; + sg = sg_next(sg); + } + } + } + /* OUT: requester want to write data via rdma read */ + if (req_hdr->out_ib_op == XIO_IB_RDMA_READ) { + if (rdma_task->write_mem_desc.mem_reg.mem_h) { + /* FMR/FRWR case */ + sge.addr = rdma_task->write_mem_desc.mem_reg.va; + sge.length = rdma_task->write_mem_desc.mem_reg.len; + sge.stag = rdma_task->write_mem_desc.mem_reg.rkey; + PACK_LLVAL(&sge, tmp_sge, addr); + PACK_LVAL(&sge, tmp_sge, length); + PACK_LVAL(&sge, tmp_sge, stag); + tmp_sge++; + } else { + sg = rdma_task->write_mem_desc.sgt.sgl; + for (i = 0; i < req_hdr->out_num_sge; i++) { + sge.addr = ib_sg_dma_address(ib_dev, sg); + sge.length = ib_sg_dma_len(ib_dev, sg); + sge.stag = mr->rkey; + PACK_LLVAL(&sge, tmp_sge, addr); + PACK_LVAL(&sge, tmp_sge, length); + PACK_LVAL(&sge, tmp_sge, stag); + tmp_sge++; + sg = sg_next(sg); + } + } + } + if (req_hdr->out_ib_op == XIO_IB_SEND) { + for (i = 0; i < req_hdr->out_num_sge; i++) { + sge.addr = 0; + sge.length = sge_length(sgtbl_ops, sg); + sge.stag = 0; + PACK_LLVAL(&sge, tmp_sge, addr); + PACK_LVAL(&sge, tmp_sge, length); + PACK_LVAL(&sge, tmp_sge, stag); + tmp_sge++; + sg = sge_next(sgtbl_ops, sgtbl, sg); + } + } + + hdr_len = sizeof(struct xio_rdma_req_hdr); + hdr_len += sizeof(struct xio_sge) * (in_num_sge + + out_num_sge); + +#ifdef EYAL_TODO + print_hex_dump_bytes("post_send: ", DUMP_PREFIX_ADDRESS, + task->mbuf.curr, + 64); +#endif + xio_mbuf_inc(&task->mbuf, hdr_len); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_read_req_header */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_read_req_header(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task, + struct xio_rdma_req_hdr *req_hdr) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + struct xio_rdma_req_hdr *tmp_req_hdr; + struct xio_sge *tmp_sge; + size_t hdr_len; + int i; + + /* point to transport header */ + xio_mbuf_set_trans_hdr(&task->mbuf); + tmp_req_hdr = xio_mbuf_get_curr_ptr(&task->mbuf); + + req_hdr->version = tmp_req_hdr->version; + req_hdr->flags = tmp_req_hdr->flags; + UNPACK_SVAL(tmp_req_hdr, req_hdr, req_hdr_len); + + if (unlikely(req_hdr->req_hdr_len != sizeof(struct xio_rdma_req_hdr))) { + ERROR_LOG( + "header length's read failed. arrived:%d expected:%zud\n", + req_hdr->req_hdr_len, sizeof(struct xio_rdma_req_hdr)); + return -1; + } + UNPACK_SVAL(tmp_req_hdr, req_hdr, sn); + UNPACK_SVAL(tmp_req_hdr, req_hdr, credits); + UNPACK_LVAL(tmp_req_hdr, req_hdr, ltid); + req_hdr->out_ib_op = tmp_req_hdr->out_ib_op; + UNPACK_SVAL(tmp_req_hdr, req_hdr, in_num_sge); + UNPACK_SVAL(tmp_req_hdr, req_hdr, out_num_sge); + UNPACK_SVAL(tmp_req_hdr, req_hdr, ulp_hdr_len); + UNPACK_SVAL(tmp_req_hdr, req_hdr, ulp_pad_len); + + /* remain_data_len not in use */ + UNPACK_LLVAL(tmp_req_hdr, req_hdr, ulp_imm_len); + + tmp_sge = (void *)((uint8_t *)tmp_req_hdr + + sizeof(struct xio_rdma_req_hdr)); + + rdma_task->sn = req_hdr->sn; + + /* params for SEND/RDMA_WRITE */ + for (i = 0; i < req_hdr->in_num_sge; i++) { + UNPACK_LLVAL(tmp_sge, &rdma_task->req_in_sge[i], addr); + UNPACK_LVAL(tmp_sge, &rdma_task->req_in_sge[i], length); + UNPACK_LVAL(tmp_sge, &rdma_task->req_in_sge[i], stag); + tmp_sge++; + } + rdma_task->req_in_num_sge = req_hdr->in_num_sge; + + /* params for SEND/RDMA_READ */ + for (i = 0; i < req_hdr->out_num_sge; i++) { + UNPACK_LLVAL(tmp_sge, &rdma_task->req_out_sge[i], addr); + UNPACK_LVAL(tmp_sge, &rdma_task->req_out_sge[i], length); + UNPACK_LVAL(tmp_sge, &rdma_task->req_out_sge[i], stag); + tmp_sge++; + } + rdma_task->req_out_num_sge = req_hdr->out_num_sge; + + hdr_len = sizeof(struct xio_rdma_req_hdr); + hdr_len += sizeof(struct xio_sge) * (req_hdr->in_num_sge + + req_hdr->out_num_sge); + + xio_mbuf_inc(&task->mbuf, hdr_len); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_write_rsp_header */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_write_rsp_header(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task, + struct xio_rdma_rsp_hdr *rsp_hdr) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + struct xio_sge sge; + struct xio_rdma_rsp_hdr *tmp_rsp_hdr; + struct xio_sge *tmp_sge; + struct ib_device *ib_dev = rdma_hndl->dev->ib_dev; + /* Need fix for FMR/FRWR */ + struct ib_mr *mr = rdma_hndl->dev->mr; + void *sg; + size_t hdr_len; + uint32_t *wr_len; + int i; + + /* point to transport header */ + xio_mbuf_set_trans_hdr(&task->mbuf); + tmp_rsp_hdr = xio_mbuf_get_curr_ptr(&task->mbuf); + + /* pack relevant values */ + tmp_rsp_hdr->version = rsp_hdr->version; + tmp_rsp_hdr->flags = rsp_hdr->flags; + PACK_SVAL(rsp_hdr, tmp_rsp_hdr, rsp_hdr_len); + /* sn shall be coded later */ + /* ack_sn shall be coded later */ + /* credits shall be coded later */ + PACK_LVAL(rsp_hdr, tmp_rsp_hdr, rtid); + tmp_rsp_hdr->out_ib_op = rsp_hdr->out_ib_op; + PACK_LVAL(rsp_hdr, tmp_rsp_hdr, status); + PACK_SVAL(rsp_hdr, tmp_rsp_hdr, out_num_sge); + PACK_LVAL(rsp_hdr, tmp_rsp_hdr, ltid); + PACK_SVAL(rsp_hdr, tmp_rsp_hdr, ulp_hdr_len); + PACK_SVAL(rsp_hdr, tmp_rsp_hdr, ulp_pad_len); + /* remain_data_len not in use */ + PACK_LLVAL(rsp_hdr, tmp_rsp_hdr, ulp_imm_len); + + hdr_len = sizeof(struct xio_rdma_rsp_hdr); + + /* OUT: responder want to write data via rdma write */ + if (rsp_hdr->out_ib_op == XIO_IB_RDMA_WRITE) { + wr_len = (uint32_t *)((uint8_t *)tmp_rsp_hdr + + sizeof(struct xio_rdma_rsp_hdr)); + + /* params for RDMA WRITE */ + for (i = 0; i < rsp_hdr->out_num_sge; i++) { + *wr_len = htonl(rdma_task->rsp_out_sge[i].length); + wr_len++; + } + hdr_len += sizeof(uint32_t) * rsp_hdr->out_num_sge; + } + if (rsp_hdr->out_ib_op == XIO_IB_RDMA_READ) { + tmp_sge = (struct xio_sge *)((uint8_t *)tmp_rsp_hdr + hdr_len); + + /* OUT: requester want to write data via rdma read */ + if (rdma_task->write_mem_desc.mem_reg.mem_h) { + /* FMR/FRWR case */ + sge.addr = rdma_task->write_mem_desc.mem_reg.va; + sge.length = rdma_task->write_mem_desc.mem_reg.len; + sge.stag = rdma_task->write_mem_desc.mem_reg.rkey; + PACK_LLVAL(&sge, tmp_sge, addr); + PACK_LVAL(&sge, tmp_sge, length); + PACK_LVAL(&sge, tmp_sge, stag); + tmp_sge++; + hdr_len += sizeof(struct xio_sge); + } else { + sg = rdma_task->write_mem_desc.sgt.sgl; + for (i = 0; i < rsp_hdr->out_num_sge; i++) { + sge.addr = ib_sg_dma_address(ib_dev, sg); + sge.length = ib_sg_dma_len(ib_dev, sg); + sge.stag = mr->rkey; + PACK_LLVAL(&sge, tmp_sge, addr); + PACK_LVAL(&sge, tmp_sge, length); + PACK_LVAL(&sge, tmp_sge, stag); + tmp_sge++; + sg = sg_next(sg); + } + hdr_len += + sizeof(struct xio_sge) * rsp_hdr->out_num_sge; + } + } + + xio_mbuf_inc(&task->mbuf, hdr_len); + +#ifdef EYAL_TODO + print_hex_dump_bytes("post_send: ", DUMP_PREFIX_ADDRESS, + task->mbuf.tlv.head, 64); +#endif + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_read_rsp_header */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_read_rsp_header(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task, + struct xio_rdma_rsp_hdr *rsp_hdr) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + struct xio_rdma_rsp_hdr *tmp_rsp_hdr; + struct xio_sge *tmp_sge; + size_t hdr_len; + uint32_t *wr_len; + int i; + + /* point to transport header */ + xio_mbuf_set_trans_hdr(&task->mbuf); + tmp_rsp_hdr = xio_mbuf_get_curr_ptr(&task->mbuf); + + rsp_hdr->version = tmp_rsp_hdr->version; + rsp_hdr->flags = tmp_rsp_hdr->flags; + UNPACK_SVAL(tmp_rsp_hdr, rsp_hdr, rsp_hdr_len); + + if (unlikely(rsp_hdr->rsp_hdr_len != sizeof(struct xio_rdma_rsp_hdr))) { + ERROR_LOG( + "header length's read failed. arrived:%d expected:%zu\n", + rsp_hdr->rsp_hdr_len, sizeof(struct xio_rdma_rsp_hdr)); + return -1; + } + UNPACK_SVAL(tmp_rsp_hdr, rsp_hdr, sn); + /* ack_sn not used */ + UNPACK_SVAL(tmp_rsp_hdr, rsp_hdr, credits); + UNPACK_LVAL(tmp_rsp_hdr, rsp_hdr, rtid); + rsp_hdr->out_ib_op = tmp_rsp_hdr->out_ib_op; + UNPACK_LVAL(tmp_rsp_hdr, rsp_hdr, status); + UNPACK_SVAL(tmp_rsp_hdr, rsp_hdr, out_num_sge); + UNPACK_LVAL(tmp_rsp_hdr, rsp_hdr, ltid); + UNPACK_SVAL(tmp_rsp_hdr, rsp_hdr, ulp_hdr_len); + UNPACK_SVAL(tmp_rsp_hdr, rsp_hdr, ulp_pad_len); + /* remain_data_len not in use */ + UNPACK_LLVAL(tmp_rsp_hdr, rsp_hdr, ulp_imm_len); + + hdr_len = sizeof(struct xio_rdma_rsp_hdr); + if (rsp_hdr->out_ib_op == XIO_IB_RDMA_WRITE) { + wr_len = (uint32_t *)((uint8_t *)tmp_rsp_hdr + + sizeof(struct xio_rdma_rsp_hdr)); + + /* params for RDMA WRITE */ + for (i = 0; i < rsp_hdr->out_num_sge; i++) { + rdma_task->rsp_out_sge[i].length = ntohl(*wr_len); + wr_len++; + } + rdma_task->rsp_out_num_sge = rsp_hdr->out_num_sge; + + hdr_len += sizeof(uint32_t) * rsp_hdr->out_num_sge; + } + if (rsp_hdr->out_ib_op == XIO_IB_RDMA_READ) { + tmp_sge = (struct xio_sge *)((uint8_t *)tmp_rsp_hdr + + sizeof(struct xio_rdma_rsp_hdr)); + + /* params for RDMA_READ */ + for (i = 0; i < rsp_hdr->out_num_sge; i++) { + UNPACK_LLVAL(tmp_sge, &rdma_task->req_out_sge[i], + addr); + UNPACK_LVAL(tmp_sge, &rdma_task->req_out_sge[i], + length); + UNPACK_LVAL(tmp_sge, &rdma_task->req_out_sge[i], + stag); + tmp_sge++; + } + rdma_task->req_out_num_sge = i; + hdr_len += sizeof(struct xio_sge) * rsp_hdr->out_num_sge; + } + + xio_mbuf_inc(&task->mbuf, hdr_len); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_prep_req_header */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_prep_req_header(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task, + uint16_t ulp_hdr_len, + uint16_t ulp_pad_len, + uint64_t ulp_imm_len, + uint32_t status) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + struct xio_rdma_req_hdr req_hdr; + + if (unlikely(!IS_REQUEST(task->tlv_type))) { + ERROR_LOG("unknown message type %u\n", task->tlv_type); + return -1; + } + + /* write the headers */ + + /* fill request header */ + req_hdr.version = XIO_REQ_HEADER_VERSION; + req_hdr.req_hdr_len = sizeof(req_hdr); + req_hdr.ltid = task->ltid; + req_hdr.in_ib_op = rdma_task->in_ib_op; + req_hdr.out_ib_op = rdma_task->out_ib_op; + req_hdr.flags = 0; + + if (test_bits(XIO_MSG_FLAG_PEER_WRITE_RSP, &task->omsg_flags)) + set_bits(XIO_MSG_FLAG_PEER_WRITE_RSP, &req_hdr.flags); + else if (test_bits(XIO_MSG_FLAG_LAST_IN_BATCH, &task->omsg_flags)) + set_bits(XIO_MSG_FLAG_LAST_IN_BATCH, &req_hdr.flags); + + req_hdr.ulp_hdr_len = ulp_hdr_len; + req_hdr.ulp_pad_len = ulp_pad_len; + req_hdr.ulp_imm_len = ulp_imm_len; + req_hdr.in_num_sge = rdma_task->req_in_num_sge; + req_hdr.out_num_sge = rdma_task->req_out_num_sge; + + if (rdma_task->in_ib_op != XIO_IB_SEND && + rdma_task->req_in_num_sge > 0) { + unsigned int sqe_used = 0; + + if (xio_map_desc(rdma_hndl, &rdma_task->read_mem_desc, + DMA_FROM_DEVICE, &sqe_used)) + goto cleanup0; + rdma_task->sqe_used += sqe_used; + } + if (rdma_task->out_ib_op != XIO_IB_SEND && + rdma_task->req_out_num_sge > 0) { + unsigned int sqe_used = 0; + + if (xio_map_desc(rdma_hndl, &rdma_task->write_mem_desc, + DMA_TO_DEVICE, &sqe_used)) + goto cleanup1; + rdma_task->sqe_used += sqe_used; + } + + if (xio_rdma_write_req_header(rdma_hndl, task, &req_hdr) != 0) + goto cleanup2; + + /* write the payload header */ + if (ulp_hdr_len) { + if (xio_mbuf_write_array( + &task->mbuf, + task->omsg->out.header.iov_base, + task->omsg->out.header.iov_len) != 0) + goto cleanup2; + } + + /* write the pad between header and data */ + if (ulp_pad_len) + xio_mbuf_inc(&task->mbuf, ulp_pad_len); + + return 0; + +cleanup2: + if (rdma_task->out_ib_op != XIO_IB_SEND && + rdma_task->req_out_num_sge > 0) { + xio_unmap_desc(rdma_hndl, + &rdma_task->write_mem_desc, + DMA_TO_DEVICE); + } + +cleanup1: + if (rdma_task->in_ib_op != XIO_IB_SEND && + rdma_task->req_in_num_sge > 0) { + xio_unmap_desc(rdma_hndl, + &rdma_task->read_mem_desc, + DMA_FROM_DEVICE); + } + +cleanup0: + xio_set_error(XIO_E_MSG_SIZE); + ERROR_LOG("xio_rdma_write_req_header failed\n"); + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_prep_rsp_header */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_prep_rsp_header(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task, + uint16_t ulp_hdr_len, + uint16_t ulp_pad_len, + uint64_t ulp_imm_len, + uint32_t status) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + struct xio_rdma_rsp_hdr rsp_hdr; + uint64_t xio_hdr_len; + + if (unlikely(!IS_RESPONSE(task->tlv_type))) { + ERROR_LOG("unknown message type\n"); + return -1; + } + + /* fill response header */ + rsp_hdr.version = XIO_RSP_HEADER_VERSION; + rsp_hdr.rsp_hdr_len = sizeof(rsp_hdr); + rsp_hdr.rtid = task->rtid; + rsp_hdr.ltid = task->ltid; + rsp_hdr.out_ib_op = rdma_task->out_ib_op; + rsp_hdr.flags = XIO_HEADER_FLAG_NONE; + if (rdma_task->out_ib_op == XIO_IB_RDMA_READ) + rsp_hdr.out_num_sge = rdma_task->req_out_num_sge; + else + rsp_hdr.out_num_sge = rdma_task->rsp_out_num_sge; + rsp_hdr.ulp_hdr_len = ulp_hdr_len; + rsp_hdr.ulp_pad_len = ulp_pad_len; + rsp_hdr.ulp_imm_len = ulp_imm_len; + rsp_hdr.status = status; + + if (xio_rdma_write_rsp_header(rdma_hndl, task, &rsp_hdr) != 0) + goto cleanup; + + /* write the payload header */ + if (ulp_hdr_len) { + if (xio_mbuf_write_array( + &task->mbuf, + task->omsg->out.header.iov_base, + task->omsg->out.header.iov_len) != 0) + goto cleanup; + } + + /* write the pad between header and data */ + if (ulp_pad_len) + xio_mbuf_inc(&task->mbuf, ulp_pad_len); + + /* reinit header sgl to proper size (size was updated )*/ + xio_hdr_len = xio_mbuf_get_curr_offset(&task->mbuf); + xio_reinit_header(rdma_task, xio_hdr_len); + + return 0; + +cleanup: + xio_set_error(XIO_E_MSG_SIZE); + ERROR_LOG("xio_rdma_write_rsp_header failed\n"); + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_write_send_data */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_write_send_data(struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + + if (xio_vmsg_to_tx_sgt(&task->omsg->out, + &rdma_task->txd.sgt, + &rdma_task->txd.nents)) { + xio_set_error(XIO_E_MSG_SIZE); + ERROR_LOG("xio_vmsg_to_sgt failed\n"); + goto cleanup; + } + + /* No need to add one for the header (internal) */ + /* rdma_task->txd.nents++; */ + + return 0; + +cleanup: + xio_set_error(XIO_E_MSG_SIZE); + ERROR_LOG("xio_rdma_send_msg failed\n"); + return -1; +} + +/* up until testing the feature */ +#undef HAVE_RDMA_READ_RSP + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_prep_rsp_out_data */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_prep_rsp_out_data( + struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + struct xio_rdma_rsp_hdr rsp_hdr; + struct xio_sg_table_ops *sgtbl_ops; + struct xio_vmsg *vmsg = &task->omsg->out; + void *sgtbl; + size_t retval; + uint64_t xio_hdr_len; + uint64_t ulp_imm_len; + uint16_t ulp_hdr_len; + uint16_t ulp_pad_len = 0; + int enforce_write_rsp; + + sgtbl = xio_sg_table_get(vmsg); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(vmsg->sgl_type); + + /* calculate headers */ + ulp_hdr_len = task->omsg->out.header.iov_len; + ulp_imm_len = tbl_length(sgtbl_ops, sgtbl); + + xio_hdr_len = xio_mbuf_get_curr_offset(&task->mbuf); + xio_hdr_len += sizeof(rsp_hdr); + xio_hdr_len += rdma_task->req_in_num_sge * sizeof(struct xio_sge); + + if (g_poptions->inline_xio_data_align && ulp_imm_len) { + uint16_t hdr_len = xio_hdr_len + ulp_hdr_len; + + ulp_pad_len = ALIGN(hdr_len, + g_poptions->inline_xio_data_align) - + hdr_len; + } + + enforce_write_rsp = (task->imsg_flags && + (task->imsg_flags & + XIO_HEADER_FLAG_PEER_WRITE_RSP)); + /* + if (rdma_hndl->max_inline_buf_sz < xio_hdr_len + ulp_hdr_len) { + ERROR_LOG("header size %u exceeds max header %zu\n", + ulp_hdr_len, + rdma_hndl->max_inline_buf_sz - (size_t)xio_hdr_len); + xio_set_error(XIO_E_MSG_SIZE); + goto cleanup; + } + */ + + /* initialize the txd */ + rdma_task->txd.send_wr.num_sge = 1; + + /* Small data is outgoing via SEND unless the requester explicitly + * insisted on RDMA operation and provided resources. + * One sge is reserved for the header + */ + if ((ulp_imm_len == 0) || + (!enforce_write_rsp && + (tbl_nents(sgtbl_ops, sgtbl) <= + (size_t)(rdma_hndl->max_sge - 1)) && + ((xio_hdr_len + ulp_hdr_len + ulp_pad_len + ulp_imm_len) < + (uint64_t)rdma_hndl->max_inline_buf_sz))) { + rdma_task->out_ib_op = XIO_IB_SEND; + /* write xio header to the buffer */ + retval = xio_rdma_prep_rsp_header( + rdma_hndl, task, + ulp_hdr_len, ulp_pad_len, ulp_imm_len, + XIO_E_SUCCESS); + if (retval) + goto cleanup; + + /* if there is data, set it to buffer or directly to the sge */ + if (ulp_imm_len) { + retval = xio_rdma_write_send_data(task); + if (retval) + goto cleanup; + } else { + /* no data at all */ + tbl_set_nents(sgtbl_ops, sgtbl, 0); + rdma_task->txd.nents = 1; + } + } else { + if (rdma_task->req_in_sge[0].addr && + rdma_task->req_in_sge[0].length && + rdma_task->req_in_sge[0].stag) { + /* the data is sent via RDMA_WRITE */ + + /* prepare rdma write */ + xio_sched_rdma_wr_req(rdma_hndl, task); + + /* and the header is sent via SEND */ + /* write xio header to the buffer */ + retval = xio_rdma_prep_rsp_header( + rdma_hndl, task, + ulp_hdr_len, 0, ulp_imm_len, + XIO_E_SUCCESS); + } else { + /* EYAL - the case were requester send request but + * does not provide buffer for response. responder + * tries to send via rdma_write but fail. it converts + * the response to rdma_read. responder handle + * rdma_read and finally send ack to release resources + */ +#ifndef HAVE_RDMA_READ_RSP + DEBUG_LOG("partial completion of request due " \ + "to missing, response buffer\n"); + + rdma_task->out_ib_op = XIO_IB_SEND; + + /* the client did not provide buffer for response */ + retval = xio_rdma_prep_rsp_header( + rdma_hndl, task, + ulp_hdr_len, 0, 0, + XIO_E_RSP_BUF_SIZE_MISMATCH); + + tbl_set_nents(sgtbl_ops, sgtbl, 0); +#else + /* the data is outgoing via SEND but the peer will do + * RDMA_READ */ + + /* Only header header in the SEND */ + rdma_task->txd.nents = 1; + + rdma_task->out_ib_op = XIO_IB_RDMA_READ; + + /* user must provided buffers with length for + * RDMA READ */ + if (xio_vmsg_to_sgt( + vmsg, + &rdma_task->write_mem_desc.sgt, + &rdma_task->write_mem_desc.nents) < 0) { + ERROR_LOG("xio_vmsg_to_sgt failed\n"); + goto cleanup1; + } + rdma_task->req_out_num_sge = + rdma_task->write_mem_desc.nents; + rdma_task->sqe_used = 0; + + /* write XIO header to the buffer */ + retval = xio_rdma_prep_req_header(rdma_hndl, task, + ulp_hdr_len, 0, 0, + XIO_E_SUCCESS); + + if (retval) { + ERROR_LOG("Failed to write header\n"); + goto cleanup1; + } + + /* reinit header sgl to proper size + * (size was updated )*/ + xio_hdr_len = xio_mbuf_get_curr_offset(&task->mbuf); + xio_reinit_header(rdma_task, xio_hdr_len); +#endif + } + } + + return 0; +#ifdef HAVE_RDMA_READ_RSP +cleanup1: + xio_mempool_free(&rdma_task->write_mem_desc); + rdma_task->req_out_num_sge = 0; + return -1; +#endif + +cleanup: + xio_set_error(XIO_E_MSG_SIZE); + ERROR_LOG("xio_rdma_send_msg failed\n"); + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_prep_req_out_data */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_prep_req_out_data(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + struct xio_vmsg *vmsg = &task->omsg->out; + uint64_t xio_hdr_len; + uint64_t xio_max_hdr_len; + uint64_t ulp_hdr_len; + uint64_t ulp_pad_len = 0; + uint64_t ulp_imm_len; + size_t retval; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + int tx_by_sr; + uint32_t nents; + + sgtbl = xio_sg_table_get(&task->omsg->out); + sgtbl_ops = xio_sg_table_ops_get(task->omsg->out.sgl_type); + nents = tbl_nents(sgtbl_ops, sgtbl); + + /* calculate headers */ + ulp_hdr_len = vmsg->header.iov_len; + ulp_imm_len = tbl_length(sgtbl_ops, sgtbl); + + xio_hdr_len = xio_mbuf_get_curr_offset(&task->mbuf); + xio_hdr_len += sizeof(struct xio_rdma_req_hdr); + xio_hdr_len += sizeof(struct xio_sge) * rdma_task->req_in_num_sge; + xio_max_hdr_len = xio_hdr_len + sizeof(struct xio_sge) * nents; + + /* + if (rdma_hndl->max_inline_buf_sz < (xio_hdr_len + ulp_hdr_len)) { + ERROR_LOG("header size %llu exceeds max header %llu\n", + ulp_imm_len, rdma_hndl->max_inline_buf_sz - + xio_hdr_len); + return -1; + } + */ + + if (g_poptions->inline_xio_data_align && ulp_imm_len) { + uint16_t hdr_len = xio_hdr_len + ulp_hdr_len; + + ulp_pad_len = ALIGN(hdr_len, + g_poptions->inline_xio_data_align) - + hdr_len; + } + + /* initialize the txd */ + rdma_task->txd.send_wr.num_sge = 1; + + if (test_bits(XIO_MSG_FLAG_PEER_READ_REQ, &task->omsg_flags) && nents) + tx_by_sr = 0; + else + /* test for using send/receive or rdma_read */ + tx_by_sr = (nents <= (rdma_hndl->max_sge - 1) && + ((ulp_hdr_len + ulp_pad_len + + ulp_imm_len + xio_max_hdr_len) <= + rdma_hndl->max_inline_buf_sz) && + (((int)(ulp_imm_len) <= + xio_get_options()->max_inline_xio_data) || + ulp_imm_len == 0)); + + /* the data is outgoing via SEND */ + if (tx_by_sr) { + rdma_task->out_ib_op = XIO_IB_SEND; + /* user has small request - no rdma operation expected */ + rdma_task->req_out_num_sge = 0; + rdma_task->sqe_used = 0; + + /* write xio header to the buffer */ + retval = xio_rdma_prep_req_header( + rdma_hndl, task, + ulp_hdr_len, ulp_pad_len, ulp_imm_len, + XIO_E_SUCCESS); + if (retval) + return -1; + + /* reinit header sgl to proper size (size was updated )*/ + xio_hdr_len = xio_mbuf_get_curr_offset(&task->mbuf); + xio_reinit_header(rdma_task, xio_hdr_len); + + /* if there is data, set it to buffer or directly to the sge */ + if (ulp_imm_len) { + retval = xio_rdma_write_send_data(task); + if (retval) + return -1; + } else { + /* Only header */ + rdma_task->txd.nents = 1; + } + } else { + /* the data is outgoing via SEND but the peer will do + * RDMA_READ + */ + /* Only header header in the SEND */ + rdma_task->txd.nents = 1; + + rdma_task->out_ib_op = XIO_IB_RDMA_READ; + + /* user must provided buffers with length for RDMA READ */ + if (xio_vmsg_to_sgt(vmsg, &rdma_task->write_mem_desc.sgt, + &rdma_task->write_mem_desc.nents) < 0) { + ERROR_LOG("xio_vmsg_to_sgt failed\n"); + goto cleanup; + } + rdma_task->req_out_num_sge = rdma_task->write_mem_desc.nents; + rdma_task->sqe_used = 0; + + /* write XIO header to the buffer */ + retval = xio_rdma_prep_req_header(rdma_hndl, task, + ulp_hdr_len, 0, 0, + XIO_E_SUCCESS); + + if (unlikely(retval)) { + ERROR_LOG("Failed to write header\n"); + goto cleanup; + } + + /* reinit header sgl to proper size (size was updated )*/ + xio_hdr_len = xio_mbuf_get_curr_offset(&task->mbuf); + xio_reinit_header(rdma_task, xio_hdr_len); + } + + return 0; + +cleanup: + xio_mempool_free(&rdma_task->write_mem_desc); + rdma_task->req_out_num_sge = 0; + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_prep_req_in_data */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_prep_req_in_data(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + size_t hdr_len; + size_t xio_hdr_len; + size_t data_len; + struct xio_vmsg *vmsg = &task->omsg->in; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + void *sg; + int enforce_write_rsp; + int nents; + int retval; + unsigned int i; + + sgtbl = xio_sg_table_get(&task->omsg->in); + sgtbl_ops = xio_sg_table_ops_get(task->omsg->in.sgl_type); + nents = tbl_nents(sgtbl_ops, sgtbl); + + if (nents == 0) { + rdma_task->in_ib_op = XIO_IB_SEND; + rdma_task->req_in_num_sge = 0; + return 0; + } + data_len = tbl_length(sgtbl_ops, sgtbl); + hdr_len = vmsg->header.iov_len; + if (hdr_len && hdr_len >= rdma_hndl->peer_max_header) { + ERROR_LOG("hdr_len=%zd is bigger than peer_max_reader=%d\n", + hdr_len, rdma_hndl->peer_max_header); + return -1; + } + + /* before working on the out - current place after the session header */ + xio_hdr_len = xio_mbuf_get_curr_offset(&task->mbuf); + xio_hdr_len += sizeof(struct xio_rdma_rsp_hdr); + xio_hdr_len += sizeof(struct xio_sge) * nents; + + /* requester may insist on RDMA for small buffers to eliminate copy + * from receive buffers to user buffers + */ + enforce_write_rsp = task->omsg_flags & XIO_MSG_FLAG_PEER_WRITE_RSP; + if (!(enforce_write_rsp) && + data_len + hdr_len + xio_hdr_len < rdma_hndl->max_inline_buf_sz) { + /* user has small response - no rdma operation expected */ + rdma_task->in_ib_op = XIO_IB_SEND; + rdma_task->req_in_num_sge = (data_len) ? nents : 0; + } else { + rdma_task->in_ib_op = XIO_IB_RDMA_WRITE; + /* user must provided buffers with length for RDMA WRITE */ + if (xio_vmsg_to_sgt(vmsg, &rdma_task->read_mem_desc.sgt, + &rdma_task->read_mem_desc.nents) < 0) { + ERROR_LOG("xio_vmsg_to_sgt failed\n"); + goto cleanup; + } + + sg = sge_first(sgtbl_ops, sgtbl); + if (!sge_addr(sgtbl_ops, sg)) { + if (unlikely(!rdma_hndl->rdma_mempool)) { + xio_set_error(XIO_E_NO_BUFS); + ERROR_LOG( + "message /read/write failed - " \ + "library's memory pool disabled\n"); + goto cleanup; + } + + /* user did not provide buffers */ + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + retval = xio_mempool_alloc( + rdma_hndl->rdma_mempool, + sge_length(sgtbl_ops, sg), + &rdma_task->read_mem_desc.mp_sge[i]); + + if (unlikely(retval)) { + rdma_task->req_in_num_sge = i; + rdma_task->read_mem_desc.num_sge = i; + xio_set_error(ENOMEM); + ERROR_LOG( + "mempool is empty for %zd bytes\n", + sge_length(sgtbl_ops, sg)); + goto cleanup; + } + sge_set_addr( + sgtbl_ops, sg, + rdma_task->read_mem_desc.mp_sge[i].addr); + } + rdma_task->read_mem_desc.num_sge = nents; + } + rdma_task->req_in_num_sge = rdma_task->read_mem_desc.nents; + } + /* + if (rdma_task->req_in_num_sge > rdma_hndl->peer_max_out_iovsz) { + ERROR_LOG("request in iovlen %d is bigger then peer " \ + "max out iovlen %d\n", + rdma_task->req_in_num_sge, + rdma_hndl->peer_max_out_iovsz); + goto cleanup; + } + */ + return 0; + +cleanup: + xio_mempool_free(&rdma_task->read_mem_desc); + rdma_task->req_in_num_sge = 0; + xio_set_error(EMSGSIZE); + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_send_req */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_send_req(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + struct scatterlist *sg; + uint64_t payload; + size_t retval; + int i; + int must_send = 0; + size_t sge_len; + + if (verify_req_send_limits(rdma_hndl)) + return -1; + + /* prepare buffer for RDMA response */ + retval = xio_rdma_prep_req_in_data(rdma_hndl, task); + if (unlikely(retval != 0)) { + ERROR_LOG("rdma_prep_req_in_data failed\n"); + return -1; + } + + /* prepare the out message */ + retval = xio_rdma_prep_req_out_data(rdma_hndl, task); + if (unlikely(retval != 0)) { + ERROR_LOG("rdma_prep_req_out_data failed\n"); + return -1; + } + payload = xio_mbuf_tlv_payload_len(&task->mbuf); + + /* add tlv */ + if (unlikely(xio_mbuf_write_tlv(&task->mbuf, task->tlv_type, payload) != 0)) { + ERROR_LOG("write tlv failed\n"); + xio_set_error(EOVERFLOW); + return -1; + } + + /* xio_rdma_prep_req_out_data sets txd.nents */ + /* set the length */ + rdma_task->txd.sgt.sgl[0].length = + xio_mbuf_get_curr_offset(&task->mbuf); + + /* validate header */ + if (unlikely(XIO_TLV_LEN + payload != rdma_task->txd.sgt.sgl[0].length)) { + ERROR_LOG("header validation failed\n"); + return -1; + } + xio_task_addref(task); + + /* check for inline */ + rdma_task->txd.send_wr.send_flags = 0; + + sge_len = 0; + sg = rdma_task->txd.sgt.sgl; + for (i = 0; i < rdma_task->txd.send_wr.num_sge; i++) { + sge_len += sg->length; + sg = sg_next(sg); + } + + if (sge_len < rdma_hndl->max_inline_data) + rdma_task->txd.send_wr.send_flags |= IB_SEND_INLINE; + + if (IS_FIN(task->tlv_type)) { + rdma_task->txd.send_wr.send_flags |= IB_SEND_FENCE; + must_send = 1; + } + + if (unlikely(++rdma_hndl->req_sig_cnt >= HARD_CQ_MOD || + task->is_control || + task->omsg->flags & XIO_MSG_FLAG_IMM_SEND_COMP)) { + /* avoid race between send completion and response arrival */ + rdma_task->txd.send_wr.send_flags |= IB_SEND_SIGNALED; + rdma_hndl->req_sig_cnt = 0; + } + + rdma_task->out_ib_op = XIO_IB_SEND; + + list_move_tail(&task->tasks_list_entry, &rdma_hndl->tx_ready_list); + + rdma_hndl->tx_ready_tasks_num++; + + return kick_send_and_read(rdma_hndl, task, must_send); +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_send_rsp */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_send_rsp(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + struct xio_work_req *txd; + struct scatterlist *sg; + size_t retval; + size_t sge_len; + uint64_t payload; + int i; + int must_send = 0; + + if (unlikely(verify_rsp_send_limits(rdma_hndl))) + return -1; + + /* prepare the out message */ + retval = xio_rdma_prep_rsp_out_data(rdma_hndl, task); + if (unlikely(retval != 0)) { + ERROR_LOG("rdma_prep_req_out_data failed\n"); + goto cleanup; + } + + payload = xio_mbuf_tlv_payload_len(&task->mbuf); + + /* add tlv */ + if (xio_mbuf_write_tlv(&task->mbuf, task->tlv_type, payload) != 0) + goto cleanup; + + txd = &rdma_task->txd; + + /* set the length of the header */ + txd->sgt.sgl[0].length = xio_mbuf_get_curr_offset(&task->mbuf); + + /* validate header */ + if (unlikely(XIO_TLV_LEN + payload != txd->sgt.sgl[0].length)) { + ERROR_LOG("header validation failed\n"); + goto cleanup; + } + + txd->send_wr.send_flags = 0; + + /* check for inline */ + if (rdma_task->out_ib_op == XIO_IB_SEND || + rdma_task->out_ib_op == XIO_IB_RDMA_READ) { + sge_len = 0; + sg = txd->sgt.sgl; + for (i = 0; i < txd->send_wr.num_sge; i++) { + sge_len += sg->length; + sg = sg_next(sg); + } + + if (sge_len < rdma_hndl->max_inline_data) + txd->send_wr.send_flags |= IB_SEND_INLINE; + + list_move_tail(&task->tasks_list_entry, + &rdma_hndl->tx_ready_list); + rdma_hndl->tx_ready_tasks_num++; + } + + if (IS_FIN(task->tlv_type)) { + txd->send_wr.send_flags |= IB_SEND_FENCE; + must_send = 1; + } + + if (++rdma_hndl->rsp_sig_cnt >= SOFT_CQ_MOD || + task->is_control || + task->omsg->flags & XIO_MSG_FLAG_IMM_SEND_COMP) { + txd->send_wr.send_flags |= IB_SEND_SIGNALED; + rdma_hndl->rsp_sig_cnt = 0; + } + if (rdma_task->out_ib_op == XIO_IB_RDMA_READ) + xio_task_addref(task); + + return kick_send_and_read(rdma_hndl, task, must_send); +cleanup: + xio_set_error(XIO_E_MSG_SIZE); + ERROR_LOG("xio_rdma_send_msg failed\n"); + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_on_rsp_send_comp */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_on_rsp_send_comp(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + union xio_transport_event_data event_data; + + if (rdma_task->out_ib_op == XIO_IB_RDMA_READ) { + xio_tasks_pool_put(task); + return 0; + } + if (IS_CANCEL(task->tlv_type)) + return 0; + + event_data.msg.op = XIO_WC_OP_SEND; + event_data.msg.task = task; + + xio_transport_notify_observer(&rdma_hndl->base, + XIO_TRANSPORT_EVENT_SEND_COMPLETION, + &event_data); + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_on_req_send_comp */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_on_req_send_comp(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + union xio_transport_event_data event_data; + + if (IS_CANCEL(task->tlv_type)) + return 0; + + event_data.msg.op = XIO_WC_OP_SEND; + event_data.msg.task = task; + + xio_transport_notify_observer(&rdma_hndl->base, + XIO_TRANSPORT_EVENT_SEND_COMPLETION, + &event_data); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_on_direct_rdma_comp */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_on_direct_rdma_comp(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task, + enum xio_wc_op op) +{ + union xio_transport_event_data event_data; + + event_data.msg.op = op; + event_data.msg.task = task; + xio_transport_notify_observer( + &rdma_hndl->base, + XIO_TRANSPORT_EVENT_DIRECT_RDMA_COMPLETION, + &event_data); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_post_recv_rsp */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_post_recv_rsp(struct xio_task *task) +{ + struct xio_msg *imsg; + struct xio_msg *omsg; + struct xio_sg_table_ops *isgtbl_ops; + void *isgtbl; + struct xio_sg_table_ops *osgtbl_ops; + void *osgtbl; + + omsg = task->sender_task->omsg; + imsg = &task->imsg; + isgtbl = xio_sg_table_get(&imsg->in); + isgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(imsg->in.sgl_type); + osgtbl = xio_sg_table_get(&omsg->in); + osgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(omsg->in.sgl_type); + + tbl_clone(osgtbl_ops, osgtbl, isgtbl_ops, isgtbl); + + /* also set bits */ + if (test_bits(XIO_MSG_HINT_ASSIGNED_DATA_IN_BUF, &imsg->hints)) + set_bits(XIO_MSG_HINT_ASSIGNED_DATA_IN_BUF, &omsg->hints); + else + clr_bits(XIO_MSG_HINT_ASSIGNED_DATA_IN_BUF, &omsg->hints); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_on_recv_rsp */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_on_recv_rsp(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + XIO_TO_RDMA_TASK(task, rdma_sender_task); + int retval = 0, i; + union xio_transport_event_data event_data; + struct xio_rdma_rsp_hdr rsp_hdr; + struct xio_msg *imsg; + struct xio_msg *omsg; + void *ulp_hdr; + struct xio_sg_table_ops *isgtbl_ops; + void *isgtbl; + struct xio_sg_table_ops *osgtbl_ops; + void *osgtbl; + void *sg; + struct scatterlist *sgl; + + /* read the response header */ + retval = xio_rdma_read_rsp_header(rdma_hndl, task, &rsp_hdr); + if (retval != 0) { + xio_set_error(XIO_E_MSG_INVALID); + goto cleanup; + } + /* update receive + send window */ + if (rdma_hndl->exp_sn == rsp_hdr.sn) { + rdma_hndl->exp_sn++; + rdma_hndl->ack_sn = rsp_hdr.sn; + rdma_hndl->peer_credits += rsp_hdr.credits; + } else { + ERROR_LOG("ERROR: expected sn:%d, arrived sn:%d\n", + rdma_hndl->exp_sn, rsp_hdr.sn); + } + /* read the sn */ + rdma_task->sn = rsp_hdr.sn; + + /* find the sender task */ + task->sender_task = xio_rdma_primary_task_lookup(rdma_hndl, + rsp_hdr.rtid); + + task->rtid = rsp_hdr.ltid; + rdma_sender_task = task->sender_task->dd_data; + /* mark the sender task as arrived */ + task->sender_task->state = XIO_TASK_STATE_RESPONSE_RECV; + + xio_unmap_tx_work_req(rdma_hndl->dev, &rdma_sender_task->txd); + + if (rdma_sender_task->read_mem_desc.nents && + rdma_sender_task->read_mem_desc.mapped) + xio_unmap_desc(rdma_hndl, &rdma_sender_task->read_mem_desc, + DMA_FROM_DEVICE); + if (rdma_sender_task->write_mem_desc.nents && + rdma_sender_task->write_mem_desc.mapped) + xio_unmap_desc(rdma_hndl, &rdma_sender_task->write_mem_desc, + DMA_TO_DEVICE); + + omsg = task->sender_task->omsg; + imsg = &task->imsg; + isgtbl = xio_sg_table_get(&imsg->in); + isgtbl_ops = xio_sg_table_ops_get(imsg->in.sgl_type); + osgtbl = xio_sg_table_get(&omsg->in); + osgtbl_ops = xio_sg_table_ops_get(omsg->in.sgl_type); + + clr_bits(XIO_MSG_HINT_ASSIGNED_DATA_IN_BUF, &imsg->hints); + + ulp_hdr = xio_mbuf_get_curr_ptr(&task->mbuf); + /* msg from received message */ + if (rsp_hdr.ulp_hdr_len) { + imsg->in.header.iov_base = ulp_hdr; + imsg->in.header.iov_len = rsp_hdr.ulp_hdr_len; + } else { + imsg->in.header.iov_base = NULL; + imsg->in.header.iov_len = 0; + } + + task->status = rsp_hdr.status; + + if (omsg->in.header.iov_base) { + /* copy header to user buffers */ + size_t hdr_len = 0; + + if (imsg->in.header.iov_len > omsg->in.header.iov_len) { + hdr_len = imsg->in.header.iov_len; + task->status = XIO_E_MSG_SIZE; + } else { + hdr_len = omsg->in.header.iov_len; + task->status = XIO_E_SUCCESS; + } + if (hdr_len && imsg->in.header.iov_base) + memcpy(omsg->in.header.iov_base, + imsg->in.header.iov_base, + hdr_len); + else + *((char *)omsg->in.header.iov_base) = 0; + + omsg->in.header.iov_len = hdr_len; + } else { + /* no copy - just pointers */ + memclonev(&omsg->in.header, 1, &imsg->in.header, 1); + } + + /* if data arrived, set the pointers */ + switch (rsp_hdr.out_ib_op) { + case XIO_IB_SEND: + /* This is a completion of RDMA READ can free + * DMA mapping of send buffer (future FMR/FRWR) + */ + xio_unmap_desc(rdma_hndl, + &rdma_sender_task->write_mem_desc, + DMA_TO_DEVICE); + if (rsp_hdr.ulp_imm_len) { + tbl_set_nents(isgtbl_ops, isgtbl, 1); + sg = sge_first(isgtbl_ops, isgtbl); + sge_set_addr(isgtbl_ops, sg, + (ulp_hdr + imsg->in.header.iov_len + + rsp_hdr.ulp_pad_len)); + sge_set_length(isgtbl_ops, sg, + rsp_hdr.ulp_imm_len); + } else { + tbl_set_nents(isgtbl_ops, isgtbl, 0); + } + if (tbl_nents(osgtbl_ops, osgtbl)) { + /* deep copy */ + if (tbl_nents(isgtbl_ops, isgtbl)) { + size_t idata_len = + tbl_length(isgtbl_ops, isgtbl); + size_t odata_len = + tbl_length(osgtbl_ops, osgtbl); + + if (idata_len > odata_len) { + task->status = XIO_E_MSG_SIZE; + xio_reset_desc( + &rdma_sender_task->write_mem_desc); + goto partial_msg; + } else { + task->status = XIO_E_SUCCESS; + } + sg = sge_first(osgtbl_ops, osgtbl); + if (sge_addr(osgtbl_ops, sg)) { + /* user provided buffer so do copy */ + tbl_copy(osgtbl_ops, osgtbl, + isgtbl_ops, isgtbl); + } else { + /* use provided only length - set user + * pointers */ + tbl_clone(osgtbl_ops, osgtbl, + isgtbl_ops, isgtbl); + } + } else { + tbl_set_nents(osgtbl_ops, osgtbl, + tbl_nents(isgtbl_ops, isgtbl)); + } + } else { + tbl_clone(osgtbl_ops, osgtbl, + isgtbl_ops, isgtbl); + } + xio_reset_desc(&rdma_sender_task->write_mem_desc); + break; + case XIO_IB_RDMA_WRITE: + /* This is a completion of RDMA WRITE can free + * DMA mapping of read buffer (future FMR/FRWR) + */ + xio_unmap_desc(rdma_hndl, + &rdma_sender_task->read_mem_desc, + DMA_FROM_DEVICE); + if (rdma_task->rsp_out_num_sge > + rdma_sender_task->req_in_num_sge) { + ERROR_LOG("local in data_iovec is too small %d < %d\n", + rdma_sender_task->req_in_num_sge, + rdma_task->rsp_out_num_sge); + xio_reset_desc(&rdma_sender_task->read_mem_desc); + goto partial_msg; + } + tbl_set_nents(isgtbl_ops, isgtbl, + rdma_task->rsp_out_num_sge); + + sg = sge_first(isgtbl_ops, isgtbl); + sgl = rdma_sender_task->read_mem_desc.sgt.sgl; + for (i = 0; i < rdma_task->rsp_out_num_sge; i++) { + sge_set_addr(isgtbl_ops, sg, sg_virt(sgl)); + sge_set_length(isgtbl_ops, sg, + rdma_task->rsp_out_sge[i].length); + sg = sge_next(isgtbl_ops, isgtbl, sg); + sgl = sg_next(sgl); + } + if (tbl_nents(osgtbl_ops, osgtbl)) { + sg = sge_first(osgtbl_ops, osgtbl); + if (sge_addr(osgtbl_ops, sg)) { + void *isg; + struct xio_mp_mem *mp_sge; + + mp_sge = + &rdma_sender_task->read_mem_desc.mp_sge[0]; + /* user provided buffer */ + if (!mp_sge->cache) { + /* user buffers were aligned no + * bounce buffer data was copied + * directly to user buffer need + * to update the buffer length + */ + for_each_sge(isgtbl, + isgtbl_ops, isg, i) { + sge_set_length( + osgtbl_ops, sg, + sge_length( + isgtbl_ops, + isg)); + sg = sge_next(osgtbl_ops, + osgtbl, sg); + } + tbl_set_nents(osgtbl_ops, osgtbl, + tbl_nents(isgtbl_ops, + isgtbl)); + /* also read_mem_desc.sgt must follow + * the same nents => but we are about + * to reset the desc + rdma_sender_task->read_mem_desc.sgt.nents = + tbl_nents(isgtbl_ops, isgtbl); + */ + } else { + /* Bounce buffer */ + tbl_copy(osgtbl_ops, osgtbl, + isgtbl_ops, isgtbl); + /* put bounce buffer back to pool */ + xio_mempool_free( + &rdma_sender_task->read_mem_desc); + rdma_sender_task->req_in_num_sge = 0; + } + } else { + /* use provided only length - set user + * pointers */ + tbl_clone(osgtbl_ops, osgtbl, + isgtbl_ops, isgtbl); + } + } else { + ERROR_LOG("empty out message\n"); + } + xio_reset_desc(&rdma_sender_task->read_mem_desc); + break; + case XIO_IB_RDMA_READ: + /* schedule request for RDMA READ. in case of error + * don't schedule the rdma read operation */ + /*TRACE_LOG("scheduling rdma read\n");*/ + retval = xio_sched_rdma_rd(rdma_hndl, task); + if (retval == 0) + return 0; + ERROR_LOG("scheduling rdma read failed\n"); + break; + default: + ERROR_LOG("%s unexpected op 0x%x\n", __func__, + rsp_hdr.out_ib_op); + break; + } + /* must delay the send due to pending rdma read responses + * if not user will get out of order messages - need fence + */ + if (!list_empty(&rdma_hndl->rdma_rd_rsp_list)) { + list_move_tail(&task->tasks_list_entry, + &rdma_hndl->rdma_rd_rsp_list); + rdma_hndl->kick_rdma_rd_rsp = 1; + return 0; + } + if (rdma_hndl->rdma_rd_rsp_in_flight) { + rdma_hndl->rdma_rd_rsp_in_flight++; + list_move_tail(&task->tasks_list_entry, + &rdma_hndl->rdma_rd_rsp_in_flight_list); + return 0; + } + +partial_msg: + /* fill notification event */ + event_data.msg.op = XIO_WC_OP_RECV; + event_data.msg.task = task; + + /* notify the upper layer of received message */ + xio_transport_notify_observer(&rdma_hndl->base, + XIO_TRANSPORT_EVENT_NEW_MESSAGE, + &event_data); + return 0; + +cleanup: + retval = xio_errno(); + ERROR_LOG("xio_rdma_on_recv_rsp failed. (errno=%d %s)\n", + retval, xio_strerror(retval)); + xio_transport_notify_observer_error(&rdma_hndl->base, retval); + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sched_rdma_rd */ +/*---------------------------------------------------------------------------*/ +static int xio_sched_rdma_rd(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + int i, retval; + int user_assign_flag = 0; + size_t rlen = 0, llen = 0; + size_t rsg_out_list_len = 0; + int tasks_used = 0; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + void *sg; + struct list_head *rdma_rd_list; + + /* peer get buffer from pool and do rdma read */ + + /* needed buffer to do rdma read. there are two options: */ + /* option 1: user provides call back that fills application memory */ + /* option 2: use internal buffer pool */ + + /* hint the upper layer of sizes */ + sgtbl = xio_sg_table_get(&task->imsg.in); + sgtbl_ops = xio_sg_table_ops_get(task->imsg.in.sgl_type); + tbl_set_nents(sgtbl_ops, sgtbl, rdma_task->req_out_num_sge); + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + sge_set_addr(sgtbl_ops, sg, NULL); + sge_set_length(sgtbl_ops, sg, + rdma_task->req_out_sge[i].length); + rlen += rdma_task->req_out_sge[i].length; + rdma_task->read_mem_desc.mp_sge[i].cache = NULL; + } + + sgtbl = xio_sg_table_get(&task->imsg.out); + sgtbl_ops = xio_sg_table_ops_get(task->imsg.out.sgl_type); + if (rdma_task->req_in_num_sge) { + tbl_set_nents(sgtbl_ops, sgtbl, rdma_task->req_in_num_sge); + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + sge_set_addr(sgtbl_ops, sg, NULL); + sge_set_length(sgtbl_ops, sg, + rdma_task->req_in_sge[i].length); + rdma_task->write_mem_desc.mp_sge[i].cache = NULL; + } + } else { + tbl_set_nents(sgtbl_ops, sgtbl, 0); + } + sgtbl = xio_sg_table_get(&task->imsg.in); + sgtbl_ops = xio_sg_table_ops_get(task->imsg.in.sgl_type); + + xio_transport_assign_in_buf(&rdma_hndl->base, task, &user_assign_flag); + + if (user_assign_flag) { + /* if user does not have buffers ignore */ + if (tbl_nents(sgtbl_ops, sgtbl) == 0) { + WARN_LOG("application has not provided buffers\n"); + WARN_LOG("rdma read is ignored\n"); + task->status = XIO_E_NO_USER_BUFS; + return -1; + } + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + /* not required since the application can change + * number of SG entries as part of + * assign_data_in_buf() callback and this SG count + * might not match with rdma_task->req_out_num_sge. + */ + /* + rdma_task->read_mem_desc.mp_sge[i].cache = NULL; + */ + if (!sge_addr(sgtbl_ops, sg)) { + ERROR_LOG("application has provided " \ + "null address\n"); + ERROR_LOG("rdma read is ignored\n"); + task->status = XIO_E_NO_USER_BUFS; + return -1; + } + llen += sge_length(sgtbl_ops, sg); + } + if (rlen > llen) { + ERROR_LOG("application provided too small iovec\n"); + ERROR_LOG("remote peer want to write %zd bytes while " \ + "local peer provided buffer size %zd bytes\n", + rlen, llen); + ERROR_LOG("rdma read is ignored\n"); + task->status = XIO_E_USER_BUF_OVERFLOW; + return -1; + } + set_bits(XIO_MSG_HINT_ASSIGNED_DATA_IN_BUF, &task->imsg.hints); + } else { + retval = xio_mp_sge_alloc(rdma_hndl->rdma_mempool, + rdma_task->req_out_sge, + rdma_task->req_out_num_sge, + &rdma_task->read_mem_desc); + if (unlikely(retval)) { + ERROR_LOG("mempool alloc failed\n"); + task->status = ENOMEM; + goto cleanup; + } + + tbl_set_nents(sgtbl_ops, sgtbl, rdma_task->req_out_num_sge); + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + rdma_task->read_mem_desc.mp_sge[i].length = + rdma_task->req_out_sge[i].length; + + sge_set_addr(sgtbl_ops, sg, + rdma_task->read_mem_desc.mp_sge[i].addr); + sge_set_length( + sgtbl_ops, sg, + rdma_task->read_mem_desc.mp_sge[i].length); + llen += rdma_task->read_mem_desc.mp_sge[i].length; + } + rdma_task->req_in_num_sge = rdma_task->req_out_num_sge; + } + + retval = xio_validate_rdma_op(&task->imsg.in, + rdma_task->req_out_sge, + rdma_task->req_out_num_sge, + min(rlen, llen), + rdma_hndl->max_sge, + &tasks_used); + if (retval) { + ERROR_LOG("failed to validate input iovecs, " \ + "rlen=%zu, llen=%zu\n", rlen, llen); + ERROR_LOG("rdma read is ignored\n"); + task->status = XIO_E_MSG_INVALID; + return -1; + } + if (!task->sender_task) + rdma_rd_list = &rdma_hndl->rdma_rd_req_list; + else + rdma_rd_list = &rdma_hndl->rdma_rd_rsp_list; + + retval = xio_prep_rdma_op(task, rdma_hndl, + XIO_IB_RDMA_READ, + IB_WR_RDMA_READ, + &task->imsg.in, + rdma_task->req_out_sge, + rdma_task->req_out_num_sge, + &rsg_out_list_len, + min(rlen, llen), + rdma_hndl->max_sge, + 1, + rdma_rd_list, tasks_used); + if (unlikely(retval)) { + ERROR_LOG("failed to allocate tasks\n"); + ERROR_LOG("rdma read is ignored\n"); + task->status = XIO_E_WRITE_FAILED; + return -1; + } + + if (!task->sender_task) + xio_xmit_rdma_rd_req(rdma_hndl); + else + xio_xmit_rdma_rd_rsp(rdma_hndl); + + return 0; +cleanup: + rdma_task->req_in_num_sge = 0; + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_set_rsp_out_sge */ +/*---------------------------------------------------------------------------*/ +static inline void xio_set_rsp_out_sge(struct xio_task *task, + struct xio_sge *rsg_list, + size_t rsize) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + unsigned int i; + + for (i = 0; i < rsize; i++) + rdma_task->rsp_out_sge[i].length = rsg_list[i].length; + + rdma_task->rsp_out_num_sge = rsize; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sched_rdma_wr_req */ +/*---------------------------------------------------------------------------*/ +static int xio_sched_rdma_wr_req(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + int i, retval = 0; + size_t rlen = 0, llen = 0; + size_t rsg_out_list_len = 0; + int tasks_used = 0; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + + sgtbl = xio_sg_table_get(&task->omsg->out); + sgtbl_ops = xio_sg_table_ops_get(task->omsg->out.sgl_type); + + llen = tbl_length(sgtbl_ops, sgtbl); + + for (i = 0; i < rdma_task->req_in_num_sge; i++) + rlen += rdma_task->req_in_sge[i].length; + + if (unlikely(rlen < llen)) { + ERROR_LOG("peer provided too small iovec\n"); + ERROR_LOG("rdma write is ignored\n"); + task->status = XIO_E_REM_USER_BUF_OVERFLOW; + goto cleanup; + } + retval = xio_validate_rdma_op(&task->omsg->out, + rdma_task->req_in_sge, + rdma_task->req_in_num_sge, + min(rlen, llen), + rdma_hndl->max_sge, + &tasks_used); + if (unlikely(retval)) { + ERROR_LOG("failed to invalidate input iovecs\n"); + ERROR_LOG("rdma write is ignored\n"); + task->status = XIO_E_MSG_INVALID; + goto cleanup; + } + + retval = xio_prep_rdma_op(task, rdma_hndl, + XIO_IB_RDMA_WRITE, + IB_WR_RDMA_WRITE, + &task->omsg->out, + rdma_task->req_in_sge, + rdma_task->req_in_num_sge, + &rsg_out_list_len, + min(rlen, llen), + rdma_hndl->max_sge, + 0, + &rdma_hndl->tx_ready_list, tasks_used); + if (unlikely(retval)) { + ERROR_LOG("failed to allocate tasks\n"); + ERROR_LOG("rdma write is ignored\n"); + task->status = XIO_E_READ_FAILED; + goto cleanup; + } + /* prepare response to peer */ + xio_set_rsp_out_sge(task, rdma_task->req_in_sge, rsg_out_list_len); + + /* xio_prep_rdma_op used splice to transfer "tasks_used" to + * tx_ready_list + */ + rdma_hndl->tx_ready_tasks_num += tasks_used; + return 0; +cleanup: + rdma_task->req_out_num_sge = 0; + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_on_recv_req */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_on_recv_req(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + int retval = 0; + union xio_transport_event_data event_data; + struct xio_rdma_req_hdr req_hdr; + struct xio_msg *imsg; + void *ulp_hdr; + int i; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + void *sg; + + /* read header */ + retval = xio_rdma_read_req_header(rdma_hndl, task, &req_hdr); + if (unlikely(retval)) { + xio_set_error(XIO_E_MSG_INVALID); + goto cleanup; + } + + if (rdma_hndl->exp_sn == req_hdr.sn) { + rdma_hndl->exp_sn++; + rdma_hndl->ack_sn = req_hdr.sn; + rdma_hndl->peer_credits += req_hdr.credits; + } else { + ERROR_LOG("ERROR: sn expected:%d, sn arrived:%d" \ + " out_ib_op:%u %u %u\n", + rdma_hndl->exp_sn, req_hdr.sn, + req_hdr.out_ib_op, + req_hdr.in_num_sge, req_hdr.out_num_sge); + } + + /* save originator identifier */ + task->imsg_flags = req_hdr.flags; + task->rtid = req_hdr.ltid; + + imsg = &task->imsg; + sgtbl = xio_sg_table_get(&imsg->out); + sgtbl_ops = xio_sg_table_ops_get(imsg->out.sgl_type); + + ulp_hdr = xio_mbuf_get_curr_ptr(&task->mbuf); + + imsg->type = task->tlv_type; + imsg->in.header.iov_len = req_hdr.ulp_hdr_len; + + clr_bits(XIO_MSG_HINT_ASSIGNED_DATA_IN_BUF, &imsg->hints); + + if (req_hdr.ulp_hdr_len) + imsg->in.header.iov_base = ulp_hdr; + else + imsg->in.header.iov_base = NULL; + + /* hint upper layer about expected response */ + if (rdma_task->req_in_num_sge) { + tbl_set_nents(sgtbl_ops, sgtbl, rdma_task->req_in_num_sge); + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + sge_set_addr(sgtbl_ops, sg, NULL); + sge_set_length(sgtbl_ops, sg, + rdma_task->req_in_sge[i].length); + } + } else { + tbl_set_nents(sgtbl_ops, sgtbl, 0); + } + + switch (req_hdr.out_ib_op) { + case XIO_IB_SEND: + sgtbl = xio_sg_table_get(&imsg->in); + sgtbl_ops = xio_sg_table_ops_get(imsg->in.sgl_type); + + if (req_hdr.ulp_imm_len) { + /* incoming data via SEND */ + /* if data arrived, set the pointers */ + tbl_set_nents(sgtbl_ops, sgtbl, 1); + sg = sge_first(sgtbl_ops, sgtbl); + sge_set_addr(sgtbl_ops, sg, + (ulp_hdr + imsg->in.header.iov_len + + req_hdr.ulp_pad_len)); + sge_set_length(sgtbl_ops, sg, req_hdr.ulp_imm_len); + } else { + /* no data at all */ + tbl_set_nents(sgtbl_ops, sgtbl, 0); + } + break; + case XIO_IB_RDMA_READ: + /* schedule request for RDMA READ. in case of error + * don't schedule the rdma read operation */ + /* TRACE_LOG("scheduling rdma read\n"); */ + retval = xio_sched_rdma_rd(rdma_hndl, task); + if (retval == 0) + return 0; + ERROR_LOG("scheduling rdma read failed\n"); + break; + default: + ERROR_LOG("unexpected out_ib_op\n"); + xio_set_error(XIO_E_MSG_INVALID); + task->status = XIO_E_MSG_INVALID; + break; + } + + /* must delay the send due to pending rdma read requests + * if not user will get out of order messages - need fence + */ + if (!list_empty(&rdma_hndl->rdma_rd_req_list)) { + list_move_tail(&task->tasks_list_entry, + &rdma_hndl->rdma_rd_req_list); + rdma_hndl->kick_rdma_rd_req = 1; + return 0; + } + if (rdma_hndl->rdma_rd_req_in_flight) { + rdma_hndl->rdma_rd_req_in_flight++; + list_move_tail(&task->tasks_list_entry, + &rdma_hndl->rdma_rd_req_in_flight_list); + return 0; + } + /* fill notification event */ + event_data.msg.op = XIO_WC_OP_RECV; + event_data.msg.task = task; + + xio_transport_notify_observer(&rdma_hndl->base, + XIO_TRANSPORT_EVENT_NEW_MESSAGE, + &event_data); + + return 0; + +cleanup: + retval = xio_errno(); + ERROR_LOG("xio_rdma_on_recv_req failed. (errno=%d %s)\n", retval, + xio_strerror(retval)); + xio_transport_notify_observer_error(&rdma_hndl->base, retval); + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_write_setup_msg */ +/*---------------------------------------------------------------------------*/ +static void xio_rdma_write_setup_msg(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task, + struct xio_rdma_setup_msg *msg) +{ + struct xio_rdma_setup_msg *tmp_msg; + struct xio_rkey_tbl_pack *ptbl; + struct xio_rkey_tbl *tbl; + int i; + + /* set the mbuf after tlv header */ + xio_mbuf_set_val_start(&task->mbuf); + + /* jump after connection setup header */ + if (rdma_hndl->base.is_client) + xio_mbuf_inc(&task->mbuf, + sizeof(struct xio_nexus_setup_req)); + else + xio_mbuf_inc(&task->mbuf, + sizeof(struct xio_nexus_setup_rsp)); + + tmp_msg = xio_mbuf_get_curr_ptr(&task->mbuf); + + /* pack relevant values */ + PACK_LLVAL(msg, tmp_msg, buffer_sz); + PACK_SVAL(msg, tmp_msg, sq_depth); + PACK_SVAL(msg, tmp_msg, rq_depth); + PACK_SVAL(msg, tmp_msg, credits); + PACK_LVAL(msg, tmp_msg, max_in_iovsz); + PACK_LVAL(msg, tmp_msg, max_out_iovsz); + PACK_SVAL(msg, tmp_msg, rkey_tbl_size); + PACK_LVAL(msg, tmp_msg, max_header_len); + +#ifdef EYAL_TODO + print_hex_dump_bytes("post_send: ", DUMP_PREFIX_ADDRESS, + task->mbuf.tlv.head, + 64); +#endif + xio_mbuf_inc(&task->mbuf, sizeof(struct xio_rdma_setup_msg)); + + if (!msg->rkey_tbl_size) + return; + + tbl = rdma_hndl->rkey_tbl; + ptbl = xio_mbuf_get_curr_ptr(&task->mbuf); + for (i = 0; i < rdma_hndl->rkey_tbl_size; i++) { + PACK_LVAL(tbl, ptbl, old_rkey); + PACK_LVAL(tbl, ptbl, new_rkey); + tbl++; + xio_mbuf_inc(&task->mbuf, sizeof(struct xio_rkey_tbl_pack)); + ptbl = xio_mbuf_get_curr_ptr(&task->mbuf); + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_read_setup_msg */ +/*---------------------------------------------------------------------------*/ +static void xio_rdma_read_setup_msg(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task, + struct xio_rdma_setup_msg *msg) +{ + struct xio_rdma_setup_msg *tmp_msg; + struct xio_rkey_tbl_pack *ptbl; + struct xio_rkey_tbl *tbl; + int i; + + /* set the mbuf after tlv header */ + xio_mbuf_set_val_start(&task->mbuf); + + /* jump after connection setup header */ + if (rdma_hndl->base.is_client) + xio_mbuf_inc(&task->mbuf, + sizeof(struct xio_nexus_setup_rsp)); + else + xio_mbuf_inc(&task->mbuf, + sizeof(struct xio_nexus_setup_req)); + + tmp_msg = xio_mbuf_get_curr_ptr(&task->mbuf); + + /* pack relevant values */ + UNPACK_LLVAL(tmp_msg, msg, buffer_sz); + UNPACK_SVAL(tmp_msg, msg, sq_depth); + UNPACK_SVAL(tmp_msg, msg, rq_depth); + UNPACK_SVAL(tmp_msg, msg, credits); + UNPACK_LVAL(tmp_msg, msg, max_in_iovsz); + UNPACK_LVAL(tmp_msg, msg, max_out_iovsz); + UNPACK_SVAL(tmp_msg, msg, rkey_tbl_size); + UNPACK_LVAL(tmp_msg, msg, max_header_len); + +#ifdef EYAL_TODO + print_hex_dump_bytes("post_send: ", DUMP_PREFIX_ADDRESS, + task->mbuf.curr, + 64); +#endif + xio_mbuf_inc(&task->mbuf, sizeof(struct xio_rdma_setup_msg)); + + if (!msg->rkey_tbl_size) + return; + + rdma_hndl->peer_rkey_tbl = kcalloc(msg->rkey_tbl_size, sizeof(*tbl), + GFP_KERNEL); + if (!rdma_hndl->peer_rkey_tbl) { + ERROR_LOG("calloc failed. (errno=%m)\n"); + xio_strerror(ENOMEM); + msg->rkey_tbl_size = -1; + return; + } + + tbl = rdma_hndl->peer_rkey_tbl; + ptbl = xio_mbuf_get_curr_ptr(&task->mbuf); + for (i = 0; i < msg->rkey_tbl_size; i++) { + UNPACK_LVAL(ptbl, tbl, old_rkey); + UNPACK_LVAL(ptbl, tbl, new_rkey); + tbl++; + xio_mbuf_inc(&task->mbuf, sizeof(struct xio_rkey_tbl_pack)); + ptbl = xio_mbuf_get_curr_ptr(&task->mbuf); + } + rdma_hndl->peer_rkey_tbl_size = msg->rkey_tbl_size; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_send_setup_req */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_send_setup_req(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + uint16_t payload; + struct xio_rdma_setup_msg req; + + req.buffer_sz = xio_rdma_get_inline_buffer_size(); + req.sq_depth = rdma_hndl->sq_depth; + req.rq_depth = rdma_hndl->rq_depth; + req.credits = 0; + req.max_in_iovsz = rdma_options.max_in_iovsz; + req.max_out_iovsz = rdma_options.max_out_iovsz; + req.rkey_tbl_size = rdma_hndl->rkey_tbl_size; + req.max_header_len = g_poptions->max_inline_xio_hdr; + + xio_rdma_write_setup_msg(rdma_hndl, task, &req); + + payload = xio_mbuf_tlv_payload_len(&task->mbuf); + + /* add tlv */ + if (xio_mbuf_write_tlv(&task->mbuf, task->tlv_type, payload) != 0) + return -1; + + /* Only header */ + rdma_task->txd.nents = 1; + rdma_task->txd.sgt.nents = 1; + /* set the length */ + rdma_task->txd.sgt.sgl[0].length = xio_mbuf_data_length(&task->mbuf); + + rdma_task->txd.send_wr.send_flags = IB_SEND_SIGNALED; + if (rdma_task->txd.sgt.sgl[0].length < rdma_hndl->max_inline_data) + rdma_task->txd.send_wr.send_flags |= IB_SEND_INLINE; + + rdma_task->txd.send_wr.next = NULL; + rdma_task->out_ib_op = XIO_IB_SEND; + + /* Map the send */ + if (xio_map_tx_work_req(rdma_hndl->dev, &rdma_task->txd)) { + ERROR_LOG("DMA map to device failed\n"); + return -1; + } + rdma_task->txd.send_wr.num_sge = rdma_task->txd.mapped; + + xio_task_addref(task); + rdma_hndl->reqs_in_flight_nr++; + + list_move_tail(&task->tasks_list_entry, &rdma_hndl->in_flight_list); + + rdma_hndl->peer_credits--; + + /* set the lkey prior to sending */ + rdma_task->txd.send_wr.sg_list[0].lkey = rdma_hndl->dev->mr->lkey; + + /* send the setup request */ + xio_post_send(rdma_hndl, &rdma_task->txd, 1); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_send_setup_rsp */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_send_setup_rsp(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + uint16_t payload; + + rdma_hndl->sim_peer_credits += rdma_hndl->credits; + rdma_hndl->setup_rsp.credits = rdma_hndl->credits; + rdma_hndl->setup_rsp.buffer_sz = g_poptions->max_inline_xio_hdr + + g_poptions->max_inline_xio_data + + xio_mbuf_get_curr_offset(&task->mbuf); + rdma_hndl->setup_rsp.max_header_len = g_poptions->max_inline_xio_hdr; + + xio_rdma_write_setup_msg(rdma_hndl, task, &rdma_hndl->setup_rsp); + + rdma_hndl->credits = 0; + rdma_hndl->setup_rsp.max_in_iovsz = rdma_options.max_in_iovsz; + rdma_hndl->setup_rsp.max_out_iovsz = rdma_options.max_out_iovsz; + + payload = xio_mbuf_tlv_payload_len(&task->mbuf); + + /* add tlv */ + if (xio_mbuf_write_tlv(&task->mbuf, task->tlv_type, payload) != 0) + return -1; + + /* Only header */ + rdma_task->txd.nents = 1; + /* set the length */ + rdma_task->txd.sgt.sgl[0].length = xio_mbuf_data_length(&task->mbuf); + + rdma_task->txd.send_wr.send_flags = IB_SEND_SIGNALED; + if (rdma_task->txd.sgt.sgl[0].length < rdma_hndl->max_inline_data) + rdma_task->txd.send_wr.send_flags |= IB_SEND_INLINE; + + rdma_task->txd.send_wr.next = NULL; + rdma_task->out_ib_op = XIO_IB_SEND; + + /* Map the send */ + if (unlikely(xio_map_tx_work_req(rdma_hndl->dev, &rdma_task->txd))) { + ERROR_LOG("DMA map to device failed\n"); + return -1; + } + rdma_task->txd.send_wr.num_sge = rdma_task->txd.mapped; + + rdma_hndl->rsps_in_flight_nr++; + + list_move(&task->tasks_list_entry, &rdma_hndl->in_flight_list); + + rdma_hndl->peer_credits--; + + /* set the lkey prior to sending */ + rdma_task->txd.send_wr.sg_list[0].lkey = rdma_hndl->dev->mr->lkey; + + /* send the setup request */ + xio_post_send(rdma_hndl, &rdma_task->txd, 1); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_on_setup_msg */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_on_setup_msg(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + union xio_transport_event_data event_data; + struct xio_rdma_setup_msg *rsp = &rdma_hndl->setup_rsp; + u64 local_buf_size; + + if (rdma_hndl->base.is_client) { + struct xio_task *sender_task = NULL; + + if (!list_empty(&rdma_hndl->in_flight_list)) + sender_task = list_first_entry( + &rdma_hndl->in_flight_list, + struct xio_task, tasks_list_entry); + else if (!list_empty(&rdma_hndl->tx_comp_list)) + sender_task = list_first_entry( + &rdma_hndl->tx_comp_list, + struct xio_task, tasks_list_entry); + else + ERROR_LOG("could not find sender task\n"); + + task->sender_task = sender_task; + if (sender_task && sender_task->dd_data) { + struct xio_rdma_task *rdma_sender_task; + + rdma_sender_task = task->sender_task->dd_data; + xio_unmap_tx_work_req(rdma_hndl->dev, + &rdma_sender_task->txd); + } + xio_rdma_read_setup_msg(rdma_hndl, task, rsp); + /* get the initial credits */ + rdma_hndl->peer_credits += rsp->credits; + } else { + struct xio_rdma_setup_msg req; + + xio_rdma_read_setup_msg(rdma_hndl, task, &req); + + /* current implementation is symmetric */ + local_buf_size = xio_rdma_get_inline_buffer_size(); + rsp->buffer_sz = min(req.buffer_sz, local_buf_size); + rsp->sq_depth = max((int)req.sq_depth, rdma_hndl->rq_depth); + rsp->rq_depth = max((int)req.rq_depth, rdma_hndl->sq_depth); + rsp->max_in_iovsz = req.max_in_iovsz; + rsp->max_out_iovsz = req.max_out_iovsz; + rsp->max_header_len = req.max_header_len; + } + + /* save the values */ + rdma_hndl->rq_depth = rsp->rq_depth; + rdma_hndl->actual_rq_depth = rdma_hndl->rq_depth + EXTRA_RQE; + rdma_hndl->sq_depth = rsp->sq_depth; + rdma_hndl->membuf_sz = rsp->buffer_sz; + rdma_hndl->max_inline_buf_sz = rsp->buffer_sz; + rdma_hndl->peer_max_in_iovsz = rsp->max_in_iovsz; + rdma_hndl->peer_max_out_iovsz = rsp->max_out_iovsz; + rdma_hndl->peer_max_header = rsp->max_header_len; + + /* initialize send window */ + rdma_hndl->sn = 0; + rdma_hndl->ack_sn = ~0; + rdma_hndl->credits = 0; + rdma_hndl->max_sn = rdma_hndl->sq_depth; + + /* initialize receive window */ + rdma_hndl->exp_sn = 0; + rdma_hndl->max_exp_sn = 0; + + rdma_hndl->max_tx_ready_tasks_num = rdma_hndl->sq_depth; + rdma_hndl->num_tasks = rdma_hndl->base.ctx->max_conns_per_ctx * + (rdma_hndl->sq_depth + rdma_hndl->actual_rq_depth); + + /* fill notification event */ + event_data.msg.op = XIO_WC_OP_RECV; + event_data.msg.task = task; + + xio_transport_notify_observer(&rdma_hndl->base, + XIO_TRANSPORT_EVENT_NEW_MESSAGE, + &event_data); + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_write_rdma_read_ack_hdr */ +/*---------------------------------------------------------------------------*/ +static void xio_write_rdma_read_ack_hdr(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task, + struct xio_rdma_read_ack_hdr *rra) +{ + struct xio_rdma_read_ack_hdr *tmp_rra; + + xio_mbuf_reset(&task->mbuf); + + /* set start of the tlv */ + if (xio_mbuf_tlv_start(&task->mbuf) != 0) + return; + + /* set the mbuf after tlv header */ + xio_mbuf_set_val_start(&task->mbuf); + + /* get the pointer */ + tmp_rra = (struct xio_rdma_read_ack_hdr *) + xio_mbuf_get_curr_ptr(&task->mbuf); + + /* pack relevant values */ + PACK_SVAL(rra, tmp_rra, hdr_len); + PACK_LVAL(rra, tmp_rra, rtid); + + xio_mbuf_inc(&task->mbuf, sizeof(*rra)); +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_send_rdma_read_ack */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_send_rdma_read_ack(struct xio_rdma_transport *rdma_hndl, + int rtid) +{ + uint64_t payload; + struct xio_task *task; + struct xio_rdma_task *rdma_task; + struct xio_rdma_read_ack_hdr rra = { + .hdr_len = sizeof(rra), + .rtid = rtid, + }; + + task = xio_rdma_primary_task_alloc(rdma_hndl); + if (unlikely(!task)) { + ERROR_LOG("primary tasks pool is empty\n"); + return -1; + } + task->omsg = NULL; + + task->tlv_type = XIO_RDMA_READ_ACK; + rdma_task = (struct xio_rdma_task *)task->dd_data; + + /* write the message */ + xio_write_rdma_read_ack_hdr(rdma_hndl, task, &rra); + + payload = xio_mbuf_tlv_payload_len(&task->mbuf); + + /* add tlv */ + if (xio_mbuf_write_tlv(&task->mbuf, task->tlv_type, payload) != 0) + return -1; + + /* set the length */ + rdma_task->txd.sge[0].length = xio_mbuf_data_length(&task->mbuf); + rdma_task->txd.send_wr.send_flags = 0; + if (rdma_task->txd.sge[0].length < (size_t)rdma_hndl->max_inline_data) + rdma_task->txd.send_wr.send_flags |= IB_SEND_INLINE; + + rdma_task->txd.send_wr.next = NULL; + rdma_task->out_ib_op = XIO_IB_SEND; + rdma_task->txd.send_wr.num_sge = 1; + + rdma_hndl->rsps_in_flight_nr++; + list_add_tail(&task->tasks_list_entry, &rdma_hndl->in_flight_list); + + rdma_hndl->peer_credits--; + xio_post_send(rdma_hndl, &rdma_task->txd, 1); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_read_rdma_read_ack_hdr */ +/*---------------------------------------------------------------------------*/ +static void xio_read_rdma_read_ack_hdr(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task, + struct xio_rdma_read_ack_hdr *rra) +{ + struct xio_rdma_read_ack_hdr *tmp_rra; + + /* goto to the first tlv */ + xio_mbuf_reset(&task->mbuf); + + /* set the mbuf after tlv header */ + xio_mbuf_set_val_start(&task->mbuf); + + /* get the pointer */ + tmp_rra = (struct xio_rdma_read_ack_hdr *) + xio_mbuf_get_curr_ptr(&task->mbuf); + + /* pack relevant values */ + UNPACK_SVAL(tmp_rra, rra, hdr_len); + UNPACK_LVAL(tmp_rra, rra, rtid); + + xio_mbuf_inc(&task->mbuf, sizeof(*rra)); +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_on_recv_rdma_read_ack */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_on_recv_rdma_read_ack(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + struct xio_rdma_read_ack_hdr rra; + union xio_transport_event_data event_data; + struct xio_task *req_task; + + xio_read_rdma_read_ack_hdr(rdma_hndl, task, &rra); + + /* the rx task is returned back to pool */ + xio_tasks_pool_put(task); + + /* find the sender task */ + req_task = xio_rdma_primary_task_lookup(rdma_hndl, rra.rtid); + + event_data.msg.op = XIO_WC_OP_SEND; + event_data.msg.task = req_task; + + xio_transport_notify_observer(&rdma_hndl->base, + XIO_TRANSPORT_EVENT_SEND_COMPLETION, + &event_data); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_write_nop */ +/*---------------------------------------------------------------------------*/ +static void xio_rdma_write_nop(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task, + struct xio_nop_hdr *nop) +{ + struct xio_nop_hdr *tmp_nop; + + xio_mbuf_reset(&task->mbuf); + + /* set start of the tlv */ + if (xio_mbuf_tlv_start(&task->mbuf) != 0) + return; + + /* set the mbuf after tlv header */ + xio_mbuf_set_val_start(&task->mbuf); + + /* get the pointer */ + tmp_nop = xio_mbuf_get_curr_ptr(&task->mbuf); + + /* pack relevant values */ + PACK_SVAL(nop, tmp_nop, hdr_len); + PACK_SVAL(nop, tmp_nop, sn); + PACK_SVAL(nop, tmp_nop, ack_sn); + PACK_SVAL(nop, tmp_nop, credits); + tmp_nop->opcode = nop->opcode; + tmp_nop->flags = nop->flags; + +#ifdef EYAL_TODO + print_hex_dump_bytes("write_nop: ", DUMP_PREFIX_ADDRESS, + task->mbuf.tlv.head, + 64); +#endif + xio_mbuf_inc(&task->mbuf, sizeof(*nop)); +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_send_nop */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_send_nop(struct xio_rdma_transport *rdma_hndl) +{ + uint64_t payload; + struct xio_task *task; + struct xio_rdma_task *rdma_task; + struct xio_nop_hdr nop = { + .hdr_len = sizeof(nop), + .sn = rdma_hndl->sn, + .ack_sn = rdma_hndl->ack_sn, + .credits = rdma_hndl->credits, + .opcode = 0, + .flags = 0, + }; + + TRACE_LOG("SEND_NOP\n"); + + task = xio_rdma_primary_task_alloc(rdma_hndl); + if (unlikely(!task)) { + ERROR_LOG("primary tasks pool is empty\n"); + return -1; + } + + task->omsg = NULL; + + task->tlv_type = XIO_CREDIT_NOP; + rdma_task = (struct xio_rdma_task *)task->dd_data; + + /* write the message */ + xio_rdma_write_nop(rdma_hndl, task, &nop); + rdma_hndl->sim_peer_credits += rdma_hndl->credits; + rdma_hndl->credits = 0; + + payload = xio_mbuf_tlv_payload_len(&task->mbuf); + + /* add tlv */ + if (xio_mbuf_write_tlv(&task->mbuf, task->tlv_type, payload) != 0) + return -1; + + rdma_task->txd.send_wr.next = NULL; + rdma_task->out_ib_op = XIO_IB_SEND; + + rdma_task->txd.nents = 1; + rdma_task->txd.sgt.nents = 1; + /* set the length */ + rdma_task->txd.sgt.sgl[0].length = xio_mbuf_data_length(&task->mbuf); + + rdma_task->txd.send_wr.send_flags = IB_SEND_SIGNALED; + if (rdma_task->txd.sgt.sgl[0].length < rdma_hndl->max_inline_data) + rdma_task->txd.send_wr.send_flags |= IB_SEND_INLINE; + + /* Map the send */ + if (unlikely(xio_map_tx_work_req(rdma_hndl->dev, &rdma_task->txd))) { + ERROR_LOG("DMA map to device failed\n"); + return -1; + } + + rdma_task->txd.send_wr.num_sge = rdma_task->txd.mapped; + + rdma_hndl->rsps_in_flight_nr++; + list_add_tail(&task->tasks_list_entry, &rdma_hndl->in_flight_list); + + rdma_hndl->peer_credits--; + xio_post_send(rdma_hndl, &rdma_task->txd, 1); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_read_nop */ +/*---------------------------------------------------------------------------*/ +static void xio_rdma_read_nop(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task, + struct xio_nop_hdr *nop) +{ + struct xio_nop_hdr *tmp_nop; + + /* goto to the first tlv */ + xio_mbuf_reset(&task->mbuf); + + /* set the mbuf after tlv header */ + xio_mbuf_set_val_start(&task->mbuf); + + /* get the pointer */ + tmp_nop = xio_mbuf_get_curr_ptr(&task->mbuf); + + /* pack relevant values */ + UNPACK_SVAL(tmp_nop, nop, hdr_len); + UNPACK_SVAL(tmp_nop, nop, sn); + UNPACK_SVAL(tmp_nop, nop, ack_sn); + UNPACK_SVAL(tmp_nop, nop, credits); + nop->opcode = tmp_nop->opcode; + nop->flags = tmp_nop->flags; + +#ifdef EYAL_TODO + print_hex_dump_bytes("post_send: ", DUMP_PREFIX_ADDRESS, + task->mbuf.tlv.head, + 64); +#endif + xio_mbuf_inc(&task->mbuf, sizeof(*nop)); +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_on_recv_nop */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_on_recv_nop(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + struct xio_nop_hdr nop; + + TRACE_LOG("RECV_NOP\n"); + xio_rdma_read_nop(rdma_hndl, task, &nop); + + if (rdma_hndl->exp_sn == nop.sn) + rdma_hndl->peer_credits += nop.credits; + else + ERROR_LOG("ERROR: sn expected:%d, sn arrived:%d\n", + rdma_hndl->exp_sn, nop.sn); + + /* the rx task is returned back to pool */ + xio_tasks_pool_put(task); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_send_cancel */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_send_cancel(struct xio_rdma_transport *rdma_hndl, + uint32_t tlv_type, + struct xio_rdma_cancel_hdr *cancel_hdr, + void *ulp_msg, size_t ulp_msg_sz) +{ + uint64_t payload; + uint16_t ulp_hdr_len; + int retval; + struct xio_task *task; + struct xio_rdma_task *rdma_task; + void *buff; + + task = xio_rdma_primary_task_alloc(rdma_hndl); + if (unlikely(!task)) { + ERROR_LOG("primary tasks pool is empty\n"); + return -1; + } + xio_mbuf_reset(&task->mbuf); + + /* set start of the tlv */ + if (xio_mbuf_tlv_start(&task->mbuf) != 0) + return -1; + + task->tlv_type = tlv_type; + rdma_task = (struct xio_rdma_task *)task->dd_data; + rdma_task->out_ib_op = XIO_IB_SEND; + rdma_task->req_out_num_sge = 0; + rdma_task->req_in_num_sge = 0; + rdma_task->sqe_used = 0; + + ulp_hdr_len = sizeof(*cancel_hdr) + sizeof(uint16_t) + ulp_msg_sz; + rdma_hndl->dummy_msg.out.header.iov_base = + kzalloc(ulp_hdr_len, GFP_KERNEL); + rdma_hndl->dummy_msg.out.header.iov_len = ulp_hdr_len; + + /* write the message */ + /* get the pointer */ + buff = rdma_hndl->dummy_msg.out.header.iov_base; + + /* pack relevant values */ + buff += xio_write_uint16(cancel_hdr->hdr_len, 0, buff); + buff += xio_write_uint16(cancel_hdr->sn, 0, buff); + buff += xio_write_uint32(cancel_hdr->result, 0, buff); + buff += xio_write_uint16((uint16_t)(ulp_msg_sz), 0, buff); + buff += xio_write_array(ulp_msg, ulp_msg_sz, 0, buff); + + task->omsg = &rdma_hndl->dummy_msg; + + /* write xio header to the buffer */ + retval = xio_rdma_prep_req_header(rdma_hndl, task, + ulp_hdr_len, 0, 0, + XIO_E_SUCCESS); + if (unlikely(retval)) + return -1; + + payload = xio_mbuf_tlv_payload_len(&task->mbuf); + + /* add tlv */ + if (xio_mbuf_write_tlv(&task->mbuf, task->tlv_type, payload) != 0) + return -1; + + task->omsg = NULL; + kfree(rdma_hndl->dummy_msg.out.header.iov_base); + + rdma_task->txd.nents = 1; + + /* set the length */ + rdma_task->txd.sgt.nents = 1; + rdma_task->txd.sgt.sgl[0].length = xio_mbuf_data_length(&task->mbuf); + + rdma_task->txd.send_wr.send_flags = IB_SEND_SIGNALED; + if (rdma_task->txd.sgt.sgl[0].length < rdma_hndl->max_inline_data) + rdma_task->txd.send_wr.send_flags |= IB_SEND_INLINE; + + rdma_task->txd.send_wr.next = NULL; + + /* Map the send */ + if (unlikely(xio_map_tx_work_req(rdma_hndl->dev, &rdma_task->txd))) { + ERROR_LOG("DMA map to device failed\n"); + return -1; + } + rdma_task->txd.send_wr.num_sge = rdma_task->txd.mapped; + + rdma_hndl->tx_ready_tasks_num++; + list_move_tail(&task->tasks_list_entry, &rdma_hndl->tx_ready_list); + + xio_rdma_xmit(rdma_hndl); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_send */ +/*---------------------------------------------------------------------------*/ +int xio_rdma_send(struct xio_transport_base *transport, + struct xio_task *task) +{ + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)transport; + int retval = -1; + + switch (task->tlv_type) { + case XIO_NEXUS_SETUP_REQ: + retval = xio_rdma_send_setup_req(rdma_hndl, task); + break; + case XIO_NEXUS_SETUP_RSP: + retval = xio_rdma_send_setup_rsp(rdma_hndl, task); + break; + case XIO_MSG_TYPE_RDMA: + retval = xio_rdma_perform_direct_rdma( + (struct xio_rdma_transport *)rdma_hndl, task); + break; + default: + if (IS_REQUEST(task->tlv_type)) + retval = xio_rdma_send_req(rdma_hndl, task); + else if (IS_RESPONSE(task->tlv_type)) + retval = xio_rdma_send_rsp(rdma_hndl, task); + else + ERROR_LOG("unknown message type:0x%x\n", + task->tlv_type); + break; + } + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_cancel_req_handler */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_cancel_req_handler(struct xio_rdma_transport *rdma_hndl, + struct xio_rdma_cancel_hdr *cancel_hdr, + void *ulp_msg, size_t ulp_msg_sz) +{ + union xio_transport_event_data event_data; + struct xio_task *ptask, *next_ptask; + struct xio_rdma_task *rdma_task; + int found = 0; + + /* start by looking for the task rdma_rd */ + list_for_each_entry_safe(ptask, next_ptask, + &rdma_hndl->rdma_rd_req_list, + tasks_list_entry) { + rdma_task = ptask->dd_data; + if (rdma_task->phantom_idx == 0 && + rdma_task->sn == cancel_hdr->sn) { + TRACE_LOG("[%d] - message found on rdma_rd_req_list\n", + cancel_hdr->sn); + ptask->state = XIO_TASK_STATE_CANCEL_PENDING; + found = 1; + break; + } + } + + if (!found) { + list_for_each_entry_safe(ptask, next_ptask, + &rdma_hndl->rdma_rd_req_in_flight_list, + tasks_list_entry) { + rdma_task = ptask->dd_data; + if (rdma_task->phantom_idx == 0 && + rdma_task->sn == cancel_hdr->sn) { + TRACE_LOG("[%d] - message found on " \ + "rdma_rd_req_in_flight_list\n", + cancel_hdr->sn); + ptask->state = XIO_TASK_STATE_CANCEL_PENDING; + found = 1; + break; + } + } + } + + if (!found) { + TRACE_LOG("[%d] - was not found\n", cancel_hdr->sn); + /* fill notification event */ + event_data.cancel.ulp_msg = ulp_msg; + event_data.cancel.ulp_msg_sz = ulp_msg_sz; + event_data.cancel.task = NULL; + event_data.cancel.result = 0; + + xio_transport_notify_observer( + &rdma_hndl->base, + XIO_TRANSPORT_EVENT_CANCEL_REQUEST, + &event_data); + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_cancel_req_handler */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_cancel_rsp_handler(struct xio_rdma_transport *rdma_hndl, + struct xio_rdma_cancel_hdr *cancel_hdr, + void *ulp_msg, size_t ulp_msg_sz) +{ + union xio_transport_event_data event_data; + struct xio_task *ptask, *next_ptask; + struct xio_rdma_task *rdma_task; + struct xio_task *task_to_cancel = NULL; + + if ((cancel_hdr->result == XIO_E_MSG_CANCELED) || + (cancel_hdr->result == XIO_E_MSG_CANCEL_FAILED)) { + /* look in the in_flight */ + list_for_each_entry_safe(ptask, next_ptask, + &rdma_hndl->in_flight_list, + tasks_list_entry) { + rdma_task = ptask->dd_data; + if (rdma_task->sn == cancel_hdr->sn) { + task_to_cancel = ptask; + break; + } + } + if (!task_to_cancel) { + /* look in the tx_comp */ + list_for_each_entry_safe(ptask, next_ptask, + &rdma_hndl->tx_comp_list, + tasks_list_entry) { + rdma_task = ptask->dd_data; + if (rdma_task->sn == cancel_hdr->sn) { + task_to_cancel = ptask; + break; + } + } + } + + if (!task_to_cancel) { + ERROR_LOG("[%d] - Failed to found canceled message\n", + cancel_hdr->sn); + /* fill notification event */ + event_data.cancel.ulp_msg = ulp_msg; + event_data.cancel.ulp_msg_sz = ulp_msg_sz; + event_data.cancel.task = NULL; + event_data.cancel.result = XIO_E_MSG_NOT_FOUND; + + xio_transport_notify_observer( + &rdma_hndl->base, + XIO_TRANSPORT_EVENT_CANCEL_RESPONSE, + &event_data); + return 0; + } + } + + /* fill notification event */ + event_data.cancel.ulp_msg = ulp_msg; + event_data.cancel.ulp_msg_sz = ulp_msg_sz; + event_data.cancel.task = task_to_cancel; + event_data.cancel.result = cancel_hdr->result; + + xio_transport_notify_observer(&rdma_hndl->base, + XIO_TRANSPORT_EVENT_CANCEL_RESPONSE, + &event_data); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_on_recv_cancel_rsp */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_on_recv_cancel_rsp(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + int retval = 0; + struct xio_rdma_rsp_hdr rsp_hdr; + struct xio_msg *imsg; + void *ulp_hdr; + void *buff; + uint16_t ulp_msg_sz; + struct xio_rdma_task *rdma_task = task->dd_data; + struct xio_rdma_cancel_hdr cancel_hdr; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + void *sg; + + /* read the response header */ + retval = xio_rdma_read_rsp_header(rdma_hndl, task, &rsp_hdr); + if (retval != 0) { + xio_set_error(XIO_E_MSG_INVALID); + goto cleanup; + } + /* update receive + send window */ + if (rdma_hndl->exp_sn == rsp_hdr.sn) { + rdma_hndl->exp_sn++; + rdma_hndl->ack_sn = rsp_hdr.sn; + rdma_hndl->peer_credits += rsp_hdr.credits; + } else { + ERROR_LOG("ERROR: expected sn:%d, arrived sn:%d\n", + rdma_hndl->exp_sn, rsp_hdr.sn); + } + /* read the sn */ + rdma_task->sn = rsp_hdr.sn; + + imsg = &task->imsg; + sgtbl = xio_sg_table_get(&imsg->in); + sgtbl_ops = xio_sg_table_ops_get(imsg->in.sgl_type); + sg = sge_first(sgtbl_ops, sgtbl); + + ulp_hdr = xio_mbuf_get_curr_ptr(&task->mbuf); + + imsg->type = task->tlv_type; + imsg->in.header.iov_len = rsp_hdr.ulp_hdr_len; + imsg->in.header.iov_base = ulp_hdr; + sge_set_addr(sgtbl_ops, sg, NULL); + tbl_set_nents(sgtbl_ops, sgtbl, 0); + + buff = imsg->in.header.iov_base; + buff += xio_read_uint16(&cancel_hdr.hdr_len, 0, buff); + buff += xio_read_uint16(&cancel_hdr.sn, 0, buff); + buff += xio_read_uint32(&cancel_hdr.result, 0, buff); + buff += xio_read_uint16(&ulp_msg_sz, 0, buff); + + xio_rdma_cancel_rsp_handler(rdma_hndl, &cancel_hdr, + buff, ulp_msg_sz); + /* return the the cancel response task to pool */ + xio_tasks_pool_put(task); + + return 0; +cleanup: + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_on_recv_cancel_req */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_on_recv_cancel_req(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + int retval = 0; + struct xio_rdma_cancel_hdr cancel_hdr; + struct xio_rdma_req_hdr req_hdr; + struct xio_msg *imsg; + void *ulp_hdr; + void *buff; + uint16_t ulp_msg_sz; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + void *sg; + + /* read header */ + retval = xio_rdma_read_req_header(rdma_hndl, task, &req_hdr); + if (retval != 0) { + xio_set_error(XIO_E_MSG_INVALID); + goto cleanup; + } + if (rdma_hndl->exp_sn == req_hdr.sn) { + rdma_hndl->exp_sn++; + rdma_hndl->ack_sn = req_hdr.sn; + rdma_hndl->peer_credits += req_hdr.credits; + } else { + ERROR_LOG("ERROR: sn expected:%d, sn arrived:%d\n", + rdma_hndl->exp_sn, req_hdr.sn); + } + + /* read the sn */ + rdma_task->sn = req_hdr.sn; + + imsg = &task->imsg; + sgtbl = xio_sg_table_get(&imsg->in); + sgtbl_ops = xio_sg_table_ops_get(imsg->in.sgl_type); + sg = sge_first(sgtbl_ops, sgtbl); + + ulp_hdr = xio_mbuf_get_curr_ptr(&task->mbuf); + + /* set header pointers */ + imsg->type = task->tlv_type; + imsg->in.header.iov_len = req_hdr.ulp_hdr_len; + imsg->in.header.iov_base = ulp_hdr; + sge_set_addr(sgtbl_ops, sg, NULL); + tbl_set_nents(sgtbl_ops, sgtbl, 0); + + buff = imsg->in.header.iov_base; + buff += xio_read_uint16(&cancel_hdr.hdr_len, 0, buff); + buff += xio_read_uint16(&cancel_hdr.sn, 0, buff); + buff += xio_read_uint32(&cancel_hdr.result, 0, buff); + buff += xio_read_uint16(&ulp_msg_sz, 0, buff); + + xio_rdma_cancel_req_handler(rdma_hndl, &cancel_hdr, + buff, ulp_msg_sz); + /* return the the cancel request task to pool */ + xio_tasks_pool_put(task); + + return 0; + +cleanup: + retval = xio_errno(); + ERROR_LOG("xio_rdma_on_recv_req failed. (errno=%d %s)\n", retval, + xio_strerror(retval)); + xio_transport_notify_observer_error(&rdma_hndl->base, retval); + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_cancel_req */ +/*---------------------------------------------------------------------------*/ +int xio_rdma_cancel_req(struct xio_transport_base *transport, + struct xio_msg *req, uint64_t stag, + void *ulp_msg, size_t ulp_msg_sz) +{ + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)transport; + struct xio_task *ptask, *next_ptask; + union xio_transport_event_data event_data; + struct xio_rdma_task *rdma_task; + struct xio_rdma_cancel_hdr cancel_hdr = { + .hdr_len = sizeof(cancel_hdr), + .result = 0 + }; + + /* look in the tx_ready */ + list_for_each_entry_safe(ptask, next_ptask, &rdma_hndl->tx_ready_list, + tasks_list_entry) { + if (ptask->omsg && + (ptask->omsg->sn == req->sn) && + (ptask->stag == stag)) { + TRACE_LOG("[%llu] - message found on tx_ready_list\n", + req->sn); + + /* return decrease ref count from task */ + xio_tasks_pool_put(ptask); + rdma_hndl->tx_ready_tasks_num--; + list_move_tail(&ptask->tasks_list_entry, + &rdma_hndl->tx_comp_list); + + /* fill notification event */ + event_data.cancel.ulp_msg = ulp_msg; + event_data.cancel.ulp_msg_sz = ulp_msg_sz; + event_data.cancel.task = ptask; + event_data.cancel.result = XIO_E_MSG_CANCELED; + + xio_transport_notify_observer( + &rdma_hndl->base, + XIO_TRANSPORT_EVENT_CANCEL_RESPONSE, + &event_data); + return 0; + } + } + /* look in the in_flight */ + list_for_each_entry_safe(ptask, next_ptask, &rdma_hndl->in_flight_list, + tasks_list_entry) { + if (ptask->omsg && + (ptask->omsg->sn == req->sn) && + (ptask->stag == stag) && + (ptask->state != XIO_TASK_STATE_RESPONSE_RECV)) { + TRACE_LOG("[%llu] - message found on in_flight_list\n", + req->sn); + + rdma_task = ptask->dd_data; + cancel_hdr.sn = rdma_task->sn; + + xio_rdma_send_cancel(rdma_hndl, XIO_CANCEL_REQ, + &cancel_hdr, + ulp_msg, ulp_msg_sz); + return 0; + } + } + /* look in the tx_comp */ + list_for_each_entry_safe(ptask, next_ptask, &rdma_hndl->tx_comp_list, + tasks_list_entry) { + if (ptask->omsg && + (ptask->omsg->sn == req->sn) && + (ptask->stag == stag) && + (ptask->state != XIO_TASK_STATE_RESPONSE_RECV)) { + TRACE_LOG("[%llu] - message found on tx_comp_list\n", + req->sn); + rdma_task = ptask->dd_data; + cancel_hdr.sn = rdma_task->sn; + + xio_rdma_send_cancel(rdma_hndl, XIO_CANCEL_REQ, + &cancel_hdr, + ulp_msg, ulp_msg_sz); + return 0; + } + } + TRACE_LOG("[%llu] - message not found on tx path\n", req->sn); + + /* fill notification event */ + event_data.cancel.ulp_msg = ulp_msg; + event_data.cancel.ulp_msg_sz = ulp_msg_sz; + event_data.cancel.task = NULL; + event_data.cancel.result = XIO_E_MSG_NOT_FOUND; + + xio_transport_notify_observer(&rdma_hndl->base, + XIO_TRANSPORT_EVENT_CANCEL_RESPONSE, + &event_data); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_cancel_rsp */ +/*---------------------------------------------------------------------------*/ +int xio_rdma_cancel_rsp(struct xio_transport_base *transport, + struct xio_task *task, enum xio_status result, + void *ulp_msg, size_t ulp_msg_sz) +{ + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)transport; + struct xio_rdma_task *rdma_task; + + struct xio_rdma_cancel_hdr cancel_hdr = { + .hdr_len = sizeof(cancel_hdr), + .result = result, + }; + + if (task) { + rdma_task = task->dd_data; + cancel_hdr.sn = rdma_task->sn; + } else { + cancel_hdr.sn = 0; + } + + /* fill dummy transport header since was handled by upper layer + */ + return xio_rdma_send_cancel(rdma_hndl, XIO_CANCEL_RSP, + &cancel_hdr, ulp_msg, ulp_msg_sz); +} diff --git a/open_src/xio/src/kernel/transport/rdma/xio_rdma_management.c b/open_src/xio/src/kernel/transport/rdma/xio_rdma_management.c new file mode 100644 index 0000000..11a5d0d --- /dev/null +++ b/open_src/xio/src/kernel/transport/rdma/xio_rdma_management.c @@ -0,0 +1,3266 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include + +#include +#include + +#include "libxio.h" +#include +#include "xio_common.h" +#include "xio_log.h" +#include "xio_observer.h" +#include "xio_ktransport.h" +#include "xio_transport.h" +#include "xio_protocol.h" +#include "xio_mbuf.h" +#include "xio_task.h" +#include "xio_mem.h" +#include "xio_mempool.h" +#include "xio_rdma_utils.h" +#include "xio_rdma_transport.h" +#include "xio_sg_table.h" +#include "xio_ev_data.h" +#include "xio_objpool.h" +#include "xio_workqueue.h" +#include "xio_ev_loop.h" +#include "xio_context.h" +#include "xio_context_priv.h" + +MODULE_AUTHOR("Eyal Solomon, Shlomo Pongratz"); +MODULE_DESCRIPTION("XIO library " \ + "v" DRV_VERSION " (" DRV_RELDATE ")"); +MODULE_LICENSE("Dual BSD/GPL"); + +/* The root of xio_rdma debugfs tree */ +static struct dentry *xio_rdma_root; + +int xio_rdma_cq_completions; +module_param_named(cq_completions, xio_rdma_cq_completions, int, 0644); +MODULE_PARM_DESC(cq_completions, "moderate CQ to N completions if N > 0 (default:disabled)"); + +int xio_rdma_cq_timeout; +module_param_named(cq_timeout, xio_rdma_cq_timeout, int, 0644); +MODULE_PARM_DESC(cq_timeout, "moderate CQ to max T micro-sec if T > 0 (default:disabled)"); + +/* TODO: move to an include file like xio_usr_transport.h in user space */ +#define VALIDATE_SZ(sz) do { \ + if (optlen != (sz)) { \ + xio_set_error(EINVAL); \ + return -1; \ + } \ + } while (0) + +/* default option values */ +#define XIO_OPTVAL_DEF_ENABLE_MEM_POOL 1 +#define XIO_OPTVAL_DEF_ENABLE_DMA_LATENCY 0 +#define XIO_OPTVAL_DEF_MAX_IN_IOVSZ XIO_IOVLEN +#define XIO_OPTVAL_DEF_MAX_OUT_IOVSZ XIO_IOVLEN +#define XIO_OPTVAL_DEF_QP_CAP_MAX_INLINE_DATA (200) + +/*---------------------------------------------------------------------------*/ +/* globals */ +/*---------------------------------------------------------------------------*/ +struct xio_options *g_poptions; + +/* rdma options */ +struct xio_rdma_options rdma_options = { + .enable_mem_pool = XIO_OPTVAL_DEF_ENABLE_MEM_POOL, + .enable_dma_latency = XIO_OPTVAL_DEF_ENABLE_DMA_LATENCY, + .max_in_iovsz = XIO_OPTVAL_DEF_MAX_IN_IOVSZ, + .max_out_iovsz = XIO_OPTVAL_DEF_MAX_OUT_IOVSZ, + .qp_cap_max_inline_data = XIO_OPTVAL_DEF_QP_CAP_MAX_INLINE_DATA, +}; + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_get_max_header_size */ +/*---------------------------------------------------------------------------*/ +int xio_rdma_get_max_header_size(void) +{ + int req_hdr = XIO_TRANSPORT_OFFSET + sizeof(struct xio_rdma_req_hdr); + int rsp_hdr = XIO_TRANSPORT_OFFSET + sizeof(struct xio_rdma_rsp_hdr); + int iovsz = rdma_options.max_out_iovsz + rdma_options.max_in_iovsz; + + req_hdr += iovsz * sizeof(struct xio_sge); + rsp_hdr += rdma_options.max_out_iovsz * sizeof(struct xio_sge); + + return max(req_hdr, rsp_hdr); +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_get_inline_buffer_size */ +/*---------------------------------------------------------------------------*/ +int xio_rdma_get_inline_buffer_size(void) +{ + int inline_buf_sz = ALIGN(xio_rdma_get_max_header_size() + + g_poptions->max_inline_xio_hdr + + g_poptions->max_inline_xio_data, 1024); + return inline_buf_sz; +} + +/*---------------------------------------------------------------------------*/ +/* forward declaration */ +/*---------------------------------------------------------------------------*/ +static struct xio_transport_base *xio_rdma_open( + struct xio_transport *transport, + struct xio_context *ctx, + struct xio_observer *observer, + uint32_t trans_attr_mask, + struct xio_transport_init_attr *attr); + +static void xio_rdma_close(struct xio_transport_base *transport); +static int xio_rdma_reject(struct xio_transport_base *transport); +static void xio_rdma_post_close(struct xio_transport_base *transport); +static int xio_rdma_flush_all_tasks(struct xio_rdma_transport *rdma_hndl); + +static void xio_cq_event_callback(struct ib_event *cause, void *context) +{ + ERROR_LOG("got cq event %d ctx(%p)\n", cause->event, context); +} + +static void xio_add_one(struct ib_device *ib_dev); +static void xio_del_one(struct ib_device *ib_dev); + +static struct ib_client xio_client = { + .name = "xio", + .add = xio_add_one, + .remove = xio_del_one +}; + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_context_shutdown */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_context_shutdown(struct xio_transport_base *trans_hndl, + struct xio_context *ctx) +{ + xio_context_destroy_wait(trans_hndl->ctx); + + xio_rdma_close(trans_hndl); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_cq_down */ +/*---------------------------------------------------------------------------*/ +static void xio_cq_down(struct kref *kref) +{ + struct xio_cq *tcq = container_of(kref, struct xio_cq, kref); + int retval; + + write_lock_bh(&tcq->dev->cq_lock); + retval = list_empty(&tcq->cq_list_entry); + list_del_init(&tcq->cq_list_entry); + write_unlock_bh(&tcq->dev->cq_lock); + + if (retval) + ERROR_LOG("tcq double free\n"); + + if (!list_empty(&tcq->trans_list)) + ERROR_LOG("rdma_hndl memory leakage\n"); + + xio_context_unreg_observer(tcq->ctx, &tcq->observer); + + /* the event loop may be release by the time this function is called */ + retval = ib_destroy_cq(tcq->cq); + if (retval) + ERROR_LOG("ib_destroy_cq failed. (err=%d)\n", retval); + + XIO_OBSERVER_DESTROY(&tcq->observer); + + kfree(tcq->wc_array); + kfree(tcq); +} + +/*---------------------------------------------------------------------------*/ +/* xio_cq_release */ +/*---------------------------------------------------------------------------*/ +static inline void xio_cq_release(struct xio_cq *tcq) +{ + kref_put(&tcq->kref, xio_cq_down); +} + +/*---------------------------------------------------------------------------*/ +/* xio_on_context_event */ +/*---------------------------------------------------------------------------*/ +static int xio_on_context_event(void *observer, void *sender, + int event, void *event_data) +{ + if (event == XIO_CONTEXT_EVENT_POST_CLOSE) { + TRACE_LOG("context: [close] ctx:%p\n", sender); + xio_cq_release((struct xio_cq *)observer); + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_cq_get */ +/*---------------------------------------------------------------------------*/ +static struct xio_cq *xio_cq_get(struct xio_device *dev, + struct xio_context *ctx) +{ + struct xio_cq *tcq; + int num_cores = num_online_cpus(); + u32 alloc_sz; + int cpu; + + /* If two session were created with the same context and + * the address resolved on the same device than the same + * CQ is used + */ + read_lock_bh(&dev->cq_lock); + list_for_each_entry(tcq, &dev->cq_list, cq_list_entry) { + if (tcq->ctx == ctx) { + kref_get(&tcq->kref); + read_unlock_bh(&dev->cq_lock); + return tcq; + } + } + read_unlock_bh(&dev->cq_lock); + + if (ctx->cpuid < 0 || ctx->cpuid >= num_cores) { + ERROR_LOG("BUG, wrong cpuid(%d) check init\n", ctx->cpuid); + goto cleanup0; + } else { + cpu = ctx->cpuid; + } + cpu = cpu % dev->cqs_used; + + tcq = kzalloc(sizeof(*tcq), GFP_KERNEL); + if (!tcq) { + ERROR_LOG("xio_cq_init kzalloc failed\n"); + goto cleanup0; + } + + tcq->alloc_sz = min(dev->device_attr.max_cqe, CQE_ALLOC_SIZE); + alloc_sz = tcq->alloc_sz; + + /* allocate device wc array */ + tcq->wc_array = kcalloc(MAX_POLL_WC, sizeof(struct ib_wc), GFP_KERNEL); + if (!tcq->wc_array) { + xio_set_error(ENOMEM); + ERROR_LOG("wc array allocation failed\n"); + goto cleanup1; + } + + tcq->ctx = ctx; + tcq->dev = dev; + tcq->max_cqe = dev->device_attr.max_cqe; + tcq->wc_array_len = MAX_POLL_WC; + INIT_LIST_HEAD(&tcq->trans_list); + INIT_LIST_HEAD(&tcq->cq_list_entry); + + /* xio_rdma_poll doesn't support separate tx & rx poll + * so we use only one cq for RX and TX + */ + +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 2, 0) + tcq->cq = ib_create_cq(dev->ib_dev, + xio_cq_data_callback, + xio_cq_event_callback, + (void *)tcq, + alloc_sz, cpu); +#else + { + struct ib_cq_init_attr ia = { + .cqe = alloc_sz, + .comp_vector = cpu, + }; + + tcq->cq = ib_create_cq(dev->ib_dev, + xio_cq_data_callback, + xio_cq_event_callback, + (void *)tcq, + &ia); + } +#endif + if (IS_ERR(tcq->cq)) { + ERROR_LOG("ib_create_cq err(%ld)\n", PTR_ERR(tcq->cq)); + goto cleanup2; + } + tcq->cq_depth = tcq->cq->cqe; + tcq->cqe_avail = tcq->cq->cqe; + +/* due to ib_modify_cq API change, need to add backporting */ +#if 0 + if (xio_rdma_cq_completions && xio_rdma_cq_timeout) { + if (xio_rdma_cq_completions > 0xffff || + xio_rdma_cq_timeout > 0xffff) { + ERROR_LOG("invalid CQ moderation values\n"); + } else { + ret = ib_modify_cq(tcq->cq, + xio_rdma_cq_completions, + xio_rdma_cq_timeout); + if (ret && ret != -ENOSYS) { + ERROR_LOG("failed modifying CQ (%d)\n", ret); + goto cleanup3; + } + } + } +#endif + + /* we don't expect missed events (if supported) so it is an error */ + if (ib_req_notify_cq(tcq->cq, + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS)) { + ERROR_LOG("ib_req_notify_cq\n"); + goto cleanup3; + } + + write_lock_bh(&dev->cq_lock); + list_add(&tcq->cq_list_entry, &dev->cq_list); + write_unlock_bh(&dev->cq_lock); + + /* One reference count for the context and one for the rdma handle */ + kref_init(&tcq->kref); + kref_get(&tcq->kref); + + /* set the tcq to be the observer for context events */ + XIO_OBSERVER_INIT(&tcq->observer, tcq, xio_on_context_event); + xio_context_reg_observer(ctx, &tcq->observer); + + /* regiter completion function to be called directly */ + xio_context_set_poll_completions_fn( + ctx, + (poll_completions_fn_t)xio_rdma_poll_completions, + tcq); + + return tcq; + +cleanup3: + ib_destroy_cq(tcq->cq); +cleanup2: + kfree(tcq->wc_array); +cleanup1: + kfree(tcq); +cleanup0: + ERROR_LOG("xio_cq_init failed\n"); + return NULL; +} + +/*---------------------------------------------------------------------------*/ +/* xio_dev_event_handler */ +/*---------------------------------------------------------------------------*/ +static void xio_dev_event_handler(struct ib_event_handler *handler, + struct ib_event *event) +{ + ERROR_LOG("async event %d on device %s port %d\n", event->event, + event->device->name, event->element.port_num); +} + +/*---------------------------------------------------------------------------*/ +/* xio_device_init */ +/*---------------------------------------------------------------------------*/ +static struct xio_device *xio_device_init(struct ib_device *ib_dev, int port) +{ + struct xio_device *dev; + int num_cores; + int retval; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) { + xio_set_error(ENOMEM); + ERROR_LOG("kzalloc failed.\n"); + goto cleanup0; + } + + retval = ib_query_device(ib_dev, &dev->device_attr); + if (retval < 0) { + ERROR_LOG("ib_query_device failed. (ret=%d)\n", retval); + xio_set_error(-retval); + goto cleanup1; + } + + /* FMR not yet supported */ +#if 0 + /* Assign function handles - based on FMR support */ + if (ib_dev->alloc_fmr && ib_dev->dealloc_fmr && + ib_dev->map_phys_fmr && ib_dev->unmap_fmr) + ERROR_LOG("not supported"); +#endif + + if (dev->device_attr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) { + if (xio_fast_reg_init(XIO_FAST_MEM_FRWR, &dev->fastreg)) + goto cleanup1; + } else { + if (xio_fast_reg_init(XIO_FAST_MEM_NONE, &dev->fastreg)) + goto cleanup1; + } + + dev->ib_dev = ib_dev; + dev->port_num = port; + + dev->pd = ib_alloc_pd(ib_dev); + if (!dev->pd) { + xio_set_error(ENOMEM); + ERROR_LOG("ibv_alloc_pd failed.\n"); + goto cleanup1; + } + + dev->mr = ib_get_dma_mr(dev->pd, + IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ); + if (IS_ERR(dev->mr)) { + xio_set_error(PTR_ERR(dev->mr)); + ERROR_LOG("ib_get_dma_mr failed. (ret=%ld)\n", + PTR_ERR(dev->mr)); + goto cleanup2; + } + + kref_init(&dev->kref); + rwlock_init(&dev->cq_lock); + INIT_LIST_HEAD(&dev->cq_list); + num_cores = num_online_cpus(); + num_cores = roundup_pow_of_two(num_cores); + dev->cqs_used = min(num_cores, ib_dev->num_comp_vectors); + + TRACE_LOG("rdma device: [new] %p\n", dev); + + INIT_IB_EVENT_HANDLER(&dev->event_handler, dev->ib_dev, + xio_dev_event_handler); + + if (ib_register_event_handler(&dev->event_handler)) + goto cleanup3; + + return dev; + +cleanup3: + ib_dereg_mr(dev->mr); +cleanup2: + ib_dealloc_pd(dev->pd); +cleanup1: + kfree(dev); +cleanup0: + ERROR_LOG("rdma device: [new] failed\n"); + return NULL; +} + +/*---------------------------------------------------------------------------*/ +/* xio_device_down */ +/*---------------------------------------------------------------------------*/ +void xio_device_down(struct kref *kref) +{ + struct xio_device *dev = container_of(kref, struct xio_device, kref); + + ib_dereg_mr(dev->mr); + ib_dealloc_pd(dev->pd); + + kfree(dev); +} + +/*---------------------------------------------------------------------------*/ +/* xio_device_release */ +/*---------------------------------------------------------------------------*/ +static void xio_device_release(struct xio_device *dev) +{ + TRACE_LOG("rdma device: [close] dev:%p\n", dev); + + (void)ib_unregister_event_handler(&dev->event_handler); + + write_lock_bh(&dev->cq_lock); + + if (!list_empty(&dev->cq_list)) { + write_unlock_bh(&dev->cq_lock); + ERROR_LOG("cq memory leakage\n"); + } else { + write_unlock_bh(&dev->cq_lock); + } + + /* ib_dereg_mr & ib_dealloc_pd will be called from xio_device_down + * (kerf) + */ + xio_device_put(dev); +} + +/*---------------------------------------------------------------------------*/ +/* xio_cq_alloc_slots */ +/*---------------------------------------------------------------------------*/ +static int xio_cq_alloc_slots(struct xio_cq *tcq, int cqe_num) +{ + if (cqe_num < tcq->cqe_avail) { + tcq->cqe_avail -= cqe_num; + return 0; + } else if (tcq->cq_depth + tcq->alloc_sz < tcq->max_cqe) { + int cqe = tcq->cq->cqe; + int retval = ib_resize_cq(tcq->cq, + tcq->cq_depth + tcq->alloc_sz); + if (retval != 0 || (cqe == tcq->cq->cqe)) { + ERROR_LOG("ibv_resize_cq failed. ret=%d, cqe:%d\n", + retval, cqe); + return -1; + } + tcq->cq_depth += (tcq->cq->cqe - cqe); + tcq->cqe_avail += (tcq->cq->cqe - cqe); + DEBUG_LOG("cq_resize: expected:%d, actual:%d\n", + tcq->cq_depth, tcq->cq->cqe); + tcq->cqe_avail -= cqe_num; + return 0; + } + + ERROR_LOG("cq overflow reached\n"); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_cq_free_slots */ +/*---------------------------------------------------------------------------*/ +static int xio_cq_free_slots(struct xio_cq *tcq, int cqe_num) +{ + if (tcq->cqe_avail + cqe_num <= tcq->cq_depth) { + tcq->cqe_avail += cqe_num; + return 0; + } + ERROR_LOG("cq allocation error"); + + return 0; +} + +static void xio_qp_event_handler(struct ib_event *cause, void *context) +{ + ERROR_LOG("got qp event %d\n", cause->event); +} + +/*---------------------------------------------------------------------------*/ +/* xio_qp_create */ +/*---------------------------------------------------------------------------*/ +static int xio_qp_create(struct xio_rdma_transport *rdma_hndl) +{ + struct xio_device *dev; + struct ib_qp_init_attr qp_init_attr; + struct ib_qp_attr qp_attr; + struct xio_cq *tcq; + int retval = 0; + + /* Should be set by now */ + dev = rdma_hndl->dev; + if (!dev) { + ERROR_LOG("failed to find device\n"); + return -1; + } + + tcq = xio_cq_get(dev, rdma_hndl->base.ctx); + if (!tcq) { + ERROR_LOG("cq initialization failed\n"); + return -1; + } + + retval = xio_cq_alloc_slots(tcq, MAX_CQE_PER_QP); + if (retval != 0) { + ERROR_LOG("cq full capacity reached\n"); + return -1; + } + + memset(&qp_init_attr, 0, sizeof(qp_init_attr)); + + qp_init_attr.event_handler = xio_qp_event_handler; + qp_init_attr.qp_context = rdma_hndl; + qp_init_attr.qp_type = IB_QPT_RC; + qp_init_attr.send_cq = tcq->cq; + qp_init_attr.recv_cq = tcq->cq; + qp_init_attr.cap.max_send_wr = 5 * MAX_SEND_WR; + qp_init_attr.cap.max_recv_wr = MAX_RECV_WR + EXTRA_RQE; + qp_init_attr.cap.max_inline_data = rdma_options.qp_cap_max_inline_data; + qp_init_attr.cap.max_send_sge = min(rdma_options.max_out_iovsz + 1, + dev->device_attr.max_sge); + qp_init_attr.cap.max_recv_sge = 1; + + /* only generate completion queue entries if requested + * User space version sets sq_sig_all to 0, according to + * ib_uverbs_create_qp this translates to IB_SIGNAL_REQ_WR + */ + qp_init_attr.sq_sig_type = IB_SIGNAL_REQ_WR; + + retval = rdma_create_qp(rdma_hndl->cm_id, dev->pd, &qp_init_attr); + if (retval) { + xio_set_error(retval); + xio_cq_free_slots(tcq, MAX_CQE_PER_QP); + ERROR_LOG("rdma_create_qp failed. (err=%d)\n", retval); + return -1; + } + rdma_hndl->dev = dev; + rdma_hndl->tcq = tcq; + rdma_hndl->qp = rdma_hndl->cm_id->qp; + rdma_hndl->sqe_avail = 5 * MAX_SEND_WR; + + rdma_hndl->beacon_task.dd_data = ptr_from_int64(XIO_BEACON_WRID); + rdma_hndl->beacon.wr_id = uint64_from_ptr(&rdma_hndl->beacon_task); + rdma_hndl->beacon.opcode = IB_WR_SEND; + + memset(&qp_attr, 0, sizeof(qp_attr)); + retval = ib_query_qp(rdma_hndl->qp, &qp_attr, 0, &qp_init_attr); + if (retval) + ERROR_LOG("ib_query_qp failed. (err=%d)\n", retval); + + rdma_hndl->max_inline_data = qp_attr.cap.max_inline_data; + rdma_hndl->max_sge = min(rdma_options.max_out_iovsz + 1, + dev->device_attr.max_sge); + + list_add(&rdma_hndl->trans_list_entry, &tcq->trans_list); + + TRACE_LOG("rdma qp: [new] handle:%p, qp:0x%x\n", rdma_hndl, + rdma_hndl->qp->qp_num); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_qp_release */ +/*---------------------------------------------------------------------------*/ +static void xio_qp_release(struct xio_rdma_transport *rdma_hndl) +{ + if (rdma_hndl->qp) { + TRACE_LOG("rdma qp: [close] handle:%p, qp:0x%x\n", rdma_hndl, + rdma_hndl->qp->qp_num); + xio_cq_free_slots(rdma_hndl->tcq, MAX_CQE_PER_QP); + if (list_empty(&rdma_hndl->trans_list_entry)) + ERROR_LOG("rdma_hndl has qp but not cq\n"); + + list_del_init(&rdma_hndl->trans_list_entry); + rdma_destroy_qp(rdma_hndl->cm_id); + xio_cq_release(rdma_hndl->tcq); + rdma_hndl->qp = NULL; + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_rxd_init */ +/*---------------------------------------------------------------------------*/ +static void xio_rxd_init(struct xio_work_req *rxd, + size_t rxd_nr, + struct xio_task *task, + struct scatterlist *sgl, + unsigned size, + struct ib_mr *srmr) +{ + int i; + /* This address need to be dma mapped */ + /* rxd->sge[0].addr = uint64_from_ptr(buf); */ + /* rxd->sge[0].length = size; */ + if (srmr) { + for (i = 0; i < rxd_nr; i++) + rxd->sge[i].lkey = srmr->lkey; + } + + if (size) { + rxd->sgt.sgl = sgl; + rxd->sgt.orig_nents = 1; + rxd->sgt.nents = 1; + rxd->nents = 1; + } else { + rxd->sgt.sgl = NULL; + rxd->sgt.orig_nents = 0; + rxd->sgt.nents = 0; + rxd->nents = 0; + } + + rxd->recv_wr.wr_id = uint64_from_ptr(task); + rxd->recv_wr.sg_list = rxd->sge; + rxd->recv_wr.num_sge = rxd->nents; + rxd->recv_wr.next = NULL; +} + +/*---------------------------------------------------------------------------*/ +/* xio_txd_init */ +/*---------------------------------------------------------------------------*/ +static void xio_txd_init(struct xio_work_req *txd, + size_t txd_nr, + struct xio_task *task, + struct scatterlist *sgl, + unsigned size, + struct ib_mr *srmr) +{ + int i; + /* This address need to be dma mapped */ + /* txd->sge[0].addr = uint64_from_ptr(buf); */ + /* txd->sge[0].length = size; */ + if (srmr) { + for (i = 0; i < txd_nr; i++) + txd->sge[i].lkey = srmr->lkey; + } + + if (size) { + txd->sgt.sgl = sgl; + txd->sgt.orig_nents = 1; + txd->sgt.nents = 1; + txd->nents = 1; + } else { + txd->sgt.sgl = NULL; + txd->sgt.orig_nents = 0; + txd->sgt.nents = 0; + txd->nents = 0; + } + + txd->send_wr.wr_id = uint64_from_ptr(task); + txd->send_wr.next = NULL; + txd->send_wr.sg_list = txd->sge; + txd->send_wr.num_sge = txd->sgt.nents; + txd->send_wr.opcode = IB_WR_SEND; + + txd->mapped = 0; + /* txd->send_wr.send_flags = IB_SEND_SIGNALED; */ +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdmad_init */ +/*---------------------------------------------------------------------------*/ +static int xio_rdmad_init(struct xio_work_req *rdmad, + size_t rdmad_nr, + struct xio_task *task) +{ + rdmad->send_wr.wr_id = uint64_from_ptr(task); + rdmad->send_wr.sg_list = rdmad->sge; + rdmad->send_wr.num_sge = 1; + rdmad->send_wr.next = NULL; + rdmad->send_wr.send_flags = IB_SEND_SIGNALED; + + /* rdmad has no sgl of it's own since it doesn't have a buffer */ + if (rdmad_nr) { + if (sg_alloc_table(&rdmad->sgt, rdmad_nr, GFP_KERNEL)) { + ERROR_LOG("sg_write_table(rdmad)\n"); + return -1; + } + } else { + rdmad->sgt.sgl = NULL; + rdmad->sgt.orig_nents = 0; + rdmad->sgt.nents = 0; + } + + rdmad->nents = 1; + rdmad->mapped = 0; + + /* to be set before posting: + rdmad->xio_ib_op, rdmad->send_wr.opcode + rdmad->sge.addr, rdmad->sge.length + rdmad->send_wr.wr.rdma.(remote_addr,rkey) */ + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_task_init */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_task_init(struct xio_task *task, + struct xio_rdma_transport *rdma_hndl, + void *buf, + unsigned long size, + struct ib_mr *srmr, + size_t txd_nr, + size_t rxd_nr, + size_t rdmad_nr) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + + rdma_task->buf = buf; + + if (buf) { + sg_init_one(rdma_task->rx_sgl, buf, size); + /* txd's scatterlist has and extra entry for chaining + * with the application's scatterlist + */ + sg_init_table(rdma_task->tx_sgl, 2); + sg_set_buf(rdma_task->tx_sgl, buf, size); + sg_mark_end(rdma_task->tx_sgl); + /* The link entry shoulden't be marked end */ + sg_unmark_end(&rdma_task->tx_sgl[1]); + } + + if (rxd_nr) + xio_rxd_init(&rdma_task->rxd, rxd_nr, task, rdma_task->rx_sgl, + size, srmr); + if (txd_nr) + xio_txd_init(&rdma_task->txd, txd_nr, task, rdma_task->tx_sgl, + size, srmr); + if (rdmad_nr) + xio_rdmad_init(&rdma_task->rdmad, rdmad_nr, task); + + /* initialize the mbuf */ + xio_mbuf_init(&task->mbuf, buf, size, 0); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_txd_init */ +/*---------------------------------------------------------------------------*/ +static void xio_xd_reinit(struct xio_work_req *xd, + size_t xd_nr, + struct ib_mr *srmr) +{ + int i; + + if (!srmr || !xd || !xd->sge) + return; + + for (i = 0; i < xd_nr; i++) { + if (!xd->sge[i].lkey) + break; + xd->sge[i].lkey = srmr->lkey; + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_task_reinit */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_task_reinit(struct xio_task *task, + struct xio_rdma_transport *rdma_hndl, + struct ib_mr *srmr) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + + xio_xd_reinit(&rdma_task->rxd, rdma_hndl->max_sge, srmr); + xio_xd_reinit(&rdma_task->txd, rdma_hndl->max_sge, srmr); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_flush_all_tasks */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_flush_all_tasks(struct xio_rdma_transport *rdma_hndl) +{ + if (!list_empty(&rdma_hndl->in_flight_list)) { + TRACE_LOG("in_flight_list not empty!\n"); + xio_transport_flush_task_list(&rdma_hndl->in_flight_list); + /* for task that attached to senders with ref count = 2 */ + xio_transport_flush_task_list(&rdma_hndl->in_flight_list); + } + + if (!list_empty(&rdma_hndl->rdma_rd_req_in_flight_list)) { + TRACE_LOG("rdma_rd_req_in_flight_list not empty!\n"); + xio_transport_flush_task_list( + &rdma_hndl->rdma_rd_req_in_flight_list); + } + if (!list_empty(&rdma_hndl->rdma_rd_req_list)) { + TRACE_LOG("rdma_rd_req_list not empty!\n"); + xio_transport_flush_task_list(&rdma_hndl->rdma_rd_req_list); + } + if (!list_empty(&rdma_hndl->rdma_rd_rsp_in_flight_list)) { + TRACE_LOG("rdma_rd_rsp_in_flight_list not empty!\n"); + xio_transport_flush_task_list( + &rdma_hndl->rdma_rd_rsp_in_flight_list); + } + if (!list_empty(&rdma_hndl->rdma_rd_rsp_list)) { + TRACE_LOG("rdma_rd_rsp_list not empty!\n"); + xio_transport_flush_task_list(&rdma_hndl->rdma_rd_rsp_list); + } + if (!list_empty(&rdma_hndl->tx_comp_list)) { + TRACE_LOG("tx_comp_list not empty!\n"); + xio_transport_flush_task_list(&rdma_hndl->tx_comp_list); + } + if (!list_empty(&rdma_hndl->io_list)) { + TRACE_LOG("io_list not empty!\n"); + xio_transport_flush_task_list(&rdma_hndl->io_list); + } + + if (!list_empty(&rdma_hndl->tx_ready_list)) { + TRACE_LOG("tx_ready_list not empty!\n"); + xio_transport_flush_task_list(&rdma_hndl->tx_ready_list); + /* for task that attached to senders with ref count = 2 */ + xio_transport_flush_task_list(&rdma_hndl->tx_ready_list); + } + + if (!list_empty(&rdma_hndl->rx_list)) { + TRACE_LOG("rx_list not empty!\n"); + xio_transport_flush_task_list(&rdma_hndl->rx_list); + } + + rdma_hndl->kick_rdma_rd_req = 0; + rdma_hndl->kick_rdma_rd_rsp = 0; + rdma_hndl->rdma_rd_req_in_flight = 0; + rdma_hndl->rdma_rd_rsp_in_flight = 0; + rdma_hndl->reqs_in_flight_nr = 0; + rdma_hndl->rsps_in_flight_nr = 0; + rdma_hndl->tx_ready_tasks_num = 0; + + return 0; +} + + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_initial_pool_slab_pre_create */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_initial_pool_slab_pre_create( + struct xio_transport_base *transport_hndl, + int alloc_nr, + void *pool_dd_data, void *slab_dd_data) +{ + struct xio_rdma_tasks_slab *rdma_slab = + (struct xio_rdma_tasks_slab *)slab_dd_data; + + rdma_slab->buf_size = CONN_SETUP_BUF_SIZE; + /* The name must be valid until the pool is destroyed + * Use the address of the pool structure to create a unique + * name for the pool + */ + sprintf(rdma_slab->name, "initial_pool-%p", rdma_slab); + rdma_slab->data_pool = kmem_cache_create(rdma_slab->name, + rdma_slab->buf_size, PAGE_SIZE, + SLAB_HWCACHE_ALIGN, NULL); + if (!rdma_slab->data_pool) { + xio_set_error(ENOMEM); + ERROR_LOG("kcache(initial_pool) creation failed\n"); + return -1; + } + DEBUG_LOG("kcache(%s) created(%p)\n", + rdma_slab->name, rdma_slab->data_pool); + rdma_slab->count = 0; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_initial_task_alloc */ +/*---------------------------------------------------------------------------*/ +static inline struct xio_task *xio_rdma_initial_task_alloc( + struct xio_rdma_transport *rdma_hndl) +{ + return rdma_hndl->initial_pool_cls.task_get( + rdma_hndl->initial_pool_cls.pool, rdma_hndl); +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_primary_task_alloc */ +/*---------------------------------------------------------------------------*/ +struct xio_task *xio_rdma_primary_task_alloc( + struct xio_rdma_transport *rdma_hndl) +{ + return rdma_hndl->primary_pool_cls.task_get( + rdma_hndl->primary_pool_cls.pool, rdma_hndl); +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_primary_task_lookup */ +/*---------------------------------------------------------------------------*/ +struct xio_task *xio_rdma_primary_task_lookup( + struct xio_rdma_transport *rdma_hndl, + int tid) +{ + if (rdma_hndl->primary_pool_cls.task_lookup) + return rdma_hndl->primary_pool_cls.task_lookup( + rdma_hndl->primary_pool_cls.pool, tid); + return NULL; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_task_free */ +/*---------------------------------------------------------------------------*/ +inline void xio_rdma_task_free(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + if (rdma_hndl->primary_pool_cls.task_put) + return rdma_hndl->primary_pool_cls.task_put(task); +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_initial_pool_post_create */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_initial_pool_post_create( + struct xio_transport_base *transport_hndl, + void *pool, void *pool_dd_data) +{ + struct xio_task *task; + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)transport_hndl; + struct xio_rdma_tasks_pool *rdma_pool = + (struct xio_rdma_tasks_pool *)pool_dd_data; + struct xio_rdma_task *rdma_task; + int retval; + + if (!rdma_hndl) + return 0; + + rdma_hndl->initial_pool_cls.pool = pool; + rdma_pool->dev = rdma_hndl->dev; + + task = xio_rdma_initial_task_alloc(rdma_hndl); + if (!task) { + ERROR_LOG("failed to get task\n"); + } else { + DEBUG_LOG("post_recv conn_setup rx task:%p\n", task); + rdma_task = (struct xio_rdma_task *)task->dd_data; + if (xio_map_rx_work_req(rdma_hndl->dev, &rdma_task->rxd)) { + ERROR_LOG("DMA map from device failed\n"); + return -1; + } + + /* set the lkey prior to receiving */ + rdma_task->rxd.recv_wr.sg_list[0].lkey = rdma_hndl->dev->mr->lkey; + + retval = xio_post_recv(rdma_hndl, task, 1); + if (retval) + ERROR_LOG("xio_post_recv failed\n"); + + /* assuming that both sides posted one recv wr for initial + * negotiation + */ + rdma_hndl->peer_credits = 1; + rdma_hndl->sim_peer_credits = 1; + + rdma_task->out_ib_op = XIO_IB_RECV; + list_add_tail(&task->tasks_list_entry, &rdma_hndl->rx_list); + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_task_pre_put */ +/*---------------------------------------------------------------------------*/ +int xio_rdma_task_pre_put(struct xio_transport_base *trans_hndl, + struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)trans_hndl; + struct xio_device *dev; + + dev = rdma_hndl->dev; + + /* Unmap before releasing */ + + if (rdma_task->rxd.mapped) + xio_unmap_rx_work_req(dev, &rdma_task->rxd); + + if (rdma_task->txd.mapped) + xio_unmap_tx_work_req(dev, &rdma_task->txd); + + if (rdma_task->rdmad.mapped) { + if (rdma_task->out_ib_op == XIO_IB_RDMA_WRITE) + xio_unmap_txmad_work_req(dev, &rdma_task->rdmad); + else + xio_unmap_rxmad_work_req(dev, &rdma_task->rdmad); + } + + if (rdma_task->in_ib_op != XIO_IB_SEND) { + if (rdma_task->read_mem_desc.nents && + rdma_task->read_mem_desc.mapped) + xio_unmap_desc(rdma_hndl, + &rdma_task->read_mem_desc, + DMA_FROM_DEVICE); + } + + if (rdma_task->out_ib_op != XIO_IB_SEND) { + if (rdma_task->write_mem_desc.nents && + rdma_task->write_mem_desc.mapped) + xio_unmap_desc(rdma_hndl, + &rdma_task->write_mem_desc, + DMA_TO_DEVICE); + } + + /* recycle RDMA buffers back to pool */ + + /* put buffers back to pool */ + xio_mempool_free(&rdma_task->read_mem_desc); + rdma_task->read_num_mem_desc = 0; + + xio_mempool_free(&rdma_task->write_mem_desc); + rdma_task->write_num_mem_desc = 0; + /* + rdma_task->req_write_num_mem_desc = 0; + rdma_task->rsp_write_num_mem_desc = 0; + rdma_task->req_read_num_mem_desc = 0; + rdma_task->req_recv_num_sge = 0; + + rdma_task->txd.send_wr.num_sge = 1; + rdma_task->out_ib_op = XIO_IB_NULL; + rdma_task->phantom_idx = 0; + rdma_task->sn = 0; + */ + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_initial_pool_slab_destroy */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_initial_pool_slab_destroy( + struct xio_transport_base *transport_hndl, + void *pool_dd_data, void *slab_dd_data) +{ + struct xio_rdma_tasks_slab *rdma_slab = + (struct xio_rdma_tasks_slab *)slab_dd_data; + + DEBUG_LOG("kcache(%s) freed\n", rdma_slab->name); + + if (rdma_slab->count) + ERROR_LOG("pool(%s) not-free(%d)\n", + rdma_slab->name, rdma_slab->count); + + kmem_cache_destroy(rdma_slab->data_pool); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_pool_slab_uninit_task */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_pool_slab_uninit_task(struct xio_transport_base *trans_hndl, + void *pool_dd_data, + void *slab_dd_data, + struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)trans_hndl; + struct xio_rdma_tasks_pool *rdma_pool = + (struct xio_rdma_tasks_pool *)pool_dd_data; + struct xio_rdma_tasks_slab *rdma_slab = + (struct xio_rdma_tasks_slab *)slab_dd_data; + struct xio_device *dev; + + dev = rdma_pool->dev; + if (!dev) + return 0; + + if (!dev->ib_dev) { + ERROR_LOG("ib_dev not set\n"); + return -1; + } + + if (rdma_task->rxd.mapped) + xio_unmap_rx_work_req(dev, &rdma_task->rxd); + + if (rdma_task->txd.mapped) + xio_unmap_tx_work_req(dev, &rdma_task->txd); + + if (rdma_task->rdmad.mapped) { + if (rdma_task->out_ib_op == XIO_IB_RDMA_WRITE) + xio_unmap_txmad_work_req(dev, &rdma_task->rdmad); + else + xio_unmap_rxmad_work_req(dev, &rdma_task->rdmad); + } + + if (rdma_task->read_mem_desc.nents && rdma_task->read_mem_desc.mapped) + xio_unmap_desc(rdma_hndl, &rdma_task->read_mem_desc, + DMA_FROM_DEVICE); + + if (rdma_task->write_mem_desc.nents && rdma_task->write_mem_desc.mapped) + xio_unmap_desc(rdma_hndl, &rdma_task->write_mem_desc, + DMA_TO_DEVICE); + + if (rdma_task->rdmad.sgt.sgl) + sg_free_table(&rdma_task->rdmad.sgt); +#if 0 + if (rdma_task->write_mem_desc.sgt.sgl) + sg_free_table(&rdma_task->write_mem_desc.sgt); + + if (rdma_task->read_mem_desc.sgt.sgl) + sg_free_table(&rdma_task->read_mem_desc.sgt); +#endif + /* Phantom tasks have no buffer */ + if (rdma_task->buf) { + if (rdma_slab->count) + rdma_slab->count--; + else + ERROR_LOG("pool(%s) double free?\n", rdma_slab->name); + + kmem_cache_free(rdma_slab->data_pool, rdma_task->buf); + rdma_task->buf = NULL; + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_initial_pool_slab_init_task */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_initial_pool_slab_init_task( + struct xio_transport_base *transport_hndl, + void *pool_dd_data, void *slab_dd_data, + int tid, struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)transport_hndl; + struct xio_rdma_tasks_slab *rdma_slab = + (struct xio_rdma_tasks_slab *)slab_dd_data; + void *buf; + char *ptr; + + if (!rdma_hndl || rdma_task->buf) + return 0; + + /* fill xio_rdma_task */ + ptr = (char *)rdma_task; + ptr += sizeof(struct xio_rdma_task); + + /* fill xio_work_req */ + rdma_task->txd.sge = (void *)ptr; + ptr += sizeof(struct ib_sge); + + rdma_task->rxd.sge = (void *)ptr; + ptr += sizeof(struct ib_sge); + /*****************************************/ + + buf = kmem_cache_zalloc(rdma_slab->data_pool, GFP_KERNEL); + if (!buf) { + xio_set_error(ENOMEM); + ERROR_LOG("kmem_cache_zalloc(initial_pool)\n"); + return -ENOMEM; + } + rdma_slab->count++; + + return xio_rdma_task_init(task, + rdma_hndl, + buf, + rdma_slab->buf_size, + rdma_hndl->dev->mr, + 1, /* txd_nr */ + 1, /* rxd_nr */ + 0); /* rdmad_nr */ +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_initial_pool_get_params */ +/*---------------------------------------------------------------------------*/ +static void xio_rdma_initial_pool_get_params( + struct xio_transport_base *transport_hndl, + int *start_nr, int *max_nr, int *alloc_nr, + int *pool_dd_sz, int *slab_dd_sz, int *task_dd_sz) +{ + *start_nr = 10 * NUM_CONN_SETUP_TASKS; + *alloc_nr = 10 * NUM_CONN_SETUP_TASKS; + *max_nr = 10 * NUM_CONN_SETUP_TASKS; + + *pool_dd_sz = sizeof(struct xio_rdma_tasks_pool); + *slab_dd_sz = sizeof(struct xio_rdma_tasks_slab); + *task_dd_sz = sizeof(struct xio_rdma_task) + + 2 * sizeof(struct ib_sge); +} + +static struct xio_tasks_pool_ops initial_tasks_pool_ops = { + .pool_get_params = xio_rdma_initial_pool_get_params, + .slab_pre_create = xio_rdma_initial_pool_slab_pre_create, + .slab_destroy = xio_rdma_initial_pool_slab_destroy, + .slab_init_task = xio_rdma_initial_pool_slab_init_task, + .slab_uninit_task = xio_rdma_pool_slab_uninit_task, + .pool_post_create = xio_rdma_initial_pool_post_create +}; + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_phantom_pool_slab_init_task */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_phantom_pool_slab_init_task( + struct xio_transport_base *transport_hndl, + void *pool_dd_data, void *slab_dd_data, + int tid, struct xio_task *task) +{ + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)transport_hndl; + int max_iovsz = max(rdma_options.max_out_iovsz, + rdma_options.max_in_iovsz) + 1; + int max_sge = min(rdma_hndl->max_sge, max_iovsz); + char *ptr; + + XIO_TO_RDMA_TASK(task, rdma_task); + + /* fill xio_rdma_task */ + ptr = (char *)rdma_task; + ptr += sizeof(struct xio_rdma_task); + + /* fill xio_work_req */ + rdma_task->rdmad.sge = (void *)ptr; + ptr += rdma_hndl->max_sge * sizeof(struct ib_sge); + /*****************************************/ + + rdma_task->out_ib_op = 0x200; + xio_rdma_task_init( + task, + rdma_hndl, + NULL, + 0, + NULL, + 0, /* txd_nr */ + 0, /* rxd_nr */ + max_sge); /* rdmad_nr */ + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_phantom_pool_post_create */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_phantom_pool_post_create( + struct xio_transport_base *transport_hndl, + void *pool, void *pool_dd_data) +{ + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)transport_hndl; + struct xio_rdma_tasks_pool *rdma_pool = + (struct xio_rdma_tasks_pool *)pool_dd_data; + + if (!rdma_hndl) + return 0; + + rdma_pool->dev = rdma_hndl->dev; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_phantom_pool_create */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_phantom_pool_create(struct xio_rdma_transport *rdma_hndl) +{ + struct xio_tasks_pool_params params; + + memset(¶ms, 0, sizeof(params)); + + params.start_nr = NUM_START_PHANTOM_POOL_TASKS; + params.max_nr = NUM_MAX_PHANTOM_POOL_TASKS; + params.alloc_nr = NUM_ALLOC_PHANTOM_POOL_TASKS; + params.pool_dd_data_sz = sizeof(struct xio_rdma_tasks_pool); + params.slab_dd_data_sz = sizeof(struct xio_rdma_tasks_slab); + params.task_dd_data_sz = sizeof(struct xio_rdma_task) + + rdma_hndl->max_sge * + sizeof(struct ib_sge); + + params.pool_hooks.context = rdma_hndl; + params.pool_hooks.slab_init_task = + (void *)xio_rdma_phantom_pool_slab_init_task; + params.pool_hooks.slab_uninit_task = + (void *)xio_rdma_pool_slab_uninit_task; + params.pool_hooks.task_pre_put = + (void *)xio_rdma_task_pre_put; + + params.pool_hooks.pool_post_create = + (void *)xio_rdma_phantom_pool_post_create; + + /* initialize the tasks pool */ + rdma_hndl->phantom_tasks_pool = xio_tasks_pool_create(¶ms); + if (!rdma_hndl->phantom_tasks_pool) { + ERROR_LOG("xio_tasks_pool_create failed\n"); + goto cleanup; + } + + return 0; + +cleanup: + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_phantom_pool_destroy */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_phantom_pool_destroy(struct xio_rdma_transport *rdma_hndl) +{ + if (!rdma_hndl->phantom_tasks_pool) + return -1; + + xio_tasks_pool_destroy(rdma_hndl->phantom_tasks_pool); + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_primary_pool_slab_pre_create */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_primary_pool_slab_pre_create( + struct xio_transport_base *transport_hndl, + int alloc_nr, void *pool_dd_data, void *slab_dd_data) +{ + struct xio_rdma_tasks_slab *rdma_slab = + (struct xio_rdma_tasks_slab *)slab_dd_data; + size_t inline_buf_sz = xio_rdma_get_inline_buffer_size(); + + rdma_slab->buf_size = inline_buf_sz; + /* The name must be valid until the pool is destroyed + * Use the address of the pool structure to create a unique + * name for the pool + */ + sprintf(rdma_slab->name, "primary_pool-%p", rdma_slab); + rdma_slab->data_pool = kmem_cache_create(rdma_slab->name, + rdma_slab->buf_size, PAGE_SIZE, + SLAB_HWCACHE_ALIGN, NULL); + if (!rdma_slab->data_pool) { + xio_set_error(ENOMEM); + ERROR_LOG("kcache(primary_pool) creation failed\n"); + return -1; + } + DEBUG_LOG("kcache(%s) created(%p)\n", + rdma_slab->name, rdma_slab->data_pool); + + DEBUG_LOG("pool buf:%p\n", rdma_slab->data_pool); + rdma_slab->count = 0; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_primary_pool_post_create */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_primary_pool_post_create( + struct xio_transport_base *transport_hndl, + void *pool, void *pool_dd_data) +{ + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)transport_hndl; + struct xio_rdma_tasks_pool *rdma_pool = + (struct xio_rdma_tasks_pool *)pool_dd_data; + + if (!rdma_hndl) + return 0; + + rdma_hndl->primary_pool_cls.pool = pool; + rdma_pool->dev = rdma_hndl->dev; + + /* tasks may require fast registration for RDMA read and write */ + if (rdma_hndl->dev->fastreg.alloc_rdma_reg_res(rdma_hndl)) { + xio_set_error(ENOMEM); + ERROR_LOG("fast reg init failed\n"); + return -1; + } + + xio_rdma_rearm_rq(rdma_hndl); + + /* late creation */ + xio_rdma_phantom_pool_create(rdma_hndl); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_primary_pool_slab_destroy */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_primary_pool_slab_destroy( + struct xio_transport_base *transport_hndl, + void *pool_dd_data, void *slab_dd_data) +{ + struct xio_rdma_tasks_slab *rdma_slab = + (struct xio_rdma_tasks_slab *)slab_dd_data; + + DEBUG_LOG("kcache(%s) freed\n", rdma_slab->name); + + + if (rdma_slab->count) + ERROR_LOG("pool(%s) not-free(%d)\n", + rdma_slab->name, rdma_slab->count); + + kmem_cache_destroy(rdma_slab->data_pool); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_primary_pool_slab_remap_task */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_primary_pool_slab_remap_task( + struct xio_transport_base *old_th, + struct xio_transport_base *new_th, + void *pool_dd_data, + void *slab_dd_data, + struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + struct xio_rdma_transport *old_hndl = + (struct xio_rdma_transport *)old_th; + struct xio_rdma_transport *new_hndl = + (struct xio_rdma_transport *)new_th; + struct xio_device *old_dev = old_hndl->dev; + struct xio_device *new_dev = new_hndl->dev; + struct xio_rkey_tbl *te; + + task->context = new_th; + + /* if the same device is used then there is no need to remap */ + if (old_dev && old_dev == new_dev) + return 0; + + xio_rdma_task_reinit(task, new_hndl, new_dev->mr); + + if (!new_hndl->rkey_tbl) { + /* one for each possible desc and one for device mr */ + new_hndl->rkey_tbl = kcalloc(2 * old_hndl->num_tasks + 1, + sizeof(struct xio_rkey_tbl), + GFP_KERNEL); + if (!new_hndl->rkey_tbl) + return -ENOMEM; + } + + if (rdma_task->rxd.mapped) { + if (xio_remap_work_req(old_dev, new_dev, &rdma_task->rxd, + DMA_FROM_DEVICE)) { + ERROR_LOG("DMA re-map failed\n"); + return -1; + } + } + + if (rdma_task->txd.mapped) { + if (xio_remap_work_req(old_dev, new_dev, &rdma_task->txd, + DMA_TO_DEVICE)) { + ERROR_LOG("DMA re-map failed\n"); + return -1; + } + } + + if (rdma_task->rdmad.mapped) { + enum dma_data_direction direction = + (rdma_task->out_ib_op == XIO_IB_RDMA_WRITE) ? + DMA_TO_DEVICE : DMA_FROM_DEVICE; + if (xio_remap_work_req(old_dev, new_dev, &rdma_task->rdmad, + direction)) { + ERROR_LOG("DMA re-map to/from device failed\n"); + return -1; + } + } + + if (rdma_task->read_mem_desc.nents && rdma_task->read_mem_desc.mapped) { + int used_fast; + unsigned int sqe_used = 0; + /* was FRWR/FMR in use */ + if (rdma_task->read_mem_desc.mem_reg.mem_h) { + te = &new_hndl->rkey_tbl[new_hndl->rkey_tbl_size]; + te->old_rkey = rdma_task->read_mem_desc.mem_reg.rkey; + used_fast = 1; + } else { + used_fast = 0; + } + xio_remap_desc(old_hndl, new_hndl, &rdma_task->read_mem_desc, + DMA_FROM_DEVICE, &sqe_used); + rdma_task->sqe_used += sqe_used; + if (used_fast) { + if (!rdma_task->read_mem_desc.mem_reg.mem_h) { + ERROR_LOG("Fast re-reg from device failed\n"); + return -1; + } + te->new_rkey = rdma_task->read_mem_desc.mem_reg.rkey; + new_hndl->rkey_tbl_size++; + } + } + + if (rdma_task->write_mem_desc.nents && + rdma_task->write_mem_desc.mapped) { + int used_fast; + unsigned int sqe_used = 0; + /* was FRWR/FMR in use */ + if (rdma_task->write_mem_desc.mem_reg.mem_h) { + te = &new_hndl->rkey_tbl[new_hndl->rkey_tbl_size]; + te->old_rkey = rdma_task->write_mem_desc.mem_reg.rkey; + used_fast = 1; + } else { + used_fast = 0; + } + xio_remap_desc(old_hndl, new_hndl, &rdma_task->write_mem_desc, + DMA_TO_DEVICE, &sqe_used); + rdma_task->sqe_used += sqe_used; + if (used_fast) { + if (!rdma_task->write_mem_desc.mem_reg.mem_h) { + ERROR_LOG("Fast re-reg tom device failed\n"); + return -1; + } + te->new_rkey = rdma_task->write_mem_desc.mem_reg.rkey; + new_hndl->rkey_tbl_size++; + } + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_primary_pool_slab_init_task */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_primary_pool_slab_init_task( + struct xio_transport_base *t_hndl, + void *pool_dd_data, void *slab_dd_data, + int tid, + struct xio_task *task) +{ + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)t_hndl; + struct xio_rdma_tasks_slab *rdma_slab = + (struct xio_rdma_tasks_slab *)slab_dd_data; + XIO_TO_RDMA_TASK(task, rdma_task); + int max_iovsz = max(rdma_options.max_out_iovsz, + rdma_options.max_in_iovsz) + 1; + int max_sge = min(rdma_hndl->max_sge, max_iovsz); + void *buf; + char *ptr; + + if (rdma_task->buf) + return 0; + + /* fill xio_rdma_task */ + ptr = (char *)rdma_task; + ptr += sizeof(struct xio_rdma_task); + + /* fill xio_work_req */ + rdma_task->txd.sge = (void *)ptr; + ptr += max_sge * sizeof(struct ib_sge); + rdma_task->rxd.sge = (void *)ptr; + ptr += sizeof(struct ib_sge); + rdma_task->rdmad.sge = (void *)ptr; + ptr += max_sge * sizeof(struct ib_sge); + + rdma_task->read_mem_desc.mp_sge = (void *)ptr; + ptr += max_iovsz * sizeof(struct xio_mp_mem); + + rdma_task->write_mem_desc.mp_sge = (void *)ptr; + ptr += max_iovsz * sizeof(struct xio_mp_mem); + + rdma_task->req_in_sge = (void *)ptr; + ptr += max_iovsz * sizeof(struct xio_sge); + rdma_task->req_out_sge = (void *)ptr; + ptr += max_iovsz * sizeof(struct xio_sge); + rdma_task->rsp_out_sge = (void *)ptr; + ptr += max_iovsz * sizeof(struct xio_sge); + /*****************************************/ + +#if 0 + if (sg_alloc_table(&rdma_task->read_mem_desc.sgt, + max_iovsz, GFP_KERNEL)) { + ERROR_LOG("sg_alloc_table(read_mem_desc)\n"); + goto cleanup0; + } + + if (sg_alloc_table(&rdma_task->write_mem_desc.sgt, + max_iovsz, GFP_KERNEL)) { + ERROR_LOG("sg_alloc_table(write_mem_desc)\n"); + goto cleanup1; + } +#endif + + rdma_task->out_ib_op = 0x200; + + buf = kmem_cache_zalloc(rdma_slab->data_pool, GFP_KERNEL); + if (!buf) { + ERROR_LOG("kmem_cache_zalloc(primary_pool)\n"); + goto cleanup2; + } + rdma_slab->count++; + + xio_rdma_task_init(task, + rdma_hndl, + buf, + rdma_slab->buf_size, + rdma_hndl->dev->mr, + max_sge, /* txd_nr */ + 1, /* rxd_nr */ + max_sge); /* rdmad_nr */ + + return 0; + +cleanup2: +#if 0 + sg_free_table(&rdma_task->write_mem_desc.sgt); +cleanup1: + sg_free_table(&rdma_task->read_mem_desc.sgt); +cleanup0: +#endif + xio_set_error(ENOMEM); + return -ENOMEM; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_primary_pool_get_params */ +/*---------------------------------------------------------------------------*/ +static void xio_rdma_primary_pool_get_params( + struct xio_transport_base *transport_hndl, + int *start_nr, int *max_nr, int *alloc_nr, + int *pool_dd_sz, int *slab_dd_sz, int *task_dd_sz) +{ + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)transport_hndl; + int max_iovsz = max(rdma_options.max_out_iovsz, + rdma_options.max_in_iovsz) + 1; + int max_sge; + int queued_nr; + + + if (rdma_hndl) + max_sge = min(rdma_hndl->max_sge, max_iovsz); + else + max_sge = min(XIO_DEV_ATTR_MAX_SGE, max_iovsz); + + queued_nr = g_poptions->snd_queue_depth_msgs + + g_poptions->rcv_queue_depth_msgs + + MAX_CQE_PER_QP; /* for ibv_post_recv */ + + if (rdma_hndl) + *start_nr = rdma_hndl->rq_depth + EXTRA_RQE + SEND_QE; + else + *start_nr = NUM_START_PRIMARY_POOL_TASKS; + + *alloc_nr = NUM_ALLOC_PRIMARY_POOL_TASKS; + *max_nr = max(queued_nr, *start_nr); + + *pool_dd_sz = sizeof(struct xio_rdma_tasks_pool); + *slab_dd_sz = sizeof(struct xio_rdma_tasks_slab); + *task_dd_sz = sizeof(struct xio_rdma_task) + + (max_sge + 1 + max_sge) * sizeof(struct ib_sge) + + 2 * max_iovsz * sizeof(struct xio_mp_mem) + + 3 * max_iovsz * sizeof(struct xio_sge); +} + +static struct xio_tasks_pool_ops primary_tasks_pool_ops = { + .pool_get_params = xio_rdma_primary_pool_get_params, + .slab_pre_create = xio_rdma_primary_pool_slab_pre_create, + .slab_destroy = xio_rdma_primary_pool_slab_destroy, + .slab_init_task = xio_rdma_primary_pool_slab_init_task, + .slab_uninit_task = xio_rdma_pool_slab_uninit_task, + .slab_remap_task = xio_rdma_primary_pool_slab_remap_task, + .pool_post_create = xio_rdma_primary_pool_post_create, + .task_pre_put = xio_rdma_task_pre_put, +}; + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_post_close */ +/*---------------------------------------------------------------------------*/ +static void xio_rdma_post_close(struct xio_transport_base *trans_base) +{ + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)trans_base; + + if (rdma_hndl->handler_nesting) { + rdma_hndl->state = XIO_TRANSPORT_STATE_DESTROYED; + return; + } + + TRACE_LOG("rdma transport: [post_close] handle:%p, qp:%p\n", + rdma_hndl, rdma_hndl->qp); + + xio_observable_unreg_all_observers(&trans_base->observable); + + if (rdma_hndl->dev) + rdma_hndl->dev->fastreg.free_rdma_reg_res(rdma_hndl); + + xio_rdma_phantom_pool_destroy(rdma_hndl); + + xio_qp_release(rdma_hndl); + /* Don't call rdma_destroy_id from event handler. see comment in + * xio_handle_cm_event + */ + if (rdma_hndl->cm_id) { + TRACE_LOG("call rdma_destroy_id\n"); + rdma_destroy_id(rdma_hndl->cm_id); + rdma_hndl->cm_id = NULL; + } + + xio_context_destroy_resume(rdma_hndl->base.ctx); + + kfree(rdma_hndl->rkey_tbl); + rdma_hndl->rkey_tbl = NULL; + + kfree(rdma_hndl->peer_rkey_tbl); + rdma_hndl->peer_rkey_tbl = NULL; + + kfree(trans_base->portal_uri); + trans_base->portal_uri = NULL; + + XIO_OBSERVABLE_DESTROY(&rdma_hndl->base.observable); + /* last chance to flush all tasks */ + xio_rdma_flush_all_tasks(rdma_hndl); + + kfree(rdma_hndl); +} + +/*---------------------------------------------------------------------------*/ +/* on_cm_addr_resolved */ +/*---------------------------------------------------------------------------*/ +static void on_cm_addr_resolved(struct rdma_cm_event *ev, + struct xio_rdma_transport *rdma_hndl) +{ + int retval = 0; + struct xio_device **xio_devs; + struct xio_device *dev; + + /* Find the device on which the connection was established */ + xio_devs = ib_get_client_data(rdma_hndl->cm_id->device, &xio_client); + if (!(xio_devs && xio_devs[rdma_hndl->cm_id->port_num])) { + ERROR_LOG("device(%s) port(%d) not registered\n", + rdma_hndl->cm_id->device->name, + rdma_hndl->cm_id->port_num); + xio_set_error(ENODEV); + goto notify_err0; + } + + dev = xio_devs[rdma_hndl->cm_id->port_num]; + /* increment device reference count */ + xio_device_get(dev); + rdma_hndl->dev = dev; + + if (test_bits(XIO_TRANSPORT_ATTR_TOS, &rdma_hndl->trans_attr_mask)) { + rdma_set_service_type(rdma_hndl->cm_id, + rdma_hndl->trans_attr.tos); + DEBUG_LOG("set TOS option success. mask:0x%x, tos:0x%x\n", + rdma_hndl->trans_attr_mask, + rdma_hndl->trans_attr.tos); + } + + retval = rdma_resolve_route(rdma_hndl->cm_id, ROUTE_RESOLVE_TIMEOUT); + if (retval) { + xio_set_error(retval); + ERROR_LOG("rdma_resolve_route failed. (err=%d)\n", retval); + goto notify_err1; + } + + return; + +notify_err1: + xio_device_put(dev); +notify_err0: + xio_transport_notify_observer_error(&rdma_hndl->base, xio_errno()); +} + +/*---------------------------------------------------------------------------*/ +/* on_cm_route_resolved (client) */ +/*---------------------------------------------------------------------------*/ +static void on_cm_route_resolved(struct rdma_cm_id *cm_id, + struct rdma_cm_event *ev, + struct xio_rdma_transport *rdma_hndl) +{ + struct rdma_conn_param cm_params = { + .initiator_depth = 1, + .responder_resources = 1, + .rnr_retry_count = 3, /* 7 - infinite retry */ + .retry_count = 3 + }; + int retval = 0; + + retval = xio_qp_create(rdma_hndl); + if (retval != 0) { + ERROR_LOG("internal logic error in create_endpoint\n"); + goto notify_err0; + } + + /* + * When choosing the responder resources for a ULP, it is usually + * best to use the maximum value of the HCA. If the other side is + * not going to use RDMA read, then it should zero out the + * initiator_depth in the REP, which will zero out the local + * responder_resources when we program the QP. Generally, the + * initiator_depth should be either set to 0 or + * min(max_qp_rd_atom, max_send_wr). Use 0 if RDMA read is + * never going to be sent from this side. + */ + cm_params.responder_resources = + rdma_hndl->tcq->dev->device_attr.max_qp_rd_atom; + cm_params.initiator_depth = + rdma_hndl->tcq->dev->device_attr.max_qp_init_rd_atom; + + /* connect to peer */ + retval = rdma_connect(rdma_hndl->cm_id, &cm_params); + if (retval != 0) { + xio_set_error(ENOMEM); + ERROR_LOG("rdma_connect failed.\n"); + goto notify_err1; + } + rdma_hndl->client_responder_resources = cm_params.responder_resources; + rdma_hndl->client_initiator_depth = cm_params.initiator_depth; + rdma_hndl->state = XIO_TRANSPORT_STATE_CONNECTING; + + return; + +notify_err1: + xio_qp_release(rdma_hndl); +notify_err0: + xio_transport_notify_observer_error(&rdma_hndl->base, xio_errno()); +} + +/*---------------------------------------------------------------------------*/ +/* on_cm_connect_request (server) */ +/*---------------------------------------------------------------------------*/ +static void on_cm_connect_request(struct rdma_cm_id *cm_id, + struct rdma_cm_event *ev, + struct xio_rdma_transport *parent_hndl) +{ + struct xio_rdma_transport *child_hndl; + union xio_transport_event_data event_data; + struct xio_device **xio_devs; + struct xio_device *dev; + int retval = 0; + + /* Find the device on which the connection was established */ + xio_devs = ib_get_client_data(cm_id->device, &xio_client); + if (!(xio_devs && xio_devs[cm_id->port_num])) { + ERROR_LOG("device(%s) port(%d) not registered\n", + cm_id->device->name, + cm_id->port_num); + xio_set_error(ENODEV); + retval = rdma_reject(cm_id, NULL, 0); + if (retval) { + xio_set_error(retval); + ERROR_LOG("rdma_reject failed. (err=%d %m)\n", retval); + } + goto notify_err1; + } + + child_hndl = (struct xio_rdma_transport *)xio_rdma_open( + parent_hndl->transport, + parent_hndl->base.ctx, + NULL, 0, NULL); + if (!child_hndl) { + ERROR_LOG("failed to open rdma transport\n"); + retval = rdma_reject(cm_id, NULL, 0); + if (retval) { + xio_set_error(retval); + ERROR_LOG("rdma_reject failed. (err=%d %m)\n", + retval); + } + goto notify_err1; + } + child_hndl->state = XIO_TRANSPORT_STATE_CONNECTING; + + dev = xio_devs[cm_id->port_num]; + /* increment device reference count */ + xio_device_get(dev); + + child_hndl->dev = dev; + child_hndl->cm_id = cm_id; + child_hndl->state = XIO_TRANSPORT_STATE_CONNECTING; + + /* Parent handle i.e. listener doesn't have a CQ */ + child_hndl->tcq = NULL; + + /* Can we set it ? is it a new cm_id */ + cm_id->context = child_hndl; + child_hndl->client_initiator_depth = + ev->param.conn.initiator_depth; + child_hndl->client_responder_resources = + ev->param.conn.responder_resources; + + /* initiator is dst, target is src */ + memcpy(&child_hndl->base.peer_addr, + &child_hndl->cm_id->route.addr.dst_addr, + sizeof(child_hndl->base.peer_addr)); + memcpy(&child_hndl->base.local_addr, + &child_hndl->cm_id->route.addr.src_addr, + sizeof(child_hndl->base.local_addr)); + child_hndl->base.proto = XIO_PROTO_RDMA; + + retval = xio_qp_create(child_hndl); + if (retval != 0) { + ERROR_LOG("failed to setup qp\n"); + xio_rdma_reject((struct xio_transport_base *)child_hndl); + goto notify_err2; + } + + event_data.new_connection.child_trans_hndl = + (struct xio_transport_base *)child_hndl; + xio_transport_notify_observer(&parent_hndl->base, + XIO_TRANSPORT_EVENT_NEW_CONNECTION, + &event_data); + + return; + +notify_err2: + xio_rdma_close((struct xio_transport_base *)child_hndl); + xio_device_put(dev); + +notify_err1: + xio_transport_notify_observer_error(&parent_hndl->base, xio_errno()); +} + +/*---------------------------------------------------------------------------*/ +/* on_cm_refused */ +/*---------------------------------------------------------------------------*/ +static void on_cm_refused(struct rdma_cm_event *ev, + struct xio_rdma_transport *rdma_hndl) +{ + TRACE_LOG("on_cm_refused. rdma_hndl:%p, reason:%s\n", + rdma_hndl, xio_cm_rej_reason_str(ev->status)); + /* we get CM_ESTABLISHED and afterward we get cm_refused. It looks like + * cm state machine error. + */ + if (rdma_hndl->state == XIO_TRANSPORT_STATE_CONNECTED) { + /* one for beacon */ + kref_put(&rdma_hndl->base.kref, xio_rdma_close_cb); + /* one for timedwait_exit */ + kref_put(&rdma_hndl->base.kref, xio_rdma_close_cb); + rdma_hndl->state = XIO_TRANSPORT_STATE_ERROR; + } + xio_transport_notify_observer(&rdma_hndl->base, + XIO_TRANSPORT_EVENT_REFUSED, NULL); +} + +/*---------------------------------------------------------------------------*/ +/* on_cm_established */ +/*---------------------------------------------------------------------------*/ +static void on_cm_established(struct rdma_cm_event *ev, + struct xio_rdma_transport *rdma_hndl) +{ + /* initiator is dst, target is src */ + memcpy(&rdma_hndl->base.peer_addr, + &rdma_hndl->cm_id->route.addr.dst_addr, + sizeof(rdma_hndl->base.peer_addr)); + memcpy(&rdma_hndl->base.local_addr, + &rdma_hndl->cm_id->route.addr.src_addr, + sizeof(rdma_hndl->base.local_addr)); + + rdma_hndl->state = XIO_TRANSPORT_STATE_CONNECTED; + + /* one for beacon */ + kref_get(&rdma_hndl->base.kref); + /* one for timedwait_exit */ + kref_get(&rdma_hndl->base.kref); + + xio_transport_notify_observer(&rdma_hndl->base, + XIO_TRANSPORT_EVENT_ESTABLISHED, NULL); +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_disconnect */ +/*---------------------------------------------------------------------------*/ +int xio_rdma_disconnect(struct xio_rdma_transport *rdma_hndl, int send_beacon) +{ + struct ib_send_wr *bad_wr; + int retval; + + retval = rdma_disconnect(rdma_hndl->cm_id); + if (retval) { + ERROR_LOG("rdma_hndl:%p rdma_disconnect failed, %m\n", + rdma_hndl); + return -1; + } + + if (!send_beacon) + return 0; + + /* post an indication that all flush errors were consumed */ + retval = ib_post_send(rdma_hndl->qp, &rdma_hndl->beacon, &bad_wr); + if (retval == -ENOTCONN) { + /* softiwarp returns ENOTCONN right away if the QP is not + in RTS state. */ + WARN_LOG("rdma_hndl %p failed to post beacon - " \ + "ignored because the QP is not in RTS state.\n", + rdma_hndl); + /* for beacon */ + kref_put(&rdma_hndl->base.kref, xio_rdma_close_cb); + } else if (retval) { + ERROR_LOG("rdma_hndl %p failed to post beacon (%d)\n", + rdma_hndl, retval); + return -1; + } else + rdma_hndl->beacon_sent = 1; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* on_cm_disconnected */ +/*---------------------------------------------------------------------------*/ +static void on_cm_disconnected(struct rdma_cm_event *ev, + struct xio_rdma_transport *rdma_hndl) +{ + int retval; + + TRACE_LOG("on_cm_disconnected. rdma_hndl:%p, state:%d\n", + rdma_hndl, rdma_hndl->state); + switch (rdma_hndl->state) { + case XIO_TRANSPORT_STATE_CONNECTED: + TRACE_LOG("call to rdma_disconnect. rdma_hndl:%p\n", + rdma_hndl); + rdma_hndl->state = XIO_TRANSPORT_STATE_DISCONNECTED; + retval = xio_rdma_disconnect(rdma_hndl, 1); + if (retval) + ERROR_LOG("rdma_hndl:%p rdma_disconnect failed, %m\n", + rdma_hndl); + break; + case XIO_TRANSPORT_STATE_CONNECTING: + TRACE_LOG("call to rdma_disconnect. rdma_hndl:%p\n", + rdma_hndl); + rdma_hndl->state = XIO_TRANSPORT_STATE_DISCONNECTED; + retval = xio_rdma_disconnect(rdma_hndl, 0); + if (retval) + ERROR_LOG("rdma_hndl:%p rdma_disconnect failed, %m\n", + rdma_hndl); + /* for beacon */ + kref_put(&rdma_hndl->base.kref, xio_rdma_close_cb); + break; + case XIO_TRANSPORT_STATE_CLOSED: + /* coming here from + * context_shutdown/rdma_close, + * don't go to disconnect state + */ + retval = xio_rdma_disconnect(rdma_hndl, 1); + if (retval) + ERROR_LOG("rdma_hndl:%p rdma_disconnect failed, " \ + "err=%d\n", rdma_hndl, retval); + break; + case XIO_TRANSPORT_STATE_INIT: + case XIO_TRANSPORT_STATE_LISTEN: + case XIO_TRANSPORT_STATE_DISCONNECTED: + case XIO_TRANSPORT_STATE_RECONNECT: + case XIO_TRANSPORT_STATE_DESTROYED: + case XIO_TRANSPORT_STATE_ERROR: + break; + } +} + +/* + * Handle RDMA_CM_EVENT_TIMEWAIT_EXIT which is expected to be the last + * event during the lifecycle of a connection, when it had been shut down + * and the network has cleared from the remaining in-flight messages. +*/ +/*---------------------------------------------------------------------------*/ +/* on_cm_timedwait_exit */ +/*---------------------------------------------------------------------------*/ +static void on_cm_timewait_exit(void *hndl) +{ + struct xio_rdma_transport *rdma_hndl = hndl; + + TRACE_LOG("on_cm_timedwait_exit rdma_hndl:%p state:%s\n", + rdma_hndl, xio_transport_state_str(rdma_hndl->state)); + + xio_rdma_flush_all_tasks(rdma_hndl); + + if (rdma_hndl->state == XIO_TRANSPORT_STATE_DISCONNECTED) { + xio_transport_notify_observer(&rdma_hndl->base, + XIO_TRANSPORT_EVENT_DISCONNECTED, + NULL); + } + /* if beacon was sent but was never received as wc error then reduce + ref count */ + if (rdma_hndl->beacon_sent) { + rdma_hndl->beacon_sent = 0; + kref_put(&rdma_hndl->base.kref, xio_rdma_close_cb); + } + + kref_put(&rdma_hndl->base.kref, xio_rdma_close_cb); +} + +/*---------------------------------------------------------------------------*/ +/* on_cm_device_release */ +/*---------------------------------------------------------------------------*/ +static void on_cm_device_release(struct rdma_cm_event *ev, + struct xio_rdma_transport *rdma_hndl) +{ + struct xio_device **xio_devs; + struct xio_device *dev; + + dev = rdma_hndl->dev; + if (!dev) { + ERROR_LOG("device releases, device not found\n"); + return; + } + + xio_devs = ib_get_client_data(dev->ib_dev, &xio_client); + if (!xio_devs) { + ERROR_LOG("Couldn't find xio device on %s\n", + dev->ib_dev->name); + } else { + xio_devs[dev->port_num] = NULL; + } + + xio_device_release(dev); +} + +/*---------------------------------------------------------------------------*/ +/* on_cm_error */ +/*---------------------------------------------------------------------------*/ +static void on_cm_error(struct rdma_cm_event *ev, + struct xio_rdma_transport *rdma_hndl) +{ + int reason; + + ERROR_LOG("rdma transport [error] %s, rdma_hndl:%p\n", + xio_rdma_event_str(ev->event), rdma_hndl); + + switch (ev->event) { + case RDMA_CM_EVENT_CONNECT_ERROR: + reason = XIO_E_CONNECT_ERROR; + break; + case RDMA_CM_EVENT_ADDR_ERROR: + reason = XIO_E_ADDR_ERROR; + break; + case RDMA_CM_EVENT_ROUTE_ERROR: + reason = XIO_E_ROUTE_ERROR; + break; + case RDMA_CM_EVENT_UNREACHABLE: + reason = XIO_E_UNREACHABLE; + break; + default: + reason = XIO_E_NOT_SUPPORTED; + break; + } + + xio_transport_notify_observer_error(&rdma_hndl->base, reason); +} + +/*---------------------------------------------------------------------------*/ +/* xio_close_handler */ +/*---------------------------------------------------------------------------*/ +void xio_close_handler(void *hndl) +{ + xio_rdma_post_close((struct xio_transport_base *)hndl); +} + +/*---------------------------------------------------------------------------*/ +/* xio_handle_cm_event */ +/*---------------------------------------------------------------------------*/ +/** + * xio_cm_event_handler - Callback used to report user events. + * + * Notes: Users may not call rdma_destroy_id from this callback to destroy + * the passed in id, or a corresponding listen id. Returning a + * non-zero value from the callback will destroy the passed in id. + */ +static int xio_handle_cm_event(struct rdma_cm_id *cm_id, + struct rdma_cm_event *ev) +{ + struct xio_rdma_transport *rdma_hndl = cm_id->context; + + TRACE_LOG("cm event %s, hndl:%p\n", + xio_rdma_event_str(ev->event), rdma_hndl); + + /* TODO: Handling these events here from the cm handler context, + * might cause races with the poller thread context. + * 1. Need to handle each of these events using a dedicated + * event handler from the poller context. + * 2. Need to make sure the events are removed properly before + * rdma_handler shutdown. + */ + rdma_hndl->handler_nesting++; + switch (ev->event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + on_cm_addr_resolved(ev, rdma_hndl); + break; + case RDMA_CM_EVENT_ROUTE_RESOLVED: + on_cm_route_resolved(cm_id, ev, rdma_hndl); + break; + case RDMA_CM_EVENT_CONNECT_REQUEST: + on_cm_connect_request(cm_id, ev, rdma_hndl); + break; + case RDMA_CM_EVENT_ESTABLISHED: + on_cm_established(ev, rdma_hndl); + break; + case RDMA_CM_EVENT_REJECTED: + on_cm_refused(ev, rdma_hndl); + break; + case RDMA_CM_EVENT_ADDR_CHANGE: + case RDMA_CM_EVENT_DISCONNECTED: + on_cm_disconnected(ev, rdma_hndl); + break; + case RDMA_CM_EVENT_TIMEWAIT_EXIT: + /* The caller of this callback i.e. cma_ib_handler is holding + * cma_disable_callback, thus rdma_destroy_id should not + * be called in xio_rdma_close_complete! this is prevented as + * rdma_hndl->handler_nesting > 0. We return one to ensure that + * cma_ib_handler will call + */ + rdma_hndl->ev_data_timewait_exit.handler = on_cm_timewait_exit; + rdma_hndl->ev_data_timewait_exit.data = (void *)rdma_hndl; + xio_context_add_event(rdma_hndl->base.ctx, + &rdma_hndl->ev_data_timewait_exit); + break; + + case RDMA_CM_EVENT_MULTICAST_JOIN: + case RDMA_CM_EVENT_MULTICAST_ERROR: + ERROR_LOG("Unreleated event:%d, %s - ignored\n", ev->event, + xio_rdma_event_str(ev->event)); + break; + + case RDMA_CM_EVENT_DEVICE_REMOVAL: + on_cm_device_release(ev, rdma_hndl); + break; + + case RDMA_CM_EVENT_CONNECT_RESPONSE: + break; + + case RDMA_CM_EVENT_CONNECT_ERROR: + case RDMA_CM_EVENT_ADDR_ERROR: + case RDMA_CM_EVENT_ROUTE_ERROR: + case RDMA_CM_EVENT_UNREACHABLE: + default: + on_cm_error(ev, rdma_hndl); + break; + } + rdma_hndl->handler_nesting--; + + /* state can be modified to destroyed (side effect) */ + if (rdma_hndl->state == XIO_TRANSPORT_STATE_DESTROYED) { + /* user space code calls here, xio_rdma_post_close which may + * call rdma_destroy_id which is not allowed in an handler + */ + rdma_hndl->event_data_close.handler = xio_close_handler; + rdma_hndl->event_data_close.data = (void *)rdma_hndl; + /* tell "poller mechanism" */ + xio_context_add_event(rdma_hndl->base.ctx, + &rdma_hndl->event_data_close); + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_open */ +/*---------------------------------------------------------------------------*/ +static struct xio_transport_base *xio_rdma_open( + struct xio_transport *transport, + struct xio_context *ctx, + struct xio_observer *observer, + uint32_t trans_attr_mask, + struct xio_transport_init_attr *attr) +{ + struct xio_rdma_transport *rdma_hndl; + + /* allocate rdma handle */ + rdma_hndl = kzalloc(sizeof(*rdma_hndl), GFP_KERNEL); + if (!rdma_hndl) { + xio_set_error(ENOMEM); + ERROR_LOG("calloc failed.\n"); + return NULL; + } + if (attr && trans_attr_mask) { + memcpy(&rdma_hndl->trans_attr, attr, sizeof(*attr)); + rdma_hndl->trans_attr_mask = trans_attr_mask; + } + + rdma_hndl->rdma_mempool = xio_mempool_get(ctx); + if (!rdma_hndl->rdma_mempool) { + xio_set_error(ENOMEM); + ERROR_LOG("allocating rdma mempool failed.\n"); + goto cleanup; + } + rdma_hndl->base.portal_uri = NULL; + kref_init(&rdma_hndl->base.kref); + rdma_hndl->transport = transport; + rdma_hndl->cm_id = NULL; + rdma_hndl->qp = NULL; + rdma_hndl->tcq = NULL; + rdma_hndl->base.ctx = ctx; + rdma_hndl->peer_credits = 0; + rdma_hndl->max_inline_buf_sz = xio_rdma_get_inline_buffer_size(); + + if (rdma_hndl->base.ctx->rq_depth) { + //user chose to confgure rq depth + rdma_hndl->rq_depth = max(g_poptions->max_in_iovsz, rdma_hndl->base.ctx->rq_depth); + } else { + rdma_hndl->rq_depth = MAX_RECV_WR; + } + rdma_hndl->sq_depth = g_poptions->max_out_iovsz + 1; + + rdma_hndl->frwr_task.dd_data = ptr_from_int64(XIO_FRWR_LI_WRID); + + INIT_LIST_HEAD(&rdma_hndl->trans_list_entry); + INIT_LIST_HEAD(&rdma_hndl->in_flight_list); + INIT_LIST_HEAD(&rdma_hndl->rdma_rd_req_in_flight_list); + INIT_LIST_HEAD(&rdma_hndl->rdma_rd_rsp_in_flight_list); + INIT_LIST_HEAD(&rdma_hndl->tx_ready_list); + INIT_LIST_HEAD(&rdma_hndl->tx_comp_list); + INIT_LIST_HEAD(&rdma_hndl->rx_list); + INIT_LIST_HEAD(&rdma_hndl->io_list); + INIT_LIST_HEAD(&rdma_hndl->rdma_rd_req_list); + INIT_LIST_HEAD(&rdma_hndl->rdma_rd_rsp_list); + + XIO_OBSERVABLE_INIT(&rdma_hndl->base.observable, rdma_hndl); + if (observer) + xio_observable_reg_observer(&rdma_hndl->base.observable, + observer); + + TRACE_LOG("xio_rdma_open: [new] handle:%p\n", rdma_hndl); + + return (struct xio_transport_base *)rdma_hndl; + +cleanup: + kfree(rdma_hndl); + + return NULL; +} + +/* + * Start closing connection. Transfer IB QP to error state. + * This will be followed by WC error and buffers flush events. + * We also should expect DISCONNECTED and TIMEWAIT_EXIT events. + * Only after the draining is over we are sure to have reclaimed + * all buffers (and tasks). After the RDMA CM events are collected, + * the connection QP may be destroyed, and its number may be recycled. + */ +/*---------------------------------------------------------------------------*/ +/* xio_rdma_close */ +/*---------------------------------------------------------------------------*/ +void xio_rdma_close_cb(struct kref *kref) +{ + struct xio_transport_base *transport = container_of( + kref, struct xio_transport_base, kref); + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)transport; + + xio_transport_notify_observer( + transport, + XIO_TRANSPORT_EVENT_CLOSED, + NULL); + xio_rdma_post_close((struct xio_transport_base *)rdma_hndl); +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_close */ +/*---------------------------------------------------------------------------*/ +static void xio_rdma_close(struct xio_transport_base *transport) +{ + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)transport; + int retval; + + /* now it is zero */ + DEBUG_LOG("xio_rmda_close: [close] handle:%p, qp:%p\n", + rdma_hndl, rdma_hndl->qp); + + switch (rdma_hndl->state) { + case XIO_TRANSPORT_STATE_LISTEN: + rdma_hndl->state = XIO_TRANSPORT_STATE_CLOSED; + break; + case XIO_TRANSPORT_STATE_CONNECTED: + TRACE_LOG("call to rdma_disconnect. rdma_hndl:%p\n", + rdma_hndl); + + rdma_hndl->state = XIO_TRANSPORT_STATE_CLOSED; + retval = xio_rdma_disconnect(rdma_hndl, 0); + if (retval) + DEBUG_LOG("handle:%p rdma_disconnect failed, " \ + "%d\n", rdma_hndl, retval); + break; + case XIO_TRANSPORT_STATE_DISCONNECTED: + rdma_hndl->state = XIO_TRANSPORT_STATE_CLOSED; + break; + case XIO_TRANSPORT_STATE_CLOSED: + /* do not kref_put - already done */ + return; + default: + rdma_hndl->state = XIO_TRANSPORT_STATE_CLOSED; + break; + } + kref_put(&transport->kref, xio_rdma_close_cb); +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_dup2 */ +/* makes new_trans_hndl be the copy of old_trans_hndl, closes new_trans_hndl */ +/* Note old and new are in dup2 terminology opposite to reconnect terms */ +/* --------------------------------------------------------------------------*/ +static int xio_rdma_dup2(struct xio_transport_base *old_trans_hndl, + struct xio_transport_base **new_trans_hndl) +{ + struct xio_rdma_transport *old_hndl = + (struct xio_rdma_transport *)old_trans_hndl; + struct xio_rdma_transport *new_hndl = + (struct xio_rdma_transport *)*new_trans_hndl; + + /* if device is not the same an R_KEY replacement table is created */ + if (old_hndl->dev != new_hndl->dev) { + struct xio_rkey_tbl *te; + + te = &old_hndl->rkey_tbl[old_hndl->rkey_tbl_size]; + /* new is actually the old one we want to replace */ + te->old_rkey = new_hndl->dev->mr->rkey; + te->new_rkey = old_hndl->dev->mr->rkey; + old_hndl->rkey_tbl_size++; + } + + xio_rdma_close(*new_trans_hndl); + + /* nexus layer will call close which will only decrement */ + /*kref_get(&old_trans_hndl->kref);*/ + *new_trans_hndl = old_trans_hndl; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_new_rkey */ +/*---------------------------------------------------------------------------*/ + +static int xio_new_rkey(struct xio_rdma_transport *rdma_hndl, uint32_t *key) +{ + int i; + + if (!*key) + return 0; + + for (i = 0; i < rdma_hndl->peer_rkey_tbl_size; i++) { + if (rdma_hndl->peer_rkey_tbl[i].old_rkey == *key) { + *key = rdma_hndl->peer_rkey_tbl[i].new_rkey; + return 0; + } + } + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_update_task */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_update_task(struct xio_transport_base *trans_hndl, + struct xio_task *task) +{ + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)trans_hndl; + XIO_TO_RDMA_TASK(task, rdma_task); + int i; + + for (i = 0; i < rdma_task->req_in_num_sge; i++) { + if (xio_new_rkey(rdma_hndl, &rdma_task->req_in_sge[i].stag)) + return -1; + } + + for (i = 0; i < rdma_task->req_out_num_sge; i++) { + if (xio_new_rkey(rdma_hndl, &rdma_task->req_out_sge[i].stag)) + return -1; + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_accept */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_accept(struct xio_transport_base *transport) +{ + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)transport; + int retval; + struct rdma_conn_param cm_params = { + .initiator_depth = 1, + .responder_resources = 1, + .rnr_retry_count = 0, /* 7 - infinite retry */ + .retry_count = 0 + }; + + /* + * Limit the responder resources requested by the remote + * to our capabilities. Note that the kernel swaps + * req->responder_resources and req->initiator_depth, so + * that req->responder_resources is actually the active + * side's initiator depth. + */ + if (rdma_hndl->client_responder_resources > + rdma_hndl->tcq->dev->device_attr.max_qp_rd_atom) + cm_params.responder_resources = + rdma_hndl->tcq->dev->device_attr.max_qp_rd_atom; + else + cm_params.responder_resources = + rdma_hndl->client_responder_resources; + + /* + * Note: if this side of the connection is never going to + * use RDMA read opreations, then initiator_depth can be set + * to 0 here. + */ + if (rdma_hndl->client_initiator_depth > + rdma_hndl->tcq->dev->device_attr.max_qp_init_rd_atom) + cm_params.initiator_depth = + rdma_hndl->tcq->dev->device_attr.max_qp_init_rd_atom; + else + cm_params.initiator_depth = rdma_hndl->client_initiator_depth; + + /* "accept" the connection */ + retval = rdma_accept(rdma_hndl->cm_id, &cm_params); + if (retval) { + xio_set_error(retval); + DEBUG_LOG("rdma_accept failed. (err=%d)\n", retval); + return -1; + } + rdma_hndl->client_responder_resources = cm_params.responder_resources; + rdma_hndl->client_initiator_depth = cm_params.initiator_depth; + + TRACE_LOG("rdma transport: [accept] handle:%p\n", rdma_hndl); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_reject */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_reject(struct xio_transport_base *transport) +{ + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)transport; + int retval; + + /* "reject" the connection */ + retval = rdma_reject(rdma_hndl->cm_id, NULL, 0); + if (retval) { + xio_set_error(retval); + DEBUG_LOG("rdma_reject failed. (err=%d)\n", retval); + return -1; + } + TRACE_LOG("rdma transport: [reject] handle:%p\n", rdma_hndl); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_do_connect */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_do_connect(struct xio_transport_base *trans_hndl, + const char *out_if_addr) +{ + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)trans_hndl; + union xio_sockaddr sa; + int retval = 0; + + /* resolve the portal_uri */ + if (xio_uri_to_ss(trans_hndl->portal_uri, &sa.sa_stor) == -1) { + xio_set_error(XIO_E_ADDR_ERROR); + ERROR_LOG("address [%s] resolving failed\n", + trans_hndl->portal_uri); + return -1; + } + + /* create cm id */ + rdma_hndl->cm_id = rdma_create_id(xio_handle_cm_event, + (void *)rdma_hndl, + RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(rdma_hndl->cm_id)) { + retval = PTR_ERR(rdma_hndl->cm_id); + xio_set_error(retval); + ERROR_LOG("rdma_create id failed. (err=%d)\n", retval); + goto exit1; + } + + /* TODO: support out_if_addr */ + + if (out_if_addr) { + union xio_sockaddr if_sa; + + if (xio_host_port_to_ss(out_if_addr, + &if_sa.sa_stor) == -1) { + xio_set_error(XIO_E_ADDR_ERROR); + ERROR_LOG("outgoing interface [%s] resolving failed\n", + out_if_addr); + goto exit2; + } + retval = rdma_bind_addr(rdma_hndl->cm_id, &if_sa.sa); + if (retval) { + xio_set_error(retval); + ERROR_LOG("rdma_bind_addr failed. (err=%d)\n", + retval); + goto exit2; + } + } + + retval = rdma_resolve_addr(rdma_hndl->cm_id, NULL, &sa.sa, + ADDR_RESOLVE_TIMEOUT); + if (retval) { + xio_set_error(retval); + ERROR_LOG("rdma_resolve_addr failed. (err=%d)\n", retval); + goto exit2; + } + + return 0; + +exit2: + TRACE_LOG("call rdma_destroy_id\n"); + rdma_destroy_id(rdma_hndl->cm_id); +exit1: + rdma_hndl->cm_id = NULL; + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_connect */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_connect(struct xio_transport_base *trans_hndl, + const char *portal_uri, const char *out_if_addr) +{ + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)trans_hndl; + + trans_hndl->is_client = 1; + + if (!portal_uri) { + xio_set_error(XIO_E_ADDR_ERROR); + ERROR_LOG("address [%s] resolving failed\n", portal_uri); + goto exit1; + } + + /* allocate memory for portal_uri */ + trans_hndl->portal_uri = kstrdup(portal_uri, GFP_KERNEL); + if (!rdma_hndl->base.portal_uri) { + xio_set_error(ENOMEM); + ERROR_LOG("calloc failed. %m\n"); + goto exit1; + } + + if (xio_rdma_do_connect(trans_hndl, out_if_addr) < 0) + goto exit2; + + return 0; + +exit2: + rdma_destroy_id(rdma_hndl->cm_id); + rdma_hndl->cm_id = NULL; +exit1: + kfree(trans_hndl->portal_uri); + + return -1; +} + +static __be16 priv_get_src_port(struct rdma_cm_id *cm_id) +{ + struct rdma_route *route = &cm_id->route; + struct rdma_addr *addr = &route->addr; + struct sockaddr_storage *src_addr = &addr->src_addr; + __be16 sin_port; + + if (src_addr->ss_family == AF_INET6) { + struct sockaddr_in6 *s6 = (struct sockaddr_in6 *)src_addr; + + sin_port = s6->sin6_port; + } else { + struct sockaddr_in *s4 = (struct sockaddr_in *)src_addr; + + sin_port = s4->sin_port; + } + return sin_port; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_listen */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_listen(struct xio_transport_base *transport, + const char *portal_uri, + uint16_t *src_port, int backlog) +{ + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)transport; + union xio_sockaddr sa; + int retval = 0; + uint16_t sport; + + /* resolve the portal_uri */ + if (xio_uri_to_ss(portal_uri, &sa.sa_stor) == -1) { + xio_set_error(XIO_E_ADDR_ERROR); + DEBUG_LOG("address [%s] resolving failed\n", portal_uri); + return -1; + } + rdma_hndl->base.is_client = 0; + + /* create cm id */ + rdma_hndl->cm_id = rdma_create_id(xio_handle_cm_event, + (void *)rdma_hndl, + RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(rdma_hndl->cm_id)) { + retval = PTR_ERR(rdma_hndl->cm_id); + xio_set_error(retval); + DEBUG_LOG("rdma_create id failed. (err=%d)\n", retval); + goto exit1; + } + + retval = rdma_bind_addr(rdma_hndl->cm_id, &sa.sa); + if (retval) { + xio_set_error(retval); + DEBUG_LOG("rdma_bind_addr failed. (err=%d)\n", retval); + goto exit2; + } + + /* TODO (Alex): Why was backlog set to 0? */ + DEBUG_LOG("Calling rdma_listen() for CM with backlog %d\n", backlog); + retval = rdma_listen(rdma_hndl->cm_id, backlog); + if (retval) { + xio_set_error(retval); + DEBUG_LOG("rdma_listen failed. (err=%d)\n", retval); + goto exit2; + } + + sport = ntohs(priv_get_src_port(rdma_hndl->cm_id)); + if (src_port) + *src_port = sport; + + rdma_hndl->state = XIO_TRANSPORT_STATE_LISTEN; + DEBUG_LOG("listen on [%s] src_port:%d\n", portal_uri, sport); + + return 0; + +exit2: + rdma_destroy_id(rdma_hndl->cm_id); +exit1: + rdma_hndl->cm_id = NULL; + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_set_opt */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_set_opt(void *xio_obj, + int optname, const void *optval, int optlen) +{ + switch (optname) { + case XIO_OPTNAME_ENABLE_MEM_POOL: + VALIDATE_SZ(sizeof(int)); + rdma_options.enable_mem_pool = *((int *)optval); + return 0; + case XIO_OPTNAME_ENABLE_DMA_LATENCY: + VALIDATE_SZ(sizeof(int)); + rdma_options.enable_dma_latency = *((int *)optval); + return 0; + case XIO_OPTNAME_MAX_IN_IOVLEN: + VALIDATE_SZ(sizeof(int)); + rdma_options.max_in_iovsz = *((int *)optval); + return 0; + case XIO_OPTNAME_MAX_OUT_IOVLEN: + VALIDATE_SZ(sizeof(int)); + rdma_options.max_out_iovsz = *((int *)optval); + return 0; + case XIO_OPTNAME_QP_CAP_MAX_INLINE_DATA: + VALIDATE_SZ(sizeof(int)); + rdma_options.qp_cap_max_inline_data = *((int *)optval); + return 0; + default: + break; + } + xio_set_error(XIO_E_NOT_SUPPORTED); + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_get_opt */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_get_opt(void *xio_obj, + int optname, void *optval, int *optlen) +{ + switch (optname) { + case XIO_OPTNAME_ENABLE_MEM_POOL: + *((int *)optval) = rdma_options.enable_mem_pool; + *optlen = sizeof(int); + return 0; + case XIO_OPTNAME_ENABLE_DMA_LATENCY: + *((int *)optval) = rdma_options.enable_dma_latency; + *optlen = sizeof(int); + return 0; + case XIO_OPTNAME_MAX_IN_IOVLEN: + *((int *)optval) = rdma_options.max_in_iovsz; + *optlen = sizeof(int); + return 0; + case XIO_OPTNAME_MAX_OUT_IOVLEN: + *((int *)optval) = rdma_options.max_out_iovsz; + *optlen = sizeof(int); + return 0; + case XIO_OPTNAME_QP_CAP_MAX_INLINE_DATA: + *((int *)optval) = rdma_options.qp_cap_max_inline_data; + *optlen = sizeof(int); + return 0; + default: + break; + } + xio_set_error(XIO_E_NOT_SUPPORTED); + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_is_valid_in_req */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_is_valid_in_req(struct xio_msg *msg) +{ + struct xio_vmsg *vmsg = &msg->in; + int i; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + void *sge; + unsigned int nents, max_nents; + size_t length = 0; + + /* kernel works only with kernel's scatterlist */ + if (unlikely(vmsg->sgl_type != XIO_SGL_TYPE_SCATTERLIST)) { + /* src/common/xio_session_client.c uses XIO_SGL_TYPE_IOV but len + * should be zero. Note, other types are not supported! + */ + if (vmsg->sgl_type != XIO_SGL_TYPE_IOV) { + ERROR_LOG("Incompatible sgl type %d\n", vmsg->sgl_type); + return 0; + } + if (vmsg->data_tbl.nents){ + ERROR_LOG("Bad data_tbl.nents %d\n", vmsg->data_tbl.nents); + return 0; + } + /* Just check header */ + if (vmsg->header.iov_base && + (vmsg->header.iov_len == 0)){ + ERROR_LOG("Bad header %p %zu\n", vmsg->header.iov_base, + vmsg->header.iov_len); + return 0; + } else { + return 1; + } + } + + sgtbl = xio_sg_table_get(vmsg); + sgtbl_ops = xio_sg_table_ops_get(vmsg->sgl_type); + nents = tbl_nents(sgtbl_ops, sgtbl); + max_nents = tbl_max_nents(sgtbl_ops, sgtbl); + + if ((nents > rdma_options.max_in_iovsz) || + (nents > max_nents) || + (max_nents > rdma_options.max_in_iovsz)) { + ERROR_LOG("Too many SG entries %u (%u, %u)\n", + nents, max_nents, rdma_options.max_in_iovsz); + return 0; + } + + if (vmsg->header.iov_base && + (vmsg->header.iov_len == 0)) { + ERROR_LOG("Bad header %p %zu\n", vmsg->header.iov_base, + vmsg->header.iov_len); + return 0; + } + + for_each_sge(sgtbl, sgtbl_ops, sge, i) { + length += sge_length(sgtbl_ops, sge); + if (sge_addr(sgtbl_ops, sge) && + (sge_length(sgtbl_ops, sge) == 0)){ + ERROR_LOG("Zero SGE length\n"); + return 0; + } + } + if (length >= (XIO_MAX_IOV + 1) * PAGE_SIZE) { + ERROR_LOG("Total length %zu > %zu\n", + length, (XIO_MAX_IOV + 1) * PAGE_SIZE); + return 0; + } + + return 1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_is_valid_out_msg */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_is_valid_out_msg(struct xio_msg *msg) +{ + struct xio_vmsg *vmsg = &msg->out; + int i; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + void *sge; + unsigned int nents, max_nents; + + /* kernel works only with kernel's scatterlist */ + if (unlikely(vmsg->sgl_type != XIO_SGL_TYPE_SCATTERLIST)) { + /* src/common/xio_session_client.c uses XIO_SGL_TYPE_IOV but len + * should be zero. Note, other types are not supported! + */ + if (vmsg->sgl_type != XIO_SGL_TYPE_IOV) { + ERROR_LOG("Invalid SGL type %d for msg %p\n", + vmsg->sgl_type, msg); + return 0; + } + if (vmsg->data_tbl.nents) { + ERROR_LOG("SGL type is XIO_SGL_TYPE_IOV and nents=%u\n", + vmsg->data_tbl.nents); + return 0; + } + + /* Just check header */ + if ((vmsg->header.iov_base && + (vmsg->header.iov_len == 0)) || + (!vmsg->header.iov_base && + (vmsg->header.iov_len != 0))) { + ERROR_LOG("Bad header for IOV SGL base=%p len=%zu\n", + vmsg->header.iov_base, + vmsg->header.iov_len); + return 0; + } else { + return 1; + } + } + + sgtbl = xio_sg_table_get(vmsg); + sgtbl_ops = xio_sg_table_ops_get(vmsg->sgl_type); + nents = tbl_nents(sgtbl_ops, sgtbl); + max_nents = tbl_max_nents(sgtbl_ops, sgtbl); + + if ((nents > rdma_options.max_out_iovsz) || + (nents > max_nents) || + (max_nents > rdma_options.max_out_iovsz)) { + ERROR_LOG("Bad nents=%u rdma_options.max_out_iovsz=%u " \ + "max_nents=%u\n", + nents, rdma_options.max_out_iovsz, max_nents); + return 0; + } + + if ((vmsg->header.iov_base && + (vmsg->header.iov_len == 0)) || + (!vmsg->header.iov_base && + (vmsg->header.iov_len != 0))) { + ERROR_LOG("Bad header base=%p len=%zu\n", + vmsg->header.iov_base, + vmsg->header.iov_len); + return 0; + } + + if (vmsg->header.iov_len > + (size_t)xio_get_options()->max_inline_xio_hdr) { + ERROR_LOG("Header is too large %zu>%zu\n", vmsg->header.iov_len, + (size_t)xio_get_options()->max_inline_xio_hdr); + return 0; + } + + for_each_sge(sgtbl, sgtbl_ops, sge, i) { + if ((!sge_addr(sgtbl_ops, sge)) || + (sge_length(sgtbl_ops, sge) == 0)) { + ERROR_LOG("Addr is NULL or length is zero " \ + "for an SGE\n"); + return 0; + } + } + + return 1; +} + +/* task pools management */ +/*---------------------------------------------------------------------------*/ +/* xio_rdma_get_pools_ops */ +/*---------------------------------------------------------------------------*/ +static void xio_rdma_get_pools_ops(struct xio_transport_base *trans_hndl, + struct xio_tasks_pool_ops **initial_pool_ops, + struct xio_tasks_pool_ops **primary_pool_ops) +{ + *initial_pool_ops = &initial_tasks_pool_ops; + *primary_pool_ops = &primary_tasks_pool_ops; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_set_pools_cls */ +/*---------------------------------------------------------------------------*/ +static void xio_rdma_set_pools_cls(struct xio_transport_base *trans_hndl, + struct xio_tasks_pool_cls *initial_pool_cls, + struct xio_tasks_pool_cls *primary_pool_cls) +{ + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)trans_hndl; + + if (initial_pool_cls) + rdma_hndl->initial_pool_cls = *initial_pool_cls; + if (primary_pool_cls) + rdma_hndl->primary_pool_cls = *primary_pool_cls; +} + +static struct xio_transport xio_rdma_transport = { + .name = "rdma", + .ctor = NULL, + .dtor = NULL, + .init = NULL, + .release = NULL, + .context_shutdown = xio_rdma_context_shutdown, + .open = xio_rdma_open, + .connect = xio_rdma_connect, + .listen = xio_rdma_listen, + .accept = xio_rdma_accept, + .reject = xio_rdma_reject, + .close = xio_rdma_close, + .dup2 = xio_rdma_dup2, + .update_task = xio_rdma_update_task, + .send = xio_rdma_send, + .poll = NULL, + .set_opt = xio_rdma_set_opt, + .get_opt = xio_rdma_get_opt, + .cancel_req = xio_rdma_cancel_req, + .cancel_rsp = xio_rdma_cancel_rsp, + .get_pools_setup_ops = xio_rdma_get_pools_ops, + .set_pools_cls = xio_rdma_set_pools_cls, + + .validators_cls.is_valid_in_req = xio_rdma_is_valid_in_req, + .validators_cls.is_valid_out_msg = xio_rdma_is_valid_out_msg, +}; + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_transport_constructor */ +/*---------------------------------------------------------------------------*/ +static int __init xio_rdma_transport_constructor(void) +{ + struct xio_transport *transport = &xio_rdma_transport; + + /* set cpu latency until process is down */ + /* xio_set_cpu_latency(); */ + + /* register the transport */ + xio_reg_transport(transport); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_transport_destructor */ +/*---------------------------------------------------------------------------*/ +static void __exit xio_rdma_transport_destructor(void) +{ + struct xio_transport *transport = &xio_rdma_transport; + + /* Called after all devices were deleted */ + + xio_unreg_transport(transport); +} + +/*---------------------------------------------------------------------------*/ +/* xio_add_one */ +/*---------------------------------------------------------------------------*/ +static void xio_add_one(struct ib_device *ib_dev) +{ + struct xio_device **xio_devs; + int s, e, p; + enum rdma_transport_type transport_type = rdma_node_get_transport( + ib_dev->node_type); + + if (transport_type != RDMA_TRANSPORT_IB && + transport_type != RDMA_TRANSPORT_IWARP) + return; + + if (ib_dev->node_type == RDMA_NODE_IB_SWITCH) { + s = 0; + e = 0; + } else { + s = 1; + e = ib_dev->phys_port_cnt; + } + + xio_devs = kcalloc(e + 1, sizeof(struct xio_device *), GFP_KERNEL); + if (!xio_devs) { + ERROR_LOG("Couldn't allocate n(%d) pointers\n", e + 1); + return; + } + + for (p = s; p <= e; p++) { + struct xio_device *xio_dev; + + xio_dev = xio_device_init(ib_dev, p); + if (!xio_dev) { + ERROR_LOG("init xio_dev on dev(%s) port(%d) failed\n", + ib_dev->name, p); + goto cleanup; + } + xio_devs[p] = xio_dev; + } + + ib_set_client_data(ib_dev, &xio_client, xio_devs); + + return; + +cleanup: + for (p = s; p <= e; p++) { + if (xio_devs[p]) { + xio_device_release(xio_devs[p]); + xio_devs[p] = NULL; + } + } + kfree(xio_devs); +} + +/*---------------------------------------------------------------------------*/ +/* xio_del_one */ +/*---------------------------------------------------------------------------*/ + +static void xio_del_one(struct ib_device *ib_dev) +{ + struct xio_device **xio_devs; + int s, e, p; + enum rdma_transport_type transport_type = rdma_node_get_transport( + ib_dev->node_type); + + if (transport_type != RDMA_TRANSPORT_IB && + transport_type != RDMA_TRANSPORT_IWARP) + return; + + /* xio_del_one is called before the core clients' list is deleted + * so calling ib_get_client_data in xio_del_one is O.K. + */ + + xio_devs = ib_get_client_data(ib_dev, &xio_client); + if (!xio_devs) { + ERROR_LOG("Couldn't find xio device on %s\n", + ib_dev->name); + return; + } + + if (ib_dev->node_type == RDMA_NODE_IB_SWITCH) { + s = 0; + e = 0; + } else { + s = 1; + e = ib_dev->phys_port_cnt; + } + + for (p = s; p <= e; p++) { + if (xio_devs[p]) { + xio_device_release(xio_devs[p]); + xio_devs[p] = NULL; + } + } + + kfree(xio_devs); +} + +static int __init xio_init_module(void) +{ + int ret; + + if (debugfs_initialized()) { + xio_rdma_root = debugfs_create_dir("xio_rdma", NULL); + if (!xio_rdma_root) { + pr_err("xio_rdma root debugfs creation failed\n"); + return -ENOMEM; + } + } else { + xio_rdma_root = NULL; + pr_err("debugfs not initialized\n"); + } + + xio_rdma_transport_constructor(); + + g_poptions = xio_get_options(); + + /* xio_add_one will be called for all existing devices + * add for all new devices + */ + + ret = ib_register_client(&xio_client); + if (ret) { + pr_err("couldn't register IB client ret%d\n", ret); + return ret; + } + return 0; +} + +static void __exit xio_cleanup_module(void) +{ + /* xio_del_one will called for all devices */ + + ib_unregister_client(&xio_client); + + xio_rdma_transport_destructor(); + + debugfs_remove_recursive(xio_rdma_root); +} + +struct dentry *xio_rdma_debugfs_root(void) +{ + return xio_rdma_root; +} + +module_init(xio_init_module); +module_exit(xio_cleanup_module); diff --git a/open_src/xio/src/kernel/transport/rdma/xio_rdma_memory.c b/open_src/xio/src/kernel/transport/rdma/xio_rdma_memory.c new file mode 100644 index 0000000..cff012d --- /dev/null +++ b/open_src/xio/src/kernel/transport/rdma/xio_rdma_memory.c @@ -0,0 +1,1067 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "libxio.h" +#include +#include "xio_log.h" +#include "xio_observer.h" +#include "xio_common.h" +#include "xio_protocol.h" +#include "xio_mbuf.h" +#include "xio_task.h" +#include "xio_transport.h" +#include "xio_ktransport.h" +#include "xio_protocol.h" +#include "xio_mem.h" +#include "xio_mempool.h" +#include "xio_rdma_transport.h" +#include "xio_rdma_utils.h" +#include "xio_sg_table.h" +#include "xio_ev_data.h" +#include "xio_objpool.h" +#include "xio_workqueue.h" +#include "xio_context.h" + +#define XIO_KMALLOC_THRESHOLD 0x20000 /* 128K - kmalloc limit */ + +#ifndef IS_PAGE_ALIGNED +#define IS_PAGE_ALIGNED(ptr) (((PAGE_SIZE-1) & (uintptr_t)(ptr)) == 0) +#endif + +struct fast_reg_descriptor { + struct llist_node llist_entry; + /* For fast registration - FRWR */ + struct ib_mr *data_mr; + struct ib_fast_reg_page_list *data_frpl; + /* Valid for fast registration flag */ + int valid; +}; + +/*---------------------------------------------------------------------------*/ +/* xio_unmap_rx_work_req */ +/*---------------------------------------------------------------------------*/ +void xio_unmap_rx_work_req(struct xio_device *dev, struct xio_work_req *xd) +{ + struct ib_device *ib_dev = dev->ib_dev; + + if (!xd->nents || !xd->mapped) + return; + + /* Assume scatterlist is terminated properly */ + + ib_dma_unmap_sg(ib_dev, xd->sgt.sgl, xd->sgt.nents, DMA_FROM_DEVICE); + + xd->mapped = 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_unmap_tx_work_req */ +/*---------------------------------------------------------------------------*/ + +void xio_unmap_tx_work_req(struct xio_device *dev, struct xio_work_req *xd) +{ + struct ib_device *ib_dev = dev->ib_dev; + + if (!xd->nents || !xd->mapped) + return; + + /* Assume scatterlist is terminated properly */ + + /* Inline were not mapped */ + if (!(xd->send_wr.send_flags & IB_SEND_INLINE)) + ib_dma_unmap_sg(ib_dev, xd->sgt.sgl, xd->sgt.nents, + DMA_TO_DEVICE); + + /* Disconnect header from data if any */ + sg_mark_end(&xd->sgt.sgl[1]); + sg_mark_end(xd->sgt.sgl); + xd->sgt.nents = xd->sgt.orig_nents; + + xd->mapped = 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_map_rx_work_req */ +/*---------------------------------------------------------------------------*/ +int xio_map_rx_work_req(struct xio_device *dev, struct xio_work_req *xd) +{ + struct ib_device *ib_dev = dev->ib_dev; + struct sg_table *sgt = &xd->sgt; + struct scatterlist *sg; + int nents; + int i; + + if (!xd->nents) + return -1; + + /* Assume scatterlist is terminated properly */ + + nents = ib_dma_map_sg(ib_dev, sgt->sgl, sgt->nents, + DMA_FROM_DEVICE); + if (!nents) { + xd->mapped = 0; + return -1; + } + + sg = sgt->sgl; + for (i = 0; i < nents; i++) { + xd->sge[i].addr = ib_sg_dma_address(ib_dev, sg); + xd->sge[i].length = ib_sg_dma_len(ib_dev, sg); + /* lkey is already initialized */ + sg = sg_next(sg); + } + + xd->mapped = nents; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_map_tx_work_req */ +/*---------------------------------------------------------------------------*/ +int xio_map_tx_work_req(struct xio_device *dev, struct xio_work_req *xd) +{ + struct ib_device *ib_dev = dev->ib_dev; + struct sg_table *sgt = &xd->sgt; + struct scatterlist *sg; + int nents; + int i; + + if (!xd->nents) + return -1; + + /* Assume scatterlist is terminated properly */ + + sg = sgt->sgl; + + if (xd->send_wr.send_flags & IB_SEND_INLINE) { + /* Inline need not be mapped just return to virt addresses + * from sg's page + offset + */ + for (i = 0; i < xd->nents; i++) { + xd->sge[i].addr = uint64_from_ptr(sg_virt(sg)); + xd->sge[i].length = sg->length; + /* lkey is already initialized */ + sg = sg_next(sg); + } + xd->mapped = xd->nents; + return 0; + } + + nents = ib_dma_map_sg(ib_dev, sgt->sgl, sgt->nents, DMA_TO_DEVICE); + if (!nents) { + /* Disconnect header from data if any*/ + sg_mark_end(sg); + sgt->nents = sgt->orig_nents; + xd->mapped = 0; + return -1; + } + + for (i = 0; i < nents; i++) { + xd->sge[i].addr = ib_sg_dma_address(ib_dev, sg); + xd->sge[i].length = ib_sg_dma_len(ib_dev, sg); + /* lkey is already initialized */ + sg = sg_next(sg); + } + xd->mapped = nents; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_unmap_rxmad_work_req */ +/*---------------------------------------------------------------------------*/ +void xio_unmap_rxmad_work_req(struct xio_device *dev, struct xio_work_req *xd) +{ + struct ib_device *ib_dev = dev->ib_dev; + + if (!xd->nents || !xd->mapped) + return; + + /* Assume scatterlist is terminated properly */ + + ib_dma_unmap_sg(ib_dev, xd->sgt.sgl, xd->sgt.nents, DMA_FROM_DEVICE); + + /* xio_prep_rdma_op calls sg_mark_end need to undo */ + if (xd->last_sg) { + sg_unmark_end(xd->last_sg); + xd->last_sg = NULL; + xd->sgt.nents = xd->sgt.orig_nents; + } + + xd->mapped = 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_unmap_txmad_work_req */ +/*---------------------------------------------------------------------------*/ + +void xio_unmap_txmad_work_req(struct xio_device *dev, struct xio_work_req *xd) +{ + struct ib_device *ib_dev = dev->ib_dev; + + if (!xd->nents || !xd->mapped) + return; + + /* Assume scatterlist is terminated properly */ + + ib_dma_unmap_sg(ib_dev, xd->sgt.sgl, xd->sgt.nents, DMA_TO_DEVICE); + + /* xio_prep_rdma_op calls sg_mark_end need to undo */ + if (xd->last_sg) { + sg_unmark_end(xd->last_sg); + xd->last_sg = NULL; + xd->sgt.nents = xd->sgt.orig_nents; + } + + xd->mapped = 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_map_rxmad_work_req */ +/*---------------------------------------------------------------------------*/ +int xio_map_rxmad_work_req(struct xio_device *dev, struct xio_work_req *xd) +{ + struct ib_device *ib_dev = dev->ib_dev; + u32 lkey = dev->mr->lkey; + struct sg_table *sgt = &xd->sgt; + struct scatterlist *sg; + int nents; + int i; + + if (!xd->nents) + return -1; + + /* Assume scatterlist is terminated properly */ + + nents = ib_dma_map_sg(ib_dev, sgt->sgl, sgt->nents, DMA_FROM_DEVICE); + if (!nents) { + if (xd->last_sg) { + sg_unmark_end(xd->last_sg); + xd->last_sg = NULL; + xd->sgt.nents = xd->sgt.orig_nents; + } + xd->mapped = 0; + return -1; + } + + sg = sgt->sgl; + for (i = 0; i < nents; i++) { + xd->sge[i].addr = ib_sg_dma_address(ib_dev, sg); + xd->sge[i].length = ib_sg_dma_len(ib_dev, sg); + xd->sge[i].lkey = lkey; + sg = sg_next(sg); + } + + xd->mapped = nents; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_map_txmad_work_req */ +/*---------------------------------------------------------------------------*/ +int xio_map_txmad_work_req(struct xio_device *dev, struct xio_work_req *xd) +{ + struct ib_device *ib_dev = dev->ib_dev; + u32 lkey = dev->mr->lkey; + struct sg_table *sgt = &xd->sgt; + struct scatterlist *sg; + int nents; + int i; + + if (!xd->nents) + return -1; + + sg = sgt->sgl; + + nents = ib_dma_map_sg(ib_dev, sgt->sgl, sgt->nents, DMA_TO_DEVICE); + if (!nents) { + if (xd->last_sg) { + sg_unmark_end(xd->last_sg); + xd->last_sg = NULL; + sgt->nents = sgt->orig_nents; + } + xd->mapped = 0; + return -1; + } + + for (i = 0; i < nents; i++) { + xd->sge[i].addr = ib_sg_dma_address(ib_dev, sg); + xd->sge[i].length = ib_sg_dma_len(ib_dev, sg); + xd->sge[i].lkey = lkey; + sg = sg_next(sg); + } + xd->mapped = nents; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_remap_work_req */ +/*---------------------------------------------------------------------------*/ +int xio_remap_work_req(struct xio_device *odev, struct xio_device *ndev, + struct xio_work_req *xd, + enum dma_data_direction direction) +{ + struct ib_device *ib_odev = odev->ib_dev; + struct ib_device *ib_ndev = ndev->ib_dev; + u32 lkey = ndev->mr->lkey; + struct sg_table *sgt = &xd->sgt; + struct scatterlist *sg; + int nents; + int i; + + if (!xd->nents || !xd->mapped) + return -1; + + /* Assume scatterlist is terminated properly */ + + if ((direction == DMA_TO_DEVICE) && + (xd->send_wr.send_flags & IB_SEND_INLINE)) { + /* Just update lkey */ + for (i = 0; i < xd->nents; i++) + xd->sge[i].lkey = lkey; + return 0; + } + + ib_dma_unmap_sg(ib_odev, sgt->sgl, sgt->nents, direction); + nents = ib_dma_map_sg(ib_ndev, sgt->sgl, sgt->nents, direction); + if (!nents) { + if (xd->last_sg) { + /* rdmad */ + sg_unmark_end(xd->last_sg); + xd->last_sg = NULL; + sgt->nents = sgt->orig_nents; + } else { + /* Disconnect header from data if any*/ + if (direction == DMA_TO_DEVICE && + sgt->orig_nents > sgt->nents) { + sg_mark_end(sgt->sgl); + sgt->nents = sgt->orig_nents; + } + } + xd->mapped = 0; + return -1; + } + + sg = sgt->sgl; + for (i = 0; i < nents; i++) { + xd->sge[i].addr = ib_sg_dma_address(ib_ndev, sg); + xd->sge[i].length = ib_sg_dma_len(ib_ndev, sg); + xd->sge[i].lkey = lkey; + sg = sg_next(sg); + } + xd->mapped = nents; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_reset_desc */ +/*---------------------------------------------------------------------------*/ +void xio_reset_desc(struct xio_mem_desc *desc) +{ + memset(&desc->sgt, 0, sizeof(desc->sgt)); +} + +/*---------------------------------------------------------------------------*/ +/* xio_unmap_desc */ +/*---------------------------------------------------------------------------*/ +void xio_unmap_desc(struct xio_rdma_transport *rdma_hndl, + struct xio_mem_desc *desc, + enum dma_data_direction direction) +{ + struct xio_device *dev = rdma_hndl->dev; + struct ib_device *ib_dev = dev->ib_dev; + + if (!desc->nents || !desc->mapped) + return; + + /* fast unregistration routine may do nothing but it is always exists */ + dev->fastreg.unreg_rdma_mem(rdma_hndl, desc, direction); + + /* Assume scatterlist is terminated properly */ + + ib_dma_unmap_sg(ib_dev, desc->sgt.sgl, desc->sgt.nents, direction); + + desc->mapped = 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_map_desc */ +/*---------------------------------------------------------------------------*/ +int xio_map_desc(struct xio_rdma_transport *rdma_hndl, + struct xio_mem_desc *desc, + enum dma_data_direction direction, + unsigned int *sqe_used) +{ + struct xio_device *dev = rdma_hndl->dev; + struct ib_device *ib_dev = dev->ib_dev; + int nents; + + if (!desc->nents) + return -1; + + /* Assume scatterlist is terminated properly */ + + nents = ib_dma_map_sg(ib_dev, desc->sgt.sgl, desc->sgt.nents, + direction); + if (!nents) { + memset(&desc->sgt, 0, sizeof(desc->sgt)); + desc->mapped = 0; + return -1; + } + desc->mapped = nents; + + /* fast registration routine may do nothing but it is always exists */ + if (dev->fastreg.reg_rdma_mem(rdma_hndl, desc, direction, sqe_used)) { + ib_dma_unmap_sg(ib_dev, desc->sgt.sgl, desc->sgt.nents, + direction); + memset(&desc->sgt, 0, sizeof(desc->sgt)); + desc->mapped = 0; + return -1; + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_remap_desc */ +/*---------------------------------------------------------------------------*/ +int xio_remap_desc(struct xio_rdma_transport *rdma_ohndl, + struct xio_rdma_transport *rdma_nhndl, + struct xio_mem_desc *desc, + enum dma_data_direction direction, + unsigned int *sqe_used) +{ + struct xio_device *dev; + struct ib_device *ib_dev; + int nents; + + if (!desc->nents || !desc->mapped) + return -1; + + dev = rdma_ohndl->dev; + ib_dev = dev->ib_dev; + /* fast unregistration routine may do nothing but it is + * always exists */ + dev->fastreg.unreg_rdma_mem(rdma_ohndl, desc, direction); + + /* Assume scatterlist is terminated properly */ + ib_dma_unmap_sg(ib_dev, desc->sgt.sgl, desc->sgt.nents, direction); + + dev = rdma_nhndl->dev; + ib_dev = dev->ib_dev; + nents = ib_dma_map_sg(ib_dev, desc->sgt.sgl, desc->sgt.nents, + direction); + if (!nents) { + memset(&desc->sgt, 0, sizeof(desc->sgt)); + desc->mapped = 0; + return -1; + } + + /* fast registration routine may do nothing but it is always exists */ + if (dev->fastreg.reg_rdma_mem(rdma_nhndl, desc, direction, sqe_used)) { + ib_dma_unmap_sg(ib_dev, desc->sgt.sgl, desc->sgt.nents, + direction); + memset(&desc->sgt, 0, sizeof(desc->sgt)); + desc->mapped = 0; + return -1; + } + + return 0; +} + +void xio_free_dummy_pool(struct xio_rdma_transport *rdma_hndl) +{ +} + +int xio_create_dummy_pool(struct xio_rdma_transport *rdma_hndl) +{ + return 0; +} + +void xio_unreg_mem_dummy(struct xio_rdma_transport *rdma_hndl, + struct xio_mem_desc *desc, + enum dma_data_direction cmd_dir) +{ +} + +int xio_reg_rdma_mem_dummy(struct xio_rdma_transport *rdma_hndl, + struct xio_mem_desc *desc, + enum dma_data_direction cmd_dir, + unsigned int *sqe_used) +{ + desc->mem_reg.mem_h = NULL; + + return 0; +} + +/** + * xio_sg_to_page_vec - Translates scatterlist entries to physical addresses + * and returns the length of resulting physical address array (may be less than + * the original due to possible compaction). + * + * we build a "page vec" under the assumption that the SG meets the RDMA + * alignment requirements. Other then the first and last SG elements, all + * the "internal" elements can be compacted into a list whose elements are + * dma addresses of physical pages. The code supports also the weird case + * where --few fragments of the same page-- are present in the SG as + * consecutive elements. Also, it handles one entry SG. + */ + +static int xio_sg_to_page_vec(struct xio_mem_desc *mdesc, + struct ib_device *ibdev, + struct ib_fast_reg_page_list *data_frpl, + int *offset, int *data_size) +{ + struct scatterlist *sg, *sgl = mdesc->sgt.sgl; + u64 start_addr, end_addr, page, chunk_start = 0; + unsigned long total_sz = 0; + unsigned int dma_len; + int i, new_chunk, cur_page, last_ent = mdesc->nents - 1; + u64 *pages = data_frpl->page_list; + + /* compute the offset of first element */ + *offset = (u64)sgl[0].offset & ~PAGE_MASK; + + new_chunk = 1; + cur_page = 0; + for_each_sg(sgl, sg, mdesc->nents, i) { + start_addr = ib_sg_dma_address(ibdev, sg); + if (new_chunk) + chunk_start = start_addr; + dma_len = ib_sg_dma_len(ibdev, sg); + end_addr = start_addr + dma_len; + total_sz += dma_len; + + /* collect page fragments until aligned or end of SG list */ + if (!IS_PAGE_ALIGNED(end_addr) && i < last_ent) { + new_chunk = 0; + continue; + } + new_chunk = 1; + + /* address of the first page in the contiguous chunk; + masking relevant for the very first SG entry, + which might be unaligned */ + page = chunk_start & PAGE_MASK; + do { + if (cur_page >= data_frpl->max_page_list_len) { + ERROR_LOG("Overflowing page list " \ + "array. cur_page = %d, " \ + "max = %u, tot sz=%lu\n", + cur_page, + data_frpl->max_page_list_len, + total_sz); + break; + } + pages[cur_page++] = page; + page += PAGE_SIZE; + } while (page < end_addr); + } + + *data_size = total_sz; + TRACE_LOG("page_vec->data_size:%d cur_page %d\n", + *data_size, cur_page); + return cur_page; +} + +/** + * xio_data_buf_aligned_len - Tries to determine the maximal correctly aligned + * for RDMA sub-list of a scatter-gather list of memory buffers, and returns + * the number of entries which are aligned correctly. Supports the case where + * consecutive SG elements are actually fragments of the same physcial page. + */ +static int xio_data_buf_aligned_len(struct xio_mem_desc *mdesc, + struct ib_device *ibdev) +{ + struct scatterlist *sgl, *sg, *next_sg = NULL; + u64 start_addr, end_addr; + int i, ret_len, start_check = 0; + + if (mdesc->nents == 1) + return 1; + + sgl = mdesc->sgt.sgl; + start_addr = ib_sg_dma_address(ibdev, sgl); + + for_each_sg(sgl, sg, mdesc->nents, i) { + if (start_check && !IS_PAGE_ALIGNED(start_addr)) + break; + + next_sg = sg_next(sg); + if (!next_sg) + break; + + end_addr = start_addr + ib_sg_dma_len(ibdev, sg); + start_addr = ib_sg_dma_address(ibdev, next_sg); + + if (end_addr == start_addr) { + start_check = 0; + continue; + } else { + start_check = 1; + } + + if (!IS_PAGE_ALIGNED(end_addr)) + break; + } + ret_len = (next_sg) ? i : i+1; + TRACE_LOG("Found %d aligned entries out of %d in mdesc:%p\n", + ret_len, mdesc->nents, mdesc); + return ret_len; +} + +/** + * xio_free_frwr_pool - releases the pool of fast_reg descriptors + */ +void xio_free_frwr_pool(struct xio_rdma_transport *rdma_hndl) +{ + struct xio_frwr *frwr = &rdma_hndl->fastreg.frwr; + struct fast_reg_descriptor *fdesc; + struct llist_node *node; + int i = 0; + + DEBUG_LOG("freeing rdma_hndl %p FRWR pool\n", rdma_hndl); + + node = llist_del_all(&frwr->pool); + while (node) { + fdesc = llist_entry(node, struct fast_reg_descriptor, + llist_entry); + node = llist_next(node); + ib_free_fast_reg_page_list(fdesc->data_frpl); + ib_dereg_mr(fdesc->data_mr); + kfree(fdesc); + i++; + } + + node = llist_del_all(&frwr->pool_ret); + while (node) { + fdesc = llist_entry(node, struct fast_reg_descriptor, + llist_entry); + node = llist_next(node); + ib_free_fast_reg_page_list(fdesc->data_frpl); + ib_dereg_mr(fdesc->data_mr); + kfree(fdesc); + i++; + } + + if (i < frwr->pool_size) + WARN_LOG("pool still has %d regions registered\n", + frwr->pool_size - i); +} + +/** + * xio_create_frwr_pool - Creates pool of fast_reg descriptors + * for fast registration work requests. + * returns 0 on success, or errno code on failure + */ +int xio_create_frwr_pool(struct xio_rdma_transport *rdma_hndl) +{ + struct xio_device *dev = rdma_hndl->dev; + struct xio_frwr *frwr = &rdma_hndl->fastreg.frwr; + struct fast_reg_descriptor *desc; + int i, ret; + + init_llist_head(&frwr->pool); + frwr->pool_size = 0; + /* There can be only max_tx_ready_tasks_num simultaneously inflight + * request tasks at any given time, each of which may need both RDMA + * read and write (both data form server to client may be big) + */ + for (i = 0; i < rdma_hndl->max_tx_ready_tasks_num * 2; i++) { + desc = kzalloc(sizeof(*desc), GFP_KERNEL); + if (!desc) { + ERROR_LOG("Failed to allocate a new fast_reg " \ + "descriptor\n"); + ret = -ENOMEM; + goto err; + } + + desc->data_frpl = ib_alloc_fast_reg_page_list(dev->ib_dev, + XIO_MAX_IOV + 1); + if (IS_ERR(desc->data_frpl)) { + ret = PTR_ERR(desc->data_frpl); + ERROR_LOG("Failed to allocate ib_fast_reg_page_list " \ + "err=%d\n", ret); + kfree(desc); + goto err; + } + desc->data_frpl->max_page_list_len = XIO_MAX_IOV + 1; + + desc->data_mr = ib_alloc_fast_reg_mr(dev->pd, XIO_MAX_IOV + 1); + if (IS_ERR(desc->data_mr)) { + ret = PTR_ERR(desc->data_mr); + ERROR_LOG("Failed to allocate ib_fast_reg_mr err=%d\n", + ret); + ib_free_fast_reg_page_list(desc->data_frpl); + kfree(desc); + goto err; + } + desc->valid = true; + llist_add(&desc->llist_entry, &frwr->pool_ret); + frwr->pool_size++; + } + + return 0; +err: + xio_free_frwr_pool(rdma_hndl); + return ret; +} + +void xio_unreg_mem_frwr(struct xio_rdma_transport *rdma_hndl, + struct xio_mem_desc *mdesc, + enum dma_data_direction cmd_dir) +{ + struct xio_mem_reg *reg = &mdesc->mem_reg; + struct fast_reg_descriptor *fdesc = reg->mem_h; + + if (!reg->mem_h) + return; + + reg->mem_h = NULL; + llist_add(&fdesc->llist_entry, &rdma_hndl->fastreg.frwr.pool_ret); +} + +static int xio_fast_reg_mr(struct fast_reg_descriptor *fdesc, + struct xio_rdma_transport *rdma_hndl, + struct xio_mem_reg *reg, + u32 offset, unsigned int data_size, + unsigned int page_list_len, + unsigned int *sqe_used) +{ + struct ib_send_wr fastreg_wr, inv_wr; + struct ib_send_wr *bad_wr, *wr = NULL; + u8 key; + int ret; + + if (!fdesc->valid) { + /* don't send signaled */ + memset(&inv_wr, 0, sizeof(inv_wr)); + inv_wr.opcode = IB_WR_LOCAL_INV; + inv_wr.wr_id = uint64_from_ptr(&rdma_hndl->frwr_task); + inv_wr.ex.invalidate_rkey = fdesc->data_mr->rkey; + /* Bump the key */ + key = (u8)(fdesc->data_mr->rkey & 0x000000FF); + ib_update_fast_reg_key(fdesc->data_mr, ++key); + /* send two work requests */ + wr = &inv_wr; + wr->next = &fastreg_wr; + rdma_hndl->sqe_avail--; + (*sqe_used)++; + } else { + wr = &fastreg_wr; + } + rdma_hndl->sqe_avail--; + (*sqe_used)++; + /* Prepare FASTREG WR */ + memset(&fastreg_wr, 0, sizeof(fastreg_wr)); + fastreg_wr.opcode = IB_WR_FAST_REG_MR; + fastreg_wr.wr_id = uint64_from_ptr(&rdma_hndl->frwr_task); + fastreg_wr.wr.fast_reg.iova_start = + fdesc->data_frpl->page_list[0] + offset; + fastreg_wr.wr.fast_reg.page_list = fdesc->data_frpl; + fastreg_wr.wr.fast_reg.page_list_len = page_list_len; + fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; + fastreg_wr.wr.fast_reg.length = data_size; + fastreg_wr.wr.fast_reg.rkey = fdesc->data_mr->rkey; + fastreg_wr.wr.fast_reg.access_flags = (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ); + + ret = ib_post_send(rdma_hndl->qp, wr, &bad_wr); + if (unlikely(ret)) { + ERROR_LOG("fast registration failed, ret:%d\n", ret); + return ret; + } + + fdesc->valid = false; + + reg->mem_h = (void *)fdesc; + reg->lkey = fdesc->data_mr->lkey; + reg->rkey = fdesc->data_mr->rkey; + reg->va = fdesc->data_frpl->page_list[0] + offset; + reg->len = data_size; + + return ret; +} + +static struct fast_reg_descriptor *get_fdesc( + struct xio_rdma_transport *rdma_hndl) +{ + struct llist_node *node, *nnode; + struct fast_reg_descriptor *fdesc; + + node = llist_del_first(&rdma_hndl->fastreg.frwr.pool); + if (node) + return llist_entry(node, struct fast_reg_descriptor, + llist_entry); + + node = llist_del_all(&rdma_hndl->fastreg.frwr.pool_ret); + if (!node) + return NULL; + + nnode = llist_reverse_order(node); + fdesc = llist_entry(nnode, struct fast_reg_descriptor, llist_entry); + nnode = llist_next(nnode); + fdesc->llist_entry.next = NULL; + + if (nnode) + llist_add_batch(nnode, node, &rdma_hndl->fastreg.frwr.pool); + + return fdesc; +} + +/** + * xio_reg_rdma_mem_frwr - Registers memory intended for RDMA, + * using Fast Registration WR (if possible) obtaining rkey and va + * + * returns 0 on success, errno code on failure + */ +static int xio_reg_rdma_mem_frwr(struct xio_rdma_transport *rdma_hndl, + struct xio_mem_desc *mdesc, + enum dma_data_direction cmd_dir, + unsigned int *sqe_used) +{ + struct xio_device *dev = rdma_hndl->dev; + struct ib_device *ibdev = dev->ib_dev; + struct fast_reg_descriptor *fdesc; + unsigned int data_size, page_list_len; + int err, aligned_len; + u32 offset; + + /* if there a single dma entry, fail to dummy */ + if (mdesc->nents == 1) + return xio_reg_rdma_mem_dummy(rdma_hndl, mdesc, + cmd_dir, sqe_used); + + /* if not enough sqe for post_send */ + if (rdma_hndl->sqe_avail < 2) { + ERROR_LOG("no rdma_hndl->sqe_avail=%d\n", rdma_hndl->sqe_avail); + return xio_reg_rdma_mem_dummy(rdma_hndl, mdesc, + cmd_dir, sqe_used); + } + + aligned_len = xio_data_buf_aligned_len(mdesc, ibdev); + if (aligned_len != mdesc->nents) + /* fail to dummy, i.e. will use multiple RDMA */ + return xio_reg_rdma_mem_dummy(rdma_hndl, mdesc, + cmd_dir, sqe_used); + + fdesc = get_fdesc(rdma_hndl); + if (!fdesc) { + /* We may have temporary pressure on pool */ + DEBUG_LOG("pool is empty!\n"); + /* fail to dummy, i.e. will use multiple RDMA */ + return xio_reg_rdma_mem_dummy(rdma_hndl, mdesc, + cmd_dir, sqe_used); + } + + page_list_len = xio_sg_to_page_vec(mdesc, dev->ib_dev, + fdesc->data_frpl, + &offset, &data_size); + + if (unlikely(page_list_len * PAGE_SIZE < data_size)) { + ERROR_LOG("fast reg page_list too short to hold this SG\n"); + err = -EINVAL; + goto err_reg; + } + + err = xio_fast_reg_mr(fdesc, rdma_hndl, &mdesc->mem_reg, + offset, data_size, page_list_len, sqe_used); + if (err) + goto err_reg; + + return 0; +err_reg: + llist_add(&fdesc->llist_entry, &rdma_hndl->fastreg.frwr.pool); + return err; +} + +int xio_fast_reg_init(enum xio_fast_reg reg, struct xio_fastreg_ops *ops) +{ + switch (reg) { + case XIO_FAST_MEM_NONE: + ops->alloc_rdma_reg_res = xio_create_dummy_pool; + ops->free_rdma_reg_res = xio_free_dummy_pool; + ops->reg_rdma_mem = xio_reg_rdma_mem_dummy; + ops->unreg_rdma_mem = xio_unreg_mem_dummy; + WARN_LOG("Fast registration not supported\n"); + return 0; + case XIO_FAST_MEM_FRWR: + ops->alloc_rdma_reg_res = xio_create_frwr_pool; + ops->free_rdma_reg_res = xio_free_frwr_pool; + ops->reg_rdma_mem = xio_reg_rdma_mem_frwr; + ops->unreg_rdma_mem = xio_unreg_mem_frwr; + DEBUG_LOG("FRWR supported, using FRWR for registration\n"); + return 0; + case XIO_FAST_MEM_FMR: + ERROR_LOG("FMRs not yet implemented\n"); + return -1; + default: + ERROR_LOG("Unknown registration type\n"); + return -1; + } +} + +/* drivers/block/nvme.c nvme_map_bio */ +#define XIO_VEC_NOT_VIRT_MERGEABLE(vec1, vec2) ((vec2)->bv_offset || \ + (((vec1)->bv_offset + (vec1)->bv_len) % PAGE_SIZE)) + +void xio_copy_vmsg_to_buffer(struct xio_vmsg *vmsg, + struct xio_mp_mem *mp) +{ + void *ptr = mp->addr; + int i; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + void *sge; + + sgtbl = xio_sg_table_get(vmsg); + sgtbl_ops = xio_sg_table_ops_get(vmsg->sgl_type); + + sge = sge_first(sgtbl_ops, sgtbl); + for (i = 0; i < tbl_nents(sgtbl_ops, sgtbl) - 1; i++) { + memmove(ptr, sge_addr(sgtbl_ops, sge), + sge_length(sgtbl_ops, sge)); + ptr += sge_length(sgtbl_ops, sge); + sge = sge_next(sgtbl_ops, sgtbl, sge); + } +} + +void xio_reinit_header(struct xio_rdma_task *rdma_task, size_t len) +{ + sg_set_page(rdma_task->txd.sgt.sgl, virt_to_page(rdma_task->buf), + len, offset_in_page(rdma_task->buf)); +} + +int xio_vmsg_to_tx_sgt(struct xio_vmsg *vmsg, struct sg_table *sgt, int *nents) +{ + switch (vmsg->sgl_type) { + case XIO_SGL_TYPE_IOV: + case XIO_SGL_TYPE_IOV_PTR: + WARN_LOG("wrong vmsg type %d\n", vmsg->sgl_type); + if (unlikely(vmsg->data_tbl.nents)) { + *nents = 0; + return -EINVAL; + } + goto done; + case XIO_SGL_TYPE_SCATTERLIST: + break; + default: + WARN_LOG("wrong vmsg type %d\n", vmsg->sgl_type); + *nents = 0; + return -EINVAL; + } + + /* TODO: validate vmsg sgl */ + if (unlikely(vmsg->data_tbl.nents > XIO_MAX_IOV)) { + WARN_LOG("scatterlist too long %u\n", vmsg->data_tbl.nents); + *nents = 0; + return -EINVAL; + } + +#ifdef CONFIG_DEBUG_SG + BUG_ON(vmsg->data_tbl.sgl->sg_magic != SG_MAGIC); +#endif + + /* Only the header will be sent */ + if (vmsg->data_tbl.nents) { + /* txd has one more entry we need to chain */ + sg_unmark_end(sgt->sgl); + /* Assume scatterlist is terminated properly */ + sg_chain(sgt->sgl, 2, vmsg->data_tbl.sgl); + sgt->nents = 1 + vmsg->data_tbl.nents; + } + +done: + *nents = sgt->nents; + + return 0; +} + +int xio_vmsg_to_sgt(struct xio_vmsg *vmsg, struct sg_table *sgt, int *nents) +{ + switch (vmsg->sgl_type) { + case XIO_SGL_TYPE_IOV: + case XIO_SGL_TYPE_IOV_PTR: + WARN_LOG("wrong vmsg type %d\n", vmsg->sgl_type); + if (unlikely(vmsg->data_tbl.nents)) { + *nents = 0; + return -EINVAL; + } + memset(sgt, 0, sizeof(*sgt)); + goto done; + case XIO_SGL_TYPE_SCATTERLIST: + break; + default: + WARN_LOG("wrong vmsg type %d\n", vmsg->sgl_type); + *nents = 0; + return -EINVAL; + } + + /* TODO: validate vmsg sgl */ + if (unlikely(vmsg->data_tbl.nents > XIO_MAX_IOV)) { + WARN_LOG("scatterlist too long %u\n", vmsg->data_tbl.nents); + *nents = 0; + return -EINVAL; + } + + if (vmsg->data_tbl.nents) + memcpy(sgt, &vmsg->data_tbl, sizeof(*sgt)); + else + memset(sgt, 0, sizeof(*sgt)); + +done: + *nents = sgt->nents; + + return 0; +} diff --git a/open_src/xio/src/kernel/transport/rdma/xio_rdma_transport.h b/open_src/xio/src/kernel/transport/rdma/xio_rdma_transport.h new file mode 100644 index 0000000..24824f9 --- /dev/null +++ b/open_src/xio/src/kernel/transport/rdma/xio_rdma_transport.h @@ -0,0 +1,634 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_RDMA_TRANSPORT_H +#define XIO_RDMA_TRANSPORT_H + +/*---------------------------------------------------------------------------*/ +/* externals */ +/*---------------------------------------------------------------------------*/ +extern struct xio_rdma_options rdma_options; +extern struct xio_options *g_poptions; + +/* poll_cq definitions */ +#define MAX_RDMA_ADAPTERS 64 /* 64 adapters per unit */ +#define MAX_POLL_WC 128 + +#define ADDR_RESOLVE_TIMEOUT 1000 +#define ROUTE_RESOLVE_TIMEOUT 1000 + +#define MAX_SGE (XIO_MAX_IOV + 1) + +/* 256 rdma_write + 1 send */ +#define MAX_SEND_WR (XIO_MAX_IOV + 1) +#define MAX_RECV_WR (XIO_MAX_IOV) +#define EXTRA_RQE 32 +#define SEND_QE NUM_START_PRIMARY_POOL_TASKS - EXTRA_RQE - MAX_RECV_WR +#define XIO_DEV_ATTR_MAX_SGE 30 + +/* - one for send, (one for frwr, one for local invalidate) x (r1 + w1) + */ +#define MAX_CQE_PER_QP (5 * MAX_SEND_WR + MAX_RECV_WR + EXTRA_RQE) +#define CQE_ALLOC_SIZE (10 * MAX_CQE_PER_QP) + +#define MAX_HDR_SZ 512 +#define BUDGET_SIZE 1024 +#define MAX_NUM_DELAYED_ARM 16 + +#define NUM_CONN_SETUP_TASKS 2 /* one posted for req rx, + * one for reply tx + */ +#define CONN_SETUP_BUF_SIZE 4096 + +#define NUM_START_PRIMARY_POOL_TASKS 312 /* must be enough to send few + + fully post_recv buffers + */ +#define NUM_ALLOC_PRIMARY_POOL_TASKS 512 + +#define NUM_START_PHANTOM_POOL_TASKS 0 +#define NUM_ALLOC_PHANTOM_POOL_TASKS 256 +#define NUM_MAX_PHANTOM_POOL_TASKS 32768 + +#define SOFT_CQ_MOD 8 +#define HARD_CQ_MOD 64 +#define SEND_THRESHOLD 8 + +#ifndef PAGE_SHIFT +#define PAGE_SHIFT 12 +#endif +#ifndef PAGE_SIZE +#define PAGE_SIZE BIT(PAGE_SHIFT) +#endif +#ifndef PAGE_MASK +#define PAGE_MASK (~(PAGE_SIZE - 1)) +#endif + +#define USECS_IN_SEC 1000000 +#define NSECS_IN_USEC 1000 + +#define XIO_TO_RDMA_TASK(xt, rt) \ + struct xio_rdma_task *rt = (struct xio_rdma_task *)(xt)->dd_data +#define XIO_TO_RDMA_HNDL(xt, rh) \ + struct xio_rdma_transport *(rh) = \ + (struct xio_rdma_transport *)(xt)->context + +#define xio_prefetch(p) prefetch(p) + +#define XIO_FRWR_LI_WRID 0xffffffffffffffffULL +#define XIO_BEACON_WRID 0xfffffffffffffffeULL + +/*---------------------------------------------------------------------------*/ +/* enums */ +/*---------------------------------------------------------------------------*/ +enum xio_ib_op_code { + XIO_IB_NULL, + XIO_IB_RECV = 1, + XIO_IB_SEND, + XIO_IB_RDMA_WRITE, + XIO_IB_RDMA_READ, + XIO_IB_RDMA_WRITE_DIRECT, + XIO_IB_RDMA_READ_DIRECT +}; + +struct xio_transport_base; +struct xio_rdma_transport; + +/*---------------------------------------------------------------------------*/ +struct xio_rdma_options { + int enable_mem_pool; + int enable_dma_latency; + int max_in_iovsz; + int max_out_iovsz; + int qp_cap_max_inline_data; +}; + +#define XIO_REQ_HEADER_VERSION 1 + +struct __attribute__((__packed__)) xio_rdma_req_hdr { + uint8_t version; /* request version */ + uint8_t flags; + uint16_t req_hdr_len; /* req header length */ + uint16_t sn; /* serial number */ + uint16_t ack_sn; /* ack serial number */ + + uint16_t credits; /* peer send credits */ + uint32_t ltid; /* local task id */ + uint8_t in_ib_op; /* opcode for peers */ + uint8_t out_ib_op; + + uint16_t in_num_sge; + uint16_t out_num_sge; + uint32_t pad1; + + uint16_t ulp_hdr_len; /* ulp header length */ + uint16_t ulp_pad_len; /* pad_len length */ + uint32_t remain_data_len;/* remaining data length */ + uint64_t ulp_imm_len; /* ulp data length */ +}; + +#define XIO_RSP_HEADER_VERSION 1 + +struct __attribute__((__packed__)) xio_rdma_rsp_hdr { + uint8_t version; /* response version */ + uint8_t flags; + uint16_t rsp_hdr_len; /* rsp header length */ + uint16_t sn; /* serial number */ + uint16_t ack_sn; /* ack serial number */ + + uint16_t credits; /* peer send credits */ + uint32_t rtid; /* remote task id */ + uint8_t out_ib_op; /* opcode for peers */ + uint8_t pad; + + uint16_t pad1; + uint16_t out_num_sge; + uint32_t status; /* status */ + + uint32_t ltid; /* local task id */ + uint16_t ulp_hdr_len; /* ulp header length */ + uint16_t ulp_pad_len; /* pad_len length */ + + uint32_t remain_data_len;/* remaining data length */ + + uint64_t ulp_imm_len; /* ulp data length */ +}; + +struct __attribute__((__packed__)) xio_rdma_setup_msg { + u16 credits; /* peer send credits */ + u16 sq_depth; + u16 rq_depth; + u16 rkey_tbl_size; + u64 buffer_sz; + u32 max_in_iovsz; + u32 max_out_iovsz; + u32 max_header_len; + u32 pad; +}; + +struct __attribute__((__packed__)) xio_nop_hdr { + u16 hdr_len; /* req header length */ + u16 sn; /* serial number */ + u16 ack_sn; /* ack serial number */ + u16 credits; /* peer send credits */ + u8 opcode; /* opcode for peers */ + u8 flags; /* not used */ + u16 pad; +}; + +struct __attribute__((__packed__)) xio_rdma_read_ack_hdr { + uint16_t hdr_len; /* req header length */ + uint32_t rtid; /* remote task id */ +}; + +struct __attribute__((__packed__)) xio_rdma_cancel_hdr { + uint16_t hdr_len; /* req header length */ + uint16_t sn; /* serial number */ + uint32_t result; +}; + +struct xio_work_req { + union { + struct ib_send_wr send_wr; + struct ib_recv_wr recv_wr; + }; + struct ib_sge *sge; + struct sg_table sgt; /* same as sg_table with pointer to last*/ + struct scatterlist *last_sg; + int nents; /* number of sgl entries */ + int mapped; /* number of mapped entries */ +}; + +struct xio_rdma_task { + enum xio_ib_op_code out_ib_op; + enum xio_ib_op_code in_ib_op; + + /* The buffer mapped with the 3 xio_work_req + * used to transfer the headers + */ + void *buf; + /* for txd & rxd + * txd needs to chain the header sgl + * with task->omsg->out so sgl[1] is needed + */ + struct scatterlist tx_sgl[2]; + struct scatterlist rx_sgl[1]; + unsigned long size; + struct xio_work_req txd; + struct xio_work_req rxd; + struct xio_work_req rdmad; + + /* User (from vmsg) or pool buffer used for */ + u32 sqe_used; + u16 read_num_mem_desc; + u16 write_num_mem_desc; + struct xio_mem_desc read_mem_desc; + struct xio_mem_desc write_mem_desc; + + /* What this side got from the peer for RDMA R/W */ + + u16 req_out_num_sge; + u16 req_in_num_sge; + u16 rsp_out_num_sge; + u16 pad1; + + /* can serve send/rdma write */ + struct xio_sge *req_in_sge; + + /* can serve send/rdma read */ + struct xio_sge *req_out_sge; + + /* can serve send/rdma read response/rdma write */ + struct xio_sge *rsp_out_sge; + + unsigned int phantom_idx; + u16 sn; + u16 pad[3]; + +}; + +struct xio_cq { + struct xio_ev_data event_data; + struct ib_cq *cq; + struct xio_context *ctx; + struct xio_device *dev; + struct ib_wc *wc_array; + u32 wc_array_len; + u32 max_cqe; /* max snd elements */ + u32 cq_depth; /* current cq depth */ + u32 alloc_sz; /* allocation factor */ + u32 cqe_avail; /* free elements */ + struct kref kref; /* utilization counter */ + u32 num_delayed_arm; + u32 pad; + u32 polling_started; + struct timespec polling_end_time; + struct list_head trans_list; /* list of all transports + * attached to this cq + */ + struct list_head cq_list_entry; /* on device cq list */ + struct xio_observer observer; /* context observer */ + u64 events; + u64 wqes; + u64 scheds; +}; + +struct xio_page_vec { + u64 *pages; + int length; + int offset; + int data_size; +}; + +enum xio_fast_reg { + XIO_FAST_MEM_NONE, + XIO_FAST_MEM_FRWR, + XIO_FAST_MEM_FMR +}; + +struct xio_fmr { + struct ib_fmr_pool *pool; /* pool of IB FMRs */ + struct xio_page_vec *page_vec; /* represents SG to fmr maps* + * maps serialized as tx is*/ +}; + +struct xio_frwr { + struct llist_head pool; + struct llist_head pool_ret; + int pool_size; +}; + +union xio_fastreg { + struct xio_fmr fmr; + struct xio_frwr frwr; +}; + +struct xio_fastreg_ops { + int (*alloc_rdma_reg_res)(struct xio_rdma_transport *rdma_hndl); + void (*free_rdma_reg_res)(struct xio_rdma_transport *rdma_hndl); + int (*reg_rdma_mem)(struct xio_rdma_transport *rdma_hndl, + struct xio_mem_desc *desc, + enum dma_data_direction cmd_dir, + unsigned int *sqe_used); + void (*unreg_rdma_mem)(struct xio_rdma_transport *rdma_hndl, + struct xio_mem_desc *desc, + enum dma_data_direction cmd_dir); +}; + +struct xio_device { + struct xio_fastreg_ops fastreg; + struct list_head cq_list; /* list of all cq per device */ + rwlock_t cq_lock; + struct ib_device *ib_dev; + struct ib_pd *pd; + struct ib_mr *mr; + struct ib_device_attr device_attr; + struct xio_cq *cqs; + int cqs_used; + int port_num; + struct ib_event_handler event_handler; + struct kref kref; /* 1 + #xio_rdma_transport */ +}; + +struct xio_rdma_tasks_slab { + /* memory for non-rdma send/recv */ + struct kmem_cache *data_pool; + char name[32]; /* kmem_cache_create keeps a pointer to the pool's name + * Therefore the name must be valid until the pool + * is destroyed + */ + int buf_size; + int count; +}; + +struct xio_rdma_tasks_pool { + struct xio_device *dev; +}; + +struct __attribute__((__packed__)) xio_rkey_tbl_pack { + uint32_t old_rkey; + uint32_t new_rkey; +}; + +struct xio_rkey_tbl { + uint32_t old_rkey; + uint32_t new_rkey; +}; + +struct xio_rdma_transport { + struct xio_transport_base base; + struct xio_cq *tcq; + struct xio_device *dev; + struct ib_qp *qp; + struct xio_mempool *rdma_mempool; + struct xio_tasks_pool *phantom_tasks_pool; + union xio_fastreg fastreg; + struct xio_ev_data event_data_close; + struct xio_ev_data ev_data_timewait_exit; + + struct list_head trans_list_entry; + + /* tasks queues */ + struct list_head tx_ready_list; + struct list_head tx_comp_list; + struct list_head in_flight_list; + struct list_head rx_list; + struct list_head io_list; + struct list_head rdma_rd_req_list; + struct list_head rdma_rd_req_in_flight_list; + struct list_head rdma_rd_rsp_list; + struct list_head rdma_rd_rsp_in_flight_list; + + /* rx parameters */ + int rq_depth; /* max rcv allowed */ + int actual_rq_depth; /* max rcv allowed */ + int rqe_avail; /* recv queue elements + avail */ + uint16_t sim_peer_credits; /* simulates the peer + * credits management + * to control nop + * sends + */ + uint16_t credits; /* the ack this + peer sends */ + uint16_t peer_credits; + + uint16_t pad; + + /* fast path params */ + int rdma_rd_req_in_flight; + int rdma_rd_rsp_in_flight; + int sqe_avail; + enum xio_transport_state state; + + /* tx parameters */ + int kick_rdma_rd_req; + int kick_rdma_rd_rsp; + int reqs_in_flight_nr; + int rsps_in_flight_nr; + int tx_ready_tasks_num; + int max_tx_ready_tasks_num; + int max_inline_data; + size_t max_inline_buf_sz; + int max_sge; + uint16_t req_sig_cnt; + uint16_t rsp_sig_cnt; + /* sender window parameters */ + uint16_t sn; /* serial number */ + uint16_t ack_sn; /* serial number */ + + uint16_t max_sn; /* upper edge of + sender's window + 1 */ + + /* receiver window parameters */ + uint16_t exp_sn; /* lower edge of + receiver's window */ + + uint16_t max_exp_sn; /* upper edge of + receiver's window + 1 */ + + uint16_t beacon_sent:1; /* flag */ + + /* control path params */ + int sq_depth; /* max snd allowed */ + int num_tasks; + uint16_t client_initiator_depth; + uint16_t client_responder_resources; + + uint32_t peer_max_in_iovsz; + uint32_t peer_max_out_iovsz; + + /* connection's flow control */ + size_t membuf_sz; + + struct xio_transport *transport; + struct rdma_event_channel *cm_channel; + struct rdma_cm_id *cm_id; + struct xio_tasks_pool_cls initial_pool_cls; + struct xio_tasks_pool_cls primary_pool_cls; + + struct xio_rdma_setup_msg setup_rsp; + + /* for reconnect */ + struct xio_rkey_tbl *rkey_tbl; + struct xio_rkey_tbl *peer_rkey_tbl; + + uint16_t handler_nesting; + + /* for reconnect */ + uint16_t rkey_tbl_size; + uint16_t peer_rkey_tbl_size; + uint32_t peer_max_header; + + /* too big to be on stack - use as temporaries */ + union { + struct xio_msg dummy_msg; + struct xio_work_req dummy_wr; + }; + struct ib_send_wr beacon; + struct xio_task beacon_task; + struct xio_task frwr_task; + uint32_t trans_attr_mask; + struct xio_transport_attr trans_attr; +}; + +/* + * The next routines deal with comparing 16 bit unsigned integers + * and worry about wrap-around (automatic with unsigned arithmetic). + */ + +static inline s16 before(u16 seq1, u16 seq2) +{ + return (s16)(seq1 - seq2) < 0; +} + +#define after(seq2, seq1) before(seq1, seq2) + +static inline s16 before_eq(u16 seq1, u16 seq2) +{ + return (s16)(seq1 - seq2) <= 0; +} + +#define after_eq(seq2, seq1) before_eq(seq1, seq2) + +/* is s2<=s1tv_sec * USECS_IN_SEC; + retval += time_spec->tv_nsec / NSECS_IN_USEC; + + return retval; +} + +/* xio_rdma_verbs.c */ +void xio_mr_list_init(void); +int xio_mr_list_free(void); +const char *xio_ib_wc_opcode_str(enum ib_wc_opcode opcode); +const char *xio_ib_wc_status_str(enum ib_wc_status status); +const char *xio_rdma_event_str(enum rdma_cm_event_type event); + +/* xio_rdma_datapath.c */ +void xio_data_ev_handler(int fd, int events, void *user_context); +int xio_post_recv(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task, int num_recv_bufs); +int xio_rdma_rearm_rq(struct xio_rdma_transport *rdma_hndl); +int xio_rdma_send(struct xio_transport_base *transport, + struct xio_task *task); +int xio_rdma_poll(struct xio_transport_base *transport, + long min_nr, long max_nr, + struct timespec *ts_timeout); + +/* xio_rdma_management.c */ +int xio_rdma_get_max_header_size(void); + +int xio_rdma_get_inline_buffer_size(void); + +void xio_rdma_close_cb(struct kref *kref); + +/* Should create a xio_memory.h */ +void xio_unmap_rx_work_req(struct xio_device *dev, struct xio_work_req *xd); +void xio_unmap_tx_work_req(struct xio_device *dev, struct xio_work_req *xd); +int xio_map_rx_work_req(struct xio_device *dev, struct xio_work_req *xd); +int xio_map_tx_work_req(struct xio_device *dev, struct xio_work_req *xd); +void xio_unmap_rxmad_work_req(struct xio_device *dev, struct xio_work_req *xd); +void xio_unmap_txmad_work_req(struct xio_device *dev, struct xio_work_req *xd); +int xio_map_rxmad_work_req(struct xio_device *dev, struct xio_work_req *xd); +int xio_map_txmad_work_req(struct xio_device *dev, struct xio_work_req *xd); +int xio_remap_work_req(struct xio_device *odev, struct xio_device *ndev, + struct xio_work_req *xd, + enum dma_data_direction direction); + +void xio_reset_desc(struct xio_mem_desc *desc); + +void xio_unmap_desc(struct xio_rdma_transport *rdma_hndl, + struct xio_mem_desc *desc, + enum dma_data_direction direction); + +int xio_map_desc(struct xio_rdma_transport *rdma_hndl, + struct xio_mem_desc *desc, + enum dma_data_direction direction, + unsigned int *sqe_used); + +int xio_remap_desc(struct xio_rdma_transport *rdma_ohndl, + struct xio_rdma_transport *rdma_nhndl, + struct xio_mem_desc *desc, + enum dma_data_direction direction, + unsigned int *sqe_used); + +void xio_reinit_header(struct xio_rdma_task *rdma_task, size_t len); + +int xio_vmsg_to_tx_sgt(struct xio_vmsg *vmsg, struct sg_table *sgt, int *nents); +int xio_vmsg_to_sgt(struct xio_vmsg *vmsg, struct sg_table *sgt, int *nents); + +int xio_fast_reg_init(enum xio_fast_reg reg, struct xio_fastreg_ops *ops); + +void xio_cq_data_callback(struct ib_cq *cq, void *cq_context); + +struct xio_task *xio_rdma_primary_task_alloc( + struct xio_rdma_transport *rdma_hndl); + +struct xio_task *xio_rdma_primary_task_lookup( + struct xio_rdma_transport *rdma_hndl, + int tid); + +void xio_rdma_task_free(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task); + +static inline void xio_device_get(struct xio_device *dev) +{ + kref_get(&dev->kref); +} + +void xio_device_down(struct kref *kref); + +static inline void xio_device_put(struct xio_device *dev) +{ + kref_put(&dev->kref, xio_device_down); +} + +void xio_rdma_poll_completions(struct xio_cq *tcq, int timeout_us); + +#endif /* XIO_RDMA_TRANSPORT_H */ diff --git a/open_src/xio/src/kernel/transport/rdma/xio_rdma_utils.c b/open_src/xio/src/kernel/transport/rdma/xio_rdma_utils.c new file mode 100644 index 0000000..0e7d1d4 --- /dev/null +++ b/open_src/xio/src/kernel/transport/rdma/xio_rdma_utils.c @@ -0,0 +1,240 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include +#include + +#include "libxio.h" +#include "xio_common.h" +#include "xio_log.h" +#include "xio_observer.h" +#include "xio_mempool.h" +#include "xio_protocol.h" +#include "xio_mbuf.h" +#include "xio_task.h" +#include "xio_ktransport.h" +#include "xio_transport.h" +#include "xio_rdma_transport.h" +#include "xio_rdma_utils.h" +#include "xio_sg_table.h" + +/*---------------------------------------------------------------------------*/ +/* xio_validate_rdma_op */ +/*---------------------------------------------------------------------------*/ +int xio_validate_rdma_op(struct xio_vmsg *vmsg, + struct xio_sge *rsg_list, size_t rsize, + int op_size, + int max_sge, + int *tasks_used) +{ + struct sg_table *sgtbl; + struct scatterlist *liov; + uint64_t raddr; + uint32_t rlen; + uint64_t laddr; + uint32_t llen; + uint32_t tot_len = 0; + size_t lsize, lnents; + int l, r; + int k = 0; + + if (rsize < 1) { + ERROR_LOG("rsize:%zu\n", rsize); + *tasks_used = 0; + return -1; + } + sgtbl = &vmsg->data_tbl; + lnents = sgtbl->nents; + + if (lnents > XIO_MAX_IOV || lnents == 0) { + WARN_LOG("IOV size %zu\n", lnents); + *tasks_used = 0; + return -EINVAL; + } + + lsize = lnents; + liov = sgtbl->sgl; + + r = 0; + rlen = rsg_list[r].length; + raddr = rsg_list[r].addr; + + l = 0; + laddr = uint64_from_ptr(sg_virt(liov)); + llen = liov->length; + + /* At least one task */ + *tasks_used = 1; + + while (1) { + if (rlen < llen) { + r++; + tot_len += rlen; + if (r == rsize) + break; + llen -= rlen; + laddr += rlen; + raddr = rsg_list[r].addr; + rlen = rsg_list[r].length; + (*tasks_used)++; + k = 0; + } else if (llen < rlen) { + /* check page alignment when source buff spans more + * then one destination buffer */ + l++; + tot_len += llen; + if (l == lsize) + break; + liov = sg_next(liov); + k++; + if (k == max_sge - 1) { + /* reached last index */ + (*tasks_used)++; + k = 0; + } + rlen -= llen; + raddr += llen; + laddr = uint64_from_ptr(sg_virt(liov)); + llen = liov->length; + } else { + l++; + liov = sg_next(liov); + r++; + tot_len += llen; + if ((l == lsize) || (r == rsize)) + break; + + laddr = uint64_from_ptr(sg_virt(liov)); + llen = liov->length; + raddr = rsg_list[r].addr; + rlen = rsg_list[r].length; + (*tasks_used)++; + k = 0; + } + } + + /* not enough buffers to complete */ + if (tot_len < op_size) { + *tasks_used = 0; + ERROR_LOG("iovec exhausted, tot=%d, op=%d, max_sge=%d\n", + tot_len, op_size, max_sge); + ERROR_LOG("rsize=%zu, lents=%zu\n", rsize, lnents); + return -1; + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_cm_rej_reason_str */ +/*---------------------------------------------------------------------------*/ +const char *xio_cm_rej_reason_str(int reason) +{ + switch (reason) { + case IB_CM_REJ_NO_QP: + return "No QP"; + case IB_CM_REJ_NO_EEC: + return "No EEC"; + case IB_CM_REJ_NO_RESOURCES: + return "No Resources"; + case IB_CM_REJ_TIMEOUT: + return "Timeout"; + case IB_CM_REJ_UNSUPPORTED: + return "Unsupported"; + case IB_CM_REJ_INVALID_COMM_ID: + return "Invalid COMM ID"; + case IB_CM_REJ_INVALID_COMM_INSTANCE: + return "Invalid COMM Instance"; + case IB_CM_REJ_INVALID_SERVICE_ID: + return "Invalid Service ID"; + case IB_CM_REJ_INVALID_TRANSPORT_TYPE: + return "Invalid Transport Type"; + case IB_CM_REJ_STALE_CONN: + return "Stale Connection"; + case IB_CM_REJ_RDC_NOT_EXIST: + return "RDC not exist"; + case IB_CM_REJ_INVALID_GID: + return "Invalid GID"; + case IB_CM_REJ_INVALID_LID: + return "Invalid LID"; + case IB_CM_REJ_INVALID_SL: + return "Invalid SL"; + case IB_CM_REJ_INVALID_TRAFFIC_CLASS: + return "Invalid Traffic Class"; + case IB_CM_REJ_INVALID_HOP_LIMIT: + return "Invalid HOP Limit"; + case IB_CM_REJ_INVALID_PACKET_RATE: + return "Invalid Packet Rate"; + case IB_CM_REJ_INVALID_ALT_GID: + return "Invalid Alt GID"; + case IB_CM_REJ_INVALID_ALT_LID: + return "Invalid Alt LID"; + case IB_CM_REJ_INVALID_ALT_SL: + return "Invalid Alt SL"; + case IB_CM_REJ_INVALID_ALT_TRAFFIC_CLASS: + return "Invalid Alt Traffic Class"; + case IB_CM_REJ_INVALID_ALT_HOP_LIMIT: + return "Invalid Alt HOP Limit"; + case IB_CM_REJ_INVALID_ALT_PACKET_RATE: + return "Invalid Alt Packet Rate"; + case IB_CM_REJ_PORT_CM_REDIRECT: + return "Invalid Alt Packet Rate"; + case IB_CM_REJ_PORT_REDIRECT: + return "Port Redirect"; + case IB_CM_REJ_INVALID_MTU: + return "Invalid MTU"; + case IB_CM_REJ_INSUFFICIENT_RESP_RESOURCES: + return "Invalid Response Resources"; + case IB_CM_REJ_CONSUMER_DEFINED: + return "Consumer Defined"; + case IB_CM_REJ_INVALID_RNR_RETRY: + return "Invalid RNR Retry"; + case IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID: + return "Duplicate Local Comm ID"; + case IB_CM_REJ_INVALID_CLASS_VERSION: + return "Invalid Class Version"; + case IB_CM_REJ_INVALID_FLOW_LABEL: + return "Invalid Flow Label"; + case IB_CM_REJ_INVALID_ALT_FLOW_LABEL: + return "Invalid Alt Flow Label"; + default: + return "Unknown error"; + }; +} + diff --git a/open_src/xio/src/kernel/transport/rdma/xio_rdma_utils.h b/open_src/xio/src/kernel/transport/rdma/xio_rdma_utils.h new file mode 100644 index 0000000..71c5729 --- /dev/null +++ b/open_src/xio/src/kernel/transport/rdma/xio_rdma_utils.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_RDMA_UTILS_H +#define XIO_RDMA_UTILS_H + +int xio_validate_rdma_op(struct xio_vmsg *vmsg, + struct xio_sge *rsg_list, size_t rsize, + int op_size, + int max_sge, + int *tasks_used); + +const char *xio_cm_rej_reason_str(int reason); + +#endif /*XIO_RDMA_UTILS_H */ + diff --git a/open_src/xio/src/kernel/transport/rdma/xio_rdma_verbs.c b/open_src/xio/src/kernel/transport/rdma/xio_rdma_verbs.c new file mode 100644 index 0000000..2a5c7a9 --- /dev/null +++ b/open_src/xio/src/kernel/transport/rdma/xio_rdma_verbs.c @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2); available from the file COPYING in the main + * directory of this source tree); or the Mellanox Technologies® BSD license + * below); + * + * - Redistribution and use in source and binary forms); with or without + * modification); are permitted provided that the following conditions + * are met); + * + * - Redistributions of source code must retain the above copyright + * notice); this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice); this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES); INCLUDING); BUT NOT LIMITED TO); THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT); INDIRECT); INCIDENTAL); SPECIAL); EXEMPLARY); OR + * CONSEQUENTIAL DAMAGES (INCLUDING); BUT NOT LIMITED TO); PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE); DATA); OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY); WHETHER IN + * CONTRACT); STRICT LIABILITY); OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE); EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include + +#include "libxio.h" +#include "xio_observer.h" +#include "xio_log.h" +#include "xio_common.h" +#include "xio_mem.h" +#include "xio_protocol.h" +#include "xio_mbuf.h" +#include "xio_task.h" +#include "xio_mempool.h" +#include "xio_ktransport.h" +#include "xio_transport.h" +#include "xio_rdma_transport.h" + +/*---------------------------------------------------------------------------*/ +/* globals */ +/*---------------------------------------------------------------------------*/ + +extern struct list_head dev_list; + +#define XX(a) case (a): return #a + +/*---------------------------------------------------------------------------*/ +/* ibv_wc_opcode_str */ +/*---------------------------------------------------------------------------*/ + +const char *xio_ib_wc_status_str(enum ib_wc_status status) +{ + switch (status) { + XX(IB_WC_SUCCESS); + XX(IB_WC_LOC_LEN_ERR); + XX(IB_WC_LOC_QP_OP_ERR); + XX(IB_WC_LOC_EEC_OP_ERR); + XX(IB_WC_LOC_PROT_ERR); + XX(IB_WC_WR_FLUSH_ERR); + XX(IB_WC_MW_BIND_ERR); + XX(IB_WC_BAD_RESP_ERR); + XX(IB_WC_LOC_ACCESS_ERR); + XX(IB_WC_REM_INV_REQ_ERR); + XX(IB_WC_REM_ACCESS_ERR); + XX(IB_WC_REM_OP_ERR); + XX(IB_WC_RETRY_EXC_ERR); + XX(IB_WC_RNR_RETRY_EXC_ERR); + XX(IB_WC_LOC_RDD_VIOL_ERR); + XX(IB_WC_REM_INV_RD_REQ_ERR); + XX(IB_WC_REM_ABORT_ERR); + XX(IB_WC_INV_EECN_ERR); + XX(IB_WC_INV_EEC_STATE_ERR); + XX(IB_WC_FATAL_ERR); + XX(IB_WC_RESP_TIMEOUT_ERR); + XX(IB_WC_GENERAL_ERR); + break; + default: return "IB_WC_STATUS_UNKNOWN"; + } +} + +/*---------------------------------------------------------------------------*/ +/* ibv_wc_opcode_str */ +/*---------------------------------------------------------------------------*/ + +const char *xio_ib_wc_opcode_str(enum ib_wc_opcode opcode) +{ + switch (opcode) { + XX(IB_WC_SEND); + XX(IB_WC_RDMA_WRITE); + XX(IB_WC_RDMA_READ); + XX(IB_WC_COMP_SWAP); + XX(IB_WC_FETCH_ADD); + XX(IB_WC_BIND_MW); + /* recv-side); inbound completion */ + XX(IB_WC_RECV); + XX(IB_WC_RECV_RDMA_WITH_IMM); + break; + default: return "IB_WC_OPCODE_UNKNOWN"; + } +} + +const char *xio_rdma_event_str(enum rdma_cm_event_type event) +{ + switch (event) { + XX(RDMA_CM_EVENT_ADDR_RESOLVED); + XX(RDMA_CM_EVENT_ADDR_ERROR); + XX(RDMA_CM_EVENT_ROUTE_RESOLVED); + XX(RDMA_CM_EVENT_ROUTE_ERROR); + XX(RDMA_CM_EVENT_CONNECT_REQUEST); + XX(RDMA_CM_EVENT_CONNECT_RESPONSE); + XX(RDMA_CM_EVENT_CONNECT_ERROR); + XX(RDMA_CM_EVENT_UNREACHABLE); + XX(RDMA_CM_EVENT_REJECTED); + XX(RDMA_CM_EVENT_ESTABLISHED); + XX(RDMA_CM_EVENT_DISCONNECTED); + XX(RDMA_CM_EVENT_DEVICE_REMOVAL); + XX(RDMA_CM_EVENT_MULTICAST_JOIN); + XX(RDMA_CM_EVENT_MULTICAST_ERROR); + XX(RDMA_CM_EVENT_ADDR_CHANGE); + XX(RDMA_CM_EVENT_TIMEWAIT_EXIT); + break; + default: return "RDMA_CM_UNKNOWN"; + } +} diff --git a/open_src/xio/src/kernel/transport/tcp/Makefile.in b/open_src/xio/src/kernel/transport/tcp/Makefile.in new file mode 100644 index 0000000..7d3597f --- /dev/null +++ b/open_src/xio/src/kernel/transport/tcp/Makefile.in @@ -0,0 +1,63 @@ +# Makefile.in for kernel module + +SHELL = /bin/sh +INSTALL = @INSTALL@ +mkdir_p = mkdir -p +VERSION = @PACKAGE_VERSION@ +OFED_CFLAGS = @OFED_CFLAGS@ +KSYMVERS = @TCP_SYMVERS@ + +NOSTDINC_FLAGS += @OFED_CFLAGS@ + +DISTFILES = Makefile.in configure.ac configure ../install-sh \ + xio_log.h xio_mem.h xio_os.h \ + xio_tcp_transport.h \ + xio_tcp_datapath.c xio_tcp_management.c + +xiomoduledir = @kmoduledir@/extra/net/xio + +xiomodule := xio_tcp.ko + +all: all-@ENABLE_XIO_MODULE@ +install: install-@ENABLE_XIO_MODULE@ +uninstall: uninstall-@ENABLE_XIO_MODULE@ + +all-n: +install-n: +uninstall-n: + +all-y: all-spec + +install-y: all + $(mkdir_p) $(DESTDIR)$(xiomoduledir) + $(INSTALL) -m 644 $(xiomodule) $(DESTDIR)$(xiomoduledir)/$(xiomodule) + -/sbin/depmod -a + +uninstall-y: + rm -f $(DESTDIR)$(xiomoduledir)/$(xiomodule) + -/sbin/depmod -a + +clean: + -rm -f $(xiomodule) *.o .*.cmd *.mod.c *.ko *.s */*.o *.order *.symvers *.unsigned + +distclean: clean + rm -f Makefile configure config.status + rm -f config.h config.log config.status config.cache + rm -rf .tmp_versions autom4te.cache + +maintainer-clean: distclean + +distdir: $(DISTFILES) + cp -p $(DISTFILES) $(distdir) + + +ccflags-y += $(OFED_CFLAGS) -I$(SUBDIRS) -I$(SUBDIRS)/.. -I$(SUBDIRS)/../.. -I$(SUBDIRS)/../../xio -I$(SUBDIRS)/../../../common -I$(SUBDIRS)/../../../../include -I$(SUBDIRS)/../../../libxio_os/linuxkernel + +obj-m := xio_tcp.o +xio_tcp-objs := \ + xio_tcp_datapath.o \ + xio_tcp_management.o + +all-spec: + export NOSTDINC_FLAGS + $(MAKE) -C @kernelsrc@ SUBDIRS=`pwd` KBUILD_EXTRA_SYMBOLS="$(KSYMVERS)" @KERNELMAKE_PARAMS@ modules diff --git a/open_src/xio/src/kernel/transport/tcp/autogen.sh b/open_src/xio/src/kernel/transport/tcp/autogen.sh new file mode 100644 index 0000000..28dd57d --- /dev/null +++ b/open_src/xio/src/kernel/transport/tcp/autogen.sh @@ -0,0 +1,3 @@ +#! /bin/sh + +autoconf diff --git a/open_src/xio/src/kernel/transport/tcp/configure.ac b/open_src/xio/src/kernel/transport/tcp/configure.ac new file mode 100644 index 0000000..fad3895 --- /dev/null +++ b/open_src/xio/src/kernel/transport/tcp/configure.ac @@ -0,0 +1,212 @@ +AC_INIT([xio-kernel],[2.0],[libxio@accellio.org]) + +AC_PROG_INSTALL + +runver=`uname -r` +bad_kernel_version=no +ENABLE_XIO_MODULE=y +# do not build against ofed until kernel module can be built out of kernel +# tree +ENABLE_OFED_BUILD=y +KERNELCFLAGS= + +kernelsrc= +kernelbuild= +AC_ARG_WITH(kernel, + [ --with-kernel=PATH Specify location of kernel source ], + [kernelsrc="$withval"; kernelbuild="$withval"]) +AC_ARG_WITH(kernel-build, + [ --with-kernel-build=PATH Specify location of kernel build ], + [kernelbuild="$withval"]) +AC_ARG_ENABLE(kernel-module, + [ --enable-kernel-module Compile kernel module ]) + + +TCP_SYMVERS=`pwd`/../../xio/Module.symvers + +if test "$ENABLE_OFED_BUILD" = "y"; then +AC_MSG_CHECKING([if ofed installed]) +MLNX_OFED=`if ofed_info 2>/dev/null | grep MLNX_OFED >/dev/null 2>/dev/null; then echo true; else echo false; fi` +OFED_CFLAGS= + +if test "$MLNX_OFED" = "true"; then + AC_MSG_RESULT(yes) + + # Whether MLNX_OFED for ubuntu has been installed + MLNX_OFED_IB_UBUNTU_INSTALLED=`if dpkg -s mlnx-ofed-kernel-dkms >/dev/null 2>/dev/null; then echo true; else echo false; fi` + + # Whether MLNX_OFED for RedHat has been installed + MLNX_OFED_IB_RH_INSTALLED=`if rpm -q mlnx-ofa_kernel-devel >&/dev/null; then echo true; else echo false; fi` + + # Check if we have custom compiled kernel modules + if test "$MLNX_OFED_IB_RH_INSTALLED" = "false"; then + MLNX_OFED_IB_RH_INSTALLED=`if rpm -q kernel-ib-devel >&/dev/null; then echo true; else echo false; fi` + fi + + if test "$MLNX_OFED_IB_UBUNTU_INSTALLED" = "true"; then + OFED_VERS=`dpkg -s mlnx-ofed-kernel-dkms | awk -F\- '/Version/ {print $1}' | awk '{print $2}'` + OFED_CFLAGS=`echo -I/var/lib/dkms/mlnx-ofed-kernel/$OFED_VERS/build/include -include /var/lib/dkms/mlnx-ofed-kernel/$OFED_VERS/build/include/linux/compat-2.6.h` + fi + + if test "$MLNX_OFED_IB_RH_INSTALLED" = "true"; then + OFED_CFLAGS=`echo -I/usr/src/ofa_kernel/default/include -include /usr/src/ofa_kernel/default/include/linux/compat-2.6.h` + fi +else + AC_MSG_RESULT(no) + TCP_SYMVERS=`echo $TCP_SYMVERS ../compat/Module.symvers` + + # Whether or not the OFED kernel-ib-devel RPM has been installed. + OFED_KERNEL_IB_DEVEL_RPM_INSTALLED=`if rpm -q kernel-ib-devel 2>/dev/null | grep -q $(uname -r | sed 's/-/_/g'); then echo true; else echo false; fi` + + # Whether or not the OFED compat-rdma-devel RPM has been installed. + OFED_COMPAT_RDMA_DEVEL_RPM_INSTALLED=`if rpm -q compat-rdma-devel 2>/dev/null | grep -q $(uname -r | sed 's/-/_/g'); then echo true; else echo false; fi` + + if test "$OFED_KERNEL_IB_DEVEL_RPM_INSTALLED" = "true"; then + # Read OFED's config.mk, which contains the definition of the variable + # BACKPORT_INCLUDES. + cfile="/usr/src/ofa_kernel/config.mk" + if test -r "${cfile}"; then + echo "loading build-specific script '${cfile}'" + . "${cfile}" + else + cfile="/usr/src/ofa_kernel/default/config.mk" + if test -r "${cfile}"; then + echo "loading build-specific script '${cfile}'" + . "${cfile}" + fi + fi + + OFED_CFLAGS=`echo $BACKPORT_INCLUDES -I/usr/src/ofa_kernel/include` + fi + + if test "$OFED_COMPAT_RDMA_DEVEL_RPM_INSTALLED" = "true"; then + OFED_CFLAGS=`echo -I/usr/src/compat-rdma/include -include /usr/src/compat-rdma/include/linux/compat-2.6.h` + fi +fi + +AC_MSG_NOTICE([ofed include files directory is ${OFED_CFLAGS}]) +AC_SUBST(OFED_CFLAGS) +AC_SUBST(TCP_SYMVERS) +fi + +if test "$ENABLE_XIO_MODULE" = y; then + AC_MSG_CHECKING([kernel source directory]) + if test -z "$kernelsrc"; then + kernelbuild= + sourcelink=/lib/modules/${runver}/source + buildlink=/lib/modules/${runver}/build + + if test -e $sourcelink; then + kernelsrc=`(cd $sourcelink; /bin/pwd)` + fi + if test -e $buildlink; then + kernelbuild=`(cd $buildlink; /bin/pwd)` + fi + if test -z "$kernelsrc"; then + kernelsrc=$kernelbuild + fi + if test -z "$kernelsrc" -o -z "$kernelbuild"; then + AC_MSG_RESULT([Not found]) + AC_MSG_ERROR([ + *** Please specify the location of the kernel source with + *** the '--with-kernel=SRCDIR' option]) + fi + fi + AC_MSG_RESULT([$kernelsrc]) + AC_MSG_CHECKING([kernel build directory]) + AC_MSG_RESULT([$kernelbuild]) + + AC_MSG_CHECKING([kernel source version]) + if test -r $kernelbuild/include/linux/version.h && fgrep -q UTS_RELEASE $kernelbuild/include/linux/version.h; then + kernsrcver=`(echo "#include "; echo "kernsrcver=UTS_RELEASE") | cpp -I $kernelbuild/include | grep "^kernsrcver=" | cut -d \" -f 2` + elif test -r $kernelbuild/include/linux/utsrelease.h && fgrep -q UTS_RELEASE $kernelbuild/include/linux/utsrelease.h; then + kernsrcver=`(echo "#include "; echo "kernsrcver=UTS_RELEASE") | cpp -I $kernelbuild/include | grep "^kernsrcver=" | cut -d \" -f 2` + elif test -r $kernelbuild/include/generated/utsrelease.h && fgrep -q UTS_RELEASE $kernelbuild/include/generated/utsrelease.h; then + kernsrcver=`(echo "#include "; echo "kernsrcver=UTS_RELEASE") | cpp -I $kernelbuild/include | grep "^kernsrcver=" | cut -d \" -f 2` + fi + if test -z "$kernsrcver"; then + AC_MSG_RESULT([Not found]) + AC_MSG_ERROR([ + *** Cannot determine the version of the linux kernel source. Please + *** prepare the kernel before running this script]) + fi + AC_MSG_RESULT([$kernsrcver]) + kmoduledir=${INSTALL_MOD_PATH}/lib/modules/$kernsrcver + AC_SUBST(kernelsrc) + AC_SUBST(kmoduledir) + + if echo "$kernsrcver" | egrep -q ["^(2.4|2.6.[0-8]([^0-9]|\$))"]; then + bad_kernel_version=yes + AC_MSG_NOTICE([ +NOTE: Disabled building the kernel module, because this release only +NOTE: supports Linux versions 2.6.9 or later. You can use the kernel +NOTE: module from an earlier XIO release with the library from this +NOTE: release.]) + else + xio_configured=no + kernel_autoconf=$kernelbuild/include/linux/autoconf.h + AC_MSG_CHECKING([if XIO is configured in the kernel]) + if test -f $kernel_autoconf; then + if grep -q "^#define CONFIG_XIO 1" $kernel_autoconf || grep -q "^#define CONFIG_XIO_MODULE 1" $kernel_autoconf; then + xio_configured=yes + fi + fi + AC_MSG_RESULT([$xio_configured]) + if test -z "$enable_kernel_module" -a "$xio_configured" = yes; then + ENABLE_XIO_MODULE=n + fi + fi +fi + +if test "$ENABLE_XIO_MODULE" = n; then + AC_MSG_NOTICE([ +NOTE: Detected that XIO is already present in the kernel, so +NOTE: building of kernel module is disabled. To force building +NOTE: of kernel module use the '--enable-kernel-module' option.]) +fi + +if test "$enable_kernel_module" = no; then + ENABLE_XIO_MODULE=n +fi +if test "$bad_kernel_version" = yes; then + ENABLE_XIO_MODULE=n +fi + +AC_MSG_CHECKING([is ENABLE_XIO_MODULE defined]) +if test "$ENABLE_XIO_MODULE" = y; then + AC_MSG_RESULT([yes]) +else + AC_MSG_RESULT([no]) +fi + +AC_SUBST(ENABLE_XIO_MODULE) + +if test "$ENABLE_XIO_MODULE" = y; then + AC_MSG_CHECKING([if kernel defines kzalloc function]) + if egrep -qw "kzalloc" $kernelsrc/include/linux/slab.h; then + AC_DEFINE(HAVE_KZALLOC, 1, [kzalloc() is defined]) + AC_MSG_RESULT([yes]) + else + AC_MSG_RESULT([no]) + fi + + isuml=no + KERNELMAKE_PARAMS= + KERNELCPPFLAGS= + AC_MSG_CHECKING([if this is user mode linux]) + if test -f $kernelbuild/include/linux/autoconf.h && egrep -q "^#define CONFIG_(USERMODE|UML) 1" $kernelbuild/include/linux/autoconf.h; then + isuml=yes + KERNELMAKE_PARAMS="ARCH=um" + KERNELCPPFLAGS="-D__arch_um__ -DSUBARCH=\\\"i386\\\" -D_LARGEFILE64_SOURCE -I${kernelsrc}/arch/um/include -Derrno=kernel_errno -I${kernelsrc}/arch/um/kernel/tt/include -I${kernelsrc}/arch/um/kernel/skas/include" + fi + AC_MSG_RESULT([$isuml]) + if test "$kernelbuild" != "$kernelsrc"; then + KERNELMAKE_PARAMS="$KERNELMAKE_PARAMS O=$kernelbuild" + fi + AC_SUBST(KERNELMAKE_PARAMS) + AC_SUBST(KERNELCPPFLAGS) + AC_SUBST(KERNELCFLAGS) +fi + +AC_CONFIG_FILES([Makefile]) +AC_OUTPUT diff --git a/open_src/xio/src/kernel/transport/tcp/install-sh b/open_src/xio/src/kernel/transport/tcp/install-sh new file mode 100644 index 0000000..6781b98 --- /dev/null +++ b/open_src/xio/src/kernel/transport/tcp/install-sh @@ -0,0 +1,520 @@ +#!/bin/sh +# install - install a program, script, or datafile + +scriptversion=2009-04-28.21; # UTC + +# This originates from X11R5 (mit/util/scripts/install.sh), which was +# later released in X11R6 (xc/config/util/install.sh) with the +# following copyright and license. +# +# Copyright (C) 1994 X Consortium +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to +# deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +# sell copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# X CONSORTIUM BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN +# AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNEC- +# TION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# Except as contained in this notice, the name of the X Consortium shall not +# be used in advertising or otherwise to promote the sale, use or other deal- +# ings in this Software without prior written authorization from the X Consor- +# tium. +# +# +# FSF changes to this file are in the public domain. +# +# Calling this script install-sh is preferred over install.sh, to prevent +# `make' implicit rules from creating a file called install from it +# when there is no Makefile. +# +# This script is compatible with the BSD install script, but was written +# from scratch. + +nl=' +' +IFS=" "" $nl" + +# set DOITPROG to echo to test this script + +# Don't use :- since 4.3BSD and earlier shells don't like it. +doit=${DOITPROG-} +if test -z "$doit"; then + doit_exec=exec +else + doit_exec=$doit +fi + +# Put in absolute file names if you don't have them in your path; +# or use environment vars. + +chgrpprog=${CHGRPPROG-chgrp} +chmodprog=${CHMODPROG-chmod} +chownprog=${CHOWNPROG-chown} +cmpprog=${CMPPROG-cmp} +cpprog=${CPPROG-cp} +mkdirprog=${MKDIRPROG-mkdir} +mvprog=${MVPROG-mv} +rmprog=${RMPROG-rm} +stripprog=${STRIPPROG-strip} + +posix_glob='?' +initialize_posix_glob=' + test "$posix_glob" != "?" || { + if (set -f) 2>/dev/null; then + posix_glob= + else + posix_glob=: + fi + } +' + +posix_mkdir= + +# Desired mode of installed file. +mode=0755 + +chgrpcmd= +chmodcmd=$chmodprog +chowncmd= +mvcmd=$mvprog +rmcmd="$rmprog -f" +stripcmd= + +src= +dst= +dir_arg= +dst_arg= + +copy_on_change=false +no_target_directory= + +usage="\ +Usage: $0 [OPTION]... [-T] SRCFILE DSTFILE + or: $0 [OPTION]... SRCFILES... DIRECTORY + or: $0 [OPTION]... -t DIRECTORY SRCFILES... + or: $0 [OPTION]... -d DIRECTORIES... + +In the 1st form, copy SRCFILE to DSTFILE. +In the 2nd and 3rd, copy all SRCFILES to DIRECTORY. +In the 4th, create DIRECTORIES. + +Options: + --help display this help and exit. + --version display version info and exit. + + -c (ignored) + -C install only if different (preserve the last data modification time) + -d create directories instead of installing files. + -g GROUP $chgrpprog installed files to GROUP. + -m MODE $chmodprog installed files to MODE. + -o USER $chownprog installed files to USER. + -s $stripprog installed files. + -t DIRECTORY install into DIRECTORY. + -T report an error if DSTFILE is a directory. + +Environment variables override the default commands: + CHGRPPROG CHMODPROG CHOWNPROG CMPPROG CPPROG MKDIRPROG MVPROG + RMPROG STRIPPROG +" + +while test $# -ne 0; do + case $1 in + -c) ;; + + -C) copy_on_change=true;; + + -d) dir_arg=true;; + + -g) chgrpcmd="$chgrpprog $2" + shift;; + + --help) echo "$usage"; exit $?;; + + -m) mode=$2 + case $mode in + *' '* | *' '* | *' +'* | *'*'* | *'?'* | *'['*) + echo "$0: invalid mode: $mode" >&2 + exit 1;; + esac + shift;; + + -o) chowncmd="$chownprog $2" + shift;; + + -s) stripcmd=$stripprog;; + + -t) dst_arg=$2 + shift;; + + -T) no_target_directory=true;; + + --version) echo "$0 $scriptversion"; exit $?;; + + --) shift + break;; + + -*) echo "$0: invalid option: $1" >&2 + exit 1;; + + *) break;; + esac + shift +done + +if test $# -ne 0 && test -z "$dir_arg$dst_arg"; then + # When -d is used, all remaining arguments are directories to create. + # When -t is used, the destination is already specified. + # Otherwise, the last argument is the destination. Remove it from $@. + for arg + do + if test -n "$dst_arg"; then + # $@ is not empty: it contains at least $arg. + set fnord "$@" "$dst_arg" + shift # fnord + fi + shift # arg + dst_arg=$arg + done +fi + +if test $# -eq 0; then + if test -z "$dir_arg"; then + echo "$0: no input file specified." >&2 + exit 1 + fi + # It's OK to call `install-sh -d' without argument. + # This can happen when creating conditional directories. + exit 0 +fi + +if test -z "$dir_arg"; then + trap '(exit $?); exit' 1 2 13 15 + + # Set umask so as not to create temps with too-generous modes. + # However, 'strip' requires both read and write access to temps. + case $mode in + # Optimize common cases. + *644) cp_umask=133;; + *755) cp_umask=22;; + + *[0-7]) + if test -z "$stripcmd"; then + u_plus_rw= + else + u_plus_rw='% 200' + fi + cp_umask=`expr '(' 777 - $mode % 1000 ')' $u_plus_rw`;; + *) + if test -z "$stripcmd"; then + u_plus_rw= + else + u_plus_rw=,u+rw + fi + cp_umask=$mode$u_plus_rw;; + esac +fi + +for src +do + # Protect names starting with `-'. + case $src in + -*) src=./$src;; + esac + + if test -n "$dir_arg"; then + dst=$src + dstdir=$dst + test -d "$dstdir" + dstdir_status=$? + else + + # Waiting for this to be detected by the "$cpprog $src $dsttmp" command + # might cause directories to be created, which would be especially bad + # if $src (and thus $dsttmp) contains '*'. + if test ! -f "$src" && test ! -d "$src"; then + echo "$0: $src does not exist." >&2 + exit 1 + fi + + if test -z "$dst_arg"; then + echo "$0: no destination specified." >&2 + exit 1 + fi + + dst=$dst_arg + # Protect names starting with `-'. + case $dst in + -*) dst=./$dst;; + esac + + # If destination is a directory, append the input filename; won't work + # if double slashes aren't ignored. + if test -d "$dst"; then + if test -n "$no_target_directory"; then + echo "$0: $dst_arg: Is a directory" >&2 + exit 1 + fi + dstdir=$dst + dst=$dstdir/`basename "$src"` + dstdir_status=0 + else + # Prefer dirname, but fall back on a substitute if dirname fails. + dstdir=` + (dirname "$dst") 2>/dev/null || + expr X"$dst" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ + X"$dst" : 'X\(//\)[^/]' \| \ + X"$dst" : 'X\(//\)$' \| \ + X"$dst" : 'X\(/\)' \| . 2>/dev/null || + echo X"$dst" | + sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ + s//\1/ + q + } + /^X\(\/\/\)[^/].*/{ + s//\1/ + q + } + /^X\(\/\/\)$/{ + s//\1/ + q + } + /^X\(\/\).*/{ + s//\1/ + q + } + s/.*/./; q' + ` + + test -d "$dstdir" + dstdir_status=$? + fi + fi + + obsolete_mkdir_used=false + + if test $dstdir_status != 0; then + case $posix_mkdir in + '') + # Create intermediate dirs using mode 755 as modified by the umask. + # This is like FreeBSD 'install' as of 1997-10-28. + umask=`umask` + case $stripcmd.$umask in + # Optimize common cases. + *[2367][2367]) mkdir_umask=$umask;; + .*0[02][02] | .[02][02] | .[02]) mkdir_umask=22;; + + *[0-7]) + mkdir_umask=`expr $umask + 22 \ + - $umask % 100 % 40 + $umask % 20 \ + - $umask % 10 % 4 + $umask % 2 + `;; + *) mkdir_umask=$umask,go-w;; + esac + + # With -d, create the new directory with the user-specified mode. + # Otherwise, rely on $mkdir_umask. + if test -n "$dir_arg"; then + mkdir_mode=-m$mode + else + mkdir_mode= + fi + + posix_mkdir=false + case $umask in + *[123567][0-7][0-7]) + # POSIX mkdir -p sets u+wx bits regardless of umask, which + # is incompatible with FreeBSD 'install' when (umask & 300) != 0. + ;; + *) + tmpdir=${TMPDIR-/tmp}/ins$RANDOM-$$ + trap 'ret=$?; rmdir "$tmpdir/d" "$tmpdir" 2>/dev/null; exit $ret' 0 + + if (umask $mkdir_umask && + exec $mkdirprog $mkdir_mode -p -- "$tmpdir/d") >/dev/null 2>&1 + then + if test -z "$dir_arg" || { + # Check for POSIX incompatibilities with -m. + # HP-UX 11.23 and IRIX 6.5 mkdir -m -p sets group- or + # other-writeable bit of parent directory when it shouldn't. + # FreeBSD 6.1 mkdir -m -p sets mode of existing directory. + ls_ld_tmpdir=`ls -ld "$tmpdir"` + case $ls_ld_tmpdir in + d????-?r-*) different_mode=700;; + d????-?--*) different_mode=755;; + *) false;; + esac && + $mkdirprog -m$different_mode -p -- "$tmpdir" && { + ls_ld_tmpdir_1=`ls -ld "$tmpdir"` + test "$ls_ld_tmpdir" = "$ls_ld_tmpdir_1" + } + } + then posix_mkdir=: + fi + rmdir "$tmpdir/d" "$tmpdir" + else + # Remove any dirs left behind by ancient mkdir implementations. + rmdir ./$mkdir_mode ./-p ./-- 2>/dev/null + fi + trap '' 0;; + esac;; + esac + + if + $posix_mkdir && ( + umask $mkdir_umask && + $doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir" + ) + then : + else + + # The umask is ridiculous, or mkdir does not conform to POSIX, + # or it failed possibly due to a race condition. Create the + # directory the slow way, step by step, checking for races as we go. + + case $dstdir in + /*) prefix='/';; + -*) prefix='./';; + *) prefix='';; + esac + + eval "$initialize_posix_glob" + + oIFS=$IFS + IFS=/ + $posix_glob set -f + set fnord $dstdir + shift + $posix_glob set +f + IFS=$oIFS + + prefixes= + + for d + do + test -z "$d" && continue + + prefix=$prefix$d + if test -d "$prefix"; then + prefixes= + else + if $posix_mkdir; then + (umask=$mkdir_umask && + $doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir") && break + # Don't fail if two instances are running concurrently. + test -d "$prefix" || exit 1 + else + case $prefix in + *\'*) qprefix=`echo "$prefix" | sed "s/'/'\\\\\\\\''/g"`;; + *) qprefix=$prefix;; + esac + prefixes="$prefixes '$qprefix'" + fi + fi + prefix=$prefix/ + done + + if test -n "$prefixes"; then + # Don't fail if two instances are running concurrently. + (umask $mkdir_umask && + eval "\$doit_exec \$mkdirprog $prefixes") || + test -d "$dstdir" || exit 1 + obsolete_mkdir_used=true + fi + fi + fi + + if test -n "$dir_arg"; then + { test -z "$chowncmd" || $doit $chowncmd "$dst"; } && + { test -z "$chgrpcmd" || $doit $chgrpcmd "$dst"; } && + { test "$obsolete_mkdir_used$chowncmd$chgrpcmd" = false || + test -z "$chmodcmd" || $doit $chmodcmd $mode "$dst"; } || exit 1 + else + + # Make a couple of temp file names in the proper directory. + dsttmp=$dstdir/_inst.$$_ + rmtmp=$dstdir/_rm.$$_ + + # Trap to clean up those temp files at exit. + trap 'ret=$?; rm -f "$dsttmp" "$rmtmp" && exit $ret' 0 + + # Copy the file name to the temp name. + (umask $cp_umask && $doit_exec $cpprog "$src" "$dsttmp") && + + # and set any options; do chmod last to preserve setuid bits. + # + # If any of these fail, we abort the whole thing. If we want to + # ignore errors from any of these, just make sure not to ignore + # errors from the above "$doit $cpprog $src $dsttmp" command. + # + { test -z "$chowncmd" || $doit $chowncmd "$dsttmp"; } && + { test -z "$chgrpcmd" || $doit $chgrpcmd "$dsttmp"; } && + { test -z "$stripcmd" || $doit $stripcmd "$dsttmp"; } && + { test -z "$chmodcmd" || $doit $chmodcmd $mode "$dsttmp"; } && + + # If -C, don't bother to copy if it wouldn't change the file. + if $copy_on_change && + old=`LC_ALL=C ls -dlL "$dst" 2>/dev/null` && + new=`LC_ALL=C ls -dlL "$dsttmp" 2>/dev/null` && + + eval "$initialize_posix_glob" && + $posix_glob set -f && + set X $old && old=:$2:$4:$5:$6 && + set X $new && new=:$2:$4:$5:$6 && + $posix_glob set +f && + + test "$old" = "$new" && + $cmpprog "$dst" "$dsttmp" >/dev/null 2>&1 + then + rm -f "$dsttmp" + else + # Rename the file to the real destination. + $doit $mvcmd -f "$dsttmp" "$dst" 2>/dev/null || + + # The rename failed, perhaps because mv can't rename something else + # to itself, or perhaps because mv is so ancient that it does not + # support -f. + { + # Now remove or move aside any old file at destination location. + # We try this two ways since rm can't unlink itself on some + # systems and the destination file might be busy for other + # reasons. In this case, the final cleanup might fail but the new + # file should still install successfully. + { + test ! -f "$dst" || + $doit $rmcmd -f "$dst" 2>/dev/null || + { $doit $mvcmd -f "$dst" "$rmtmp" 2>/dev/null && + { $doit $rmcmd -f "$rmtmp" 2>/dev/null; :; } + } || + { echo "$0: cannot unlink or rename $dst" >&2 + (exit 1); exit 1 + } + } && + + # Now rename the file to the real destination. + $doit $mvcmd "$dsttmp" "$dst" + } + fi || exit 1 + + trap '' 0 + fi +done + +# Local variables: +# eval: (add-hook 'write-file-hooks 'time-stamp) +# time-stamp-start: "scriptversion=" +# time-stamp-format: "%:y-%02m-%02d.%02H" +# time-stamp-time-zone: "UTC" +# time-stamp-end: "; # UTC" +# End: diff --git a/open_src/xio/src/kernel/transport/tcp/xio_tcp_datapath.c b/open_src/xio/src/kernel/transport/tcp/xio_tcp_datapath.c new file mode 100644 index 0000000..fa78f4c --- /dev/null +++ b/open_src/xio/src/kernel/transport/tcp/xio_tcp_datapath.c @@ -0,0 +1,3490 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include + +#include "libxio.h" +#include +#include "xio_common.h" +#include "xio_observer.h" +#include "xio_log.h" +#include "xio_protocol.h" +#include "xio_mbuf.h" +#include "xio_task.h" +#include "xio_ktransport.h" +#include "xio_transport.h" +#include "xio_mempool.h" +#include "xio_ev_data.h" +#include "xio_objpool.h" +#include "xio_workqueue.h" +#include "xio_ev_loop.h" +#include "xio_context.h" +#include "xio_context_priv.h" +#include "xio_tcp_transport.h" +#include "xio_sg_table.h" + +extern struct xio_tcp_options tcp_options; + + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_send_work */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_send_work(struct socket *sock, void **buf, uint32_t *len, + int block) +{ + int retval; + struct msghdr msg; + struct kvec vec; + + memset(&msg, 0, sizeof(msg)); + msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; + + while (*len) { + vec.iov_base = *buf; + vec.iov_len = *len; + retval = kernel_sendmsg(sock, &msg, &vec, 1, *len); + if (retval < 0) { + if (retval != -EAGAIN) { + xio_set_error(-retval); + ERROR_LOG("sendmsg failed. (errno=%d)\n", + -retval); + return retval; + } else if (!block) { + xio_set_error(EAGAIN); + return retval; + } + } else { + *len -= retval; + *buf += retval; + } + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_sendmsg_work */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_sendmsg_work(struct socket *sock, + struct xio_tcp_work_req *xio_send, + int block) +{ + int retval = 0, tmp_bytes, sent_bytes = 0; + int eagain_count = TX_EAGAIN_RETRY; + unsigned int i; + + while (xio_send->tot_iov_byte_len) { + retval = kernel_sendmsg(sock, &xio_send->msg, + (struct kvec *)MSGHDR_IOV(&xio_send->msg), + MSGHDR_IOVLEN(&xio_send->msg), + xio_send->tot_iov_byte_len); + if (retval < 0) { + if (retval != -EAGAIN) { + xio_set_error(-retval); + DEBUG_LOG("sendmsg failed. (errno=%d)\n", + -retval); + return retval; + } else if (!block && (eagain_count-- == 0)) { + xio_set_error(EAGAIN); + return retval; + } + } else { + sent_bytes += retval; + xio_send->tot_iov_byte_len -= retval; + + if (xio_send->tot_iov_byte_len == 0) { + MSGHDR_IOVLEN(&xio_send->msg) = 0; + break; + } + + tmp_bytes = 0; + for (i = 0; i < MSGHDR_IOVLEN(&xio_send->msg); i++) { + if (MSGHDR_IOV(&xio_send->msg)[i].iov_len + + tmp_bytes < retval) { + tmp_bytes += + MSGHDR_IOV(&xio_send->msg)[i].iov_len; + } else { + ((struct iovec *)MSGHDR_IOV(&xio_send->msg))[i].iov_len -= + (retval - tmp_bytes); + ((struct iovec *)MSGHDR_IOV(&xio_send->msg))[i].iov_base += + (retval - tmp_bytes); + MSGHDR_IOV(&xio_send->msg) = + &MSGHDR_IOV(&xio_send->msg)[i]; + MSGHDR_IOVLEN(&xio_send->msg) -= i; + break; + } + } + + eagain_count = TX_EAGAIN_RETRY; + } + } + + return sent_bytes; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_write_setup_msg */ +/*---------------------------------------------------------------------------*/ +static void xio_tcp_write_setup_msg(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task, + struct xio_tcp_setup_msg *msg) +{ + struct xio_tcp_setup_msg *tmp_msg; + + /* set the mbuf after tlv header */ + xio_mbuf_set_val_start(&task->mbuf); + + /* jump after connection setup header */ + if (tcp_hndl->base.is_client) + xio_mbuf_inc(&task->mbuf, + sizeof(struct xio_nexus_setup_req)); + else + xio_mbuf_inc(&task->mbuf, + sizeof(struct xio_nexus_setup_rsp)); + + tmp_msg = xio_mbuf_get_curr_ptr(&task->mbuf); + + /* pack relevant values */ + PACK_LLVAL(msg, tmp_msg, buffer_sz); + PACK_LVAL(msg, tmp_msg, max_in_iovsz); + PACK_LVAL(msg, tmp_msg, max_out_iovsz); + PACK_LVAL(msg, tmp_msg, max_header_len); + +#ifdef EYAL_TODO + print_hex_dump_bytes("post_send: ", DUMP_PREFIX_ADDRESS, + task->mbuf.tlv.head, + 64); +#endif + xio_mbuf_inc(&task->mbuf, sizeof(struct xio_tcp_setup_msg)); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_read_setup_msg */ +/*---------------------------------------------------------------------------*/ +static void xio_tcp_read_setup_msg(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task, + struct xio_tcp_setup_msg *msg) +{ + struct xio_tcp_setup_msg *tmp_msg; + + DEBUG_LOG("xio_tcp_read_setup_msg\n"); + /* set the mbuf after tlv header */ + xio_mbuf_set_val_start(&task->mbuf); + + /* jump after connection setup header */ + if (tcp_hndl->base.is_client) + xio_mbuf_inc(&task->mbuf, + sizeof(struct xio_nexus_setup_rsp)); + else + xio_mbuf_inc(&task->mbuf, + sizeof(struct xio_nexus_setup_req)); + + tmp_msg = xio_mbuf_get_curr_ptr(&task->mbuf); + + /* pack relevant values */ + UNPACK_LLVAL(tmp_msg, msg, buffer_sz); + UNPACK_LVAL(tmp_msg, msg, max_in_iovsz); + UNPACK_LVAL(tmp_msg, msg, max_out_iovsz); + UNPACK_LVAL(tmp_msg, msg, max_header_len); + +#ifdef EYAL_TODO + print_hex_dump_bytes("post_send: ", DUMP_PREFIX_ADDRESS, + task->mbuf.curr, + 64); +#endif + xio_mbuf_inc(&task->mbuf, sizeof(struct xio_tcp_setup_msg)); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_send_setup_req */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_send_setup_req(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task) +{ + XIO_TO_TCP_TASK(task, tcp_task); + uint16_t payload; + struct xio_tcp_setup_msg req; + + DEBUG_LOG("xio_tcp_send_setup_req\n"); + + req.buffer_sz = xio_tcp_get_inline_buffer_size(); + req.max_in_iovsz = tcp_options.max_in_iovsz; + req.max_out_iovsz = tcp_options.max_out_iovsz; + req.max_header_len = g_poptions->max_inline_xio_hdr; + + xio_tcp_write_setup_msg(tcp_hndl, task, &req); + + payload = xio_mbuf_tlv_payload_len(&task->mbuf); + + /* add tlv */ + if (xio_mbuf_write_tlv(&task->mbuf, task->tlv_type, payload) != 0) + return -1; + + TRACE_LOG("tcp send setup request\n"); + + /* set the length */ + tcp_task->txd.msg_iov[0].iov_len = xio_mbuf_data_length(&task->mbuf); + tcp_task->txd.msg_len = 1; + tcp_task->txd.tot_iov_byte_len = tcp_task->txd.msg_iov[0].iov_len; + MSGHDR_IOV(&tcp_task->txd.msg) = tcp_task->txd.msg_iov; + MSGHDR_IOVLEN(&tcp_task->txd.msg) = tcp_task->txd.msg_len; + + tcp_task->out_tcp_op = XIO_TCP_SEND; + + xio_task_addref(task); + + xio_tcp_sendmsg_work(tcp_hndl->socket.ctl.ksock, &tcp_task->txd, 1); + + list_move_tail(&task->tasks_list_entry, &tcp_hndl->in_flight_list); + + TRACE_LOG("done tcp send setup request\n"); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_send_setup_rsp */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_send_setup_rsp(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task) +{ + XIO_TO_TCP_TASK(task, tcp_task); + uint16_t payload; + struct xio_tcp_setup_msg *rsp = &tcp_hndl->setup_rsp; + + DEBUG_LOG("xio_tcp_send_setup_rsp\n"); + + rsp->max_in_iovsz = tcp_options.max_in_iovsz; + rsp->max_out_iovsz = tcp_options.max_out_iovsz; + rsp->buffer_sz = g_poptions->max_inline_xio_hdr + + g_poptions->max_inline_xio_data + + xio_mbuf_get_curr_offset(&task->mbuf); + rsp->max_header_len = g_poptions->max_inline_xio_hdr; + + xio_tcp_write_setup_msg(tcp_hndl, task, rsp); + + payload = xio_mbuf_tlv_payload_len(&task->mbuf); + + /* add tlv */ + if (xio_mbuf_write_tlv(&task->mbuf, task->tlv_type, payload) != 0) + return -1; + + TRACE_LOG("tcp send setup response\n"); + + /* set the length */ + tcp_task->txd.msg_iov[0].iov_len = xio_mbuf_data_length(&task->mbuf); + tcp_task->txd.msg_len = 1; + tcp_task->txd.tot_iov_byte_len = tcp_task->txd.msg_iov[0].iov_len; + MSGHDR_IOV(&tcp_task->txd.msg) = tcp_task->txd.msg_iov; + MSGHDR_IOVLEN(&tcp_task->txd.msg) = tcp_task->txd.msg_len; + + tcp_task->out_tcp_op = XIO_TCP_SEND; + + xio_tcp_sendmsg_work(tcp_hndl->socket.ctl.ksock, &tcp_task->txd, 1); + + list_move(&task->tasks_list_entry, &tcp_hndl->in_flight_list); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_on_setup_msg */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_on_setup_msg(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task) +{ + union xio_transport_event_data event_data; + struct xio_tcp_setup_msg *rsp = &tcp_hndl->setup_rsp; + u64 local_buf_size; + + DEBUG_LOG("xio_tcp_on_setup_msg\n"); + + if (tcp_hndl->base.is_client) { + struct xio_task *sender_task = NULL; + + if (!list_empty(&tcp_hndl->in_flight_list)) + sender_task = list_first_entry( + &tcp_hndl->in_flight_list, + struct xio_task, tasks_list_entry); + else if (!list_empty(&tcp_hndl->tx_comp_list)) + sender_task = list_first_entry( + &tcp_hndl->tx_comp_list, + struct xio_task, tasks_list_entry); + else + ERROR_LOG("could not find sender task\n"); + + task->sender_task = sender_task; + xio_tcp_read_setup_msg(tcp_hndl, task, rsp); + } else { + struct xio_tcp_setup_msg req; + + xio_tcp_read_setup_msg(tcp_hndl, task, &req); + + /* current implementation is symmetric */ + local_buf_size = xio_tcp_get_inline_buffer_size(); + rsp->buffer_sz = min(req.buffer_sz, local_buf_size); + rsp->max_in_iovsz = req.max_in_iovsz; + rsp->max_out_iovsz = req.max_out_iovsz; + rsp->max_header_len = req.max_header_len; + } + + tcp_hndl->max_inline_buf_sz = rsp->buffer_sz; + tcp_hndl->membuf_sz = rsp->buffer_sz; + tcp_hndl->peer_max_in_iovsz = rsp->max_in_iovsz; + tcp_hndl->peer_max_out_iovsz = rsp->max_out_iovsz; + tcp_hndl->peer_max_header = rsp->max_header_len; + + tcp_hndl->sn = 0; + + tcp_hndl->alloc_sz = NUM_TASKS * tcp_hndl->membuf_sz; + + tcp_hndl->state = XIO_TRANSPORT_STATE_CONNECTED; + + /* fill notification event */ + event_data.msg.op = XIO_WC_OP_RECV; + event_data.msg.task = task; + + list_move_tail(&task->tasks_list_entry, &tcp_hndl->io_list); + + xio_transport_notify_observer(&tcp_hndl->base, + XIO_TRANSPORT_EVENT_NEW_MESSAGE, + &event_data); + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_send_connect_msg */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_send_connect_msg(struct socket *sock, + struct xio_tcp_connect_msg *msg) +{ + int retval; + struct xio_tcp_connect_msg smsg; + uint32_t size = sizeof(struct xio_tcp_connect_msg); + void *buf = &smsg; + + PACK_LVAL(msg, &smsg, sock_type); + PACK_SVAL(msg, &smsg, second_port); + PACK_SVAL(msg, &smsg, pad); + + retval = xio_tcp_send_work(sock, &buf, &size, 1); + if (retval < 0) { + if (xio_errno() == EAGAIN) { + /* ORK todo set event */ + } else { + ERROR_LOG("send return with %d. (errno=%d %m)\n", + retval, xio_errno()); + return retval; + } + } + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_write_req_header */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_write_req_header(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task, + struct xio_tcp_req_hdr *req_hdr) +{ + XIO_TO_TCP_TASK(task, tcp_task); + struct xio_tcp_req_hdr *tmp_req_hdr; + struct xio_sge *tmp_sge; + struct xio_sge sge; + size_t hdr_len; + uint32_t i; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + void *sg; + + sgtbl = xio_sg_table_get(&task->omsg->in); + sgtbl_ops = xio_sg_table_ops_get(task->omsg->in.sgl_type); + + /* point to transport header */ + xio_mbuf_set_trans_hdr(&task->mbuf); + tmp_req_hdr = xio_mbuf_get_curr_ptr(&task->mbuf); + + /* pack relevant values */ + tmp_req_hdr->version = req_hdr->version; + tmp_req_hdr->flags = req_hdr->flags; + PACK_SVAL(req_hdr, tmp_req_hdr, req_hdr_len); + PACK_LVAL(req_hdr, tmp_req_hdr, ltid); + tmp_req_hdr->in_tcp_op = req_hdr->in_tcp_op; + tmp_req_hdr->out_tcp_op = req_hdr->out_tcp_op; + + PACK_SVAL(req_hdr, tmp_req_hdr, in_num_sge); + PACK_SVAL(req_hdr, tmp_req_hdr, out_num_sge); + PACK_SVAL(req_hdr, tmp_req_hdr, ulp_hdr_len); + PACK_SVAL(req_hdr, tmp_req_hdr, ulp_pad_len); + /*remain_data_len is not used */ + PACK_LLVAL(req_hdr, tmp_req_hdr, ulp_imm_len); + + tmp_sge = (void *)((uint8_t *)tmp_req_hdr + + sizeof(struct xio_tcp_req_hdr)); + + /* IN: requester expect small input written via send */ + sg = sge_first(sgtbl_ops, sgtbl); + if (req_hdr->in_tcp_op == XIO_TCP_SEND) { + for (i = 0; i < req_hdr->in_num_sge; i++) { + sge.addr = 0; + sge.length = sge_length(sgtbl_ops, sg); + sge.stag = 0; + PACK_LLVAL(&sge, tmp_sge, addr); + PACK_LVAL(&sge, tmp_sge, length); + PACK_LVAL(&sge, tmp_sge, stag); + tmp_sge++; + sg = sge_next(sgtbl_ops, sgtbl, sg); + } + } + /* IN: requester expect big input written rdma write */ + if (req_hdr->in_tcp_op == XIO_TCP_WRITE) { + for (i = 0; i < req_hdr->in_num_sge; i++) { + sge.addr = uint64_from_ptr(tcp_task->read_mp_mem[i].addr); + sge.length = tcp_task->read_mp_mem[i].length; + sge.stag = 0; + PACK_LLVAL(&sge, tmp_sge, addr); + PACK_LVAL(&sge, tmp_sge, length); + PACK_LVAL(&sge, tmp_sge, stag); + tmp_sge++; + } + } + /* OUT: requester want to write data via rdma read */ + if (req_hdr->out_tcp_op == XIO_TCP_READ) { + for (i = 0; i < req_hdr->out_num_sge; i++) { + sge.addr = uint64_from_ptr(tcp_task->write_mp_mem[i].addr); + sge.length = tcp_task->write_mp_mem[i].length; + sge.stag = 0; + PACK_LLVAL(&sge, tmp_sge, addr); + PACK_LVAL(&sge, tmp_sge, length); + PACK_LVAL(&sge, tmp_sge, stag); + tmp_sge++; + } + } + if (req_hdr->out_tcp_op == XIO_TCP_SEND) { + for (i = 0; i < req_hdr->out_num_sge; i++) { + sge.addr = 0; + sge.length = sge_length(sgtbl_ops, sg); + sge.stag = 0; + PACK_LLVAL(&sge, tmp_sge, addr); + PACK_LVAL(&sge, tmp_sge, length); + PACK_LVAL(&sge, tmp_sge, stag); + tmp_sge++; + sg = sge_next(sgtbl_ops, sgtbl, sg); + } + } + + hdr_len = sizeof(struct xio_tcp_req_hdr); + hdr_len += sizeof(struct xio_sge) * (req_hdr->in_num_sge + + req_hdr->out_num_sge); +#ifdef EYAL_TODO + print_hex_dump_bytes("post_send: ", DUMP_PREFIX_ADDRESS, + task->mbuf.curr, + hdr_len + 16); +#endif + xio_mbuf_inc(&task->mbuf, hdr_len); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_prep_req_header */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_prep_req_header(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task, + uint16_t ulp_hdr_len, + uint16_t ulp_pad_len, + uint64_t ulp_imm_len, + uint32_t status) +{ + XIO_TO_TCP_TASK(task, tcp_task); + struct xio_tcp_req_hdr req_hdr; + + if (!IS_REQUEST(task->tlv_type)) { + ERROR_LOG("unknown message type\n"); + return -1; + } + + /* write the headers */ + + /* fill request header */ + req_hdr.version = XIO_TCP_REQ_HEADER_VERSION; + req_hdr.req_hdr_len = sizeof(req_hdr); + req_hdr.ltid = task->ltid; + req_hdr.in_tcp_op = tcp_task->in_tcp_op; + req_hdr.out_tcp_op = tcp_task->out_tcp_op; + req_hdr.flags = 0; + + if (test_bits(XIO_MSG_FLAG_PEER_WRITE_RSP, &task->omsg_flags)) + set_bits(XIO_MSG_FLAG_PEER_WRITE_RSP, &req_hdr.flags); + else if (test_bits(XIO_MSG_FLAG_LAST_IN_BATCH, &task->omsg_flags)) + set_bits(XIO_MSG_FLAG_LAST_IN_BATCH, &req_hdr.flags); + + req_hdr.ulp_hdr_len = ulp_hdr_len; + req_hdr.ulp_pad_len = ulp_pad_len; + req_hdr.ulp_imm_len = ulp_imm_len; + req_hdr.in_num_sge = tcp_task->read_num_mp_mem; + req_hdr.out_num_sge = tcp_task->write_num_mp_mem; + + if (xio_tcp_write_req_header(tcp_hndl, task, &req_hdr) != 0) + goto cleanup; + + tcp_task->txd.ctl_msg_len = xio_mbuf_tlv_len(&task->mbuf); + + /* write the payload header */ + if (ulp_hdr_len) { + if (xio_mbuf_write_array( + &task->mbuf, + task->omsg->out.header.iov_base, + task->omsg->out.header.iov_len) != 0) + goto cleanup; + } + + /* write the pad between header and data */ + if (ulp_pad_len) + xio_mbuf_inc(&task->mbuf, ulp_pad_len); + + return 0; + +cleanup: + xio_set_error(XIO_E_MSG_SIZE); + ERROR_LOG("xio_tcp_write_req_header failed\n"); + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_write_send_data */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_write_send_data( + struct xio_tcp_transport *tcp_hndl, + struct xio_task *task) +{ + XIO_TO_TCP_TASK(task, tcp_task); + size_t i; + size_t byte_len = 0; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + void *sg; + + sgtbl = xio_sg_table_get(&task->omsg->out); + sgtbl_ops = xio_sg_table_ops_get(task->omsg->out.sgl_type); + + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + tcp_task->txd.msg_iov[i + 1].iov_base = + sge_addr(sgtbl_ops, sg); + tcp_task->txd.msg_iov[i + 1].iov_len = + sge_length(sgtbl_ops, sg); + byte_len += sge_length(sgtbl_ops, sg); + } + tcp_task->txd.msg_len = tbl_nents(sgtbl_ops, sgtbl) + 1; + tcp_task->txd.tot_iov_byte_len = byte_len; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_prep_req_out_data */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_prep_req_out_data( + struct xio_tcp_transport *tcp_hndl, + struct xio_task *task) +{ + XIO_TO_TCP_TASK(task, tcp_task); + struct xio_vmsg *vmsg = &task->omsg->out; + uint64_t xio_hdr_len; + uint64_t xio_max_hdr_len; + uint64_t ulp_hdr_len; + uint64_t ulp_pad_len = 0; + uint64_t ulp_imm_len; + size_t retval; + unsigned int i; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + void *sg; + int tx_by_sr; + int nents; + + sgtbl = xio_sg_table_get(&task->omsg->out); + sgtbl_ops = xio_sg_table_ops_get(task->omsg->out.sgl_type); + nents = tbl_nents(sgtbl_ops, sgtbl); + + /* calculate headers */ + ulp_hdr_len = vmsg->header.iov_len; + ulp_imm_len = tbl_length(sgtbl_ops, sgtbl); + + xio_hdr_len = xio_mbuf_get_curr_offset(&task->mbuf); + xio_hdr_len += sizeof(struct xio_tcp_req_hdr); + xio_hdr_len += sizeof(struct xio_sge) * tcp_task->read_num_mp_mem; + xio_max_hdr_len = xio_hdr_len + sizeof(struct xio_sge) * nents; + + if (test_bits(XIO_MSG_FLAG_PEER_READ_REQ, &task->omsg_flags) && nents) + tx_by_sr = 0; + else + /* test for using send/receive or rdma_read */ + tx_by_sr = (((ulp_hdr_len + ulp_imm_len + + ulp_pad_len + xio_max_hdr_len) <= + tcp_hndl->max_inline_buf_sz) && + (((int)(ulp_imm_len) <= + xio_get_options()->max_inline_xio_data) || + ulp_imm_len == 0)); + + /* the data is outgoing via SEND */ + if (tx_by_sr) { + tcp_task->out_tcp_op = XIO_TCP_SEND; + /* user has small request - no rdma operation expected */ + tcp_task->write_num_mp_mem = 0; + + /* write xio header to the buffer */ + retval = xio_tcp_prep_req_header( + tcp_hndl, task, + ulp_hdr_len, ulp_pad_len, ulp_imm_len, + XIO_E_SUCCESS); + if (retval) + return -1; + + /* if there is data, set it to buffer or directly to the sge */ + if (ulp_imm_len) { + retval = xio_tcp_write_send_data(tcp_hndl, task); + if (retval) + return -1; + } else { + tcp_task->txd.tot_iov_byte_len = 0; + tcp_task->txd.msg_len = 1; + } + } else { + tcp_task->out_tcp_op = XIO_TCP_READ; + + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + tcp_task->write_mp_mem[i].addr = sge_addr(sgtbl_ops, sg); + tcp_task->write_mp_mem[i].cache = NULL; + tcp_task->write_mp_mem[i].length = + sge_length(sgtbl_ops, sg); + } + + tcp_task->write_num_mp_mem = tbl_nents(sgtbl_ops, sgtbl); + + if (ulp_imm_len) { + tcp_task->txd.tot_iov_byte_len = 0; + for (i = 0; i < tcp_task->write_num_mp_mem; i++) { + tcp_task->txd.msg_iov[i + 1].iov_base = + tcp_task->write_mp_mem[i].addr; + tcp_task->txd.msg_iov[i + 1].iov_len = + tcp_task->write_mp_mem[i].length; + tcp_task->txd.tot_iov_byte_len += + tcp_task->write_mp_mem[i].length; + } + tcp_task->txd.msg_len = tcp_task->write_num_mp_mem + 1; + } else { + tcp_task->txd.tot_iov_byte_len = 0; + tcp_task->txd.msg_len = 1; + } + + /* write xio header to the buffer */ + retval = xio_tcp_prep_req_header( + tcp_hndl, task, + ulp_hdr_len, 0, 0, XIO_E_SUCCESS); + + if (retval) { + ERROR_LOG("Failed to write header\n"); + goto cleanup; + } + } + + return 0; + +cleanup: + tcp_task->write_num_mp_mem = 0; + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_on_rsp_send_comp */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_on_rsp_send_comp(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task) +{ + union xio_transport_event_data event_data; + + if (IS_CANCEL(task->tlv_type)) { + xio_tasks_pool_put(task); + return 0; + } + + event_data.msg.op = XIO_WC_OP_SEND; + event_data.msg.task = task; + + xio_transport_notify_observer(&tcp_hndl->base, + XIO_TRANSPORT_EVENT_SEND_COMPLETION, + &event_data); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_on_req_send_comp */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_on_req_send_comp(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task) +{ + union xio_transport_event_data event_data; + + if (IS_CANCEL(task->tlv_type)) + return 0; + + event_data.msg.op = XIO_WC_OP_SEND; + event_data.msg.task = task; + + xio_transport_notify_observer(&tcp_hndl->base, + XIO_TRANSPORT_EVENT_SEND_COMPLETION, + &event_data); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_tx_comp_handler */ +/*---------------------------------------------------------------------------*/ +void xio_tcp_tx_completion_handler(void *xio_task) +{ + struct xio_task *ptask, *next_ptask; + int found = 0; + int removed = 0; + struct xio_task *task = (struct xio_task *)xio_task; + + XIO_TO_TCP_TASK(task, tcp_task); + XIO_TO_TCP_HNDL(task, tcp_hndl); + + list_for_each_entry_safe(ptask, next_ptask, &tcp_hndl->in_flight_list, + tasks_list_entry) { + list_move_tail(&ptask->tasks_list_entry, + &tcp_hndl->tx_comp_list); + removed++; + tcp_task = ptask->dd_data; + + if (IS_REQUEST(ptask->tlv_type)) { + xio_tcp_on_req_send_comp(tcp_hndl, ptask); + xio_tasks_pool_put(ptask); + } else if (IS_RESPONSE(ptask->tlv_type)) { + xio_tcp_on_rsp_send_comp(tcp_hndl, ptask); + } else { + ERROR_LOG("unexpected task %p id:%d magic:0x%x\n", + ptask, + ptask->ltid, ptask->magic); + continue; + } + if (ptask == task) { + found = 1; + break; + } + } + if (!found && removed) + ERROR_LOG("not found but removed %d type:0x%x\n", + removed, task->tlv_type); + + tcp_hndl->tx_comp_cnt = 0; + + if (tcp_hndl->tx_ready_tasks_num) + xio_tcp_xmit(tcp_hndl); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_write_sn */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_write_sn(struct xio_task *task, uint16_t sn) +{ + uint16_t *psn; + + /* save the current place */ + xio_mbuf_push(&task->mbuf); + /* goto to the first tlv */ + xio_mbuf_reset(&task->mbuf); + /* goto the first transport header*/ + xio_mbuf_set_trans_hdr(&task->mbuf); + + /* jump over the first uint32_t */ + xio_mbuf_inc(&task->mbuf, sizeof(uint32_t)); + + /* and set serial number */ + psn = xio_mbuf_get_curr_ptr(&task->mbuf); + *psn = htons(sn); + + /* pop to the original place */ + xio_mbuf_pop(&task->mbuf); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_xmit */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_xmit(struct xio_tcp_transport *tcp_hndl) +{ + struct xio_task *task = NULL, *task_success = NULL, + *next_task = NULL; + struct xio_tcp_task *tcp_task = NULL, *next_tcp_task = NULL; + int retval = 0; + int imm_comp = 0; + int batch_nr = TX_BATCH, batch_count = 0, tmp_count; + unsigned int i; + unsigned int iov_len; + uint64_t bytes_sent; + + if (tcp_hndl->tx_ready_tasks_num == 0 || + tcp_hndl->tx_comp_cnt > COMPLETION_BATCH_MAX || + tcp_hndl->state != XIO_TRANSPORT_STATE_CONNECTED) { + xio_set_error(EAGAIN); + return -1; + } + + task = list_first_entry(&tcp_hndl->tx_ready_list, struct xio_task, + tasks_list_entry); + + /* if "ready to send queue" is not empty */ + while (likely(tcp_hndl->tx_ready_tasks_num && + tcp_hndl->tx_comp_cnt < COMPLETION_BATCH_MAX)) { + next_task = list_first_entry_or_null(&task->tasks_list_entry, + struct xio_task, + tasks_list_entry); + next_tcp_task = next_task ? next_task->dd_data : NULL; + + tcp_task = task->dd_data; + + switch (tcp_task->txd.stage) { + case XIO_TCP_TX_BEFORE: + xio_tcp_write_sn(task, tcp_hndl->sn); + tcp_task->sn = tcp_hndl->sn; + tcp_hndl->sn++; + tcp_task->txd.stage = XIO_TCP_TX_IN_SEND_CTL; + /*fallthrough*/ + case XIO_TCP_TX_IN_SEND_CTL: + /* for single socket, ctl_msg_len is zero */ + if (tcp_task->txd.ctl_msg_len == 0) { + tcp_task->txd.stage = XIO_TCP_TX_IN_SEND_DATA; + break; + } + + tcp_hndl->tmp_work.msg_iov[batch_count].iov_base = + tcp_task->txd.ctl_msg; + tcp_hndl->tmp_work.msg_iov[batch_count].iov_len = + tcp_task->txd.ctl_msg_len; + ++tcp_hndl->tmp_work.msg_len; + tcp_hndl->tmp_work.tot_iov_byte_len += + tcp_task->txd.ctl_msg_len; + + ++batch_count; + if (batch_count != batch_nr && + batch_count != tcp_hndl->tx_ready_tasks_num && + next_task && + next_tcp_task->txd.stage + <= XIO_TCP_TX_IN_SEND_CTL) { + task = next_task; + break; + } + + MSGHDR_IOV(&tcp_hndl->tmp_work.msg) = + tcp_hndl->tmp_work.msg_iov; + MSGHDR_IOVLEN(&tcp_hndl->tmp_work.msg) = + tcp_hndl->tmp_work.msg_len; + + retval = xio_tcp_sendmsg_work( + tcp_hndl->socket.ctl.ksock, + &tcp_hndl->tmp_work, 0); + + task = list_first_entry(&tcp_hndl->tx_ready_list, + struct xio_task, + tasks_list_entry); + iov_len = tcp_hndl->tmp_work.msg_len - + MSGHDR_IOVLEN(&tcp_hndl->tmp_work.msg); + for (i = 0; i < iov_len; i++) { + tcp_task = task->dd_data; + tcp_task->txd.stage = XIO_TCP_TX_IN_SEND_DATA; + tcp_task->txd.ctl_msg_len = 0; + task = list_first_entry_or_null( + &task->tasks_list_entry, + struct xio_task, + tasks_list_entry); + } + if (MSGHDR_IOVLEN(&tcp_hndl->tmp_work.msg)) { + tcp_task = task->dd_data; + tcp_task->txd.ctl_msg = + MSGHDR_IOV(&tcp_hndl->tmp_work.msg)[0].iov_base; + tcp_task->txd.ctl_msg_len = + MSGHDR_IOV(&tcp_hndl->tmp_work.msg)[0].iov_len; + } + tcp_hndl->tmp_work.msg_len = 0; + tcp_hndl->tmp_work.tot_iov_byte_len = 0; + batch_count = 0; + + if (retval < 0) { + xio_set_error(-retval); + if (retval == -ECONNRESET || retval == -EPIPE) { + DEBUG_LOG("tcp trans got reset "); + DEBUG_LOG("tcp_hndl=%p\n", tcp_hndl); + xio_tcp_disconnect_helper(tcp_hndl); + return 0; + } + + if (retval != -EAGAIN) + return -1; + + retval = -1; + goto handle_completions; + } + + task = list_first_entry( + &tcp_hndl->tx_ready_list, + struct xio_task, tasks_list_entry); + + break; + case XIO_TCP_TX_IN_SEND_DATA: + + for (i = 0; i < MSGHDR_IOVLEN(&tcp_task->txd.msg); i++) { + tcp_hndl->tmp_work.msg_iov + [tcp_hndl->tmp_work.msg_len].iov_base = + MSGHDR_IOV(&tcp_task->txd.msg)[i].iov_base; + tcp_hndl->tmp_work.msg_iov + [tcp_hndl->tmp_work.msg_len].iov_len = + MSGHDR_IOV(&tcp_task->txd.msg)[i].iov_len; + ++tcp_hndl->tmp_work.msg_len; + } + tcp_hndl->tmp_work.tot_iov_byte_len += + tcp_task->txd.tot_iov_byte_len; + + ++batch_count; + if (batch_count != batch_nr && + batch_count != tcp_hndl->tx_ready_tasks_num && + next_task && + (next_tcp_task->txd.stage == + XIO_TCP_TX_IN_SEND_DATA) && + (MSGHDR_IOVLEN(&next_tcp_task->txd.msg) + + tcp_hndl->tmp_work.msg_len) < UIO_MAXIOV) { + task = next_task; + break; + } + + MSGHDR_IOV(&tcp_hndl->tmp_work.msg) = + tcp_hndl->tmp_work.msg_iov; + MSGHDR_IOVLEN(&tcp_hndl->tmp_work.msg) = + tcp_hndl->tmp_work.msg_len; + + bytes_sent = tcp_hndl->tmp_work.tot_iov_byte_len; + retval = xio_tcp_sendmsg_work( + tcp_hndl->socket.data.ksock, + &tcp_hndl->tmp_work, 0); + bytes_sent -= tcp_hndl->tmp_work.tot_iov_byte_len; + + task = list_first_entry(&tcp_hndl->tx_ready_list, + struct xio_task, + tasks_list_entry); + iov_len = tcp_hndl->tmp_work.msg_len - + MSGHDR_IOVLEN(&tcp_hndl->tmp_work.msg); + tmp_count = batch_count; + while (tmp_count) { + tcp_task = task->dd_data; + + if (MSGHDR_IOVLEN(&tcp_task->txd.msg) > iov_len) + break; + + iov_len -= MSGHDR_IOVLEN(&tcp_task->txd.msg); + bytes_sent -= tcp_task->txd.tot_iov_byte_len; + + tcp_hndl->tx_ready_tasks_num--; + + list_move_tail(&task->tasks_list_entry, + &tcp_hndl->in_flight_list); + + task_success = task; + + ++tcp_hndl->tx_comp_cnt; + + imm_comp = imm_comp || task->is_control || + (task->omsg && + (task->omsg->flags & + XIO_MSG_FLAG_IMM_SEND_COMP)); + + --tmp_count; + + task = list_first_entry( + &tcp_hndl->tx_ready_list, + struct xio_task, tasks_list_entry); + } + if (MSGHDR_IOVLEN(&tcp_hndl->tmp_work.msg)) { + tcp_task = task->dd_data; + MSGHDR_IOV(&tcp_task->txd.msg) = + &MSGHDR_IOV(&tcp_task->txd.msg)[iov_len]; + ((struct iovec *)MSGHDR_IOV(&tcp_task->txd.msg))[0].iov_base = + MSGHDR_IOV(&tcp_hndl->tmp_work.msg)[0].iov_base; + ((struct iovec *)MSGHDR_IOV(&tcp_task->txd.msg))[0].iov_len = + MSGHDR_IOV(&tcp_hndl->tmp_work.msg)[0].iov_len; + MSGHDR_IOVLEN(&tcp_task->txd.msg) -= iov_len; + tcp_task->txd.tot_iov_byte_len -= bytes_sent; + } + + tcp_hndl->tmp_work.msg_len = 0; + tcp_hndl->tmp_work.tot_iov_byte_len = 0; + batch_count = 0; + + if (retval < 0) { + xio_set_error(-retval); + if (retval == -ECONNRESET || retval == -EPIPE) { + DEBUG_LOG("tcp trans got reset "); + DEBUG_LOG("tcp_hndl=%p\n", tcp_hndl); + xio_tcp_disconnect_helper(tcp_hndl); + return 0; + } + + if (retval != -EAGAIN) + return -1; + + retval = -1; + goto handle_completions; + } + + task = list_first_entry(&tcp_hndl->tx_ready_list, + struct xio_task, + tasks_list_entry); + + break; + default: + ERROR_LOG("unknown TX stage %d\n", tcp_task->txd.stage); + break; + } + } + + if (tcp_hndl->tx_ready_tasks_num == 0) + xio_context_disable_event(&tcp_hndl->flush_tx_event); + else + xio_context_add_event(tcp_hndl->base.ctx, + &tcp_hndl->flush_tx_event); + +handle_completions: + + if (task_success && + (tcp_hndl->tx_comp_cnt >= COMPLETION_BATCH_MAX || imm_comp)) { + tcp_task = task_success->dd_data; + xio_context_add_event(tcp_hndl->base.ctx, + &tcp_task->comp_event); + } + + return retval < 0 ? retval : 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_prep_req_in_data */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_prep_req_in_data(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task) +{ + XIO_TO_TCP_TASK(task, tcp_task); + size_t hdr_len; + size_t data_len; + size_t xio_hdr_len; + struct xio_vmsg *vmsg = &task->omsg->in; + unsigned int i; + int retval; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + void *sg; + int nents; + + sgtbl = xio_sg_table_get(&task->omsg->in); + sgtbl_ops = xio_sg_table_ops_get(task->omsg->in.sgl_type); + nents = tbl_nents(sgtbl_ops, sgtbl); + + if (nents == 0) { + tcp_task->in_tcp_op = XIO_TCP_SEND; + tcp_task->read_num_mp_mem = 0; + return 0; + } + + data_len = tbl_length(sgtbl_ops, sgtbl); + hdr_len = vmsg->header.iov_len; + if (hdr_len && hdr_len >= tcp_hndl->peer_max_header) { + ERROR_LOG("hdr_len=%zd is bigger than peer_max_reader=%d\n", + hdr_len, tcp_hndl->peer_max_header); + return -1; + } + + /* before working on the out - current place after the session header */ + xio_hdr_len = xio_mbuf_get_curr_offset(&task->mbuf); + xio_hdr_len += sizeof(struct xio_tcp_rsp_hdr); + xio_hdr_len += sizeof(struct xio_sge) * nents; + + /* requester may insist on RDMA for small buffers to eliminate copy + * from receive buffers to user buffers + */ + if (!(task->omsg_flags & XIO_MSG_FLAG_PEER_WRITE_RSP) && + data_len + hdr_len + xio_hdr_len < tcp_hndl->max_inline_buf_sz) { + /* user has small response - no rdma operation expected */ + tcp_task->in_tcp_op = XIO_TCP_SEND; + tcp_task->read_num_mp_mem = (data_len) ? + tbl_nents(sgtbl_ops, sgtbl) : 0; + } else { + /* user provided buffers with length for RDMA WRITE */ + tcp_task->in_tcp_op = XIO_TCP_WRITE; + sg = sge_first(sgtbl_ops, sgtbl); + if (sge_addr(sgtbl_ops, sg)) { + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + tcp_task->read_mp_mem[i].addr = + sge_addr(sgtbl_ops, sg); + tcp_task->read_mp_mem[i].cache = NULL; + tcp_task->read_mp_mem[i].length = + sge_length(sgtbl_ops, sg); + } + } else { + if (!tcp_hndl->tcp_mempool) { + xio_set_error(XIO_E_NO_BUFS); + ERROR_LOG("message /read/write failed - " \ + "library's memory pool disabled\n"); + goto cleanup; + } + + /* user did not provide buffer */ + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + retval = xio_mempool_alloc( + tcp_hndl->tcp_mempool, + sge_length(sgtbl_ops, sg), + &tcp_task->read_mp_mem[i]); + if (retval) { + tcp_task->read_num_mp_mem = i; + xio_set_error(ENOMEM); + ERROR_LOG( + "mempool is empty for %zd bytes\n", + sge_length(sgtbl_ops, sg)); + goto cleanup; + } + tcp_task->read_mp_mem[i].length = + sge_length(sgtbl_ops, sg); + } + } + tcp_task->read_num_mp_mem = nents; + } + if (tcp_task->read_num_mp_mem > tcp_hndl->peer_max_out_iovsz) { + ERROR_LOG("request in iovlen %d is bigger " \ + "than peer max out iovlen %d\n", + tcp_task->read_num_mp_mem, + tcp_hndl->peer_max_out_iovsz); + goto cleanup; + } + + return 0; + +cleanup: + for (i = 0; i < tcp_task->read_num_mp_mem; i++) + xio_mempool_free_mp(&tcp_task->read_mp_mem[i]); + + tcp_task->read_num_mp_mem = 0; + xio_set_error(EMSGSIZE); + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_single_sock_set_txd */ +/*---------------------------------------------------------------------------*/ +size_t xio_tcp_single_sock_set_txd(struct xio_task *task) +{ + XIO_TO_TCP_TASK(task, tcp_task); + size_t iov_len; + size_t tlv_len; + + tcp_task->txd.ctl_msg_len = 0; + + iov_len = xio_mbuf_get_curr_offset(&task->mbuf); + tcp_task->txd.msg_iov[0].iov_len = iov_len; + + tlv_len = iov_len - XIO_TLV_LEN; + if (tcp_task->out_tcp_op == XIO_TCP_SEND) + tlv_len += tcp_task->txd.tot_iov_byte_len; + + tcp_task->txd.tot_iov_byte_len += iov_len; + + MSGHDR_IOV(&tcp_task->txd.msg) = tcp_task->txd.msg_iov; + MSGHDR_IOVLEN(&tcp_task->txd.msg) = tcp_task->txd.msg_len; + + return tlv_len; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_dual_sock_set_txd */ +/*---------------------------------------------------------------------------*/ +size_t xio_tcp_dual_sock_set_txd(struct xio_task *task) +{ + XIO_TO_TCP_TASK(task, tcp_task); + size_t iov_len; + if (IS_APPLICATION_MSG(task->tlv_type)) { + iov_len = xio_mbuf_get_curr_offset(&task->mbuf); + tcp_task->txd.ctl_msg_len = iov_len; + tcp_task->txd.msg_iov[0].iov_len = iov_len; + --tcp_task->txd.msg_len; + if (iov_len == 0) + MSGHDR_IOV(&tcp_task->txd.msg) = tcp_task->txd.msg_iov; + else + MSGHDR_IOV(&tcp_task->txd.msg) = &tcp_task->txd.msg_iov[1]; + } else { + iov_len = xio_mbuf_get_curr_offset(&task->mbuf) + - tcp_task->txd.ctl_msg_len; + tcp_task->txd.msg_iov[0].iov_len = iov_len; + tcp_task->txd.msg_iov[0].iov_base += tcp_task->txd.ctl_msg_len; + + tcp_task->txd.tot_iov_byte_len += iov_len; + + if (tcp_task->txd.msg_iov[0].iov_len == 0) { + MSGHDR_IOV(&tcp_task->txd.msg) = &tcp_task->txd.msg_iov[1]; + --tcp_task->txd.msg_len; + } else { + MSGHDR_IOV(&tcp_task->txd.msg) = tcp_task->txd.msg_iov; + } + } + MSGHDR_IOVLEN(&tcp_task->txd.msg) = tcp_task->txd.msg_len; + return tcp_task->txd.ctl_msg_len - XIO_TLV_LEN; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_send_req */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_send_req(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task) +{ + XIO_TO_TCP_TASK(task, tcp_task); + size_t retval; + size_t tlv_len; + + /* prepare buffer for response */ + retval = xio_tcp_prep_req_in_data(tcp_hndl, task); + if (retval != 0) { + ERROR_LOG("tcp_prep_req_in_data failed\n"); + return -1; + } + + /* prepare the out message */ + retval = xio_tcp_prep_req_out_data(tcp_hndl, task); + if (retval != 0) { + ERROR_LOG("tcp_prep_req_out_data failed\n"); + return -1; + } + + /* set the length */ + tlv_len = tcp_hndl->socket.ops->set_txd(task); + + /* add tlv */ + if (xio_mbuf_write_tlv(&task->mbuf, task->tlv_type, tlv_len) != 0) { + ERROR_LOG("write tlv failed\n"); + xio_set_error(EOVERFLOW); + return -1; + } + + xio_task_addref(task); + + tcp_task->out_tcp_op = XIO_TCP_SEND; + + list_move_tail(&task->tasks_list_entry, &tcp_hndl->tx_ready_list); + + tcp_hndl->tx_ready_tasks_num++; + + retval = xio_tcp_xmit(tcp_hndl); + if (retval) { + if (xio_errno() != EAGAIN) { + DEBUG_LOG("xio_tcp_xmit failed\n"); + return -1; + } + xio_context_add_event(tcp_hndl->base.ctx, + &tcp_hndl->flush_tx_event); + retval = 0; + } + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_write_rsp_header */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_write_rsp_header(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task, + struct xio_tcp_rsp_hdr *rsp_hdr) +{ + XIO_TO_TCP_TASK(task, tcp_task); + struct xio_tcp_rsp_hdr *tmp_rsp_hdr; + uint32_t *wr_len; + int i; + size_t hdr_len; + + /* point to transport header */ + xio_mbuf_set_trans_hdr(&task->mbuf); + tmp_rsp_hdr = xio_mbuf_get_curr_ptr(&task->mbuf); + + /* pack relevant values */ + tmp_rsp_hdr->version = rsp_hdr->version; + tmp_rsp_hdr->flags = rsp_hdr->flags; + PACK_SVAL(rsp_hdr, tmp_rsp_hdr, rsp_hdr_len); + PACK_LVAL(rsp_hdr, tmp_rsp_hdr, ltid); + PACK_LVAL(rsp_hdr, tmp_rsp_hdr, rtid); + tmp_rsp_hdr->out_tcp_op = rsp_hdr->out_tcp_op; + PACK_LVAL(rsp_hdr, tmp_rsp_hdr, status); + PACK_SVAL(rsp_hdr, tmp_rsp_hdr, out_num_sge); + PACK_SVAL(rsp_hdr, tmp_rsp_hdr, ulp_hdr_len); + PACK_SVAL(rsp_hdr, tmp_rsp_hdr, ulp_pad_len); + /* remain_data_len not in use */ + PACK_LLVAL(rsp_hdr, tmp_rsp_hdr, ulp_imm_len); + + if (rsp_hdr->out_num_sge) { + wr_len = (uint32_t *)((uint8_t *)tmp_rsp_hdr + + sizeof(struct xio_tcp_rsp_hdr)); + + /* params for RDMA WRITE equivalent*/ + for (i = 0; i < rsp_hdr->out_num_sge; i++) { + *wr_len = htonl(tcp_task->rsp_out_sge[i].length); + wr_len++; + } + } + + hdr_len = sizeof(struct xio_tcp_rsp_hdr); + hdr_len += sizeof(uint32_t) * rsp_hdr->out_num_sge; + + xio_mbuf_inc(&task->mbuf, hdr_len); + +#ifdef EYAL_TODO + print_hex_dump_bytes("post_send: ", DUMP_PREFIX_ADDRESS, + task->mbuf.tlv.head, 64); +#endif + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_prep_rsp_header */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_prep_rsp_header(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task, + uint16_t ulp_hdr_len, + uint16_t ulp_pad_len, + uint64_t ulp_imm_len, + uint32_t status) +{ + XIO_TO_TCP_TASK(task, tcp_task); + struct xio_tcp_rsp_hdr rsp_hdr; + + if (!IS_RESPONSE(task->tlv_type)) { + ERROR_LOG("unknown message type\n"); + return -1; + } + + /* fill response header */ + rsp_hdr.version = XIO_TCP_RSP_HEADER_VERSION; + rsp_hdr.rsp_hdr_len = sizeof(rsp_hdr); + rsp_hdr.rtid = task->rtid; + rsp_hdr.ltid = task->ltid; + rsp_hdr.out_tcp_op = tcp_task->out_tcp_op; + rsp_hdr.flags = XIO_HEADER_FLAG_NONE; + rsp_hdr.out_num_sge = tcp_task->rsp_out_num_sge; + rsp_hdr.ulp_hdr_len = ulp_hdr_len; + rsp_hdr.ulp_pad_len = ulp_pad_len; + rsp_hdr.ulp_imm_len = ulp_imm_len; + rsp_hdr.status = status; + if (xio_tcp_write_rsp_header(tcp_hndl, task, &rsp_hdr) != 0) + goto cleanup; + + tcp_task->txd.ctl_msg_len = xio_mbuf_tlv_len(&task->mbuf); + + /* write the payload header */ + if (ulp_hdr_len) { + if (xio_mbuf_write_array( + &task->mbuf, + task->omsg->out.header.iov_base, + task->omsg->out.header.iov_len) != 0) + goto cleanup; + } + + /* write the pad between header and data */ + if (ulp_pad_len) + xio_mbuf_inc(&task->mbuf, ulp_pad_len); + + return 0; + +cleanup: + xio_set_error(XIO_E_MSG_SIZE); + ERROR_LOG("xio_tcp_write_rsp_header failed\n"); + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_prep_rsp_wr_data */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_prep_rsp_wr_data(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task) +{ + XIO_TO_TCP_TASK(task, tcp_task); + unsigned int i, llen = 0, rlen = 0; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + void *sg; + + sgtbl = xio_sg_table_get(&task->omsg->out); + sgtbl_ops = xio_sg_table_ops_get(task->omsg->out.sgl_type); + + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + tcp_task->txd.msg_iov[i + 1].iov_base = + sge_addr(sgtbl_ops, sg); + tcp_task->txd.msg_iov[i + 1].iov_len = + sge_length(sgtbl_ops, sg); + llen += sge_length(sgtbl_ops, sg); + } + + tcp_task->txd.msg_len = tbl_nents(sgtbl_ops, sgtbl) + 1; + tcp_task->txd.tot_iov_byte_len = llen; + + for (i = 0; i < tcp_task->req_in_num_sge; i++) + rlen += tcp_task->req_in_sge[i].length; + + if (rlen < llen) { + ERROR_LOG("peer provided too small iovec\n"); + ERROR_LOG("tcp write is ignored\n"); + task->status = EINVAL; + goto cleanup; + } + + i = 0; + while (llen) { + if (tcp_task->req_in_sge[i].length < llen) { + tcp_task->rsp_out_sge[i].length = + tcp_task->req_in_sge[i].length; + } else { + tcp_task->rsp_out_sge[i].length = llen; + } + llen -= tcp_task->rsp_out_sge[i].length; + ++i; + } + tcp_task->rsp_out_num_sge = i; + + return 0; +cleanup: + tcp_task->write_num_mp_mem = 0; + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_send_rsp */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_send_rsp(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task) +{ + XIO_TO_TCP_TASK(task, tcp_task); + struct xio_tcp_rsp_hdr rsp_hdr; + uint64_t xio_hdr_len; + uint64_t ulp_hdr_len; + uint64_t ulp_pad_len = 0; + uint64_t ulp_imm_len; + size_t retval; + int enforce_write_rsp; + int tlv_len = 0; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + + sgtbl = xio_sg_table_get(&task->omsg->out); + sgtbl_ops = xio_sg_table_ops_get(task->omsg->out.sgl_type); + + /* calculate headers */ + ulp_hdr_len = task->omsg->out.header.iov_len; + ulp_imm_len = tbl_length(sgtbl_ops, sgtbl); + xio_hdr_len = xio_mbuf_get_curr_offset(&task->mbuf); + xio_hdr_len += sizeof(rsp_hdr); + xio_hdr_len += tcp_task->req_in_num_sge * sizeof(struct xio_sge); + enforce_write_rsp = task->imsg_flags & XIO_HEADER_FLAG_PEER_WRITE_RSP; + + if (g_poptions->inline_xio_data_align && ulp_imm_len) { + uint16_t hdr_len = xio_hdr_len + ulp_hdr_len; + + ulp_pad_len = ALIGN(hdr_len, g_poptions->inline_xio_data_align) - + hdr_len; + } + + /* + if (tcp_hndl->max_inline_buf_sz < xio_hdr_len + ulp_hdr_len) { + ERROR_LOG("header size %llu exceeds max header %llu\n", + ulp_hdr_len, + tcp_hndl->max_inline_buf_sz - xio_hdr_len); + xio_set_error(XIO_E_MSG_SIZE); + goto cleanup; + } + */ + + /* Small data is outgoing via SEND unless the requester explicitly + * insisted on RDMA operation and provided resources. + */ + if ((ulp_imm_len == 0) || (!enforce_write_rsp && + ((xio_hdr_len + ulp_hdr_len + + ulp_pad_len + ulp_imm_len) + < tcp_hndl->max_inline_buf_sz))) { + tcp_task->out_tcp_op = XIO_TCP_SEND; + /* write xio header to the buffer */ + retval = xio_tcp_prep_rsp_header( + tcp_hndl, task, + ulp_hdr_len, ulp_pad_len, ulp_imm_len, + XIO_E_SUCCESS); + if (retval) + goto cleanup; + + /* if there is data, set it to buffer or directly to the sge */ + if (ulp_imm_len) { + retval = xio_tcp_write_send_data(tcp_hndl, task); + if (retval) + goto cleanup; + } + } else { + if (tcp_task->req_in_sge[0].addr && + tcp_task->req_in_sge[0].length) { + /* the data is sent via RDMA_WRITE equivalent*/ + tcp_task->out_tcp_op = XIO_TCP_WRITE; + /* prepare rdma write equivalent */ + retval = xio_tcp_prep_rsp_wr_data(tcp_hndl, task); + if (retval) + goto cleanup; + + /* and the header is sent via SEND */ + /* write xio header to the buffer */ + retval = xio_tcp_prep_rsp_header( + tcp_hndl, task, + ulp_hdr_len, 0, ulp_imm_len, + XIO_E_SUCCESS); + } else { + ERROR_LOG("partial completion of request due " \ + "to missing, response buffer\n"); + + /* the client did not provide buffer for response */ + retval = xio_tcp_prep_rsp_header( + tcp_hndl, task, + ulp_hdr_len, 0, 0, + XIO_E_PARTIAL_MSG); + goto cleanup; + } + } + + if (ulp_imm_len == 0) { + /* no data at all */ + tbl_set_nents(sgtbl_ops, sgtbl, 0); + tcp_task->txd.tot_iov_byte_len = 0; + tcp_task->txd.msg_len = 1; + } + + /* set the length */ + tlv_len = tcp_hndl->socket.ops->set_txd(task); + + /* add tlv */ + if (xio_mbuf_write_tlv(&task->mbuf, task->tlv_type, tlv_len) != 0) + goto cleanup; + + list_move_tail(&task->tasks_list_entry, &tcp_hndl->tx_ready_list); + + tcp_hndl->tx_ready_tasks_num++; + + retval = xio_tcp_xmit(tcp_hndl); + if (retval) { + retval = xio_errno(); + if (retval != EAGAIN) { + ERROR_LOG("xio_xmit_tcp failed. %s\n", + xio_strerror(retval)); + return -1; + } + retval = 0; + xio_context_add_event(tcp_hndl->base.ctx, + &tcp_hndl->flush_tx_event); + } + + return retval; + +cleanup: + xio_set_error(XIO_E_MSG_SIZE); + ERROR_LOG("xio_tcp_send_msg failed\n"); + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_read_req_header */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_read_req_header(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task, + struct xio_tcp_req_hdr *req_hdr) +{ + XIO_TO_TCP_TASK(task, tcp_task); + struct xio_tcp_req_hdr *tmp_req_hdr; + struct xio_sge *tmp_sge; + int i; + size_t hdr_len; + + /* point to transport header */ + xio_mbuf_set_trans_hdr(&task->mbuf); + tmp_req_hdr = xio_mbuf_get_curr_ptr(&task->mbuf); + + req_hdr->version = tmp_req_hdr->version; + req_hdr->flags = tmp_req_hdr->flags; + UNPACK_SVAL(tmp_req_hdr, req_hdr, req_hdr_len); + + if (req_hdr->req_hdr_len != sizeof(struct xio_tcp_req_hdr)) { + ERROR_LOG( + "header length's read failed. arrived:%d expected:%zd\n", + req_hdr->req_hdr_len, sizeof(struct xio_tcp_req_hdr)); + return -1; + } + + UNPACK_SVAL(tmp_req_hdr, req_hdr, sn); + UNPACK_LVAL(tmp_req_hdr, req_hdr, ltid); + req_hdr->in_tcp_op = tmp_req_hdr->in_tcp_op; + req_hdr->out_tcp_op = tmp_req_hdr->out_tcp_op; + + UNPACK_SVAL(tmp_req_hdr, req_hdr, in_num_sge); + UNPACK_SVAL(tmp_req_hdr, req_hdr, out_num_sge); + UNPACK_SVAL(tmp_req_hdr, req_hdr, ulp_hdr_len); + UNPACK_SVAL(tmp_req_hdr, req_hdr, ulp_pad_len); + + /* remain_data_len not in use */ + UNPACK_LLVAL(tmp_req_hdr, req_hdr, ulp_imm_len); + + tmp_sge = (void *)((uint8_t *)tmp_req_hdr + + sizeof(struct xio_tcp_req_hdr)); + + tcp_task->sn = req_hdr->sn; + + /* params for SEND */ + /* params for RDMA_WRITE */ + for (i = 0; i < req_hdr->in_num_sge; i++) { + UNPACK_LLVAL(tmp_sge, &tcp_task->req_in_sge[i], addr); + UNPACK_LVAL(tmp_sge, &tcp_task->req_in_sge[i], length); + UNPACK_LVAL(tmp_sge, &tcp_task->req_in_sge[i], stag); + tmp_sge++; + } + tcp_task->req_in_num_sge = i; + + /* params for SEND */ + /* params for RDMA_READ */ + for (i = 0; i < req_hdr->out_num_sge; i++) { + UNPACK_LLVAL(tmp_sge, &tcp_task->req_out_sge[i], addr); + UNPACK_LVAL(tmp_sge, &tcp_task->req_out_sge[i], length); + UNPACK_LVAL(tmp_sge, &tcp_task->req_out_sge[i], stag); + tmp_sge++; + } + tcp_task->req_out_num_sge = i; + + hdr_len = sizeof(struct xio_tcp_req_hdr); + hdr_len += sizeof(struct xio_sge) * (req_hdr->in_num_sge + + req_hdr->out_num_sge); + + xio_mbuf_inc(&task->mbuf, hdr_len); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_read_rsp_header */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_read_rsp_header(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task, + struct xio_tcp_rsp_hdr *rsp_hdr) +{ + XIO_TO_TCP_TASK(task, tcp_task); + struct xio_tcp_rsp_hdr *tmp_rsp_hdr; + uint32_t *wr_len; + int i; + size_t hdr_len; + + /* point to transport header */ + xio_mbuf_set_trans_hdr(&task->mbuf); + tmp_rsp_hdr = xio_mbuf_get_curr_ptr(&task->mbuf); + + rsp_hdr->version = tmp_rsp_hdr->version; + rsp_hdr->flags = tmp_rsp_hdr->flags; + UNPACK_SVAL(tmp_rsp_hdr, rsp_hdr, rsp_hdr_len); + + if (rsp_hdr->rsp_hdr_len != sizeof(struct xio_tcp_rsp_hdr)) { + ERROR_LOG( + "header length's read failed. arrived:%d expected:%zd\n", + rsp_hdr->rsp_hdr_len, sizeof(struct xio_tcp_rsp_hdr)); + return -1; + } + + UNPACK_SVAL(tmp_rsp_hdr, rsp_hdr, sn); + UNPACK_LVAL(tmp_rsp_hdr, rsp_hdr, rtid); + UNPACK_LVAL(tmp_rsp_hdr, rsp_hdr, ltid); + rsp_hdr->out_tcp_op = tmp_rsp_hdr->out_tcp_op; + UNPACK_LVAL(tmp_rsp_hdr, rsp_hdr, status); + UNPACK_SVAL(tmp_rsp_hdr, rsp_hdr, out_num_sge); + UNPACK_SVAL(tmp_rsp_hdr, rsp_hdr, ulp_hdr_len); + UNPACK_SVAL(tmp_rsp_hdr, rsp_hdr, ulp_pad_len); + /* remain_data_len not in use */ + UNPACK_LLVAL(tmp_rsp_hdr, rsp_hdr, ulp_imm_len); + + if (rsp_hdr->out_num_sge) { + wr_len = (void *)((uint8_t *)tmp_rsp_hdr + + sizeof(struct xio_tcp_rsp_hdr)); + + /* params for RDMA WRITE */ + for (i = 0; i < rsp_hdr->out_num_sge; i++) { + tcp_task->rsp_out_sge[i].length = ntohl(*wr_len); + wr_len++; + } + tcp_task->rsp_out_num_sge = rsp_hdr->out_num_sge; + } + + hdr_len = sizeof(struct xio_tcp_rsp_hdr); + hdr_len += sizeof(uint32_t) * rsp_hdr->out_num_sge; + + xio_mbuf_inc(&task->mbuf, hdr_len); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_notify_assign_in_buf */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_assign_in_buf(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task, int *is_assigned) +{ + union xio_transport_event_data event_data = { + .assign_in_buf.task = task, + .assign_in_buf.is_assigned = 0 + }; + + xio_transport_notify_observer(&tcp_hndl->base, + XIO_TRANSPORT_EVENT_ASSIGN_IN_BUF, + &event_data); + + *is_assigned = event_data.assign_in_buf.is_assigned; + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_recv_ctl_work */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_recv_ctl_work(struct xio_tcp_transport *tcp_hndl, + struct socket *sock, + struct xio_tcp_work_req *xio_recv, int block) +{ + int retval; + int bytes_to_copy; + struct msghdr msg; + struct kvec vec; + + memset(&msg, 0, sizeof(msg)); + msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; + + if (xio_recv->tot_iov_byte_len == 0) + return 1; + + if (MSGHDR_IOVLEN(&xio_recv->msg) > 1 || + xio_recv->tot_iov_byte_len != MSGHDR_IOV(&xio_recv->msg)[0].iov_len) { + ERROR_LOG("expecting only 1 sized iovec\n"); + return 0; + } + + while (xio_recv->tot_iov_byte_len) { + while (tcp_hndl->tmp_rx_buf_len == 0) { + vec.iov_base = tcp_hndl->tmp_rx_buf; + vec.iov_len = TMP_RX_BUF_SIZE; + retval = kernel_recvmsg(sock, &msg, &vec, 1, + TMP_RX_BUF_SIZE, + msg.msg_flags); + if (retval > 0) { + tcp_hndl->tmp_rx_buf_len = retval; + tcp_hndl->tmp_rx_buf_cur = tcp_hndl->tmp_rx_buf; + } else if (retval == 0) { + /*so errno is not EAGAIN*/ + xio_set_error(ECONNABORTED); + DEBUG_LOG("tcp transport got EOF,tcp_hndl=%p\n", + tcp_hndl); + return 0; + } else { + if (retval == -EAGAIN) { + if (!block) { + xio_set_error(-retval); + return -1; + } + } else if (retval + == -ECONNRESET) { + DEBUG_LOG("recv failed.(errno=%d)\n", + -retval); + return 0; + } else { + xio_set_error(-retval); + ERROR_LOG("recv failed.(errno=%d)\n", + -retval); + return -1; + } + } + } + bytes_to_copy = xio_recv->tot_iov_byte_len > + tcp_hndl->tmp_rx_buf_len ? + tcp_hndl->tmp_rx_buf_len : + xio_recv->tot_iov_byte_len; + memcpy(MSGHDR_IOV(&xio_recv->msg)[0].iov_base, + tcp_hndl->tmp_rx_buf_cur, bytes_to_copy); + tcp_hndl->tmp_rx_buf_cur += bytes_to_copy; + ((struct iovec *)MSGHDR_IOV(&xio_recv->msg))[0].iov_base += bytes_to_copy; + tcp_hndl->tmp_rx_buf_len -= bytes_to_copy; + ((struct iovec *)MSGHDR_IOV(&xio_recv->msg))[0].iov_len -= bytes_to_copy; + xio_recv->tot_iov_byte_len -= bytes_to_copy; + } + + MSGHDR_IOVLEN(&xio_recv->msg) = 0; + + return 1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_recvmsg_work */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_recvmsg_work(struct xio_tcp_transport *tcp_hndl, + struct socket *sock, + struct xio_tcp_work_req *xio_recv, int block) +{ + unsigned int i; + int orig_iovlen; + int retval; + int recv_bytes = 0; + + if (xio_recv->tot_iov_byte_len == 0) + return 1; + + while (xio_recv->tot_iov_byte_len) { + retval = kernel_recvmsg(sock, &xio_recv->msg, + (struct kvec *)MSGHDR_IOV(&xio_recv->msg), + MSGHDR_IOVLEN(&xio_recv->msg), + (size_t)xio_recv->tot_iov_byte_len, + xio_recv->msg.msg_flags); + if (retval > 0) { + recv_bytes += retval; + xio_recv->tot_iov_byte_len -= retval; + if (xio_recv->tot_iov_byte_len == 0) { + MSGHDR_IOVLEN(&xio_recv->msg) = 0; + break; + } + } else if (retval == 0) { + xio_set_error(ECONNABORTED); /*so errno is not EAGAIN*/ + DEBUG_LOG("tcp transport got EOF, tcp_hndl=%p\n", + tcp_hndl); + goto err; + } else { + if (retval == -EAGAIN) { + if (!block) { + xio_set_error(-retval); + retval = -1; + goto err; + } + } else if (retval == -ECONNRESET) { + xio_set_error(-retval); + DEBUG_LOG("recvmsg failed. (errno=%d)\n", + -retval); + retval = 0; + goto err; + } else { + xio_set_error(-retval); + ERROR_LOG("recvmsg failed. (errno=%d)\n", + -retval); + retval = -1; + goto err; + } + } + } + + return recv_bytes; + +err: + orig_iovlen = MSGHDR_IOVLEN(&xio_recv->msg); + for (i = 0; i < orig_iovlen; i++) { + if (MSGHDR_IOV(&xio_recv->msg)[i].iov_len == 0) { + MSGHDR_IOVLEN(&xio_recv->msg)--; + } else { + MSGHDR_IOV(&xio_recv->msg) = &MSGHDR_IOV(&xio_recv->msg)[i]; + break; + } + } + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_single_sock_set_rxd */ +/*---------------------------------------------------------------------------*/ +void xio_tcp_single_sock_set_rxd(struct xio_task *task, + void *buf, uint32_t len) +{ + XIO_TO_TCP_TASK(task, tcp_task); + + tcp_task->rxd.tot_iov_byte_len = 0; + tcp_task->rxd.msg_len = 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_dual_sock_set_rxd */ +/*---------------------------------------------------------------------------*/ +void xio_tcp_dual_sock_set_rxd(struct xio_task *task, + void *buf, uint32_t len) +{ + XIO_TO_TCP_TASK(task, tcp_task); + if (IS_APPLICATION_MSG(task->tlv_type)) + buf += task->imsg.in.header.iov_len; + tcp_task->rxd.msg_iov[0].iov_base = buf; + tcp_task->rxd.msg_iov[0].iov_len = len; + tcp_task->rxd.tot_iov_byte_len = len; + if (len) { + tcp_task->rxd.msg_len = 1; + MSGHDR_IOVLEN(&tcp_task->rxd.msg) = 1; + MSGHDR_IOV(&tcp_task->rxd.msg) = tcp_task->rxd.msg_iov; + } else { + tcp_task->rxd.msg_len = 0; + MSGHDR_IOVLEN(&tcp_task->rxd.msg) = 0; + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_rd_req_header */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_rd_req_header(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task) +{ + XIO_TO_TCP_TASK(task, tcp_task); + unsigned int i, vec_size = 0; + int retval; + int user_assign_flag = 0; + size_t rlen = 0, llen = 0; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + void *sg; + + /* responder side got request for rdma read */ + + /* need for buffer to do rdma read. there are two options: */ + /* option 1: user provides call back that fills application memory */ + /* option 2: use internal buffer pool */ + + /* hint the upper layer of sizes */ + sgtbl = xio_sg_table_get(&task->imsg.in); + sgtbl_ops = xio_sg_table_ops_get(task->imsg.in.sgl_type); + tbl_set_nents(sgtbl_ops, sgtbl, tcp_task->req_out_num_sge); + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + sge_set_addr(sgtbl_ops, sg, NULL); + sge_set_length(sgtbl_ops, sg, + tcp_task->req_out_sge[i].length); + rlen += tcp_task->req_out_sge[i].length; + tcp_task->read_mp_mem[i].cache = NULL; + } + sgtbl = xio_sg_table_get(&task->imsg.out); + sgtbl_ops = xio_sg_table_ops_get(task->imsg.out.sgl_type); + if (tcp_task->req_in_num_sge) { + tbl_set_nents(sgtbl_ops, sgtbl, tcp_task->req_in_num_sge); + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + sge_set_addr(sgtbl_ops, sg, NULL); + sge_set_length(sgtbl_ops, sg, + tcp_task->req_in_sge[i].length); + tcp_task->write_mp_mem[i].cache = NULL; + } + } else { + tbl_set_nents(sgtbl_ops, sgtbl, 0); + } + sgtbl = xio_sg_table_get(&task->imsg.in); + sgtbl_ops = xio_sg_table_ops_get(task->imsg.in.sgl_type); + + xio_tcp_assign_in_buf(tcp_hndl, task, &user_assign_flag); + if (user_assign_flag) { + /* if user does not have buffers ignore */ + if (tbl_nents(sgtbl_ops, sgtbl) == 0) { + WARN_LOG("application has not provided buffers\n"); + WARN_LOG("tcp read is ignored\n"); + task->status = XIO_E_NO_USER_BUFS; + return -1; + } + + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + if (!sge_addr(sgtbl_ops, sg)) { + ERROR_LOG("application has provided " \ + "null address\n"); + ERROR_LOG("tcp read is ignored\n"); + task->status = XIO_E_NO_USER_BUFS; + return -1; + } + llen += sge_length(sgtbl_ops, sg); + vec_size++; + if (llen > rlen) { + sge_set_length( + sgtbl_ops, sg, + rlen - (llen - sge_length(sgtbl_ops, sg))); + tcp_task->req_out_sge[i].length = + sge_length(sgtbl_ops, sg); + break; + } + tcp_task->req_out_sge[i].length = + sge_length(sgtbl_ops, sg); + } + if (rlen > llen) { + ERROR_LOG("application provided too small iovec\n"); + ERROR_LOG("remote peer want to write %zd bytes while" \ + "local peer provided buffer size %zd bytes\n", + rlen, llen); + ERROR_LOG("tcp read is ignored\n"); + task->status = XIO_E_USER_BUF_OVERFLOW; + return -1; + } + + tcp_task->req_out_num_sge = vec_size; + tbl_set_nents(sgtbl_ops, sgtbl, vec_size); + set_bits(XIO_MSG_HINT_ASSIGNED_DATA_IN_BUF, &task->imsg.hints); + } else { + if (!tcp_hndl->tcp_mempool) { + ERROR_LOG("message /read/write failed - " \ + "library's memory pool disabled\n"); + task->status = XIO_E_NO_BUFS; + goto cleanup; + } + + tbl_set_nents(sgtbl_ops, sgtbl, tcp_task->req_out_num_sge); + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + retval = xio_mempool_alloc( + tcp_hndl->tcp_mempool, + tcp_task->req_out_sge[i].length, + &tcp_task->read_mp_mem[i]); + + if (retval) { + tcp_task->read_num_mp_mem = i; + ERROR_LOG("mempool is empty for %zd bytes\n", + tcp_task->read_mp_mem[i].length); + + task->status = ENOMEM; + goto cleanup; + } + sge_set_addr(sgtbl_ops, sg, + tcp_task->read_mp_mem[i].addr); + sge_set_length(sgtbl_ops, sg, + tcp_task->read_mp_mem[i].length); + } + tcp_task->read_num_mp_mem = tcp_task->req_out_num_sge; + } + + sg = sge_first(sgtbl_ops, sgtbl); + for (i = 0; i < tcp_task->req_out_num_sge; i++) { + tcp_task->rxd.msg_iov[i + 1].iov_base = + sge_addr(sgtbl_ops, sg); + tcp_task->rxd.msg_iov[i + 1].iov_len = + tcp_task->req_out_sge[i].length; + sge_set_length(sgtbl_ops, sg, + tcp_task->req_out_sge[i].length); + sg = sge_next(sgtbl_ops, sgtbl, sg); + } + tcp_task->rxd.msg_len += tcp_task->req_out_num_sge; + + /* prepare the in side of the message */ + tcp_task->rxd.tot_iov_byte_len += rlen; + if (MSGHDR_IOVLEN(&tcp_task->rxd.msg)) + MSGHDR_IOV(&tcp_task->rxd.msg) = tcp_task->rxd.msg_iov; + else + MSGHDR_IOV(&tcp_task->rxd.msg) = &tcp_task->rxd.msg_iov[1]; + MSGHDR_IOVLEN(&tcp_task->rxd.msg) = tcp_task->rxd.msg_len; + + return 0; +cleanup: + for (i = 0; i < tcp_task->read_num_mp_mem; i++) { + if (tcp_task->read_mp_mem[i].cache) + xio_mempool_free_mp(&tcp_task->read_mp_mem[i]); + } + + tcp_task->read_num_mp_mem = 0; + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_on_recv_req_header */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_on_recv_req_header(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task) +{ + XIO_TO_TCP_TASK(task, tcp_task); + int retval = 0; + struct xio_tcp_req_hdr req_hdr; + struct xio_msg *imsg; + void *ulp_hdr; + unsigned int i; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + void *sg; + + /* read header */ + retval = xio_tcp_read_req_header(tcp_hndl, task, &req_hdr); + if (retval != 0) { + xio_set_error(XIO_E_MSG_INVALID); + goto cleanup; + } + + /* save originator identifier */ + task->rtid = req_hdr.ltid; + task->imsg_flags = req_hdr.flags; + + imsg = &task->imsg; + sgtbl = xio_sg_table_get(&imsg->out); + sgtbl_ops = xio_sg_table_ops_get(imsg->out.sgl_type); + + ulp_hdr = xio_mbuf_get_curr_ptr(&task->mbuf); + + imsg->type = task->tlv_type; + imsg->in.header.iov_len = req_hdr.ulp_hdr_len; + clr_bits(XIO_MSG_HINT_ASSIGNED_DATA_IN_BUF, &imsg->hints); + + if (req_hdr.ulp_hdr_len) + imsg->in.header.iov_base = ulp_hdr; + else + imsg->in.header.iov_base = NULL; + + /* hint upper layer about expected response */ + if (tcp_task->req_in_num_sge) { + tbl_set_nents(sgtbl_ops, sgtbl, tcp_task->req_in_num_sge); + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + sge_set_addr(sgtbl_ops, sg, NULL); + sge_set_length(sgtbl_ops, sg, + tcp_task->req_in_sge[i].length); + } + } else { + tbl_set_nents(sgtbl_ops, sgtbl, 0); + } + + tcp_task->out_tcp_op = req_hdr.out_tcp_op; + tcp_task->in_tcp_op = req_hdr.in_tcp_op; + + switch (req_hdr.out_tcp_op) { + case XIO_TCP_SEND: + if (IS_APPLICATION_MSG(task->tlv_type)) + tcp_hndl->socket.ops->set_rxd(task, ulp_hdr, + (uint32_t)req_hdr.ulp_imm_len); + else + tcp_hndl->socket.ops->set_rxd(task, ulp_hdr, + req_hdr.ulp_hdr_len + + req_hdr.ulp_pad_len + + (uint32_t)req_hdr.ulp_imm_len); + sgtbl = xio_sg_table_get(&imsg->in); + sgtbl_ops = xio_sg_table_ops_get(imsg->in.sgl_type); + if (req_hdr.ulp_imm_len) { + /* incoming data via SEND */ + /* if data arrived, set the pointers */ + tbl_set_nents(sgtbl_ops, sgtbl, 1); + sg = sge_first(sgtbl_ops, sgtbl); + sge_set_addr(sgtbl_ops, sg, + (ulp_hdr + imsg->in.header.iov_len + + req_hdr.ulp_pad_len)); + sge_set_length(sgtbl_ops, sg, req_hdr.ulp_imm_len); + } else { + /* no data at all */ + tbl_set_nents(sgtbl_ops, sgtbl, 0); + } + break; + case XIO_TCP_READ: + tcp_hndl->socket.ops->set_rxd(task, ulp_hdr, + (uint32_t)req_hdr.ulp_imm_len); + /* handle RDMA READ equivalent. */ + TRACE_LOG("tcp read header\n"); + retval = xio_tcp_rd_req_header(tcp_hndl, task); + if (retval) { + ERROR_LOG("tcp read header failed\n"); + goto cleanup; + } + break; + default: + ERROR_LOG("unexpected out_tcp_op\n"); + xio_set_error(XIO_E_MSG_INVALID); + task->status = XIO_E_MSG_INVALID; + break; + }; + + return 0; + +cleanup: + retval = xio_errno(); + ERROR_LOG("xio_tcp_on_recv_req failed. (errno=%d %s)\n", retval, + xio_strerror(retval)); + xio_transport_notify_observer_error(&tcp_hndl->base, retval); + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_on_recv_req_data */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_on_recv_req_data(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task) +{ + XIO_TO_TCP_TASK(task, tcp_task); + union xio_transport_event_data event_data; + + switch (tcp_task->out_tcp_op) { + case XIO_TCP_SEND: + break; + case XIO_TCP_READ: + /* handle RDMA READ equivalent. */ + TRACE_LOG("tcp read data\n"); + break; + default: + ERROR_LOG("unexpected out_tcp_op\n"); + break; + }; + + /* fill notification event */ + event_data.msg.op = XIO_WC_OP_RECV; + event_data.msg.task = task; + + list_move_tail(&task->tasks_list_entry, &tcp_hndl->io_list); + + xio_transport_notify_observer(&tcp_hndl->base, + XIO_TRANSPORT_EVENT_NEW_MESSAGE, + &event_data); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_on_recv_rsp_header */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_on_recv_rsp_header(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task) +{ + XIO_TO_TCP_TASK(task, tcp_task); + int retval = 0; + struct xio_tcp_rsp_hdr rsp_hdr; + struct xio_msg *imsg; + void *ulp_hdr; + struct xio_tcp_task *tcp_sender_task; + unsigned int i; + struct xio_sg_table_ops *isgtbl_ops; + void *isgtbl; + void *sg; + + /* read the response header */ + retval = xio_tcp_read_rsp_header(tcp_hndl, task, &rsp_hdr); + if (retval != 0) { + xio_set_error(XIO_E_MSG_INVALID); + goto cleanup; + } + /* read the sn */ + tcp_task->sn = rsp_hdr.sn; + + /* find the sender task */ + task->sender_task = + xio_tcp_primary_task_lookup(tcp_hndl, rsp_hdr.rtid); + task->rtid = rsp_hdr.ltid; + + tcp_sender_task = task->sender_task->dd_data; + + /* mark the sender task as arrived */ + task->sender_task->state = XIO_TASK_STATE_RESPONSE_RECV; + + imsg = &task->imsg; + isgtbl = xio_sg_table_get(&imsg->in); + isgtbl_ops = xio_sg_table_ops_get(imsg->in.sgl_type); + + clr_bits(XIO_MSG_HINT_ASSIGNED_DATA_IN_BUF, &imsg->hints); + + ulp_hdr = xio_mbuf_get_curr_ptr(&task->mbuf); + /* msg from received message */ + if (rsp_hdr.ulp_hdr_len) { + imsg->in.header.iov_base = ulp_hdr; + imsg->in.header.iov_len = rsp_hdr.ulp_hdr_len; + } else { + imsg->in.header.iov_base = NULL; + imsg->in.header.iov_len = 0; + } + task->status = rsp_hdr.status; + + tcp_task->out_tcp_op = rsp_hdr.out_tcp_op; + + switch (rsp_hdr.out_tcp_op) { + case XIO_TCP_SEND: + if (IS_APPLICATION_MSG(task->tlv_type)) + tcp_hndl->socket.ops->set_rxd(task, ulp_hdr, + (uint32_t)rsp_hdr.ulp_imm_len); + else + tcp_hndl->socket.ops->set_rxd(task, ulp_hdr, + rsp_hdr.ulp_hdr_len + + rsp_hdr.ulp_pad_len + + rsp_hdr.ulp_imm_len); + /* if data arrived, set the pointers */ + if (rsp_hdr.ulp_imm_len) { + tbl_set_nents(isgtbl_ops, isgtbl, 1); + sg = sge_first(isgtbl_ops, isgtbl); + sge_set_addr(isgtbl_ops, sg, + (ulp_hdr + imsg->in.header.iov_len + + rsp_hdr.ulp_pad_len)); + sge_set_length(isgtbl_ops, sg, + rsp_hdr.ulp_imm_len); + } else { + tbl_set_nents(isgtbl_ops, isgtbl, 0); + } + break; + case XIO_TCP_WRITE: + tcp_hndl->socket.ops->set_rxd(task->sender_task, ulp_hdr, 0); + if (tcp_task->rsp_out_num_sge > + tcp_sender_task->read_num_mp_mem) { + ERROR_LOG("local in data_iovec is too small %d < %d\n", + tcp_sender_task->read_num_mp_mem, + tcp_task->rsp_out_num_sge); + goto partial_msg; + } + + tbl_set_nents(isgtbl_ops, isgtbl, + tcp_task->rsp_out_num_sge); + sg = sge_first(isgtbl_ops, isgtbl); + for (i = 0; i < tcp_task->rsp_out_num_sge; i++) { + sge_set_addr(isgtbl_ops, sg, + tcp_sender_task->read_mp_mem[i].addr); + sge_set_length(isgtbl_ops, sg, + tcp_task->rsp_out_sge[i].length); + tcp_sender_task->rxd.msg_iov[i + 1].iov_base = + tcp_sender_task->read_mp_mem[i].addr; + tcp_sender_task->rxd.msg_iov[i + 1].iov_len = + tcp_task->rsp_out_sge[i].length; + sg = sge_next(isgtbl_ops, isgtbl, sg); + } + + tcp_sender_task->rxd.msg_len += + tcp_task->rsp_out_num_sge; + tcp_sender_task->rxd.tot_iov_byte_len += + rsp_hdr.ulp_imm_len; + if (MSGHDR_IOVLEN(&tcp_sender_task->rxd.msg)) + MSGHDR_IOV(&tcp_sender_task->rxd.msg) = + tcp_sender_task->rxd.msg_iov; + else + MSGHDR_IOV(&tcp_sender_task->rxd.msg) = + &tcp_sender_task->rxd.msg_iov[1]; + MSGHDR_IOVLEN(&tcp_sender_task->rxd.msg) = + tcp_sender_task->rxd.msg_len; + break; + default: + ERROR_LOG("unexpected opcode %d\n", rsp_hdr.out_tcp_op); + break; + } + +partial_msg: + return 0; + +cleanup: + retval = xio_errno(); + ERROR_LOG("xio_tcp_on_recv_rsp failed. (errno=%d %s)\n", + retval, xio_strerror(retval)); + xio_transport_notify_observer_error(&tcp_hndl->base, retval); + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_on_recv_rsp_data */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_on_recv_rsp_data(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task) +{ + XIO_TO_TCP_TASK(task, tcp_task); + union xio_transport_event_data event_data; + struct xio_msg *imsg; + struct xio_msg *omsg; + unsigned int i; + struct xio_tcp_task *tcp_sender_task; + struct xio_sg_table_ops *isgtbl_ops; + void *isgtbl; + struct xio_sg_table_ops *osgtbl_ops; + void *osgtbl; + void *sg; + + omsg = task->sender_task->omsg; + imsg = &task->imsg; + isgtbl = xio_sg_table_get(&imsg->in); + isgtbl_ops = xio_sg_table_ops_get(imsg->in.sgl_type); + osgtbl = xio_sg_table_get(&omsg->in); + osgtbl_ops = xio_sg_table_ops_get(omsg->in.sgl_type); + + /* handle the headers */ + if (omsg->in.header.iov_base) { + /* copy header to user buffers */ + size_t hdr_len = 0; + + if (imsg->in.header.iov_len > omsg->in.header.iov_len) { + hdr_len = omsg->in.header.iov_len; + task->status = XIO_E_MSG_SIZE; + } else { + hdr_len = imsg->in.header.iov_len; + task->status = XIO_E_SUCCESS; + } + if (hdr_len) + memcpy(omsg->in.header.iov_base, + imsg->in.header.iov_base, + hdr_len); + else + *((char *)omsg->in.header.iov_base) = 0; + + omsg->in.header.iov_len = hdr_len; + } else { + /* no copy - just pointers */ + memclonev(&omsg->in.header, 1, &imsg->in.header, 1); + } + + switch (tcp_task->out_tcp_op) { + case XIO_TCP_SEND: + if (tbl_nents(osgtbl_ops, osgtbl)) { + /* deep copy */ + if (tbl_nents(isgtbl_ops, isgtbl)) { + size_t idata_len = + tbl_length(isgtbl_ops, isgtbl); + size_t odata_len = + tbl_length(osgtbl_ops, osgtbl); + if (idata_len > odata_len) { + task->status = XIO_E_MSG_SIZE; + goto partial_msg; + } else { + task->status = XIO_E_SUCCESS; + } + sg = sge_first(osgtbl_ops, osgtbl); + if (sge_addr(osgtbl_ops, sg)) { + /* user provided buffer so do copy */ + tbl_copy(osgtbl_ops, osgtbl, + isgtbl_ops, isgtbl); + } else { + /* use provided only length - set user + * pointers */ + tbl_clone(osgtbl_ops, osgtbl, + isgtbl_ops, isgtbl); + } + } else { + tbl_set_nents(osgtbl_ops, osgtbl, + tbl_nents(isgtbl_ops, isgtbl)); + } + } else { + tbl_clone(osgtbl_ops, osgtbl, + isgtbl_ops, isgtbl); + } + break; + case XIO_TCP_WRITE: + tcp_sender_task = task->sender_task->dd_data; + /* user provided buffer */ + sg = sge_first(osgtbl_ops, osgtbl); + if (sge_addr(osgtbl_ops, sg) && + !tcp_sender_task->read_mp_mem->cache) { + /* user buffers were aligned no + * bounce buffer data was copied + * directly to user buffer need + * to update the buffer length + */ + void *isg; + /* data was copied directly to user buffer */ + /* need to update the buffer length */ + for_each_sge(isgtbl, isgtbl_ops, isg, i) { + sge_set_length(osgtbl_ops, sg, + sge_length(isgtbl_ops, isg)); + sg = sge_next(osgtbl_ops, osgtbl, sg); + } + tbl_set_nents(osgtbl_ops, osgtbl, + tbl_nents(isgtbl_ops, isgtbl)); + } else { + /* deep copy */ + if (sge_addr(osgtbl_ops, sg)) { + /* Bounce buffer */ + tbl_copy(osgtbl_ops, osgtbl, + isgtbl_ops, isgtbl); + /* put buffers back to pool */ + for (i = 0; i < tcp_sender_task->read_num_mp_mem; + i++) { + xio_mempool_free_mp( + &tcp_sender_task->read_mp_mem[i]); + } + tcp_sender_task->read_num_mp_mem = 0; + } else { + /* use provided only length - set user + * pointers */ + tbl_clone(osgtbl_ops, osgtbl, + isgtbl_ops, isgtbl); + } + } + break; + default: + ERROR_LOG("unexpected opcode %d\n", tcp_task->out_tcp_op); + break; + } + +partial_msg: + + /* fill notification event */ + event_data.msg.op = XIO_WC_OP_RECV; + event_data.msg.task = task; + + list_move_tail(&task->tasks_list_entry, &tcp_hndl->io_list); + + /* notify the upper layer of received message */ + xio_transport_notify_observer(&tcp_hndl->base, + XIO_TRANSPORT_EVENT_NEW_MESSAGE, + &event_data); + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_cancel_req_handler */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_cancel_req_handler(struct xio_tcp_transport *tcp_hndl, + void *ulp_msg, size_t ulp_msg_sz) +{ + union xio_transport_event_data event_data; + + /* fill notification event */ + event_data.cancel.ulp_msg = ulp_msg; + event_data.cancel.ulp_msg_sz = ulp_msg_sz; + event_data.cancel.task = NULL; + event_data.cancel.result = 0; + + xio_transport_notify_observer(&tcp_hndl->base, + XIO_TRANSPORT_EVENT_CANCEL_REQUEST, + &event_data); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_cancel_rsp_handler */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_cancel_rsp_handler(struct xio_tcp_transport *tcp_hndl, + struct xio_tcp_cancel_hdr *cancel_hdr, + void *ulp_msg, size_t ulp_msg_sz) +{ + union xio_transport_event_data event_data; + struct xio_task *ptask, *next_ptask; + struct xio_tcp_task *tcp_task; + struct xio_task *task_to_cancel = NULL; + + if ((cancel_hdr->result == XIO_E_MSG_CANCELED) || + (cancel_hdr->result == XIO_E_MSG_CANCEL_FAILED)) { + /* look in the in_flight */ + list_for_each_entry_safe(ptask, next_ptask, + &tcp_hndl->in_flight_list, + tasks_list_entry) { + tcp_task = ptask->dd_data; + if (tcp_task->sn == cancel_hdr->sn) { + task_to_cancel = ptask; + break; + } + } + if (!task_to_cancel) { + /* look in the tx_comp */ + list_for_each_entry_safe(ptask, next_ptask, + &tcp_hndl->tx_comp_list, + tasks_list_entry) { + tcp_task = ptask->dd_data; + if (tcp_task->sn == cancel_hdr->sn) { + task_to_cancel = ptask; + break; + } + } + } + + if (!task_to_cancel) { + ERROR_LOG("[%u] - Failed to found canceled message\n", + cancel_hdr->sn); + /* fill notification event */ + event_data.cancel.ulp_msg = ulp_msg; + event_data.cancel.ulp_msg_sz = ulp_msg_sz; + event_data.cancel.task = NULL; + event_data.cancel.result = XIO_E_MSG_NOT_FOUND; + + xio_transport_notify_observer( + &tcp_hndl->base, + XIO_TRANSPORT_EVENT_CANCEL_RESPONSE, + &event_data); + return 0; + } + } + + /* fill notification event */ + event_data.cancel.ulp_msg = ulp_msg; + event_data.cancel.ulp_msg_sz = ulp_msg_sz; + event_data.cancel.task = task_to_cancel; + event_data.cancel.result = cancel_hdr->result; + + xio_transport_notify_observer(&tcp_hndl->base, + XIO_TRANSPORT_EVENT_CANCEL_RESPONSE, + &event_data); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_on_recv_cancel_rsp_data */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_on_recv_cancel_rsp_data(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task) +{ + struct xio_msg *imsg; + void *buff; + uint16_t ulp_msg_sz; + struct xio_tcp_cancel_hdr cancel_hdr; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + void *sg; + + imsg = &task->imsg; + sgtbl = xio_sg_table_get(&task->imsg.in); + sgtbl_ops = xio_sg_table_ops_get(task->imsg.in.sgl_type); + sg = sge_first(sgtbl_ops, sgtbl); + sge_set_addr(sgtbl_ops, sg, NULL); + tbl_set_nents(sgtbl_ops, sgtbl, 0); + + buff = imsg->in.header.iov_base; + buff += xio_read_uint16(&cancel_hdr.hdr_len, 0, buff); + buff += xio_read_uint16(&cancel_hdr.sn, 0, buff); + buff += xio_read_uint32(&cancel_hdr.result, 0, buff); + buff += xio_read_uint16(&ulp_msg_sz, 0, buff); + + list_move_tail(&task->tasks_list_entry, &tcp_hndl->io_list); + + xio_tcp_cancel_rsp_handler(tcp_hndl, &cancel_hdr, + buff, ulp_msg_sz); + /* return the the cancel response task to pool */ + xio_tasks_pool_put(task); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_on_recv_cancel_rsp_header */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_on_recv_cancel_rsp_header( + struct xio_tcp_transport *tcp_hndl, + struct xio_task *task) +{ + XIO_TO_TCP_TASK(task, tcp_task); + int retval = 0; + struct xio_tcp_rsp_hdr rsp_hdr; + struct xio_msg *imsg; + void *ulp_hdr; + + /* read the response header */ + retval = xio_tcp_read_rsp_header(tcp_hndl, task, &rsp_hdr); + if (retval != 0) { + xio_set_error(XIO_E_MSG_INVALID); + return -1; + } + + /* read the sn */ + tcp_task->sn = rsp_hdr.sn; + + imsg = &task->imsg; + ulp_hdr = xio_mbuf_get_curr_ptr(&task->mbuf); + + imsg->type = task->tlv_type; + imsg->in.header.iov_len = rsp_hdr.ulp_hdr_len; + imsg->in.header.iov_base = ulp_hdr; + + tcp_hndl->socket.ops->set_rxd(task, ulp_hdr, rsp_hdr.ulp_hdr_len); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_on_recv_cancel_req_data */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_on_recv_cancel_req_data(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task) +{ + struct xio_tcp_cancel_hdr cancel_hdr; + struct xio_msg *imsg; + void *buff; + uint16_t ulp_msg_sz; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + void *sg; + + imsg = &task->imsg; + sgtbl = xio_sg_table_get(&task->imsg.in); + sgtbl_ops = xio_sg_table_ops_get(task->imsg.in.sgl_type); + sg = sge_first(sgtbl_ops, sgtbl); + sge_set_addr(sgtbl_ops, sg, NULL); + tbl_set_nents(sgtbl_ops, sgtbl, 0); + + buff = imsg->in.header.iov_base; + buff += xio_read_uint16(&cancel_hdr.hdr_len, 0, buff); + buff += xio_read_uint16(&cancel_hdr.sn, 0, buff); + buff += xio_read_uint32(&cancel_hdr.result, 0, buff); + buff += xio_read_uint16(&ulp_msg_sz, 0, buff); + + list_move_tail(&task->tasks_list_entry, &tcp_hndl->io_list); + + xio_tcp_cancel_req_handler(tcp_hndl, buff, ulp_msg_sz); + /* return the the cancel request task to pool */ + xio_tasks_pool_put(task); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_on_recv_cancel_req_header */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_on_recv_cancel_req_header( + struct xio_tcp_transport *tcp_hndl, + struct xio_task *task) +{ + XIO_TO_TCP_TASK(task, tcp_task); + int retval = 0; + struct xio_tcp_req_hdr req_hdr; + struct xio_msg *imsg; + void *ulp_hdr; + + /* read header */ + retval = xio_tcp_read_req_header(tcp_hndl, task, &req_hdr); + if (retval != 0) { + xio_set_error(XIO_E_MSG_INVALID); + goto cleanup; + } + + /* read the sn */ + tcp_task->sn = req_hdr.sn; + + imsg = &task->imsg; + ulp_hdr = xio_mbuf_get_curr_ptr(&task->mbuf); + + /* set header pointers */ + imsg->type = task->tlv_type; + imsg->in.header.iov_len = req_hdr.ulp_hdr_len; + imsg->in.header.iov_base = ulp_hdr; + + tcp_hndl->socket.ops->set_rxd(task, ulp_hdr, req_hdr.ulp_hdr_len); + + return 0; + +cleanup: + retval = xio_errno(); + ERROR_LOG("recv_cancel_req_header failed. (errno=%d %s)\n", retval, + xio_strerror(retval)); + xio_transport_notify_observer_error(&tcp_hndl->base, retval); + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_send_cancel */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_send_cancel(struct xio_tcp_transport *tcp_hndl, + uint32_t tlv_type, + struct xio_tcp_cancel_hdr *cancel_hdr, + void *ulp_msg, size_t ulp_msg_sz) +{ + uint64_t tlv_len; + uint16_t ulp_hdr_len; + int retval; + struct xio_task *task; + struct xio_tcp_task *tcp_task; + void *buff; + + task = xio_tcp_primary_task_alloc(tcp_hndl); + if (!task) { + ERROR_LOG("primary tasks pool is empty\n"); + return -1; + } + xio_mbuf_reset(&task->mbuf); + + /* set start of the tlv */ + if (xio_mbuf_tlv_start(&task->mbuf) != 0) + return -1; + + task->tlv_type = tlv_type; + tcp_task = (struct xio_tcp_task *)task->dd_data; + tcp_task->out_tcp_op = XIO_TCP_SEND; + tcp_task->write_num_mp_mem = 0; + tcp_task->read_num_mp_mem = 0; + + ulp_hdr_len = sizeof(*cancel_hdr) + sizeof(uint16_t) + ulp_msg_sz; + tcp_hndl->dummy_msg.out.header.iov_base = + kzalloc(ulp_hdr_len, GFP_KERNEL); + tcp_hndl->dummy_msg.out.header.iov_len = ulp_hdr_len; + + /* write the message */ + /* get the pointer */ + buff = tcp_hndl->dummy_msg.out.header.iov_base; + + /* pack relevant values */ + buff += xio_write_uint16(cancel_hdr->hdr_len, 0, buff); + buff += xio_write_uint16(cancel_hdr->sn, 0, buff); + buff += xio_write_uint32(cancel_hdr->result, 0, buff); + buff += xio_write_uint16((uint16_t)(ulp_msg_sz), 0, buff); + buff += xio_write_array(ulp_msg, ulp_msg_sz, 0, buff); + + task->omsg = &tcp_hndl->dummy_msg; + + /* write xio header to the buffer */ + if (IS_REQUEST(task->tlv_type)) { + retval = xio_tcp_prep_req_header( + tcp_hndl, task, + ulp_hdr_len, 0, 0, + XIO_E_SUCCESS); + } else { + retval = xio_tcp_prep_rsp_header( + tcp_hndl, task, + ulp_hdr_len, 0, 0, + XIO_E_SUCCESS); + } + + if (retval) + return -1; + + /* set the length */ + tcp_task->txd.msg_len = 1; + tcp_task->txd.tot_iov_byte_len = 0; + + tlv_len = tcp_hndl->socket.ops->set_txd(task); + + /* add tlv */ + if (xio_mbuf_write_tlv(&task->mbuf, task->tlv_type, tlv_len) != 0) + return -1; + + task->omsg = NULL; + kfree(tcp_hndl->dummy_msg.out.header.iov_base); + + tcp_hndl->tx_ready_tasks_num++; + list_move_tail(&task->tasks_list_entry, &tcp_hndl->tx_ready_list); + + xio_tcp_xmit(tcp_hndl); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_send */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_send(struct xio_transport_base *transport, + struct xio_task *task) +{ + struct xio_tcp_transport *tcp_hndl = + (struct xio_tcp_transport *)transport; + int retval = -1; + + switch (task->tlv_type) { + case XIO_NEXUS_SETUP_REQ: + retval = xio_tcp_send_setup_req(tcp_hndl, task); + break; + case XIO_NEXUS_SETUP_RSP: + retval = xio_tcp_send_setup_rsp(tcp_hndl, task); + break; + default: + if (IS_REQUEST(task->tlv_type)) + retval = xio_tcp_send_req(tcp_hndl, task); + else if (IS_RESPONSE(task->tlv_type)) + retval = xio_tcp_send_rsp(tcp_hndl, task); + else + ERROR_LOG("unknown message type:0x%x\n", + task->tlv_type); + break; + } + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_get_data_rxd */ +/*---------------------------------------------------------------------------*/ +struct xio_tcp_work_req *xio_tcp_get_data_rxd(struct xio_task *task) +{ + XIO_TO_TCP_TASK(task, tcp_task); + struct xio_tcp_task *tcp_sender_task; + + switch (tcp_task->out_tcp_op) { + case XIO_TCP_SEND: + case XIO_TCP_READ: + return &tcp_task->rxd; + case XIO_TCP_WRITE: + tcp_sender_task = task->sender_task->dd_data; + return &tcp_sender_task->rxd; + default: + ERROR_LOG("unexpected opcode %d\n", tcp_task->out_tcp_op); + break; + } + + return NULL; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_rx_data_handler */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_rx_data_handler(struct xio_tcp_transport *tcp_hndl, int batch_nr, + int *resched) +{ + int retval = 0, recvmsg_retval = 0; + struct xio_tcp_task *tcp_task, *next_tcp_task; + struct xio_task *task, *next_task; + unsigned int i, last_in_rxq; + int batch_count = 0, tmp_count = 0, ret_count = 0; + unsigned int iov_len; + uint64_t bytes_recv; + struct xio_tcp_work_req *rxd_work, *next_rxd_work; + + task = list_first_entry_or_null(&tcp_hndl->rx_list, + struct xio_task, + tasks_list_entry); + + while (task && batch_count < batch_nr) { + tcp_task = task->dd_data; + + if (tcp_task->rxd.stage != XIO_TCP_RX_IO_DATA) + break; + + next_task = list_first_entry_or_null( + &task->tasks_list_entry, + struct xio_task, tasks_list_entry); + next_tcp_task = next_task ? next_task->dd_data : NULL; + next_rxd_work = (next_tcp_task && + next_tcp_task->rxd.stage == XIO_TCP_RX_IO_DATA) + ? xio_tcp_get_data_rxd(next_task) : NULL; + + /* An Accelio application runs on Side A would crush, + * when it connects Side B by a port binded by an + * application (not accelio) run on Side B. + */ + rxd_work = xio_tcp_get_data_rxd(task); + if (!rxd_work) { + ERROR_LOG("rxd_work is NULL! Disconnect!\n"); + xio_tcp_disconnect_helper(tcp_hndl); + return -1; + } + + for (i = 0; i < MSGHDR_IOVLEN(&rxd_work->msg); i++) { + tcp_hndl->tmp_work.msg_iov + [tcp_hndl->tmp_work.msg_len].iov_base = + MSGHDR_IOV(&rxd_work->msg)[i].iov_base; + tcp_hndl->tmp_work.msg_iov + [tcp_hndl->tmp_work.msg_len].iov_len = + MSGHDR_IOV(&rxd_work->msg)[i].iov_len; + ++tcp_hndl->tmp_work.msg_len; + } + tcp_hndl->tmp_work.tot_iov_byte_len += + rxd_work->tot_iov_byte_len; + + ++batch_count; + ++tmp_count; + + if (batch_count != batch_nr && next_rxd_work && + (MSGHDR_IOVLEN(&next_rxd_work->msg) + tcp_hndl->tmp_work.msg_len) + < UIO_MAXIOV) { + task = next_task; + continue; + } + + MSGHDR_IOV(&tcp_hndl->tmp_work.msg) = tcp_hndl->tmp_work.msg_iov; + MSGHDR_IOVLEN(&tcp_hndl->tmp_work.msg) = tcp_hndl->tmp_work.msg_len; + + bytes_recv = tcp_hndl->tmp_work.tot_iov_byte_len; + recvmsg_retval = xio_tcp_recvmsg_work( + tcp_hndl, + tcp_hndl->socket.data.ksock, + &tcp_hndl->tmp_work, 0); + bytes_recv -= tcp_hndl->tmp_work.tot_iov_byte_len; + + task = list_first_entry(&tcp_hndl->rx_list, + struct xio_task, tasks_list_entry); + iov_len = tcp_hndl->tmp_work.msg_len - + MSGHDR_IOVLEN(&tcp_hndl->tmp_work.msg); + for (i = 0; i < (unsigned int)tmp_count; i++) { + tcp_task = task->dd_data; + rxd_work = xio_tcp_get_data_rxd(task); + + if (MSGHDR_IOVLEN(&rxd_work->msg) > iov_len) + break; + + iov_len -= MSGHDR_IOVLEN(&rxd_work->msg); + bytes_recv -= rxd_work->tot_iov_byte_len; + + task = list_first_entry(&task->tasks_list_entry, + struct xio_task, + tasks_list_entry); + } + tmp_count = 0; + + if (MSGHDR_IOVLEN(&tcp_hndl->tmp_work.msg)) { + tcp_task = task->dd_data; + rxd_work = xio_tcp_get_data_rxd(task); + MSGHDR_IOV(&rxd_work->msg) = &MSGHDR_IOV(&rxd_work->msg)[iov_len]; + ((struct iovec *)MSGHDR_IOV(&rxd_work->msg))[0].iov_base = + MSGHDR_IOV(&tcp_hndl->tmp_work.msg)[0].iov_base; + ((struct iovec *)MSGHDR_IOV(&rxd_work->msg))[0].iov_len = + MSGHDR_IOV(&tcp_hndl->tmp_work.msg)[0].iov_len; + MSGHDR_IOVLEN(&rxd_work->msg) -= iov_len; + rxd_work->tot_iov_byte_len -= bytes_recv; + } + + tcp_hndl->tmp_work.msg_len = 0; + tcp_hndl->tmp_work.tot_iov_byte_len = 0; + + /* look for the maximum last in rxq index */ + tmp_count = 0; + last_in_rxq = 0; + list_for_each_entry(task, &tcp_hndl->rx_list, tasks_list_entry) { + if (IS_APPLICATION_MSG(task->tlv_type)) + last_in_rxq = (int)tmp_count; + if (++tmp_count == (int)i) + break; + } + tmp_count = 0; + + task = list_first_entry(&tcp_hndl->rx_list, struct xio_task, + tasks_list_entry); + while (i--) { + task->last_in_rxq = (ret_count == (int)last_in_rxq); + ++ret_count; + tcp_task = task->dd_data; + switch (task->tlv_type) { + case XIO_CANCEL_REQ: + xio_tcp_on_recv_cancel_req_data(tcp_hndl, task); + break; + case XIO_CANCEL_RSP: + xio_tcp_on_recv_cancel_rsp_data(tcp_hndl, task); + break; + default: + if (IS_REQUEST(task->tlv_type)) { + retval = + xio_tcp_on_recv_req_data(tcp_hndl, + task); + } else if (IS_RESPONSE(task->tlv_type)) { + retval = + xio_tcp_on_recv_rsp_data(tcp_hndl, + task); + } else { + ERROR_LOG("unknown message type:0x%x\n", + task->tlv_type); + } + if (retval < 0) { + if (recvmsg_retval > 0) + *resched = 1; + return retval; + } + } + + task = list_first_entry(&tcp_hndl->rx_list, + struct xio_task, + tasks_list_entry); + } + + if (recvmsg_retval == 0) { + DEBUG_LOG("tcp transport got EOF, tcp_hndl=%p\n", + tcp_hndl); + if (tcp_task->out_tcp_op == XIO_TCP_READ) { /*TODO needed?*/ + for (i = 0; i < tcp_task->read_num_mp_mem; i++) { + if (tcp_task->read_mp_mem[i].cache) + xio_mempool_free_mp( + &tcp_task->read_mp_mem[i]); + } + tcp_task->read_num_mp_mem = 0; + } + xio_tcp_disconnect_helper(tcp_hndl); + return -1; + } else if (recvmsg_retval < 0) { + break; + } + + task = list_first_entry_or_null(&tcp_hndl->rx_list, + struct xio_task, + tasks_list_entry); + } + + if (recvmsg_retval > 0) + *resched = 1; + + if (tcp_hndl->tx_ready_tasks_num) { + retval = xio_tcp_xmit(tcp_hndl); + if (retval < 0) { + if (xio_errno() != XIO_EAGAIN) { + ERROR_LOG("xio_tcp_xmit failed\n"); + return -1; + } + return ret_count; + } + } + + return ret_count; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_rx_ctl_handler */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_rx_ctl_handler(struct xio_tcp_transport *tcp_hndl, int batch_nr, + int *resched) +{ + int retval = 0, recvmsg_retval = 0; + struct xio_tcp_task *tcp_task; + struct xio_task *task, *task_next; + int exit; + int count; + int tmp_resched = 0; + + task = list_first_entry_or_null(&tcp_hndl->rx_list, + struct xio_task, + tasks_list_entry); + + count = 0; + exit = 0; + while (task && (&task->tasks_list_entry != &tcp_hndl->rx_list) && + (count < batch_nr) && !exit) { + tcp_task = task->dd_data; + + switch (tcp_task->rxd.stage) { + case XIO_TCP_RX_START: + /* ORK todo find a better place to rearm rx_list?*/ + if (tcp_hndl->state == + XIO_TRANSPORT_STATE_CONNECTED || + tcp_hndl->state == + XIO_TRANSPORT_STATE_DISCONNECTED) { + task_next = + xio_tcp_primary_task_alloc(tcp_hndl); + if (!task_next) { + ERROR_LOG( + "primary task pool is empty\n"); + exit = 1; + continue; + } else { + list_add_tail( + &task_next->tasks_list_entry, + &tcp_hndl->rx_list); + } + } + tcp_task->rxd.tot_iov_byte_len = sizeof(struct xio_tlv); + MSGHDR_IOV(&tcp_task->rxd.msg) = tcp_task->rxd.msg_iov; + MSGHDR_IOVLEN(&tcp_task->rxd.msg) = 1; + tcp_task->rxd.stage = XIO_TCP_RX_TLV; + /*fallthrough*/ + case XIO_TCP_RX_TLV: + recvmsg_retval = tcp_hndl->socket.ops->rx_ctl_work( + tcp_hndl, + tcp_hndl->socket.ctl.ksock, + &tcp_task->rxd, 0); + if (recvmsg_retval == 0) { + DEBUG_LOG("tcp transport got EOF,tcp_hndl=%p\n", + tcp_hndl); + if (count) { + exit = 1; + *resched = 1; + tmp_resched = 1; + break; + } + xio_tcp_disconnect_helper(tcp_hndl); + return -1; + } else if (recvmsg_retval < 0) { + exit = 1; + break; + } + retval = xio_mbuf_read_first_tlv(&task->mbuf); + ((struct iovec *)MSGHDR_IOV(&tcp_task->rxd.msg))[0].iov_base = + tcp_task->rxd.msg_iov[1].iov_base; + ((struct iovec *)MSGHDR_IOV(&tcp_task->rxd.msg))[0].iov_len = + task->mbuf.tlv.len; + MSGHDR_IOVLEN(&tcp_task->rxd.msg) = 1; + tcp_task->rxd.tot_iov_byte_len = task->mbuf.tlv.len; + tcp_task->rxd.stage = XIO_TCP_RX_HEADER; + /*fallthrough*/ + case XIO_TCP_RX_HEADER: + recvmsg_retval = tcp_hndl->socket.ops->rx_ctl_work( + tcp_hndl, + tcp_hndl->socket.ctl.ksock, + &tcp_task->rxd, 0); + if (recvmsg_retval == 0) { + DEBUG_LOG("tcp transport got EOF,tcp_hndl=%p\n", + tcp_hndl); + if (count) { + exit = 1; + *resched = 1; + tmp_resched = 1; + break; + } + xio_tcp_disconnect_helper(tcp_hndl); + return -1; + } else if (recvmsg_retval < 0) { + exit = 1; + break; + } + task->tlv_type = xio_mbuf_tlv_type(&task->mbuf); + /* call recv completion */ + switch (task->tlv_type) { + case XIO_NEXUS_SETUP_REQ: + case XIO_NEXUS_SETUP_RSP: + xio_tcp_on_setup_msg(tcp_hndl, task); + return 1; + case XIO_CANCEL_REQ: + xio_tcp_on_recv_cancel_req_header(tcp_hndl, + task); + break; + case XIO_CANCEL_RSP: + xio_tcp_on_recv_cancel_rsp_header(tcp_hndl, + task); + break; + default: + if (IS_REQUEST(task->tlv_type)) + retval = + xio_tcp_on_recv_req_header(tcp_hndl, + task); + else if (IS_RESPONSE(task->tlv_type)) + retval = + xio_tcp_on_recv_rsp_header(tcp_hndl, + task); + else + ERROR_LOG("unknown message type:0x%x\n", + task->tlv_type); + if (retval < 0) { + ERROR_LOG("error reading header\n"); + if (recvmsg_retval > 0) + *resched = 1; + return retval; + } + } + tcp_task->rxd.stage = XIO_TCP_RX_IO_DATA; + /*fallthrough*/ + case XIO_TCP_RX_IO_DATA: + ++count; + break; + default: + ERROR_LOG("unknown stage type:%d\n", + tcp_task->rxd.stage); + break; + } + task = list_first_entry(&task->tasks_list_entry, + struct xio_task, tasks_list_entry); + } + + if (recvmsg_retval > 0) { + tmp_resched = 1; + *resched = 1; + } + + if (count == 0) + return 0; + + *resched = tmp_resched; + retval = tcp_hndl->socket.ops->rx_data_handler(tcp_hndl, batch_nr, + resched); + if (unlikely(retval < 0)) + return retval; + count = retval; + + if (tcp_hndl->tx_ready_tasks_num) { + retval = xio_tcp_xmit(tcp_hndl); + if (retval < 0) { + if (xio_errno() != XIO_EAGAIN) { + ERROR_LOG("xio_tcp_xmit failed\n"); + return -1; + } + return count; + } + } + + return count; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_poll */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_poll(struct xio_transport_base *transport, + long min_nr, long max_nr, + struct timespec *ts_timeout) +{ + struct xio_tcp_transport *tcp_hndl; + int nr_comp = 0, recv_counter; + cycles_t timeout = -1; + cycles_t start_time = jiffies; + int resched = 0; + + if (min_nr > max_nr) + return -1; + + timeout = timespec_to_jiffies(ts_timeout); + if (timeout == 0) + return 0; + + tcp_hndl = (struct xio_tcp_transport *)transport; + + if (tcp_hndl->state != XIO_TRANSPORT_STATE_CONNECTED) { + ERROR_LOG("tcp transport is not connected, state=%d\n", + tcp_hndl->state); + return -1; + } + + while (1) { + /* ORK todo blocking recv with timeout?*/ + recv_counter = tcp_hndl->socket.ops->rx_ctl_handler(tcp_hndl, + &resched); + if (recv_counter < 0 && xio_errno() != EAGAIN) + break; + + nr_comp += recv_counter; + max_nr -= recv_counter; + if (nr_comp >= min_nr || max_nr <= 0) + break; + if ((jiffies - start_time) >= timeout) + break; + } + + return nr_comp; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_cancel_req */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_cancel_req(struct xio_transport_base *transport, + struct xio_msg *req, uint64_t stag, + void *ulp_msg, size_t ulp_msg_sz) +{ + struct xio_tcp_transport *tcp_hndl = + (struct xio_tcp_transport *)transport; + struct xio_task *ptask, *next_ptask; + union xio_transport_event_data event_data; + struct xio_tcp_task *tcp_task; + struct xio_tcp_cancel_hdr cancel_hdr = { + .hdr_len = sizeof(cancel_hdr), + .result = 0 + }; + + /* look in the tx_ready */ + list_for_each_entry_safe(ptask, next_ptask, &tcp_hndl->tx_ready_list, + tasks_list_entry) { + if (ptask->omsg && + (ptask->omsg->sn == req->sn) && + (ptask->stag == stag)) { + TRACE_LOG("[%llu] - message found on tx_ready_list\n", + req->sn); + + tcp_task = ptask->dd_data; + + if (tcp_task->txd.stage != XIO_TCP_TX_BEFORE) + goto send_cancel; + + /* return decrease ref count from task */ + xio_tasks_pool_put(ptask); + tcp_hndl->tx_ready_tasks_num--; + list_move_tail(&ptask->tasks_list_entry, + &tcp_hndl->tx_comp_list); + + /* fill notification event */ + event_data.cancel.ulp_msg = ulp_msg; + event_data.cancel.ulp_msg_sz = ulp_msg_sz; + event_data.cancel.task = ptask; + event_data.cancel.result = XIO_E_MSG_CANCELED; + + xio_transport_notify_observer( + &tcp_hndl->base, + XIO_TRANSPORT_EVENT_CANCEL_RESPONSE, + &event_data); + return 0; + } + } + /* look in the in_flight */ + list_for_each_entry_safe(ptask, next_ptask, &tcp_hndl->in_flight_list, + tasks_list_entry) { + if (ptask->omsg && + (ptask->omsg->sn == req->sn) && + (ptask->stag == stag) && + (ptask->state != XIO_TASK_STATE_RESPONSE_RECV)) { + TRACE_LOG("[%llu] - message found on in_flight_list\n", + req->sn); + goto send_cancel; + } + } + /* look in the tx_comp */ + list_for_each_entry_safe(ptask, next_ptask, &tcp_hndl->tx_comp_list, + tasks_list_entry) { + if (ptask->omsg && + (ptask->omsg->sn == req->sn) && + (ptask->stag == stag) && + (ptask->state != XIO_TASK_STATE_RESPONSE_RECV)) { + TRACE_LOG("[%llu] - message found on tx_comp_list\n", + req->sn); + goto send_cancel; + } + } + TRACE_LOG("[%llu] - message not found on tx path\n", req->sn); + + /* fill notification event */ + event_data.cancel.ulp_msg = ulp_msg; + event_data.cancel.ulp_msg_sz = ulp_msg_sz; + event_data.cancel.task = NULL; + event_data.cancel.result = XIO_E_MSG_NOT_FOUND; + + xio_transport_notify_observer(&tcp_hndl->base, + XIO_TRANSPORT_EVENT_CANCEL_RESPONSE, + &event_data); + + return 0; + +send_cancel: + + TRACE_LOG("[%llu] - send cancel request\n", req->sn); + + tcp_task = ptask->dd_data; + cancel_hdr.sn = tcp_task->sn; + + xio_tcp_send_cancel(tcp_hndl, XIO_CANCEL_REQ, &cancel_hdr, + ulp_msg, ulp_msg_sz); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_cancel_rsp */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_cancel_rsp(struct xio_transport_base *transport, + struct xio_task *task, enum xio_status result, + void *ulp_msg, size_t ulp_msg_sz) +{ + struct xio_tcp_transport *tcp_hndl = + (struct xio_tcp_transport *)transport; + struct xio_tcp_task *tcp_task; + + struct xio_tcp_cancel_hdr cancel_hdr = { + .hdr_len = sizeof(cancel_hdr), + .result = result, + }; + + if (task) { + tcp_task = task->dd_data; + cancel_hdr.sn = tcp_task->sn; + } else { + /* 0 might be a valid sn for another task */ + if ((cancel_hdr.result == XIO_E_MSG_CANCELED) || + (cancel_hdr.result == XIO_E_MSG_CANCEL_FAILED)) { + ERROR_LOG("task cannot be null if result is " \ + "MSG_CANCELED or MSG_CANCEL_FAILED\n"); + return -1; + } + cancel_hdr.sn = 0; + } + + /* fill dummy transport header since was handled by upper layer + */ + return xio_tcp_send_cancel(tcp_hndl, XIO_CANCEL_RSP, + &cancel_hdr, ulp_msg, ulp_msg_sz); +} diff --git a/open_src/xio/src/kernel/transport/tcp/xio_tcp_management.c b/open_src/xio/src/kernel/transport/tcp/xio_tcp_management.c new file mode 100644 index 0000000..6afe781 --- /dev/null +++ b/open_src/xio/src/kernel/transport/tcp/xio_tcp_management.c @@ -0,0 +1,2762 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include + +#include "libxio.h" +#include +#include "xio_common.h" +#include "xio_observer.h" +#include "xio_log.h" +#include "xio_protocol.h" +#include "xio_mbuf.h" +#include "xio_task.h" +#include "xio_ktransport.h" +#include "xio_transport.h" +#include "xio_mempool.h" +#include "xio_ev_data.h" +#include "xio_objpool.h" +#include "xio_workqueue.h" +#include "xio_ev_loop.h" +#include "xio_context.h" +#include "xio_context_priv.h" +#include "xio_tcp_transport.h" +#include "xio_sg_table.h" + +MODULE_AUTHOR("Or Kehati, Eyal Solomon, Shlomo Pongratz"); +MODULE_DESCRIPTION("XIO library v" DRV_VERSION " (" DRV_RELDATE ")"); +MODULE_LICENSE("Dual BSD/GPL"); + +/* The root of xio_tcp debugfs tree */ +static struct dentry *xio_tcp_root; + +#define VALIDATE_SZ(sz) do { \ + if (optlen != (sz)) { \ + xio_set_error(EINVAL); \ + return -1; \ + } \ + } while (0) + +/* default option values */ +#define XIO_OPTVAL_DEF_ENABLE_MEM_POOL 1 +#define XIO_OPTVAL_DEF_ENABLE_MR_CHECK 0 +#define XIO_OPTVAL_DEF_TCP_ENABLE_DMA_LATENCY 0 +#define XIO_OPTVAL_DEF_TCP_MAX_IN_IOVSZ XIO_IOVLEN +#define XIO_OPTVAL_DEF_TCP_MAX_OUT_IOVSZ XIO_IOVLEN +#define XIO_OPTVAL_DEF_TCP_NO_DELAY 0 +#define XIO_OPTVAL_DEF_TCP_SO_SNDBUF 4194304 +#define XIO_OPTVAL_DEF_TCP_SO_RCVBUF 4194304 +#define XIO_OPTVAL_DEF_TCP_DUAL_SOCK 1 + +/*---------------------------------------------------------------------------*/ +/* globals */ +/*---------------------------------------------------------------------------*/ +struct xio_transport xio_tcp_transport; +static struct xio_tcp_socket_ops single_sock_ops; +static struct xio_tcp_socket_ops dual_sock_ops; +struct xio_options *g_poptions; + +/* tcp options */ +struct xio_tcp_options tcp_options = { + .enable_mem_pool = XIO_OPTVAL_DEF_ENABLE_MEM_POOL, + .enable_dma_latency = XIO_OPTVAL_DEF_TCP_ENABLE_DMA_LATENCY, + .enable_mr_check = XIO_OPTVAL_DEF_ENABLE_MR_CHECK, + .max_in_iovsz = XIO_OPTVAL_DEF_TCP_MAX_IN_IOVSZ, + .max_out_iovsz = XIO_OPTVAL_DEF_TCP_MAX_OUT_IOVSZ, + .tcp_no_delay = XIO_OPTVAL_DEF_TCP_NO_DELAY, + .tcp_so_sndbuf = XIO_OPTVAL_DEF_TCP_SO_SNDBUF, + .tcp_so_rcvbuf = XIO_OPTVAL_DEF_TCP_SO_RCVBUF, + .tcp_dual_sock = XIO_OPTVAL_DEF_TCP_DUAL_SOCK, +}; + +static int xio_tcp_post_close(struct xio_tcp_transport *tcp_hndl, + int force_free); + +void xio_tcp_save_orig_callbacks(struct xio_socket *socket) +{ + write_lock_bh(&socket->ksock->sk->sk_callback_lock); + socket->orig_sk_data_ready = socket->ksock->sk->sk_data_ready; + socket->orig_sk_state_change = socket->ksock->sk->sk_state_change; + socket->orig_sk_write_space = socket->ksock->sk->sk_write_space; + write_unlock_bh(&socket->ksock->sk->sk_callback_lock); +} + +void xio_tcp_save_orig_callbacks_from(struct xio_socket *to, + struct xio_socket *from) +{ + to->orig_sk_data_ready = from->orig_sk_data_ready; + to->orig_sk_state_change = from->orig_sk_state_change; + to->orig_sk_write_space = from->orig_sk_write_space; +} + +void xio_tcp_restore_callbacks_from(struct socket *to, + struct xio_socket *from) +{ + write_lock_bh(&to->sk->sk_callback_lock); + if (from->orig_sk_data_ready) + to->sk->sk_data_ready = from->orig_sk_data_ready; + if (from->orig_sk_state_change) + to->sk->sk_state_change = from->orig_sk_state_change; + if (from->orig_sk_write_space) + to->sk->sk_write_space = from->orig_sk_write_space; + to->sk->sk_user_data = NULL; + write_unlock_bh(&to->sk->sk_callback_lock); +} + +void xio_tcp_restore_callbacks(struct xio_socket *socket) +{ + xio_tcp_restore_callbacks_from(socket->ksock, socket); +} + +void xio_tcp_set_callbacks(struct socket *sock, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 15, 0) + void (*sk_data_ready)(struct sock *sk), +#else + void (*sk_data_ready)(struct sock *sk, int bytes), +#endif + void (*sk_state_change)(struct sock *sk), + void (*sk_write_space)(struct sock *sk), + void *user_data) +{ + write_lock_bh(&sock->sk->sk_callback_lock); + + if (sk_data_ready) + sock->sk->sk_data_ready = sk_data_ready; + if (sk_state_change) + sock->sk->sk_state_change = sk_state_change; + if (sk_write_space) + sock->sk->sk_write_space = sk_write_space; + + sock->sk->sk_user_data = user_data; + + write_unlock_bh(&sock->sk->sk_callback_lock); +} + +void xio_tcp_state_change_cb(struct sock *sk) +{ + void (*state_change)(struct sock *sk); + struct xio_tcp_transport *tcp_hndl; + struct xio_socket *socket; + + read_lock(&sk->sk_callback_lock); + + DEBUG_LOG("sock %p state_change to %d\n", sk, sk->sk_state); + + tcp_hndl = sk->sk_user_data; + if (!tcp_hndl) { + state_change = sk->sk_state_change; + goto out; + } + + socket = (tcp_hndl->socket.ctl.ksock->sk == sk) ? + &tcp_hndl->socket.ctl : &tcp_hndl->socket.data; + + state_change = socket->orig_sk_state_change; + + switch (sk->sk_state) { + case TCP_ESTABLISHED: + xio_context_add_event(tcp_hndl->base.ctx, + &socket->conn_establish_event_data); + DEBUG_LOG("establish ksock=%p\n", socket->ksock); + break; + case TCP_CLOSE: + if (tcp_hndl->state != XIO_TRANSPORT_STATE_LISTEN) + xio_tcp_disconnect_helper(tcp_hndl); + break; + default: + break; + } +out: + read_unlock(&sk->sk_callback_lock); + state_change(sk); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_get_max_header_size */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_get_max_header_size(void) +{ + int req_hdr = XIO_TRANSPORT_OFFSET + sizeof(struct xio_tcp_req_hdr); + int rsp_hdr = XIO_TRANSPORT_OFFSET + sizeof(struct xio_tcp_rsp_hdr); + int iovsz = tcp_options.max_out_iovsz + tcp_options.max_in_iovsz; + + req_hdr += iovsz * sizeof(struct xio_sge); + rsp_hdr += tcp_options.max_out_iovsz * sizeof(struct xio_sge); + + return max(req_hdr, rsp_hdr); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_get_inline_buffer_size */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_get_inline_buffer_size(void) +{ + int inline_buf_sz = ALIGN(xio_tcp_get_max_header_size() + + g_poptions->max_inline_xio_hdr + + g_poptions->max_inline_xio_data, 1024); + return inline_buf_sz; +} +/*---------------------------------------------------------------------------*/ +/* xio_tcp_flush_all_tasks */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_flush_all_tasks(struct xio_tcp_transport *tcp_hndl) +{ + if (!list_empty(&tcp_hndl->in_flight_list)) { + TRACE_LOG("in_flight_list not empty!\n"); + xio_transport_flush_task_list(&tcp_hndl->in_flight_list); + /* for task that attached to senders with ref count = 2 */ + xio_transport_flush_task_list(&tcp_hndl->in_flight_list); + } + + if (!list_empty(&tcp_hndl->tx_comp_list)) { + TRACE_LOG("tx_comp_list not empty!\n"); + xio_transport_flush_task_list(&tcp_hndl->tx_comp_list); + } + if (!list_empty(&tcp_hndl->io_list)) { + TRACE_LOG("io_list not empty!\n"); + xio_transport_flush_task_list(&tcp_hndl->io_list); + } + + if (!list_empty(&tcp_hndl->tx_ready_list)) { + TRACE_LOG("tx_ready_list not empty!\n"); + xio_transport_flush_task_list(&tcp_hndl->tx_ready_list); + /* for task that attached to senders with ref count = 2 */ + xio_transport_flush_task_list(&tcp_hndl->tx_ready_list); + } + + if (!list_empty(&tcp_hndl->rx_list)) { + TRACE_LOG("rx_list not empty!\n"); + xio_transport_flush_task_list(&tcp_hndl->rx_list); + } + + tcp_hndl->tx_ready_tasks_num = 0; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* on_sock_close */ +/*---------------------------------------------------------------------------*/ +static void on_sock_close(struct xio_tcp_transport *tcp_hndl) +{ + TRACE_LOG("on_sock_close tcp_hndl:%p, state:%d\n\n", + tcp_hndl, tcp_hndl->state); + + xio_tcp_flush_all_tasks(tcp_hndl); + + xio_transport_notify_observer(&tcp_hndl->base, + XIO_TRANSPORT_EVENT_CLOSED, + NULL); + + tcp_hndl->state = XIO_TRANSPORT_STATE_DESTROYED; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_single_sock_del_ev_handlers */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_single_sock_del_ev_handlers(struct xio_tcp_transport *tcp_hndl) +{ + xio_tcp_restore_callbacks(&tcp_hndl->socket.ctl); + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_dual_sock_del_ev_handlers */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_dual_sock_del_ev_handlers(struct xio_tcp_transport *tcp_hndl) +{ + xio_tcp_restore_callbacks(&tcp_hndl->socket.ctl); + xio_tcp_restore_callbacks(&tcp_hndl->socket.data); + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* on_sock_disconnected */ +/*---------------------------------------------------------------------------*/ +void on_sock_disconnected(struct xio_tcp_transport *tcp_hndl, + int passive_close) +{ + struct xio_tcp_pending_conn *pconn, *next_pconn; + + TRACE_LOG("on_sock_disconnected. tcp_hndl:%p, state:%d\n", + tcp_hndl, tcp_hndl->state); + if (tcp_hndl->state == XIO_TRANSPORT_STATE_DISCONNECTED) { + TRACE_LOG("call to close. tcp_hndl:%p\n", + tcp_hndl); + tcp_hndl->state = XIO_TRANSPORT_STATE_CLOSED; + + if (tcp_hndl->socket.ops->del_ev_handlers) + tcp_hndl->socket.ops->del_ev_handlers(tcp_hndl); + + xio_context_disable_event( + &tcp_hndl->socket.ctl.conn_establish_event_data); + xio_context_disable_event( + &tcp_hndl->socket.data.conn_establish_event_data); + xio_context_disable_event(&tcp_hndl->socket.accept_event_data); + xio_context_disable_event(&tcp_hndl->ctl_rx_event); + xio_context_disable_event(&tcp_hndl->data_rx_event); + xio_context_disable_event(&tcp_hndl->flush_tx_event); + xio_context_disable_event(&tcp_hndl->disconnect_event); + + if (!passive_close && !tcp_hndl->is_listen) { /*active close*/ + tcp_hndl->socket.ops->shutdown(&tcp_hndl->socket); + } + tcp_hndl->socket.ops->close(&tcp_hndl->socket); + + list_for_each_entry_safe(pconn, next_pconn, + &tcp_hndl->pending_conns, + conns_list_entry) { + xio_tcp_restore_callbacks_from(pconn->sock, + &tcp_hndl->socket.ctl); + sock_release(pconn->sock); + pconn->sock = NULL; + xio_tcp_pending_conn_remove_handler(pconn); + } + + if (passive_close) { + xio_transport_notify_observer( + &tcp_hndl->base, + XIO_TRANSPORT_EVENT_DISCONNECTED, + NULL); + } + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_post_close_handler */ +/*---------------------------------------------------------------------------*/ +static void xio_tcp_post_close_handler(void *xio_tcp_hndl) +{ + struct xio_tcp_transport *tcp_hndl = xio_tcp_hndl; + + xio_context_destroy_resume(tcp_hndl->base.ctx); + xio_tcp_post_close(tcp_hndl, 1); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_post_close */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_post_close(struct xio_tcp_transport *tcp_hndl, + int force_free) +{ + int event_pending = 0; + struct xio_tcp_pending_conn *pconn, *next_pconn; + + TRACE_LOG("tcp transport: [post close] handle:%p, force_free=%d\n", + tcp_hndl, force_free); + + if (force_free) + goto free; + + event_pending |= xio_context_is_pending_event( + &tcp_hndl->socket.ctl.conn_establish_event_data); + event_pending |= xio_context_is_pending_event( + &tcp_hndl->socket.data.conn_establish_event_data); + event_pending |= xio_context_is_pending_event( + &tcp_hndl->socket.accept_event_data); + event_pending |= xio_context_is_pending_event(&tcp_hndl->ctl_rx_event); + event_pending |= xio_context_is_pending_event(&tcp_hndl->data_rx_event); + event_pending |= xio_context_is_pending_event( + &tcp_hndl->flush_tx_event); + event_pending |= xio_context_is_pending_event( + &tcp_hndl->disconnect_event); + + event_pending |= !list_empty(&tcp_hndl->pending_conns); + + if (event_pending) { + tcp_hndl->disconnect_event.data = tcp_hndl; + tcp_hndl->disconnect_event.handler = xio_tcp_post_close_handler; + xio_context_add_event(tcp_hndl->base.ctx, + &tcp_hndl->disconnect_event); + return 1; + } + +free: + TRACE_LOG("tcp transport: [post close - free] handle:%p\n", + tcp_hndl); + + xio_observable_unreg_all_observers(&tcp_hndl->base.observable); + XIO_OBSERVABLE_DESTROY(&tcp_hndl->base.observable); + + list_for_each_entry_safe(pconn, next_pconn, + &tcp_hndl->pending_conns, + conns_list_entry) { + kfree(pconn); + } + + kfree(tcp_hndl->tmp_rx_buf); + tcp_hndl->tmp_rx_buf = NULL; + + kfree(tcp_hndl->base.portal_uri); + tcp_hndl->base.portal_uri = NULL; + + kfree(tcp_hndl); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_close_cb */ +/*---------------------------------------------------------------------------*/ +static void xio_tcp_close_cb(struct kref *kref) +{ + struct xio_transport_base *transport = container_of( + kref, struct xio_transport_base, kref); + struct xio_tcp_transport *tcp_hndl = + (struct xio_tcp_transport *)transport; + + /* now it is zero */ + TRACE_LOG("xio_tcp_close: [close] handle:%p, socket:%p\n", + tcp_hndl, tcp_hndl->socket.ctl.ksock); + + switch (tcp_hndl->state) { + case XIO_TRANSPORT_STATE_LISTEN: + case XIO_TRANSPORT_STATE_CONNECTED: + tcp_hndl->state = XIO_TRANSPORT_STATE_DISCONNECTED; + /*fallthrough*/ + case XIO_TRANSPORT_STATE_DISCONNECTED: + on_sock_disconnected(tcp_hndl, 0); + /*fallthrough*/ + case XIO_TRANSPORT_STATE_CLOSED: + on_sock_close(tcp_hndl); + break; + default: + xio_transport_notify_observer( + &tcp_hndl->base, + XIO_TRANSPORT_EVENT_CLOSED, + NULL); + tcp_hndl->state = XIO_TRANSPORT_STATE_DESTROYED; + break; + } + + if (tcp_hndl->state == XIO_TRANSPORT_STATE_DESTROYED) + xio_tcp_post_close(tcp_hndl, 0); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_close */ +/*---------------------------------------------------------------------------*/ +static void xio_tcp_close(struct xio_transport_base *transport) +{ + int was = atomic_read(&transport->kref.refcount); + + /* this is only for debugging - please note that the combination of + * atomic_read and kref_put is not atomic - please remove if this + * error does not pop up. Otherwise contact me and report bug. + */ + + /* was already 0 */ + if (!was) { + ERROR_LOG("xio_tcp_close double close. handle:%p\n", + transport); + return; + } + + kref_put(&transport->kref, xio_tcp_close_cb); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_single_sock_shutdown */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_single_sock_shutdown(struct xio_tcp_socket *socket) +{ + int retval; + + retval = kernel_sock_shutdown(socket->ctl.ksock, SHUT_RDWR); + if (retval) { + xio_set_error(-retval); + DEBUG_LOG("tcp shutdown failed. (errno=%d)\n", -retval); + } + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_single_sock_close */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_single_sock_close(struct xio_tcp_socket *socket) +{ + DEBUG_LOG("release socket\n"); + xio_tcp_restore_callbacks(&socket->ctl); + sock_release(socket->ctl.ksock); + socket->ctl.ksock = NULL; + socket->data.ksock = NULL; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_dual_sock_shutdown */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_dual_sock_shutdown(struct xio_tcp_socket *socket) +{ + int retval1, retval2; + + retval1 = kernel_sock_shutdown(socket->ctl.ksock, SHUT_RDWR); + if (retval1) { + xio_set_error(-retval1); + DEBUG_LOG("tcp shutdown failed. (errno=%d)\n", -retval1); + } + + retval2 = kernel_sock_shutdown(socket->data.ksock, SHUT_RDWR); + if (retval2) { + xio_set_error(-retval2); + DEBUG_LOG("tcp shutdown failed. (errno=%d)\n", -retval2); + } + + return (retval1 | retval2); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_dual_sock_close */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_dual_sock_close(struct xio_tcp_socket *socket) +{ + DEBUG_LOG("release sockets\n"); + xio_tcp_restore_callbacks(&socket->ctl); + sock_release(socket->ctl.ksock); + socket->ctl.ksock = NULL; + xio_tcp_restore_callbacks(&socket->data); + sock_release(socket->data.ksock); + socket->data.ksock = NULL; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_reject */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_reject(struct xio_transport_base *transport) +{ + struct xio_tcp_transport *tcp_hndl = + (struct xio_tcp_transport *)transport; + int retval; + + ERROR_LOG("tcp transport reject - not fully implemented yet!"); + + tcp_hndl->socket.ops->shutdown(&tcp_hndl->socket); + + retval = tcp_hndl->socket.ops->close(&tcp_hndl->socket); + if (retval) + return -1; + + TRACE_LOG("tcp transport: [reject] handle:%p\n", tcp_hndl); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_context_shutdown */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_context_shutdown(struct xio_transport_base *trans_hndl, + struct xio_context *ctx) +{ + struct xio_tcp_transport *tcp_hndl = + (struct xio_tcp_transport *)trans_hndl; + + TRACE_LOG("tcp transport context_shutdown handle:%p\n", tcp_hndl); + + switch (tcp_hndl->state) { + case XIO_TRANSPORT_STATE_INIT: + ERROR_LOG("shutting context while tcp_hndl=%p state is INIT?\n", + tcp_hndl); + case XIO_TRANSPORT_STATE_LISTEN: + case XIO_TRANSPORT_STATE_CONNECTING: + case XIO_TRANSPORT_STATE_CONNECTED: + tcp_hndl->state = XIO_TRANSPORT_STATE_DISCONNECTED; + /*fallthrough*/ + case XIO_TRANSPORT_STATE_DISCONNECTED: + on_sock_disconnected(tcp_hndl, 0); + break; + default: + break; + } + + tcp_hndl->state = XIO_TRANSPORT_STATE_DESTROYED; + xio_tcp_flush_all_tasks(tcp_hndl); + + xio_transport_notify_observer(&tcp_hndl->base, + XIO_TRANSPORT_EVENT_CLOSED, + NULL); + + if (xio_tcp_post_close(tcp_hndl, 0)) + xio_context_destroy_wait(tcp_hndl->base.ctx); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_disconnect_handler */ +/*---------------------------------------------------------------------------*/ +static void xio_tcp_disconnect_handler(void *xio_tcp_hndl) +{ + struct xio_tcp_transport *tcp_hndl = xio_tcp_hndl; + + on_sock_disconnected(tcp_hndl, 1); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_disconnect_helper */ +/*---------------------------------------------------------------------------*/ +void xio_tcp_disconnect_helper(void *xio_tcp_hndl) +{ + struct xio_tcp_transport *tcp_hndl = xio_tcp_hndl; + + if (tcp_hndl->state >= XIO_TRANSPORT_STATE_DISCONNECTED) + return; + + tcp_hndl->state = XIO_TRANSPORT_STATE_DISCONNECTED; + + /* flush all tasks in completion */ + if (!list_empty(&tcp_hndl->in_flight_list)) { + struct xio_task *task = NULL; + + task = list_last_entry(&tcp_hndl->in_flight_list, + struct xio_task, + tasks_list_entry); + if (task) { + XIO_TO_TCP_TASK(task, tcp_task); + + xio_context_add_event(tcp_hndl->base.ctx, + &tcp_task->comp_event); + } + } + xio_context_add_event(tcp_hndl->base.ctx, &tcp_hndl->disconnect_event); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_flush_tx_handler */ +/*---------------------------------------------------------------------------*/ +void xio_tcp_flush_tx_handler(void *xio_tcp_hndl) +{ + struct xio_tcp_transport *tcp_hndl = xio_tcp_hndl; + + xio_tcp_xmit(tcp_hndl); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_single_sock_rx_ctl_handler */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_single_sock_rx_ctl_handler(struct xio_tcp_transport *tcp_hndl, + int *resched) +{ + return xio_tcp_rx_ctl_handler(tcp_hndl, 1, resched); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_dual_sock_rx_ctl_handler */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_dual_sock_rx_ctl_handler(struct xio_tcp_transport *tcp_hndl, + int *resched) +{ + return xio_tcp_rx_ctl_handler(tcp_hndl, RX_BATCH, resched); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_consume_ctl_rx */ +/*---------------------------------------------------------------------------*/ +void xio_tcp_consume_ctl_rx(void *xio_tcp_hndl) +{ + struct xio_tcp_transport *tcp_hndl = xio_tcp_hndl; + int retval = 0, count = 0; + int resched = 0; + + do { + retval = tcp_hndl->socket.ops->rx_ctl_handler(tcp_hndl, + &resched); + ++count; + } while (retval > 0 && count < RX_POLL_NR_MAX); + + if (resched && tcp_hndl->state == XIO_TRANSPORT_STATE_CONNECTED) { + xio_context_add_event(tcp_hndl->base.ctx, + &tcp_hndl->ctl_rx_event); + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_consume_data_rx */ +/*---------------------------------------------------------------------------*/ +void xio_tcp_consume_data_rx(void *xio_tcp_hndl) +{ + struct xio_tcp_transport *tcp_hndl = xio_tcp_hndl; + int retval = 0, count = 0; + int resched = 0; + + do { + retval = tcp_hndl->socket.ops->rx_data_handler(tcp_hndl, + RX_BATCH, + &resched); + ++count; + } while (retval > 0 && count < RX_POLL_NR_MAX); + + if (resched && tcp_hndl->state == XIO_TRANSPORT_STATE_CONNECTED) { + xio_context_add_event(tcp_hndl->base.ctx, + &tcp_hndl->data_rx_event); + } +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 15, 0) +void xio_tcp_data_ready_cb(struct sock *sk) +{ + void (*ready)(struct sock *sk); +#else +void xio_tcp_data_ready_cb(struct sock *sk, int bytes) +{ + void (*ready)(struct sock *sk, int bytes); +#endif + struct xio_tcp_transport *tcp_hndl; + int is_ctl = 0; + + read_lock(&sk->sk_callback_lock); + tcp_hndl = sk->sk_user_data; + if (!tcp_hndl) { /* check for teardown race */ + ready = sk->sk_data_ready; + goto out; + } + + is_ctl = (tcp_hndl->socket.ctl.ksock->sk == sk); + if (is_ctl) { + ready = tcp_hndl->socket.ctl.orig_sk_data_ready; + xio_context_add_event(tcp_hndl->base.ctx, + &tcp_hndl->ctl_rx_event); + } else { + ready = tcp_hndl->socket.data.orig_sk_data_ready; + xio_context_add_event(tcp_hndl->base.ctx, + &tcp_hndl->data_rx_event); + } + +out: + read_unlock(&sk->sk_callback_lock); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 15, 0) + ready(sk); +#else + ready(sk, bytes); +#endif +} + +void xio_tcp_write_space_cb(struct sock *sk) +{ + void (*write_space)(struct sock *sk); + struct xio_tcp_transport *tcp_hndl; + int is_ctl = 0; + + TRACE_LOG("write space sk %p\n", sk); + + read_lock(&sk->sk_callback_lock); + tcp_hndl = sk->sk_user_data; + if (!tcp_hndl) { /* check for teardown race */ + write_space = sk->sk_write_space; + goto out; + } + is_ctl = (tcp_hndl->socket.ctl.ksock->sk == sk); + if (is_ctl) + write_space = tcp_hndl->socket.ctl.orig_sk_write_space; + else + write_space = tcp_hndl->socket.data.orig_sk_write_space; + + xio_context_add_event(tcp_hndl->base.ctx, &tcp_hndl->flush_tx_event); + +out: + read_unlock(&sk->sk_callback_lock); + write_space(sk); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_single_sock_add_ev_handlers */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_single_sock_add_ev_handlers(struct xio_tcp_transport *tcp_hndl) +{ + xio_tcp_set_callbacks(tcp_hndl->socket.ctl.ksock, + xio_tcp_data_ready_cb, + xio_tcp_state_change_cb, + xio_tcp_write_space_cb, + tcp_hndl); + + xio_context_add_event(tcp_hndl->base.ctx, &tcp_hndl->ctl_rx_event); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_dual_sock_add_ev_handlers */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_dual_sock_add_ev_handlers(struct xio_tcp_transport *tcp_hndl) +{ + xio_tcp_set_callbacks(tcp_hndl->socket.ctl.ksock, + xio_tcp_data_ready_cb, + xio_tcp_state_change_cb, + xio_tcp_write_space_cb, + tcp_hndl); + + xio_tcp_set_callbacks(tcp_hndl->socket.data.ksock, + xio_tcp_data_ready_cb, + xio_tcp_state_change_cb, + xio_tcp_write_space_cb, + tcp_hndl); + + xio_context_add_event(tcp_hndl->base.ctx, &tcp_hndl->ctl_rx_event); + xio_context_add_event(tcp_hndl->base.ctx, &tcp_hndl->data_rx_event); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_accept */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_accept(struct xio_transport_base *transport) +{ + struct xio_tcp_transport *tcp_hndl = + (struct xio_tcp_transport *)transport; + + if (tcp_hndl->socket.ops->add_ev_handlers(tcp_hndl)) { + xio_transport_notify_observer_error(&tcp_hndl->base, + XIO_E_UNSUCCESSFUL); + } + + TRACE_LOG("tcp transport: [accept] handle:%p\n", tcp_hndl); + + xio_transport_notify_observer( + &tcp_hndl->base, + XIO_TRANSPORT_EVENT_ESTABLISHED, + NULL); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_socket_create */ +/*---------------------------------------------------------------------------*/ +struct socket *xio_tcp_socket_create(void) +{ + int retval, optval = 1; + struct socket *sock = NULL; + + retval = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); + if (retval < 0) { + xio_set_error(-retval); + ERROR_LOG("create socket failed. (errno=%d)\n", -retval); + goto cleanup; + } + + retval = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, + (char *)&optval, sizeof(optval)); + if (retval) { + xio_set_error(-retval); + ERROR_LOG("setsockopt failed. (errno=%d)\n", -retval); + goto cleanup; + } + + if (tcp_options.tcp_no_delay) { + retval = kernel_setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, + (char *)&optval, sizeof(int)); + if (retval) { + xio_set_error(-retval); + ERROR_LOG("setsockopt failed. (errno=%d)\n", -retval); + goto cleanup; + } + } + + optval = tcp_options.tcp_so_sndbuf; + retval = kernel_setsockopt(sock, SOL_SOCKET, SO_SNDBUF, + (char *)&optval, sizeof(optval)); + if (retval) { + xio_set_error(-retval); + ERROR_LOG("setsockopt failed. (errno=%d)\n", -retval); + goto cleanup; + } + optval = tcp_options.tcp_so_rcvbuf; + retval = kernel_setsockopt(sock, SOL_SOCKET, SO_RCVBUF, + (char *)&optval, sizeof(optval)); + if (retval) { + xio_set_error(-retval); + ERROR_LOG("setsockopt failed. (errno=%d)\n", -retval); + goto cleanup; + } + + return sock; + +cleanup: + if (sock) + sock_release(sock); + return NULL; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_single_sock_create */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_single_sock_create(struct xio_tcp_socket *socket) +{ + socket->ctl.ksock = xio_tcp_socket_create(); + if (!socket->ctl.ksock) + return -1; + + socket->data.ksock = socket->ctl.ksock; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_dual_sock_create */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_dual_sock_create(struct xio_tcp_socket *socket) +{ + socket->ctl.ksock = xio_tcp_socket_create(); + if (!socket->ctl.ksock) + return -1; + + socket->data.ksock = xio_tcp_socket_create(); + if (!socket->data.ksock) { + sock_release(socket->ctl.ksock); + socket->ctl.ksock = NULL; + return -1; + } + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_transport_create */ +/*---------------------------------------------------------------------------*/ +struct xio_tcp_transport *xio_tcp_transport_create( + struct xio_transport *transport, + struct xio_context *ctx, + struct xio_observer *observer, + int create_socket) +{ + struct xio_tcp_transport *tcp_hndl; + + /*allocate tcp handl */ + tcp_hndl = kzalloc(sizeof(*tcp_hndl), GFP_KERNEL); + if (!tcp_hndl) { + xio_set_error(ENOMEM); + ERROR_LOG("kzalloc failed. %m\n"); + return NULL; + } + + XIO_OBSERVABLE_INIT(&tcp_hndl->base.observable, tcp_hndl); + + if (tcp_options.enable_mem_pool) { + tcp_hndl->tcp_mempool = xio_mempool_get(ctx); + if (!tcp_hndl->tcp_mempool) { + xio_set_error(ENOMEM); + ERROR_LOG("allocating tcp mempool failed. %m\n"); + goto cleanup; + } + } + + tcp_hndl->base.portal_uri = NULL; + tcp_hndl->base.proto = XIO_PROTO_TCP; + kref_init(&tcp_hndl->base.kref); + tcp_hndl->transport = transport; + tcp_hndl->base.ctx = ctx; + tcp_hndl->is_listen = 0; + + tcp_hndl->tmp_rx_buf = NULL; + tcp_hndl->tmp_rx_buf_cur = NULL; + tcp_hndl->tmp_rx_buf_len = 0; + + tcp_hndl->tx_ready_tasks_num = 0; + tcp_hndl->tx_comp_cnt = 0; + + memset(&tcp_hndl->tmp_work, 0, sizeof(struct xio_tcp_work_req)); + tcp_hndl->tmp_work.msg.msg_flags = MSG_NOSIGNAL | MSG_DONTWAIT; + tcp_hndl->tmp_work.msg_iov = tcp_hndl->tmp_iovec; + + /* create tcp socket */ + if (create_socket) { + memcpy(tcp_hndl->socket.ops, (tcp_options.tcp_dual_sock ? + &dual_sock_ops : &single_sock_ops), sizeof(*tcp_hndl->socket.ops)); + if (tcp_hndl->socket.ops->open(&tcp_hndl->socket)) + goto cleanup; + } + + /* from now on don't allow changes */ + tcp_hndl->max_inline_buf_sz = xio_tcp_get_inline_buffer_size(); + tcp_hndl->membuf_sz = tcp_hndl->max_inline_buf_sz; + + if (observer) + xio_observable_reg_observer(&tcp_hndl->base.observable, + observer); + + INIT_LIST_HEAD(&tcp_hndl->in_flight_list); + INIT_LIST_HEAD(&tcp_hndl->tx_ready_list); + INIT_LIST_HEAD(&tcp_hndl->tx_comp_list); + INIT_LIST_HEAD(&tcp_hndl->rx_list); + INIT_LIST_HEAD(&tcp_hndl->io_list); + + INIT_LIST_HEAD(&tcp_hndl->pending_conns); + + tcp_hndl->socket.accept_event_data.handler = xio_tcp_accept_connections; + tcp_hndl->socket.accept_event_data.data = tcp_hndl; + tcp_hndl->ctl_rx_event.handler = xio_tcp_consume_ctl_rx; + tcp_hndl->ctl_rx_event.data = tcp_hndl; + tcp_hndl->data_rx_event.handler = xio_tcp_consume_data_rx; + tcp_hndl->data_rx_event.data = tcp_hndl; + tcp_hndl->flush_tx_event.handler = xio_tcp_flush_tx_handler; + tcp_hndl->flush_tx_event.data = tcp_hndl; + tcp_hndl->disconnect_event.handler = xio_tcp_disconnect_handler; + tcp_hndl->disconnect_event.data = tcp_hndl; + + TRACE_LOG("xio_tcp_open: [new] handle:%p\n", tcp_hndl); + + return tcp_hndl; + +cleanup: + kfree(tcp_hndl); + + return NULL; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_pending_conn_remove_handler */ +/*---------------------------------------------------------------------------*/ +void xio_tcp_pending_conn_remove_handler(void *user_data) +{ + struct xio_tcp_pending_conn *pending_conn = user_data; + + if (xio_context_is_pending_event( + &pending_conn->pending_event_data)) { + pending_conn->pending_event_data.data = pending_conn; + pending_conn->pending_event_data.handler = + xio_tcp_pending_conn_remove_handler; + xio_context_add_event(pending_conn->parent->base.ctx, + &pending_conn->pending_event_data); + } else { + list_del(&pending_conn->conns_list_entry); + kfree(pending_conn); + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_handle_pending_conn */ +/*---------------------------------------------------------------------------*/ +void xio_tcp_handle_pending_conn(void *user_data) +{ + int retval = 0; + struct xio_tcp_pending_conn *pending_conn = user_data; + struct xio_tcp_pending_conn *matching_conn = NULL; + struct xio_tcp_transport *parent_hndl = pending_conn->parent; + struct xio_tcp_pending_conn *pconn = NULL, *next_pconn = NULL; + struct xio_tcp_pending_conn *ctl_conn = NULL, *data_conn = NULL; + void *buf; + int is_single = 1; + struct socket *ctl_sock = NULL, *data_sock = NULL; + socklen_t len = 0; + struct xio_tcp_transport *child_hndl = NULL; + union xio_transport_event_data ev_data; + + DEBUG_LOG("parent_hndl=%p\n", parent_hndl); + + buf = &pending_conn->msg; + buf += sizeof(struct xio_tcp_connect_msg) - + pending_conn->waiting_for_bytes; + while (pending_conn->waiting_for_bytes) { + struct msghdr msg; + struct kvec vec; + + memset(&msg, 0, sizeof(msg)); + msg.msg_flags = MSG_NOSIGNAL | MSG_DONTWAIT; + vec.iov_base = buf; + vec.iov_len = pending_conn->waiting_for_bytes; + retval = kernel_recvmsg(pending_conn->sock, &msg, &vec, 1, + pending_conn->waiting_for_bytes, + msg.msg_flags); + if (retval > 0) { + pending_conn->waiting_for_bytes -= retval; + buf += retval; + } else if (retval == 0) { + ERROR_LOG("got EOF while establishing connection\n"); + goto cleanup; + } else { + if (retval != -EAGAIN) { + ERROR_LOG("recv return with errno=%d\n", + -retval); + goto cleanup; + } + return; + } + } + + DEBUG_LOG("got init msg\n"); + + UNPACK_LVAL(&pending_conn->msg, &pending_conn->msg, sock_type); + UNPACK_SVAL(&pending_conn->msg, &pending_conn->msg, second_port); + UNPACK_SVAL(&pending_conn->msg, &pending_conn->msg, pad); + + if (pending_conn->msg.sock_type == XIO_TCP_SINGLE_SOCK) { + ctl_conn = pending_conn; + ctl_sock = pending_conn->sock; + goto single_sock; + } + + is_single = 0; + + list_for_each_entry_safe(pconn, next_pconn, + &parent_hndl->pending_conns, + conns_list_entry) { + if (pconn->waiting_for_bytes) + continue; + + if (pconn->sa.sa.sa_family == AF_INET) { + if ((pconn->msg.second_port == + ntohs(pending_conn->sa.sa_in.sin_port)) && + (pconn->sa.sa_in.sin_addr.s_addr == + pending_conn->sa.sa_in.sin_addr.s_addr)) { + matching_conn = pconn; + if (ntohs(matching_conn->sa.sa_in.sin_port) != + pending_conn->msg.second_port) { + ERROR_LOG("ports mismatch\n"); + return; + } + break; + } + } else if (pconn->sa.sa.sa_family == AF_INET6) { + if ((pconn->msg.second_port == + ntohs(pending_conn->sa.sa_in6.sin6_port)) && + !memcmp(&pconn->sa.sa_in6.sin6_addr, + &pending_conn->sa.sa_in6.sin6_addr, + sizeof(pconn->sa.sa_in6.sin6_addr))) { + matching_conn = pconn; + if (ntohs(matching_conn->sa.sa_in6.sin6_port) + != pending_conn->msg.second_port) { + ERROR_LOG("ports mismatch\n"); + return; + } + break; + } + } else { + ERROR_LOG("unknown family %d\n", + pconn->sa.sa.sa_family); + } + } + + if (!matching_conn) + return; + + if (pending_conn->msg.sock_type == XIO_TCP_CTL_SOCK) { + ctl_conn = pending_conn; + data_conn = matching_conn; + } else if (pending_conn->msg.sock_type == XIO_TCP_DATA_SOCK) { + ctl_conn = matching_conn; + data_conn = pending_conn; + } + ctl_sock = ctl_conn->sock; + data_sock = data_conn->sock; + +single_sock: + child_hndl = xio_tcp_transport_create(parent_hndl->transport, + parent_hndl->base.ctx, + NULL, + 0); + if (!child_hndl) { + ERROR_LOG("failed to create tcp child\n"); + xio_transport_notify_observer_error(&parent_hndl->base, + xio_errno()); + goto cleanup; + } + + memcpy(&child_hndl->base.peer_addr, + &ctl_conn->sa.sa_stor, + sizeof(child_hndl->base.peer_addr)); + + if (is_single) { + child_hndl->socket.ctl.ksock = ctl_sock; + child_hndl->socket.data.ksock = ctl_sock; + memcpy(child_hndl->socket.ops, &single_sock_ops, + sizeof(*child_hndl->socket.ops)); + } else { + child_hndl->socket.ctl.ksock = ctl_sock; + child_hndl->socket.data.ksock = data_sock; + memcpy(child_hndl->socket.ops, &dual_sock_ops, + sizeof(*child_hndl->socket.ops)); + + child_hndl->tmp_rx_buf = kzalloc(TMP_RX_BUF_SIZE, GFP_KERNEL); + if (!child_hndl->tmp_rx_buf) { + xio_set_error(ENOMEM); + ERROR_LOG("kzalloc failed.\n"); + goto cleanup; + } + child_hndl->tmp_rx_buf_cur = child_hndl->tmp_rx_buf; + } + + xio_tcp_save_orig_callbacks_from(&child_hndl->socket.ctl, + &parent_hndl->socket.ctl); + xio_tcp_restore_callbacks(&child_hndl->socket.ctl); + xio_tcp_pending_conn_remove_handler(ctl_conn); + + if (!is_single) { + xio_tcp_save_orig_callbacks_from(&child_hndl->socket.data, + &parent_hndl->socket.ctl); + xio_tcp_restore_callbacks(&child_hndl->socket.data); + xio_tcp_pending_conn_remove_handler(data_conn); + } + + len = sizeof(child_hndl->base.local_addr); + retval = kernel_getsockname( + child_hndl->socket.ctl.ksock, + (struct sockaddr *)&child_hndl->base.local_addr, + &len); + if (retval) { + xio_set_error(-retval); + ERROR_LOG("tcp getsockname failed. (errno=%d)\n", -retval); + } + + child_hndl->state = XIO_TRANSPORT_STATE_CONNECTING; + + ev_data.new_connection.child_trans_hndl = + (struct xio_transport_base *)child_hndl; + xio_transport_notify_observer((struct xio_transport_base *)parent_hndl, + XIO_TRANSPORT_EVENT_NEW_CONNECTION, + &ev_data); + + return; + +cleanup: + if (is_single) + ctl_conn = pending_conn; + if (ctl_sock) + xio_tcp_restore_callbacks_from(ctl_sock, + &parent_hndl->socket.ctl); + xio_tcp_pending_conn_remove_handler(ctl_conn); + sock_release(ctl_sock); + + if (!is_single) { + xio_tcp_restore_callbacks_from(data_sock, + &parent_hndl->socket.ctl); + xio_tcp_pending_conn_remove_handler(data_conn); + sock_release(data_sock); + } + + if (child_hndl) + xio_tcp_post_close(child_hndl, 1); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_pending_conn_ev_handler */ +/*---------------------------------------------------------------------------*/ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 15, 0) +void xio_tcp_pending_conn_ev_handler(struct sock *sk) +{ + void (*ready)(struct sock *sk); +#else +void xio_tcp_pending_conn_ev_handler(struct sock *sk, int bytes) +{ + void (*ready)(struct sock *sk, int bytes); +#endif + struct xio_tcp_pending_conn *pending_conn; + + read_lock(&sk->sk_callback_lock); + + pending_conn = sk->sk_user_data; + + DEBUG_LOG("pending conn %p ready, sk=%p\n", pending_conn, sk); + + if (!pending_conn) { + ready = sk->sk_data_ready; + goto out; + } + + ready = pending_conn->parent->socket.ctl.orig_sk_data_ready; + + pending_conn->pending_event_data.data = pending_conn; + pending_conn->pending_event_data.handler = xio_tcp_handle_pending_conn; + xio_context_add_event(pending_conn->parent->base.ctx, + &pending_conn->pending_event_data); + +out: + read_unlock(&sk->sk_callback_lock); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 15, 0) + ready(sk); +#else + ready(sk, bytes); +#endif +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_new_connection */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_new_connection(struct xio_tcp_transport *parent_hndl) +{ + int retval; + struct socket *new_sock = NULL; + socklen_t len = sizeof(struct sockaddr_storage); + struct xio_tcp_pending_conn *pending_conn; + + DEBUG_LOG("parent_hndl=%p\n", parent_hndl); + + /* "accept" the connection */ + retval = kernel_accept(parent_hndl->socket.ctl.ksock, + &new_sock, O_NONBLOCK); + if (retval < 0 || !new_sock) { + if (new_sock) + sock_release(new_sock); + xio_set_error(-retval); + if (retval == -EWOULDBLOCK || retval == -EAGAIN) + return retval; + ERROR_LOG("tcp accept failed. (errno=%d)\n", -retval); + return retval; + } + + /*allocate pending fd struct */ + pending_conn = kzalloc(sizeof(*pending_conn), GFP_KERNEL); + if (!pending_conn) { + xio_set_error(ENOMEM); + ERROR_LOG("kzalloc failed.\n"); + xio_transport_notify_observer_error(&parent_hndl->base, + xio_errno()); + sock_release(new_sock); + return -ENOMEM; + } + + pending_conn->parent = parent_hndl; + pending_conn->waiting_for_bytes = sizeof(struct xio_tcp_connect_msg); + + retval = kernel_getpeername( + new_sock, + (struct sockaddr *)&pending_conn->sa.sa_stor, &len); + if (retval < 0) { + xio_set_error(-retval); + ERROR_LOG("tcp getpeername failed. (errno=%d)\n", -retval); + kfree(pending_conn); + sock_release(new_sock); + return retval; + } + + pending_conn->sock = new_sock; + + list_add_tail(&pending_conn->conns_list_entry, + &parent_hndl->pending_conns); + + xio_tcp_set_callbacks(new_sock, xio_tcp_pending_conn_ev_handler, + NULL, NULL, pending_conn); + + xio_tcp_handle_pending_conn(pending_conn); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_accept_connections */ +/*---------------------------------------------------------------------------*/ +void xio_tcp_accept_connections(void *user_data) +{ + struct xio_tcp_transport *parent_hndl = user_data; + + DEBUG_LOG("try to accept connections\n"); + + xio_tcp_new_connection(parent_hndl); + + /* + * if accept was successful, try to accept another one later. + */ + if (!xio_tcp_new_connection(parent_hndl)) { + xio_context_add_event( + parent_hndl->base.ctx, + &parent_hndl->socket.accept_event_data); + } + + /*todo while ????*/ + /*while (!xio_tcp_new_connection(parent_hndl)) { + cond_resched(); + if (++count > MAX_ACCEPT_BATCH) { + xio_context_add_event( + parent_hndl->base.ctx, + &parent_hndl->sock.cfd_accept_event_data); + break; + } + }*/ +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_listener_ev_handler */ +/*---------------------------------------------------------------------------*/ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 15, 0) +void xio_tcp_listener_ev_handler(struct sock *sk) +{ + void (*ready)(struct sock *sk); +#else +void xio_tcp_listener_ev_handler(struct sock *sk, int bytes) +{ + void (*ready)(struct sock *sk, int bytes); +#endif + struct xio_tcp_transport *tcp_hndl; + + DEBUG_LOG("listen data ready sk %p\n", sk); + + read_lock(&sk->sk_callback_lock); + if (!sk->sk_user_data) { /* check for teardown race */ + ready = sk->sk_data_ready; + goto out; + } + + tcp_hndl = sk->sk_user_data; + ready = tcp_hndl->socket.ctl.orig_sk_data_ready; + + /* + * ->sk_data_ready is also called for a newly established child socket + * before it has been accepted and the accepter has set up their + * data_ready.. we only want to queue listen work for our listening + * socket + */ + if (sk->sk_state == TCP_LISTEN) { + xio_context_add_event(tcp_hndl->base.ctx, + &tcp_hndl->socket.accept_event_data); + } +out: + read_unlock(&sk->sk_callback_lock); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 15, 0) + ready(sk); +#else + ready(sk, bytes); +#endif +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_listen */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_listen(struct xio_transport_base *transport, + const char *portal_uri, uint16_t *src_port, + int backlog) +{ + struct xio_tcp_transport *tcp_hndl = + (struct xio_tcp_transport *)transport; + union xio_sockaddr sa; + int sa_len; + int retval = 0; + uint16_t sport; + + /* resolve the portal_uri */ + sa_len = xio_uri_to_ss(portal_uri, &sa.sa_stor); + if (sa_len == -1) { + xio_set_error(XIO_E_ADDR_ERROR); + ERROR_LOG("address [%s] resolving failed\n", portal_uri); + return -1; + } + tcp_hndl->base.is_client = 0; + + xio_tcp_save_orig_callbacks(&tcp_hndl->socket.ctl); + xio_tcp_set_callbacks(tcp_hndl->socket.ctl.ksock, + xio_tcp_listener_ev_handler, + NULL, + NULL, + tcp_hndl); + + /* bind */ + retval = kernel_bind(tcp_hndl->socket.ctl.ksock, + (struct sockaddr *)&sa.sa_stor, sa_len); + if (retval) { + xio_set_error(-retval); + ERROR_LOG("tcp bind failed. (errno=%d)\n", -retval); + goto exit; + } + + tcp_hndl->is_listen = 1; + + retval = kernel_listen(tcp_hndl->socket.ctl.ksock, + backlog > 0 ? backlog : MAX_BACKLOG); + if (retval) { + xio_set_error(-retval); + ERROR_LOG("tcp listen failed. (errno=%d)\n", -retval); + goto exit; + } + + retval = kernel_getsockname(tcp_hndl->socket.ctl.ksock, + (struct sockaddr *)&sa.sa_stor, + (socklen_t *)&sa_len); + if (retval) { + xio_set_error(-retval); + ERROR_LOG("getsockname failed. (errno=%d)\n", -retval); + goto exit; + } + + switch (sa.sa_stor.ss_family) { + case AF_INET: + sport = ntohs(sa.sa_in.sin_port); + break; + case AF_INET6: + sport = ntohs(sa.sa_in6.sin6_port); + break; + default: + xio_set_error(XIO_E_ADDR_ERROR); + ERROR_LOG("invalid family type %d.\n", sa.sa_stor.ss_family); + goto exit; + } + + if (src_port) + *src_port = sport; + + tcp_hndl->state = XIO_TRANSPORT_STATE_LISTEN; + DEBUG_LOG("listen on [%s] src_port:%d\n", portal_uri, sport); + + return 0; + +exit: + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_conn_established_helper */ +/*---------------------------------------------------------------------------*/ +void xio_tcp_conn_established_helper(struct xio_tcp_transport *tcp_hndl) +{ + int retval = 0; + socklen_t len = 0; + + len = sizeof(tcp_hndl->base.peer_addr); + retval = kernel_getpeername( + tcp_hndl->socket.ctl.ksock, + (struct sockaddr *)&tcp_hndl->base.peer_addr, + &len); + if (retval) { + xio_set_error(-retval); + ERROR_LOG("tcp getpeername failed. (errno=%d)\n", -retval); + goto cleanup; + } + tcp_hndl->state = XIO_TRANSPORT_STATE_CONNECTING; + + retval = tcp_hndl->socket.ops->add_ev_handlers(tcp_hndl); + if (retval) { + ERROR_LOG("setting connection handler failed. (errno=%d)\n", + -retval); + goto cleanup; + } + + xio_transport_notify_observer(&tcp_hndl->base, + XIO_TRANSPORT_EVENT_ESTABLISHED, + NULL); + + return; + +cleanup: + if (retval == -ECONNREFUSED) + xio_transport_notify_observer(&tcp_hndl->base, + XIO_TRANSPORT_EVENT_REFUSED, + NULL); + else + xio_transport_notify_observer_error(&tcp_hndl->base, + XIO_E_CONNECT_ERROR); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_single_conn_established_ev_handler */ +/*---------------------------------------------------------------------------*/ +void xio_tcp_single_conn_established_ev_handler(void *user_context) +{ + struct xio_tcp_transport *tcp_hndl = user_context; + int retval = 0; + struct xio_tcp_connect_msg msg; + + if (test_bits(XIO_SOCK_ESTABLISH_CTL, + &tcp_hndl->socket.establish_states)) { + return; + } + set_bits(XIO_SOCK_ESTABLISH_CTL, &tcp_hndl->socket.establish_states); + + msg.sock_type = XIO_TCP_SINGLE_SOCK; + msg.second_port = 0; + msg.pad = 0; + retval = xio_tcp_send_connect_msg(tcp_hndl->socket.ctl.ksock, &msg); + if (retval) + goto cleanup; + + xio_tcp_conn_established_helper(tcp_hndl); + + return; + +cleanup: + if (retval == -ECONNREFUSED) + xio_transport_notify_observer(&tcp_hndl->base, + XIO_TRANSPORT_EVENT_REFUSED, + NULL); + else + xio_transport_notify_observer_error(&tcp_hndl->base, + XIO_E_CONNECT_ERROR); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_connect_helper */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_connect_helper(struct socket *sock, struct sockaddr *sa, + socklen_t sa_len, uint16_t *bound_port, + struct sockaddr_storage *lss) +{ + int retval; + union xio_sockaddr *lsa = (union xio_sockaddr *)lss; + struct sockaddr_storage sa_stor; + socklen_t lsa_len = sizeof(struct sockaddr_storage); + + DEBUG_LOG("connect sock=%p\n", sock); + retval = kernel_connect(sock, sa, sa_len, O_NONBLOCK); + if (retval) { + if (retval == -EINPROGRESS) { + /*set iomux for write event*/ + } else { + xio_set_error(-retval); + ERROR_LOG("tcp connect failed. (errno=%d)\n", -retval); + return retval; + } + } else { + /*handle in ev_handler*/ + } + + if (!lss) + lsa = (union xio_sockaddr *)&sa_stor; + + retval = kernel_getsockname(sock, &lsa->sa, &lsa_len); + if (retval) { + xio_set_error(-retval); + ERROR_LOG("tcp getsockname failed. (errno=%d %m)\n", -retval); + return retval; + } + + if (lsa->sa.sa_family == AF_INET) { + *bound_port = ntohs(lsa->sa_in.sin_port); + } else if (lsa->sa.sa_family == AF_INET6) { + *bound_port = ntohs(lsa->sa_in6.sin6_port); + } else { + ERROR_LOG("getsockname unknown family = %d\n", + lsa->sa.sa_family); + return -1; + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_ctl_conn_established_ev_handler */ +/*---------------------------------------------------------------------------*/ +void xio_tcp_ctl_conn_established_ev_handler(void *user_context) +{ + struct xio_tcp_transport *tcp_hndl = user_context; + int retval = 0; + struct xio_tcp_connect_msg msg; + + if (test_bits(XIO_SOCK_ESTABLISH_CTL, + &tcp_hndl->socket.establish_states)) { + return; + } + set_bits(XIO_SOCK_ESTABLISH_CTL, &tcp_hndl->socket.establish_states); + + DEBUG_LOG("tcp_hndl=%p\n", tcp_hndl); + msg.sock_type = XIO_TCP_CTL_SOCK; + msg.second_port = tcp_hndl->socket.data.port; + msg.pad = 0; + retval = xio_tcp_send_connect_msg(tcp_hndl->socket.ctl.ksock, &msg); + if (retval) + goto cleanup; + + if (test_bits(XIO_SOCK_ESTABLISH_DATA, + &tcp_hndl->socket.establish_states)) + xio_tcp_conn_established_helper(tcp_hndl); + + return; + +cleanup: + xio_transport_notify_observer_error(&tcp_hndl->base, + XIO_E_CONNECT_ERROR); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_data_conn_established_ev_handler */ +/*---------------------------------------------------------------------------*/ +void xio_tcp_data_conn_established_ev_handler(void *user_context) +{ + struct xio_tcp_transport *tcp_hndl = user_context; + int retval = 0; + struct xio_tcp_connect_msg msg; + + if (test_bits(XIO_SOCK_ESTABLISH_DATA, + &tcp_hndl->socket.establish_states)) { + return; + } + set_bits(XIO_SOCK_ESTABLISH_DATA, &tcp_hndl->socket.establish_states); + + DEBUG_LOG("tcp_hndl=%p\n", tcp_hndl); + + msg.sock_type = XIO_TCP_DATA_SOCK; + msg.second_port = tcp_hndl->socket.ctl.port; + msg.pad = 0; + retval = xio_tcp_send_connect_msg(tcp_hndl->socket.data.ksock, &msg); + if (retval) + goto cleanup; + + if (test_bits(XIO_SOCK_ESTABLISH_CTL, + &tcp_hndl->socket.establish_states)) + xio_tcp_conn_established_helper(tcp_hndl); + + return; + +cleanup: + xio_transport_notify_observer_error(&tcp_hndl->base, + XIO_E_CONNECT_ERROR); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_single_sock_connect */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_single_sock_connect(struct xio_tcp_transport *tcp_hndl, + struct sockaddr *sa, + socklen_t sa_len) +{ + int retval; + + tcp_hndl->socket.ctl.conn_establish_event_data.data = tcp_hndl; + tcp_hndl->socket.ctl.conn_establish_event_data.handler = + xio_tcp_single_conn_established_ev_handler; + + xio_tcp_save_orig_callbacks(&tcp_hndl->socket.ctl); + xio_tcp_set_callbacks(tcp_hndl->socket.ctl.ksock, + NULL, xio_tcp_state_change_cb, NULL, tcp_hndl); + retval = xio_tcp_connect_helper(tcp_hndl->socket.ctl.ksock, sa, sa_len, + &tcp_hndl->socket.ctl.port, + &tcp_hndl->base.local_addr); + if (retval) + return retval; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_dual_sock_connect */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_dual_sock_connect(struct xio_tcp_transport *tcp_hndl, + struct sockaddr *sa, + socklen_t sa_len) +{ + int retval; + + tcp_hndl->tmp_rx_buf = kzalloc(TMP_RX_BUF_SIZE, GFP_KERNEL); + if (!tcp_hndl->tmp_rx_buf) { + xio_set_error(ENOMEM); + ERROR_LOG("ucalloc failed. %m\n"); + return -1; + } + tcp_hndl->tmp_rx_buf_cur = tcp_hndl->tmp_rx_buf; + + tcp_hndl->socket.data.conn_establish_event_data.data = tcp_hndl; + tcp_hndl->socket.data.conn_establish_event_data.handler = + xio_tcp_data_conn_established_ev_handler; + + xio_tcp_save_orig_callbacks(&tcp_hndl->socket.data); + xio_tcp_set_callbacks(tcp_hndl->socket.data.ksock, + NULL, xio_tcp_state_change_cb, NULL, tcp_hndl); + retval = xio_tcp_connect_helper(tcp_hndl->socket.data.ksock, sa, sa_len, + &tcp_hndl->socket.data.port, + NULL); + if (retval) + return retval; + + tcp_hndl->socket.ctl.conn_establish_event_data.data = tcp_hndl; + tcp_hndl->socket.ctl.conn_establish_event_data.handler = + xio_tcp_ctl_conn_established_ev_handler; + + xio_tcp_save_orig_callbacks(&tcp_hndl->socket.ctl); + xio_tcp_set_callbacks(tcp_hndl->socket.ctl.ksock, + NULL, xio_tcp_state_change_cb, NULL, tcp_hndl); + + retval = xio_tcp_connect_helper(tcp_hndl->socket.ctl.ksock, sa, sa_len, + &tcp_hndl->socket.ctl.port, + &tcp_hndl->base.local_addr); + if (retval) + return retval; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_connect */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_connect(struct xio_transport_base *transport, + const char *portal_uri, const char *out_if_addr) +{ + struct xio_tcp_transport *tcp_hndl = + (struct xio_tcp_transport *)transport; + union xio_sockaddr rsa; + socklen_t rsa_len = 0; + int retval = 0; + + /* resolve the portal_uri */ + rsa_len = xio_uri_to_ss(portal_uri, &rsa.sa_stor); + if (rsa_len == -1) { + xio_set_error(XIO_E_ADDR_ERROR); + ERROR_LOG("address [%s] resolving failed\n", portal_uri); + return -1; + } + /* allocate memory for portal_uri */ + tcp_hndl->base.portal_uri = kstrdup(portal_uri, GFP_KERNEL); + if (!tcp_hndl->base.portal_uri) { + xio_set_error(ENOMEM); + ERROR_LOG("strdup failed. %m\n"); + return -1; + } + tcp_hndl->base.is_client = 1; + + if (out_if_addr) { + union xio_sockaddr if_sa; + int sa_len; + + sa_len = xio_host_port_to_ss(out_if_addr, &if_sa.sa_stor); + if (sa_len == -1) { + xio_set_error(XIO_E_ADDR_ERROR); + ERROR_LOG("outgoing interface [%s] resolving failed\n", + out_if_addr); + goto exit; + } + retval = tcp_hndl->socket.ctl.ksock->ops->bind( + tcp_hndl->socket.ctl.ksock, + (struct sockaddr *)&if_sa.sa_stor, sa_len); + if (retval) { + xio_set_error(-retval); + ERROR_LOG("tcp bind failed. (errno=%d %m)\n", + -retval); + goto exit; + } + } + + /* connect */ + retval = tcp_hndl->socket.ops->connect(tcp_hndl, + (struct sockaddr *)&rsa.sa_stor, + rsa_len); + if (retval) + goto exit; + + return 0; + +exit: + kfree(tcp_hndl->base.portal_uri); + tcp_hndl->base.portal_uri = NULL; + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_open */ +/*---------------------------------------------------------------------------*/ +static struct xio_transport_base *xio_tcp_open( + struct xio_transport *transport, + struct xio_context *ctx, + struct xio_observer *observer, + uint32_t trans_attr_mask, + struct xio_transport_init_attr *attr) +{ + struct xio_tcp_transport *tcp_hndl; + + tcp_hndl = xio_tcp_transport_create(transport, ctx, observer, 1); + if (!tcp_hndl) { + ERROR_LOG("failed. to create tcp transport%m\n"); + return NULL; + } + return (struct xio_transport_base *)tcp_hndl; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_rxd_init */ +/*---------------------------------------------------------------------------*/ +static void xio_tcp_rxd_init(struct xio_tcp_work_req *rxd, + void *buf, unsigned size) +{ + rxd->msg_iov[0].iov_base = buf; + rxd->msg_iov[0].iov_len = sizeof(struct xio_tlv); + rxd->msg_iov[1].iov_base = rxd->msg_iov[0].iov_base + + rxd->msg_iov[0].iov_len; + rxd->msg_iov[1].iov_len = size - sizeof(struct xio_tlv); + rxd->msg_len = 2; + + rxd->tot_iov_byte_len = 0; + + rxd->stage = XIO_TCP_RX_START; + rxd->msg.msg_control = NULL; + rxd->msg.msg_controllen = 0; + rxd->msg.msg_flags = MSG_DONTWAIT; + rxd->msg.msg_name = NULL; + rxd->msg.msg_namelen = 0; + MSGHDR_IOV(&rxd->msg) = NULL; + MSGHDR_IOVLEN(&rxd->msg) = 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_txd_init */ +/*---------------------------------------------------------------------------*/ +static void xio_tcp_txd_init(struct xio_tcp_work_req *txd, + void *buf, unsigned size) +{ + txd->ctl_msg = buf; + txd->ctl_msg_len = 0; + txd->msg_iov[0].iov_base = buf; + txd->msg_iov[0].iov_len = size; + txd->msg_len = 1; + txd->tot_iov_byte_len = 0; + + txd->stage = XIO_TCP_TX_BEFORE; + txd->msg.msg_control = NULL; + txd->msg.msg_controllen = 0; + txd->msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; + txd->msg.msg_name = NULL; + txd->msg.msg_namelen = 0; + MSGHDR_IOV(&txd->msg) = NULL; + MSGHDR_IOVLEN(&txd->msg) = 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_task_init */ +/*---------------------------------------------------------------------------*/ +static void xio_tcp_task_init(struct xio_task *task, + struct xio_tcp_transport *tcp_hndl, + void *buf, + unsigned long size) +{ + XIO_TO_TCP_TASK(task, tcp_task); + + tcp_task->buf = buf; + + xio_tcp_rxd_init(&tcp_task->rxd, buf, size); + xio_tcp_txd_init(&tcp_task->txd, buf, size); + + /* initialize the mbuf */ + xio_mbuf_init(&task->mbuf, buf, size, 0); + + memset(&tcp_task->comp_event, 0, sizeof(tcp_task->comp_event)); + tcp_task->comp_event.handler = xio_tcp_tx_completion_handler; + tcp_task->comp_event.data = task; +} + +/* task pools management */ +/*---------------------------------------------------------------------------*/ +/* xio_tcp_initial_pool_slab_pre_create */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_initial_pool_slab_pre_create( + struct xio_transport_base *transport_hndl, + int alloc_nr, + void *pool_dd_data, void *slab_dd_data) +{ + struct xio_tcp_tasks_slab *tcp_slab = + (struct xio_tcp_tasks_slab *)slab_dd_data; + + tcp_slab->buf_size = CONN_SETUP_BUF_SIZE; + + /* The name must be valid until the pool is destroyed + * Use the address of the pool structure to create a unique + * name for the pool + */ + sprintf(tcp_slab->name, "initial_pool-%p", tcp_slab); + tcp_slab->data_pool = kmem_cache_create( + tcp_slab->name, + tcp_slab->buf_size/*pool_size * alloc_nr*/, + PAGE_SIZE, + SLAB_HWCACHE_ALIGN, NULL); + if (!tcp_slab->data_pool) { + xio_set_error(ENOMEM); + ERROR_LOG("kcache(initial_pool) creation failed\n"); + return -1; + } + INFO_LOG("kcache(%s) created(%p)\n", + tcp_slab->name, tcp_slab->data_pool); + tcp_slab->count = 0; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_initial_task_alloc */ +/*---------------------------------------------------------------------------*/ +static inline struct xio_task *xio_tcp_initial_task_alloc( + struct xio_tcp_transport *tcp_hndl) +{ + return tcp_hndl->initial_pool_cls.task_get( + tcp_hndl->initial_pool_cls.pool, tcp_hndl); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_primary_task_alloc */ +/*---------------------------------------------------------------------------*/ +struct xio_task *xio_tcp_primary_task_alloc( + struct xio_tcp_transport *tcp_hndl) +{ + return tcp_hndl->primary_pool_cls.task_get( + tcp_hndl->primary_pool_cls.pool, + tcp_hndl); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_primary_task_lookup */ +/*---------------------------------------------------------------------------*/ +struct xio_task *xio_tcp_primary_task_lookup( + struct xio_tcp_transport *tcp_hndl, + int tid) +{ + if (tcp_hndl->primary_pool_cls.task_lookup) + return tcp_hndl->primary_pool_cls.task_lookup( + tcp_hndl->primary_pool_cls.pool, tid); + return NULL; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_task_free */ +/*---------------------------------------------------------------------------*/ +inline void xio_tcp_task_free(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task) +{ + if (tcp_hndl->primary_pool_cls.task_put) + return tcp_hndl->primary_pool_cls.task_put(task); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_initial_pool_post_create */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_initial_pool_post_create( + struct xio_transport_base *transport_hndl, + void *pool, void *pool_dd_data) +{ + struct xio_task *task; + struct xio_tcp_task *tcp_task; + struct xio_tcp_transport *tcp_hndl = + (struct xio_tcp_transport *)transport_hndl; + + if (!tcp_hndl) + return 0; + + tcp_hndl->initial_pool_cls.pool = pool; + + task = xio_tcp_initial_task_alloc(tcp_hndl); + if (!task) { + ERROR_LOG("failed to get task\n"); + } else { + list_add_tail(&task->tasks_list_entry, &tcp_hndl->rx_list); + tcp_task = (struct xio_tcp_task *)task->dd_data; + tcp_task->out_tcp_op = XIO_TCP_RECV; + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_initial_pool_slab_destroy */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_initial_pool_slab_destroy( + struct xio_transport_base *transport_hndl, + void *pool_dd_data, void *slab_dd_data) +{ + struct xio_tcp_tasks_slab *tcp_slab = + (struct xio_tcp_tasks_slab *)slab_dd_data; + + INFO_LOG("kcache(%s) freed\n", tcp_slab->name); + + if (tcp_slab->count) + ERROR_LOG("pool(%s) not-free(%d)\n", tcp_slab->name, + tcp_slab->count); + kmem_cache_destroy(tcp_slab->data_pool); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_pool_slab_uninit_task */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_pool_slab_uninit_task(struct xio_transport_base *trans_hndl, + void *pool_dd_data, void *slab_dd_data, + struct xio_task *task) +{ + struct xio_tcp_tasks_slab *tcp_slab = + (struct xio_tcp_tasks_slab *)slab_dd_data; + + XIO_TO_TCP_TASK(task, tcp_task); + + /* Phantom tasks have no buffer */ + if (tcp_task->buf) { + if (tcp_slab->count) + tcp_slab->count--; + else + ERROR_LOG("pool(%s) double free?\n", tcp_slab->name); + + kmem_cache_free(tcp_slab->data_pool, tcp_task->buf); + tcp_task->buf = NULL; + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_initial_pool_slab_init_task */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_initial_pool_slab_init_task( + struct xio_transport_base *transport_hndl, + void *pool_dd_data, void *slab_dd_data, + int tid, struct xio_task *task) +{ + struct xio_tcp_transport *tcp_hndl = + (struct xio_tcp_transport *)transport_hndl; + struct xio_tcp_tasks_slab *tcp_slab = + (struct xio_tcp_tasks_slab *)slab_dd_data; + void *buf; + char *ptr; + + XIO_TO_TCP_TASK(task, tcp_task); + + if (!tcp_hndl || tcp_task->buf) + return 0; + + + /* fill xio_tcp_task */ + ptr = (char *)tcp_task; + ptr += sizeof(struct xio_tcp_task); + + /* fill xio_tcp_work_req */ + tcp_task->txd.msg_iov = (void *)ptr; + ptr += sizeof(struct iovec); + + tcp_task->rxd.msg_iov = (void *)ptr; + ptr += 2 * sizeof(struct iovec); + /*****************************************/ + + buf = kmem_cache_zalloc(tcp_slab->data_pool, GFP_KERNEL); + if (!buf) { + xio_set_error(ENOMEM); + ERROR_LOG("kmem_cache_zalloc(initial_pool)\n"); + return -ENOMEM; + } + tcp_slab->count++; + + xio_tcp_task_init( + task, + tcp_hndl, + buf, + tcp_slab->buf_size); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_initial_pool_get_params */ +/*---------------------------------------------------------------------------*/ +static void xio_tcp_initial_pool_get_params( + struct xio_transport_base *transport_hndl, + int *start_nr, int *max_nr, int *alloc_nr, + int *pool_dd_sz, int *slab_dd_sz, int *task_dd_sz) +{ + *start_nr = 10 * NUM_CONN_SETUP_TASKS; + *alloc_nr = 10 * NUM_CONN_SETUP_TASKS; + *max_nr = 10 * NUM_CONN_SETUP_TASKS; + + *pool_dd_sz = 0; + *slab_dd_sz = sizeof(struct xio_tcp_tasks_slab); + *task_dd_sz = sizeof(struct xio_tcp_task) + 3 * sizeof(struct iovec); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_task_pre_put */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_task_pre_put(struct xio_transport_base *trans_hndl, + struct xio_task *task) +{ + XIO_TO_TCP_TASK(task, tcp_task); + unsigned int i; + + /* recycle TCP buffers back to pool */ + + /* put buffers back to pool */ + + for (i = 0; i < tcp_task->read_num_mp_mem; i++) { + if (tcp_task->read_mp_mem[i].cache) { + xio_mempool_free_mp(&tcp_task->read_mp_mem[i]); + tcp_task->read_mp_mem[i].cache = NULL; + } + } + + tcp_task->read_num_mp_mem = 0; + + for (i = 0; i < tcp_task->write_num_mp_mem; i++) { + if (tcp_task->write_mp_mem[i].cache) { + xio_mempool_free_mp(&tcp_task->write_mp_mem[i]); + tcp_task->write_mp_mem[i].cache = NULL; + } + } + + tcp_task->write_num_mp_mem = 0; + tcp_task->req_in_num_sge = 0; + tcp_task->req_out_num_sge = 0; + tcp_task->rsp_out_num_sge = 0; + tcp_task->sn = 0; + + tcp_task->in_tcp_op = XIO_TCP_NULL; + tcp_task->out_tcp_op = XIO_TCP_NULL; + + xio_tcp_rxd_init(&tcp_task->rxd, + task->mbuf.buf.head, + task->mbuf.buf.buflen); + xio_tcp_txd_init(&tcp_task->txd, + task->mbuf.buf.head, + task->mbuf.buf.buflen); + + /* todo how to remove? */ + xio_context_disable_event(&tcp_task->comp_event); + + return 0; +} + +static struct xio_tasks_pool_ops initial_tasks_pool_ops = { + .pool_get_params = xio_tcp_initial_pool_get_params, + .slab_pre_create = xio_tcp_initial_pool_slab_pre_create, + .slab_destroy = xio_tcp_initial_pool_slab_destroy, + .slab_init_task = xio_tcp_initial_pool_slab_init_task, + .slab_uninit_task = xio_tcp_pool_slab_uninit_task, + .pool_post_create = xio_tcp_initial_pool_post_create, + .task_pre_put = xio_tcp_task_pre_put +}; + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_primary_pool_slab_pre_create */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_primary_pool_slab_pre_create( + struct xio_transport_base *transport_hndl, + int alloc_nr, void *pool_dd_data, void *slab_dd_data) +{ + struct xio_tcp_tasks_slab *tcp_slab = + (struct xio_tcp_tasks_slab *)slab_dd_data; + size_t inline_buf_sz = xio_tcp_get_inline_buffer_size(); + + tcp_slab->buf_size = inline_buf_sz; + /* The name must be valid until the pool is destroyed + * Use the address of the pool structure to create a unique + * name for the pool + */ + sprintf(tcp_slab->name, "primary_pool-%p", tcp_slab); + tcp_slab->data_pool = kmem_cache_create(tcp_slab->name, + tcp_slab->buf_size, PAGE_SIZE, + SLAB_HWCACHE_ALIGN, NULL); + if (!tcp_slab->data_pool) { + xio_set_error(ENOMEM); + ERROR_LOG("kcache(primary_pool) creation failed\n"); + return -1; + } + INFO_LOG("kcache(%s) created(%p)\n", + tcp_slab->name, tcp_slab->data_pool); + + DEBUG_LOG("pool buf:%p\n", tcp_slab->data_pool); + tcp_slab->count = 0; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_primary_pool_post_create */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_primary_pool_post_create( + struct xio_transport_base *transport_hndl, + void *pool, void *pool_dd_data) +{ + struct xio_task *task = NULL; + struct xio_tcp_task *tcp_task = NULL; + int i; + struct xio_tcp_transport *tcp_hndl = + (struct xio_tcp_transport *)transport_hndl; + + if (!tcp_hndl) + return 0; + + tcp_hndl->primary_pool_cls.pool = pool; + + for (i = 0; i < RX_LIST_POST_NR; i++) { + /* get ready to receive message */ + task = xio_tcp_primary_task_alloc(tcp_hndl); + if (task == 0) { + ERROR_LOG("primary task pool is empty\n"); + return -1; + } + tcp_task = task->dd_data; + tcp_task->out_tcp_op = XIO_TCP_RECV; + list_add_tail(&task->tasks_list_entry, &tcp_hndl->rx_list); + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_primary_pool_slab_destroy */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_primary_pool_slab_destroy( + struct xio_transport_base *transport_hndl, + void *pool_dd_data, void *slab_dd_data) +{ + struct xio_tcp_tasks_slab *tcp_slab = + (struct xio_tcp_tasks_slab *)slab_dd_data; + + INFO_LOG("kcache(%s) freed cnt:%d\n", tcp_slab->name, tcp_slab->count); + + if (tcp_slab->count) + ERROR_LOG("pool(%s) not-free(%d)\n", + tcp_slab->name, tcp_slab->count); + + kmem_cache_destroy(tcp_slab->data_pool); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_primary_pool_slab_init_task */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_primary_pool_slab_init_task( + struct xio_transport_base *transport_hndl, + void *pool_dd_data, + void *slab_dd_data, int tid, struct xio_task *task) +{ + struct xio_tcp_transport *tcp_hndl = + (struct xio_tcp_transport *)transport_hndl; + struct xio_tcp_tasks_slab *tcp_slab = + (struct xio_tcp_tasks_slab *)slab_dd_data; + void *buf; + int max_iovsz = max(tcp_options.max_out_iovsz, + tcp_options.max_in_iovsz) + 1; + char *ptr; + + XIO_TO_TCP_TASK(task, tcp_task); + + if (!tcp_hndl || tcp_task->buf) + return 0; + + /* fill xio_tco_task */ + ptr = (char *)tcp_task; + ptr += sizeof(struct xio_tcp_task); + + /* fill xio_tcp_work_req */ + tcp_task->txd.msg_iov = (void *)ptr; + ptr += (max_iovsz + 1) * sizeof(struct iovec); + tcp_task->rxd.msg_iov = (void *)ptr; + ptr += (max_iovsz + 1) * sizeof(struct iovec); + + tcp_task->read_mp_mem = (void *)ptr; + ptr += max_iovsz * sizeof(struct xio_mp_mem); + tcp_task->write_mp_mem = (void *)ptr; + ptr += max_iovsz * sizeof(struct xio_mp_mem); + + tcp_task->req_in_sge = (void *)ptr; + ptr += max_iovsz * sizeof(struct xio_sge); + tcp_task->req_out_sge = (void *)ptr; + ptr += max_iovsz * sizeof(struct xio_sge); + tcp_task->rsp_out_sge = (void *)ptr; + ptr += max_iovsz * sizeof(struct xio_sge); + /*****************************************/ + + buf = kmem_cache_zalloc(tcp_slab->data_pool, GFP_KERNEL); + if (!buf) { + ERROR_LOG("kmem_cache_zalloc(primary_pool)\n"); + xio_set_error(ENOMEM); + return -ENOMEM; + } + tcp_slab->count++; + + tcp_task->out_tcp_op = 0x200; + xio_tcp_task_init( + task, + tcp_hndl, + buf, + tcp_slab->buf_size); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_primary_pool_get_params */ +/*---------------------------------------------------------------------------*/ +static void xio_tcp_primary_pool_get_params( + struct xio_transport_base *transport_hndl, + int *start_nr, int *max_nr, int *alloc_nr, + int *pool_dd_sz, int *slab_dd_sz, int *task_dd_sz) +{ + int max_iovsz = max(tcp_options.max_out_iovsz, + tcp_options.max_in_iovsz) + 1; + + *start_nr = NUM_START_PRIMARY_POOL_TASKS; + *alloc_nr = NUM_ALLOC_PRIMARY_POOL_TASKS; + *max_nr = max((g_poptions->snd_queue_depth_msgs + + g_poptions->rcv_queue_depth_msgs), *start_nr); + + *pool_dd_sz = 0; + *slab_dd_sz = sizeof(struct xio_tcp_tasks_slab); + *task_dd_sz = sizeof(struct xio_tcp_task) + + (2 * (max_iovsz + 1)) * sizeof(struct iovec) + + 2 * max_iovsz * sizeof(struct xio_mp_mem) + + 3 * max_iovsz * sizeof(struct xio_sge); +} + +static struct xio_tasks_pool_ops primary_tasks_pool_ops = { + .pool_get_params = xio_tcp_primary_pool_get_params, + .slab_pre_create = xio_tcp_primary_pool_slab_pre_create, + .slab_destroy = xio_tcp_primary_pool_slab_destroy, + .slab_init_task = xio_tcp_primary_pool_slab_init_task, + .slab_uninit_task = xio_tcp_pool_slab_uninit_task, + .pool_post_create = xio_tcp_primary_pool_post_create, + .task_pre_put = xio_tcp_task_pre_put, +}; + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_get_pools_ops */ +/*---------------------------------------------------------------------------*/ +static void xio_tcp_get_pools_ops(struct xio_transport_base *trans_hndl, + struct xio_tasks_pool_ops **initial_pool_ops, + struct xio_tasks_pool_ops **primary_pool_ops) +{ + *initial_pool_ops = &initial_tasks_pool_ops; + *primary_pool_ops = &primary_tasks_pool_ops; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_set_pools_cls */ +/*---------------------------------------------------------------------------*/ +static void xio_tcp_set_pools_cls(struct xio_transport_base *trans_hndl, + struct xio_tasks_pool_cls *initial_pool_cls, + struct xio_tasks_pool_cls *primary_pool_cls) +{ + struct xio_tcp_transport *tcp_hndl = + (struct xio_tcp_transport *)trans_hndl; + + if (initial_pool_cls) + tcp_hndl->initial_pool_cls = *initial_pool_cls; + if (primary_pool_cls) + tcp_hndl->primary_pool_cls = *primary_pool_cls; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_set_opt */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_set_opt(void *xio_obj, + int optname, const void *optval, int optlen) +{ + switch (optname) { + case XIO_OPTNAME_ENABLE_MEM_POOL: + VALIDATE_SZ(sizeof(int)); + tcp_options.enable_mem_pool = *((int *)optval); + return 0; + case XIO_OPTNAME_ENABLE_DMA_LATENCY: + VALIDATE_SZ(sizeof(int)); + tcp_options.enable_dma_latency = *((int *)optval); + return 0; + case XIO_OPTNAME_MAX_IN_IOVLEN: + VALIDATE_SZ(sizeof(int)); + tcp_options.max_in_iovsz = *((int *)optval); + return 0; + case XIO_OPTNAME_MAX_OUT_IOVLEN: + VALIDATE_SZ(sizeof(int)); + tcp_options.max_out_iovsz = *((int *)optval); + return 0; + case XIO_OPTNAME_TCP_ENABLE_MR_CHECK: + VALIDATE_SZ(sizeof(int)); + tcp_options.enable_mr_check = *((int *)optval); + return 0; + case XIO_OPTNAME_TCP_NO_DELAY: + VALIDATE_SZ(sizeof(int)); + tcp_options.tcp_no_delay = *((int *)optval); + return 0; + case XIO_OPTNAME_TCP_SO_SNDBUF: + VALIDATE_SZ(sizeof(int)); + tcp_options.tcp_so_sndbuf = *((int *)optval); + return 0; + case XIO_OPTNAME_TCP_SO_RCVBUF: + VALIDATE_SZ(sizeof(int)); + tcp_options.tcp_so_rcvbuf = *((int *)optval); + return 0; + case XIO_OPTNAME_TCP_DUAL_STREAM: + VALIDATE_SZ(sizeof(int)); + tcp_options.tcp_dual_sock = *((int *)optval); + return 0; + default: + break; + } + xio_set_error(XIO_E_NOT_SUPPORTED); + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_get_opt */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_get_opt(void *xio_obj, + int optname, void *optval, int *optlen) +{ + switch (optname) { + case XIO_OPTNAME_ENABLE_MEM_POOL: + *((int *)optval) = tcp_options.enable_mem_pool; + *optlen = sizeof(int); + return 0; + case XIO_OPTNAME_ENABLE_DMA_LATENCY: + *((int *)optval) = tcp_options.enable_dma_latency; + *optlen = sizeof(int); + return 0; + case XIO_OPTNAME_MAX_IN_IOVLEN: + *((int *)optval) = tcp_options.max_in_iovsz; + *optlen = sizeof(int); + return 0; + case XIO_OPTNAME_MAX_OUT_IOVLEN: + *((int *)optval) = tcp_options.max_out_iovsz; + *optlen = sizeof(int); + return 0; + case XIO_OPTNAME_TCP_ENABLE_MR_CHECK: + *((int *)optval) = tcp_options.enable_mr_check; + *optlen = sizeof(int); + return 0; + case XIO_OPTNAME_TCP_NO_DELAY: + *((int *)optval) = tcp_options.tcp_no_delay; + *optlen = sizeof(int); + return 0; + case XIO_OPTNAME_TCP_SO_SNDBUF: + *((int *)optval) = tcp_options.tcp_so_sndbuf; + *optlen = sizeof(int); + return 0; + case XIO_OPTNAME_TCP_SO_RCVBUF: + *((int *)optval) = tcp_options.tcp_so_rcvbuf; + *optlen = sizeof(int); + return 0; + case XIO_OPTNAME_TCP_DUAL_STREAM: + *((int *)optval) = tcp_options.tcp_dual_sock; + *optlen = sizeof(int); + return 0; + default: + break; + } + xio_set_error(XIO_E_NOT_SUPPORTED); + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_is_valid_in_req */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_is_valid_in_req(struct xio_msg *msg) +{ + unsigned int i; + struct xio_vmsg *vmsg = &msg->in; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + void *sge; + unsigned long nents, max_nents; + + sgtbl = xio_sg_table_get(&msg->in); + sgtbl_ops = xio_sg_table_ops_get(msg->in.sgl_type); + nents = tbl_nents(sgtbl_ops, sgtbl); + max_nents = tbl_max_nents(sgtbl_ops, sgtbl); + + if ((nents > (unsigned long)tcp_options.max_in_iovsz) || + (nents > max_nents) || + (max_nents > (unsigned long)tcp_options.max_in_iovsz)) { + return 0; + } + + if (vmsg->sgl_type == XIO_SGL_TYPE_IOV && nents > XIO_IOVLEN) + return 0; + + if (vmsg->header.iov_base && + (vmsg->header.iov_len == 0)) + return 0; + + for_each_sge(sgtbl, sgtbl_ops, sge, i) { + if (sge_addr(sgtbl_ops, sge) && + (sge_length(sgtbl_ops, sge) == 0)) + return 0; + } + + return 1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_is_valid_out_msg */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_is_valid_out_msg(struct xio_msg *msg) +{ + unsigned int i; + struct xio_vmsg *vmsg = &msg->out; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + void *sge; + unsigned long nents, max_nents; + + sgtbl = xio_sg_table_get(&msg->out); + sgtbl_ops = xio_sg_table_ops_get(msg->out.sgl_type); + nents = tbl_nents(sgtbl_ops, sgtbl); + max_nents = tbl_max_nents(sgtbl_ops, sgtbl); + + if ((nents > (unsigned long)tcp_options.max_out_iovsz) || + (nents > max_nents) || + (max_nents > (unsigned long)tcp_options.max_out_iovsz)) + return 0; + + if (vmsg->sgl_type == XIO_SGL_TYPE_IOV && nents > XIO_IOVLEN) + return 0; + + if ((vmsg->header.iov_base && + (vmsg->header.iov_len == 0)) || + (!vmsg->header.iov_base && + (vmsg->header.iov_len != 0))) + return 0; + + if (vmsg->header.iov_len > (size_t)xio_get_options()->max_inline_xio_hdr) + return 0; + + for_each_sge(sgtbl, sgtbl_ops, sge, i) { + if (!sge_addr(sgtbl_ops, sge) || + (sge_length(sgtbl_ops, sge) == 0)) + return 0; + } + + return 1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_dup2 */ +/* makes new_trans_hndl be the copy of old_trans_hndl, closes new_trans_hndl */ +/* Note old and new are in dup2 terminology opposite to reconnect terms */ +/* --------------------------------------------------------------------------*/ +static int xio_tcp_dup2(struct xio_transport_base *old_trans_hndl, + struct xio_transport_base **new_trans_hndl) +{ + xio_tcp_close(*new_trans_hndl); + + /* conn layer will call close which will only decrement */ + /*kref_get(&old_trans_hndl->kref);*/ + + *new_trans_hndl = old_trans_hndl; + + return 0; +} + +static struct xio_tcp_socket_ops single_sock_ops = { + .open = xio_tcp_single_sock_create, + .add_ev_handlers = xio_tcp_single_sock_add_ev_handlers, + .del_ev_handlers = xio_tcp_single_sock_del_ev_handlers, + .connect = xio_tcp_single_sock_connect, + .set_txd = xio_tcp_single_sock_set_txd, + .set_rxd = xio_tcp_single_sock_set_rxd, + .rx_ctl_work = xio_tcp_recvmsg_work, + .rx_ctl_handler = xio_tcp_single_sock_rx_ctl_handler, + .rx_data_handler = xio_tcp_rx_data_handler, + .shutdown = xio_tcp_single_sock_shutdown, + .close = xio_tcp_single_sock_close, +}; + +static struct xio_tcp_socket_ops dual_sock_ops = { + .open = xio_tcp_dual_sock_create, + .add_ev_handlers = xio_tcp_dual_sock_add_ev_handlers, + .del_ev_handlers = xio_tcp_dual_sock_del_ev_handlers, + .connect = xio_tcp_dual_sock_connect, + .set_txd = xio_tcp_dual_sock_set_txd, + .set_rxd = xio_tcp_dual_sock_set_rxd, + .rx_ctl_work = xio_tcp_recv_ctl_work, + .rx_ctl_handler = xio_tcp_dual_sock_rx_ctl_handler, + .rx_data_handler = xio_tcp_rx_data_handler, + .shutdown = xio_tcp_dual_sock_shutdown, + .close = xio_tcp_dual_sock_close, +}; + +struct xio_transport xio_tcp_transport = { + .name = "tcp", + .ctor = NULL, + .dtor = NULL, + .init = NULL, + .release = NULL, + .context_shutdown = xio_tcp_context_shutdown, + .open = xio_tcp_open, + .connect = xio_tcp_connect, + .listen = xio_tcp_listen, + .accept = xio_tcp_accept, + .reject = xio_tcp_reject, + .close = xio_tcp_close, + .dup2 = xio_tcp_dup2, +/* .update_task = xio_tcp_update_task,*/ + .send = xio_tcp_send, + .poll = xio_tcp_poll, + .set_opt = xio_tcp_set_opt, + .get_opt = xio_tcp_get_opt, + .cancel_req = xio_tcp_cancel_req, + .cancel_rsp = xio_tcp_cancel_rsp, + .get_pools_setup_ops = xio_tcp_get_pools_ops, + .set_pools_cls = xio_tcp_set_pools_cls, + + .validators_cls.is_valid_in_req = xio_tcp_is_valid_in_req, + .validators_cls.is_valid_out_msg = xio_tcp_is_valid_out_msg, +}; + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_transport_constructor */ +/*---------------------------------------------------------------------------*/ +static int __init xio_tcp_transport_constructor(void) +{ + struct xio_transport *transport = &xio_tcp_transport; + + /* register the transport */ + xio_reg_transport(transport); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_transport_destructor */ +/*---------------------------------------------------------------------------*/ +static void __exit xio_tcp_transport_destructor(void) +{ + struct xio_transport *transport = &xio_tcp_transport; + + /* Called after all devices were deleted */ + + xio_unreg_transport(transport); +} + +static int __init xio_init_module(void) +{ + if (debugfs_initialized()) { + xio_tcp_root = debugfs_create_dir("xio_tcp", NULL); + if (!xio_tcp_root) { + pr_err("xio_tcp root debugfs creation failed\n"); + return -ENOMEM; + } + } else { + xio_tcp_root = NULL; + pr_err("debugfs not initialized\n"); + } + + xio_tcp_transport_constructor(); + g_poptions = xio_get_options(); + return 0; +} + +static void __exit xio_cleanup_module(void) +{ + xio_tcp_transport_destructor(); + + debugfs_remove_recursive(xio_tcp_root); +} + +module_init(xio_init_module); +module_exit(xio_cleanup_module); diff --git a/open_src/xio/src/kernel/transport/tcp/xio_tcp_transport.h b/open_src/xio/src/kernel/transport/tcp/xio_tcp_transport.h new file mode 100644 index 0000000..60c0880 --- /dev/null +++ b/open_src/xio/src/kernel/transport/tcp/xio_tcp_transport.h @@ -0,0 +1,511 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_TCP_TRANSPORT_H_ +#define XIO_TCP_TRANSPORT_H_ + +#include + +struct xio_tcp_socket; + +/*---------------------------------------------------------------------------*/ +/* externals */ +/*---------------------------------------------------------------------------*/ +extern struct xio_tcp_options tcp_options; +extern struct xio_options *g_poptions; + +/* definitions */ +#define MAX_SGE (XIO_IOVLEN + 1) + +#define MAX_HDR_SZ 512 + +#define NUM_CONN_SETUP_TASKS 2 /* one posted for req rx, + * one for reply tx + */ +#define CONN_SETUP_BUF_SIZE 4096 + +#define NUM_START_PRIMARY_POOL_TASKS 32 +#define NUM_ALLOC_PRIMARY_POOL_TASKS 512 + +#define USECS_IN_SEC 1000000 +#define NSECS_IN_USEC 1000 + +#define xio_prefetch(p) prefetch(p) + +#ifndef PAGE_SHIFT +#define PAGE_SHIFT 12 +#endif +#ifndef PAGE_SIZE +#define PAGE_SIZE BIT(PAGE_SHIFT) +#endif +#ifndef PAGE_MASK +#define PAGE_MASK (~(PAGE_SIZE-1)) +#endif + +/* TCP transport */ + +#define NUM_TASKS 54400 /* 100 * (MAX_SEND_WR + + * MAX_RECV_WR + EXTRA_RQE) + */ + +#define RX_LIST_POST_NR 31 /* Initial number of buffers + * to put in the rx_list + */ + +#define COMPLETION_BATCH_MAX 64 /* Trigger TX completion every + * COMPLETION_BATCH_MAX + * packets + */ + +#define TX_BATCH 32 /* Number of TX tasks to batch */ + +#define TX_EAGAIN_RETRY 2 /* Number of retries when send + * fail with EAGAIN before return. + */ + +#define RX_POLL_NR_MAX 4 /* Max num of RX messages + * to receive in one poll + */ + +#define RX_BATCH 32 /* Number of RX tasks to batch */ + +#define MAX_ACCEPT_BATCH 4 /* Max sockets to accept at once*/ + +#define MAX_BACKLOG 1024 /* listen socket max backlog */ + +#define TMP_RX_BUF_SIZE (RX_BATCH * MAX_HDR_SZ) + +#define XIO_TO_TCP_TASK(xt, tt) \ + struct xio_tcp_task *(tt) = \ + (struct xio_tcp_task *)(xt)->dd_data +#define XIO_TO_TCP_HNDL(xt, th) \ + struct xio_tcp_transport *(th) = \ + (struct xio_tcp_transport *)(xt)->context + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 19, 0) + #define MSGHDR_IOV(mh) ((mh)->msg_iter.iov) + #define MSGHDR_IOVLEN(mh) (mh)->msg_iter.nr_segs +#else + #define MSGHDR_IOV(mh) (mh)->msg_iov + #define MSGHDR_IOVLEN(mh) (mh)->msg_iovlen +#endif + +/*---------------------------------------------------------------------------*/ +/* enums */ +/*---------------------------------------------------------------------------*/ +enum xio_tcp_op_code { + XIO_TCP_NULL, + XIO_TCP_RECV = 1, + XIO_TCP_SEND, + XIO_TCP_WRITE, + XIO_TCP_READ +}; + +enum xio_tcp_rx_stage { + XIO_TCP_RX_START, + XIO_TCP_RX_TLV, + XIO_TCP_RX_HEADER, + XIO_TCP_RX_IO_DATA, + XIO_TCP_RX_DONE +}; + +enum xio_tcp_tx_stage { + XIO_TCP_TX_BEFORE, + XIO_TCP_TX_IN_SEND_CTL, + XIO_TCP_TX_IN_SEND_DATA, + XIO_TCP_TX_DONE +}; + +enum xio_tcp_sock_type { + XIO_TCP_SINGLE_SOCK = 1, + XIO_TCP_CTL_SOCK, + XIO_TCP_DATA_SOCK +}; + +/*---------------------------------------------------------------------------*/ +struct xio_tcp_options { + int enable_mem_pool; + int enable_dma_latency; + int enable_mr_check; + int max_in_iovsz; + int max_out_iovsz; + int tcp_no_delay; + int tcp_so_sndbuf; + int tcp_so_rcvbuf; + int tcp_dual_sock; + int pad; +}; + +#define XIO_TCP_REQ_HEADER_VERSION 1 + +struct __attribute__((__packed__)) xio_tcp_req_hdr { + uint8_t version; /* request version */ + uint8_t flags; + uint16_t req_hdr_len; /* req header length */ + uint16_t sn; /* serial number */ + uint16_t pad0; + + uint32_t ltid; /* local task id */ + uint16_t pad; + uint8_t in_tcp_op; /* opcode for peers */ + uint8_t out_tcp_op; /* opcode for peers */ + + uint16_t in_num_sge; + uint16_t out_num_sge; + uint32_t pad1; + + uint16_t ulp_hdr_len; /* ulp header length */ + uint16_t ulp_pad_len; /* pad_len length */ + uint32_t remain_data_len;/* remaining data length */ + + uint64_t ulp_imm_len; /* ulp data length */ +}; + +#define XIO_TCP_RSP_HEADER_VERSION 1 + +struct __attribute__((__packed__)) xio_tcp_rsp_hdr { + uint8_t version; /* response version */ + uint8_t flags; + uint16_t rsp_hdr_len; /* rsp header length */ + uint16_t sn; /* serial number */ + uint16_t pad; + + uint32_t ltid; /* local task id */ + uint32_t rtid; /* remote task id */ + + uint8_t out_tcp_op; /* opcode for peers */ + uint8_t pad1; + uint16_t out_num_sge; + uint32_t status; /* status */ + + uint16_t ulp_hdr_len; /* ulp header length */ + uint16_t ulp_pad_len; /* pad_len length */ + uint32_t remain_data_len;/* remaining data length */ + + uint64_t ulp_imm_len; /* ulp data length */ +}; + +struct __attribute__((__packed__)) xio_tcp_connect_msg { + enum xio_tcp_sock_type sock_type; + uint16_t second_port; + uint16_t pad; +}; + +struct __attribute__((__packed__)) xio_tcp_setup_msg { + uint64_t buffer_sz; + uint32_t max_in_iovsz; + uint32_t max_out_iovsz; + uint32_t max_header_len; + uint32_t pad; +}; + +struct __attribute__((__packed__)) xio_tcp_cancel_hdr { + uint16_t hdr_len; /* req header length */ + uint16_t sn; /* msg serial number */ + uint32_t result; +}; + +struct xio_tcp_work_req { + struct iovec *msg_iov; + uint32_t msg_len; + uint32_t pad; + uint64_t tot_iov_byte_len; + void *ctl_msg; + uint32_t ctl_msg_len; + int stage; + struct msghdr msg; +}; + +struct xio_tcp_task { + enum xio_tcp_op_code in_tcp_op; + enum xio_tcp_op_code out_tcp_op; + + void *buf; + struct xio_tcp_work_req txd; + struct xio_tcp_work_req rxd; + + uint16_t read_num_mp_mem; + uint16_t write_num_mp_mem; + uint32_t pad0; + + /* User (from vmsg) or pool buffer used for */ + struct xio_mp_mem *read_mp_mem; + struct xio_mp_mem *write_mp_mem; + + uint16_t req_in_num_sge; + uint16_t req_out_num_sge; + uint16_t rsp_out_num_sge; + uint16_t sn; + + /* What this side got from the peer for SEND */ + /* What this side got from the peer for RDMA equivalent R/W + */ + struct xio_sge *req_in_sge; + struct xio_sge *req_out_sge; + + /* What this side writes to the peer on RDMA equivalent W + */ + struct xio_sge *rsp_out_sge; + + struct xio_ev_data comp_event; +}; + +struct xio_tcp_tasks_slab { + struct kmem_cache *data_pool; + char name[32]; /* kmem_cache_create keeps a pointer to the pool's name + * Therefore the name must be valid until the pool + * is destroyed + */ + int buf_size; + int count; +}; + +struct xio_tcp_pending_conn { + struct socket *sock; + struct xio_tcp_transport *parent; + struct xio_ev_data pending_event_data; + int waiting_for_bytes; + struct xio_tcp_connect_msg msg; + union xio_sockaddr sa; + struct list_head conns_list_entry; +}; + +struct xio_socket { + struct socket *ksock; + uint16_t port; + void (*orig_sk_state_change)(struct sock *sk); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 15, 0) + void (*orig_sk_data_ready)(struct sock *sk); +#else + void (*orig_sk_data_ready)(struct sock *sk, int bytes); +#endif + void (*orig_sk_write_space)(struct sock *sk); + struct xio_ev_data conn_establish_event_data; +}; + +#define XIO_SOCK_ESTABLISH_CTL 1 +#define XIO_SOCK_ESTABLISH_DATA BIT(1) + +struct xio_tcp_socket_ops { + int (*open)(struct xio_tcp_socket *sock); + int (*add_ev_handlers)(struct xio_tcp_transport *tcp_hndl); + int (*del_ev_handlers)(struct xio_tcp_transport *tcp_hndl); + int (*connect)(struct xio_tcp_transport *tcp_hndl, + struct sockaddr *sa, socklen_t sa_len); + size_t (*set_txd)(struct xio_task *task); + void (*set_rxd)(struct xio_task *task, void *buf, uint32_t len); + int (*rx_ctl_work)(struct xio_tcp_transport *tcp_hndl, struct socket *, + struct xio_tcp_work_req *xio_recv, + int block); + int (*rx_ctl_handler)(struct xio_tcp_transport *tcp_hndl, int *resched); + int (*rx_data_handler)(struct xio_tcp_transport *tcp_hndl, + int batch_nr, int *resched); + int (*shutdown)(struct xio_tcp_socket *sock); + int (*close)(struct xio_tcp_socket *sock); +}; + +struct xio_tcp_socket { + struct xio_socket ctl; + struct xio_socket data; + uint64_t establish_states; + struct xio_tcp_socket_ops ops[1]; + struct xio_ev_data accept_event_data; +}; + + +struct xio_tcp_transport { + struct xio_transport_base base; + struct xio_mempool *tcp_mempool; + struct list_head trans_list_entry; + + /* tasks queues */ + struct list_head tx_ready_list; + struct list_head tx_comp_list; + struct list_head in_flight_list; + struct list_head rx_list; + struct list_head io_list; + + struct xio_tcp_socket socket; + int is_listen; + + /* fast path params */ + enum xio_transport_state state; + + /* tx parameters */ + size_t max_inline_buf_sz; + + int tx_ready_tasks_num; + + uint16_t tx_comp_cnt; + + uint16_t sn; /* serial number */ + + /* control path params */ + uint32_t peer_max_in_iovsz; + uint32_t peer_max_out_iovsz; + + /* connection's flow control */ + size_t alloc_sz; + size_t membuf_sz; + + struct xio_transport *transport; + struct xio_tasks_pool_cls initial_pool_cls; + struct xio_tasks_pool_cls primary_pool_cls; + + struct xio_tcp_setup_msg setup_rsp; + + /* too big to be on stack - use as temporaries */ + union { + struct xio_msg dummy_msg; + }; + + struct list_head pending_conns; + + void *tmp_rx_buf; + void *tmp_rx_buf_cur; + uint32_t tmp_rx_buf_len; + uint32_t peer_max_header; + + struct xio_tcp_work_req tmp_work; + struct iovec tmp_iovec[UIO_MAXIOV]; + + struct xio_ev_data flush_tx_event; + struct xio_ev_data ctl_rx_event; + struct xio_ev_data data_rx_event; + struct xio_ev_data disconnect_event; +}; + +/* + * The next routines deal with comparing 16 bit unsigned integers + * and worry about wrap-around (automatic with unsigned arithmetic). + */ + +static inline s16 before(u16 seq1, u16 seq2) +{ + return (s16)(seq1 - seq2) < 0; +} +#define after(seq2, seq1) before(seq1, seq2) + +static inline s16 before_eq(u16 seq1, u16 seq2) +{ + return (s16)(seq1 - seq2) <= 0; +} +#define after_eq(seq2, seq1) before_eq(seq1, seq2) + +/* is s2<=s1tv_sec * USECS_IN_SEC; + retval += time_spec->tv_nsec / NSECS_IN_USEC; + + return retval; +} + +int xio_tcp_get_max_header_size(void); + +int xio_tcp_get_inline_buffer_size(void); + +int xio_tcp_send(struct xio_transport_base *transport, + struct xio_task *task); + +int xio_tcp_rx_handler(struct xio_tcp_transport *tcp_hndl); + +int xio_tcp_poll(struct xio_transport_base *transport, + long min_nr, long max_nr, + struct timespec *ts_timeout); + +struct xio_task *xio_tcp_primary_task_lookup( + struct xio_tcp_transport *tcp_hndl, + int tid); + +struct xio_task *xio_tcp_primary_task_alloc( + struct xio_tcp_transport *tcp_hndl); + +void on_sock_disconnected(struct xio_tcp_transport *tcp_hndl, + int notify_observer); + +int xio_tcp_cancel_req(struct xio_transport_base *transport, + struct xio_msg *req, uint64_t stag, + void *ulp_msg, size_t ulp_msg_sz); + +int xio_tcp_cancel_rsp(struct xio_transport_base *transport, + struct xio_task *task, enum xio_status result, + void *ulp_msg, size_t ulp_msg_sz); + +int xio_tcp_send_connect_msg(struct socket *sock, + struct xio_tcp_connect_msg *msg); + +size_t xio_tcp_single_sock_set_txd(struct xio_task *task); +size_t xio_tcp_dual_sock_set_txd(struct xio_task *task); +void xio_tcp_single_sock_set_rxd(struct xio_task *task, void *buf, + uint32_t len); +void xio_tcp_dual_sock_set_rxd(struct xio_task *task, void *buf, uint32_t len); + +int xio_tcp_rx_ctl_handler(struct xio_tcp_transport *tcp_hndl, int batch_nr, + int *resched); +int xio_tcp_rx_data_handler(struct xio_tcp_transport *tcp_hndl, int batch_nr, + int *resched); +int xio_tcp_recv_ctl_work(struct xio_tcp_transport *tcp_hndl, + struct socket *sock, + struct xio_tcp_work_req *xio_recv, int block); +int xio_tcp_recvmsg_work(struct xio_tcp_transport *tcp_hndl, + struct socket *sock, + struct xio_tcp_work_req *xio_recv, int block); + +void xio_tcp_disconnect_helper(void *xio_tcp_hndl); + +int xio_tcp_xmit(struct xio_tcp_transport *tcp_hndl); + +void xio_tcp_tx_completion_handler(void *xio_task); +void xio_tcp_consume_ctl_rx(void *xio_tcp_hndl); +void xio_tcp_accept_connections(void *user_data); + +void xio_tcp_ctl_conn_established_ev_handler(void *user_context); +void xio_tcp_data_conn_established_ev_handler(void *user_context); +void xio_tcp_pending_conn_remove_handler(void *user_data); + +#endif /* XIO_TCP_TRANSPORT_H_ */ diff --git a/open_src/xio/src/kernel/transport/xio_ktransport.c b/open_src/xio/src/kernel/transport/xio_ktransport.c new file mode 100644 index 0000000..68ea203 --- /dev/null +++ b/open_src/xio/src/kernel/transport/xio_ktransport.c @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include "libxio.h" +#include "xio_ktransport.h" + +/*---------------------------------------------------------------------------*/ +/* xio_transport_state_str */ +/*---------------------------------------------------------------------------*/ +char *xio_transport_state_str(enum xio_transport_state state) +{ + switch (state) { + case XIO_TRANSPORT_STATE_INIT: + return "INIT"; + case XIO_TRANSPORT_STATE_LISTEN: + return "LISTEN"; + case XIO_TRANSPORT_STATE_CONNECTING: + return "CONNECTING"; + case XIO_TRANSPORT_STATE_CONNECTED: + return "CONNECTED"; + case XIO_TRANSPORT_STATE_DISCONNECTED: + return "DISCONNECTED"; + case XIO_TRANSPORT_STATE_RECONNECT: + return "RECONNECT"; + case XIO_TRANSPORT_STATE_CLOSED: + return "CLOSED"; + case XIO_TRANSPORT_STATE_DESTROYED: + return "DESTROYED"; + case XIO_TRANSPORT_STATE_ERROR: + return "ERROR"; + default: + return "UNKNOWN"; + } + + return NULL; +} +EXPORT_SYMBOL(xio_transport_state_str); + diff --git a/open_src/xio/src/kernel/transport/xio_ktransport.h b/open_src/xio/src/kernel/transport/xio_ktransport.h new file mode 100644 index 0000000..1df446c --- /dev/null +++ b/open_src/xio/src/kernel/transport/xio_ktransport.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_KTRANSPORT_H +#define XIO_KTRANSPORT_H + +/*---------------------------------------------------------------------------*/ +/* enums */ +/*---------------------------------------------------------------------------*/ +enum xio_transport_state { + XIO_TRANSPORT_STATE_INIT, + XIO_TRANSPORT_STATE_LISTEN, + XIO_TRANSPORT_STATE_CONNECTING, + XIO_TRANSPORT_STATE_CONNECTED, + XIO_TRANSPORT_STATE_DISCONNECTED, + XIO_TRANSPORT_STATE_RECONNECT, + XIO_TRANSPORT_STATE_CLOSED, + XIO_TRANSPORT_STATE_DESTROYED, + XIO_TRANSPORT_STATE_ERROR +}; + + +char *xio_transport_state_str(enum xio_transport_state state); + +#endif /* XIO_KTRANSPORT_H */ diff --git a/open_src/xio/src/kernel/xio/Makefile.in b/open_src/xio/src/kernel/xio/Makefile.in new file mode 100644 index 0000000..04e9cff --- /dev/null +++ b/open_src/xio/src/kernel/xio/Makefile.in @@ -0,0 +1,128 @@ +# Makefile.in for kernel module + +SHELL = /bin/sh +INSTALL = @INSTALL@ +mkdir_p = mkdir -p +VERSION = @PACKAGE_VERSION@ +OFED_CFLAGS = @OFED_CFLAGS@ +XIO_SYMVERS = @XIO_SYMVERS@ +PRIVATE_COMMON = .common + +NOSTDINC_FLAGS += @OFED_CFLAGS@ + +DISTFILES = Makefile.in configure.ac configure ../install-sh \ + xio_log.h xio_mem.h xio_os.h xio_mempool.h \ + ../../common/common/xio_observer.h \ + xio_context.c xio_ev_loop.c xio_mempool.c \ + xio_init.c xio_mem.c xio_task.c xio_kernel_utils.c \ + xio_workqueue.c xio_sg_iov.c xio_sg_iovptr.c xio_sg_scatter.c \ + xio_sg_table.c \ + ../../../version.c \ + ../../common/xio_objpool.c \ + ../../common/xio_nexus.c \ + ../../common/xio_nexus_cache.c \ + ../../common/xio_options.c \ + ../../common/xio_session.c \ + ../../common/xio_session_server.c \ + ../../common/xio_session_client.c \ + ../../common/xio_transport.c \ + ../../common/xio_connection.c \ + ../../common/xio_error.c \ + ../../common/xio_server.c \ + ../../common/xio_sessions_cache.c \ + ../../common/xio_observer.c \ + ../../common/xio_idr.c \ + ../../common/xio_utils.c \ + ../transport/xio_ktransport.c + +#TODO: add prefix here to manage directory, where to place headers +includedir = /opt/xio/include +xiomoduledir = @kmoduledir@/extra/net/xio + +xiomodule := xio_core.ko + +all: all-@ENABLE_XIO_MODULE@ +install: install-@ENABLE_XIO_MODULE@ +uninstall: uninstall-@ENABLE_XIO_MODULE@ + +all-n: +install-n: +uninstall-n: + +all-y: all-spec + +SUBDIRS ?=`pwd` + +install-y: all + $(mkdir_p) $(DESTDIR)$(xiomoduledir) + $(mkdir_p) $(DESTDIR)$(includedir) + $(INSTALL) -m 644 $(xiomodule) $(DESTDIR)$(xiomoduledir)/$(xiomodule) + $(INSTALL) -m 644 $(SUBDIRS)/../../../include/libxio.h $(DESTDIR)$(includedir) + $(INSTALL) -m 644 $(SUBDIRS)/../../../include/xio_base.h $(DESTDIR)$(includedir) + $(INSTALL) -m 644 $(SUBDIRS)/../../../include/xio_kernel.h $(DESTDIR)$(includedir) + $(INSTALL) -m 644 Module.symvers $(DESTDIR)$(includedir) + -/sbin/depmod -a + +uninstall-y: + rm -f $(DESTDIR)$(xiomoduledir)/$(xiomodule) + -/sbin/depmod -a + +clean: + -rm -f $(xiomodule) *.o .*.cmd *.mod.c *.ko *.s */*.o *.order *.symvers *.unsigned + -rm -rf $(PRIVATE_COMMON) + +distclean: clean + rm -f Makefile configure config.status + rm -f config.h config.log config.status config.cache + rm -rf .tmp_versions autom4te.cache + +maintainer-clean: distclean + +distdir: $(DISTFILES) + cp -p $(DISTFILES) $(distdir) + + +ccflags-y += $(OFED_CFLAGS) -I$(SUBDIRS) -I$(SUBDIRS)/.. -I$(SUBDIRS)/../../common -I$(SUBDIRS)/../../common -I$(SUBDIRS)/../../../include -I$(SUBDIRS)/../../libxio_os/linuxkernel + +obj-m := xio_core.o +xio_core-objs := \ + xio_init.o \ + xio_context.o \ + xio_ev_loop.o \ + xio_mempool.o \ + xio_init.o \ + xio_mem.o \ + xio_task.o \ + xio_kernel_utils.o \ + xio_workqueue.o \ + xio_sg_iov.o \ + xio_sg_iovptr.o \ + xio_sg_scatter.o \ + xio_sg_table.o \ + ../transport/xio_ktransport.o \ + $(PRIVATE_COMMON)/version.o \ + $(PRIVATE_COMMON)/xio_objpool.o \ + $(PRIVATE_COMMON)/xio_nexus.o \ + $(PRIVATE_COMMON)/xio_nexus_cache.o \ + $(PRIVATE_COMMON)/xio_options.o \ + $(PRIVATE_COMMON)/xio_session.o \ + $(PRIVATE_COMMON)/xio_session_server.o \ + $(PRIVATE_COMMON)/xio_session_client.o \ + $(PRIVATE_COMMON)/xio_transport.o \ + $(PRIVATE_COMMON)/xio_connection.o \ + $(PRIVATE_COMMON)/xio_error.o \ + $(PRIVATE_COMMON)/xio_server.o \ + $(PRIVATE_COMMON)/xio_sessions_cache.o \ + $(PRIVATE_COMMON)/xio_observer.o \ + $(PRIVATE_COMMON)/xio_idr.o \ + $(PRIVATE_COMMON)/xio_utils.o + +$(PRIVATE_COMMON): + $(mkdir_p) $@ + cp -ps `pwd`/../../../version.c $@/ + cp -ps `pwd`/../../common/*.c $@/ + cp -ps `pwd`/../../common/*.h $@/ + +all-spec: $(PRIVATE_COMMON) + export NOSTDINC_FLAGS + $(MAKE) -C @kernelsrc@ SUBDIRS=`pwd` @KERNELMAKE_PARAMS@ KBUILD_EXTRA_SYMBOLS=$(XIO_SYMVERS) modules diff --git a/open_src/xio/src/kernel/xio/autogen.sh b/open_src/xio/src/kernel/xio/autogen.sh new file mode 100644 index 0000000..28dd57d --- /dev/null +++ b/open_src/xio/src/kernel/xio/autogen.sh @@ -0,0 +1,3 @@ +#! /bin/sh + +autoconf diff --git a/open_src/xio/src/kernel/xio/configure.ac b/open_src/xio/src/kernel/xio/configure.ac new file mode 100644 index 0000000..9aaac63 --- /dev/null +++ b/open_src/xio/src/kernel/xio/configure.ac @@ -0,0 +1,216 @@ +AC_INIT([xio-kernel],[2.0],[libxio@accellio.org]) + +AC_PROG_INSTALL + +runver=`uname -r` +bad_kernel_version=no +ENABLE_XIO_MODULE=y +# do not build against ofed until kernel module can be built out of kernel +# tree +ENABLE_OFED_BUILD=y +KERNELCFLAGS= + +kernelsrc= +kernelbuild= +AC_ARG_WITH(kernel, + [ --with-kernel=PATH Specify location of kernel source ], + [kernelsrc="$withval"; kernelbuild="$withval"]) +AC_ARG_WITH(kernel-build, + [ --with-kernel-build=PATH Specify location of kernel build ], + [kernelbuild="$withval"]) +AC_ARG_ENABLE(kernel-module, + [ --enable-kernel-module Compile kernel module ]) + +#build against installed OFED +if test "$ENABLE_OFED_BUILD" = "y"; then +AC_MSG_CHECKING([if ofed installed]) +MLNX_OFED=`if ofed_info 2>/dev/null | grep MLNX_OFED >/dev/null 2>/dev/null; then echo true; else echo false; fi` +OFED_CFLAGS= +XIO_SYMVERS= + +if test "$MLNX_OFED" = "true"; then + AC_MSG_RESULT(yes) + + # Whether MLNX_OFED for ubuntu has been installed + MLNX_OFED_IB_UBUNTU_INSTALLED=`if dpkg -s mlnx-ofed-kernel-dkms >/dev/null 2>/dev/null; then echo true; else echo false; fi` + + # Whether MLNX_OFED for RedHat has been installed + MLNX_OFED_IB_RH_INSTALLED=`if rpm -q mlnx-ofa_kernel-devel >&/dev/null; then echo true; else echo false; fi` + + # Check if we have custom compiled kernel modules + if test "$MLNX_OFED_IB_RH_INSTALLED" = "false"; then + MLNX_OFED_IB_RH_INSTALLED=`if rpm -q kernel-ib-devel >&/dev/null; then echo true; else echo false; fi` + fi + + if test "$MLNX_OFED_IB_UBUNTU_INSTALLED" = "true"; then + OFED_VERS=`dpkg -s mlnx-ofed-kernel-dkms | awk -F\- '/Version/ {print $1}' | awk '{print $2}'` + OFED_CFLAGS=`echo -I/var/lib/dkms/mlnx-ofed-kernel/$OFED_VERS/build/include -include /var/lib/dkms/mlnx-ofed-kernel/$OFED_VERS/build/include/linux/compat-2.6.h` + XIO_SYMVERS=`echo /var/lib/dkms/mlnx-ofed-kernel/$OFED_VERS/build/Module.symvers` + fi + + if test "$MLNX_OFED_IB_RH_INSTALLED" = "true"; then + OFED_CFLAGS=`echo -I/usr/src/ofa_kernel/default/include -include /usr/src/ofa_kernel/default/include/linux/compat-2.6.h` + XIO_SYMVERS=`echo /usr/src/ofa_kernel/default/Module.symvers` + fi +else + AC_MSG_RESULT(no) + XIO_SYMVERS=`echo ../compat/Module.symvers` + + + # Whether or not the OFED kernel-ib-devel RPM has been installed. + OFED_KERNEL_IB_DEVEL_RPM_INSTALLED=`if rpm -q kernel-ib-devel 2>/dev/null | grep -q $(uname -r | sed 's/-/_/g'); then echo true; else echo false; fi` + + # Whether or not the OFED compat-rdma-devel RPM has been installed. + OFED_COMPAT_RDMA_DEVEL_RPM_INSTALLED=`if rpm -q compat-rdma-devel 2>/dev/null | grep -q $(uname -r | sed 's/-/_/g'); then echo true; else echo false; fi` + + if test "$OFED_KERNEL_IB_DEVEL_RPM_INSTALLED" = "true"; then + # Read OFED's config.mk, which contains the definition of the variable + # BACKPORT_INCLUDES. + cfile="/usr/src/ofa_kernel/config.mk" + if test -r "${cfile}"; then + echo "loading build-specific script '${cfile}'" + . "${cfile}" + else + cfile="/usr/src/ofa_kernel/default/config.mk" + if test -r "${cfile}"; then + echo "loading build-specific script '${cfile}'" + . "${cfile}" + fi + fi + + OFED_CFLAGS=`echo $BACKPORT_INCLUDES -I/usr/src/ofa_kernel/include` + XIO_SYMVERS=`echo /usr/src/ofa_kernel/Module.symvers` + fi + + if test "$OFED_COMPAT_RDMA_DEVEL_RPM_INSTALLED" = "true"; then + OFED_CFLAGS=`echo -I/usr/src/compat-rdma/include -include /usr/src/compat-rdma/include/linux/compat-2.6.h` + XIO_SYMVERS=`echo /usr/src/compat-rdma/Module.symvers` + fi +fi + +AC_MSG_NOTICE([ofed include files directory is ${OFED_CFLAGS}]) +AC_SUBST(OFED_CFLAGS) +AC_SUBST(XIO_SYMVERS) +fi + +if test "$ENABLE_XIO_MODULE" = y; then + AC_MSG_CHECKING([kernel source directory]) + if test -z "$kernelsrc"; then + kernelbuild= + sourcelink=/lib/modules/${runver}/source + buildlink=/lib/modules/${runver}/build + + if test -e $sourcelink; then + kernelsrc=`(cd $sourcelink; /bin/pwd)` + fi + if test -e $buildlink; then + kernelbuild=`(cd $buildlink; /bin/pwd)` + fi + if test -z "$kernelsrc"; then + kernelsrc=$kernelbuild + fi + if test -z "$kernelsrc" -o -z "$kernelbuild"; then + AC_MSG_RESULT([Not found]) + AC_MSG_ERROR([ + *** Please specify the location of the kernel source with + *** the '--with-kernel=SRCDIR' option]) + fi + fi + AC_MSG_RESULT([$kernelsrc]) + AC_MSG_CHECKING([kernel build directory]) + AC_MSG_RESULT([$kernelbuild]) + + AC_MSG_CHECKING([kernel source version]) + if test -r $kernelbuild/include/linux/version.h && fgrep -q UTS_RELEASE $kernelbuild/include/linux/version.h; then + kernsrcver=`(echo "#include "; echo "kernsrcver=UTS_RELEASE") | cpp -I $kernelbuild/include | grep "^kernsrcver=" | cut -d \" -f 2` + elif test -r $kernelbuild/include/linux/utsrelease.h && fgrep -q UTS_RELEASE $kernelbuild/include/linux/utsrelease.h; then + kernsrcver=`(echo "#include "; echo "kernsrcver=UTS_RELEASE") | cpp -I $kernelbuild/include | grep "^kernsrcver=" | cut -d \" -f 2` + elif test -r $kernelbuild/include/generated/utsrelease.h && fgrep -q UTS_RELEASE $kernelbuild/include/generated/utsrelease.h; then + kernsrcver=`(echo "#include "; echo "kernsrcver=UTS_RELEASE") | cpp -I $kernelbuild/include | grep "^kernsrcver=" | cut -d \" -f 2` + fi + if test -z "$kernsrcver"; then + AC_MSG_RESULT([Not found]) + AC_MSG_ERROR([ + *** Cannot determine the version of the linux kernel source. Please + *** prepare the kernel before running this script]) + fi + AC_MSG_RESULT([$kernsrcver]) + kmoduledir=${INSTALL_MOD_PATH}/lib/modules/$kernsrcver + AC_SUBST(kernelsrc) + AC_SUBST(kmoduledir) + + if echo "$kernsrcver" | egrep -q ["^(2.4|2.6.[0-8]([^0-9]|\$))"]; then + bad_kernel_version=yes + AC_MSG_NOTICE([ +NOTE: Disabled building the kernel module, because this release only +NOTE: supports Linux versions 2.6.9 or later. You can use the kernel +NOTE: module from an earlier XIO release with the library from this +NOTE: release.]) + else + xio_configured=no + kernel_autoconf=$kernelbuild/include/linux/autoconf.h + AC_MSG_CHECKING([if XIO is configured in the kernel]) + if test -f $kernel_autoconf; then + if grep -q "^#define CONFIG_XIO 1" $kernel_autoconf || grep -q "^#define CONFIG_XIO_MODULE 1" $kernel_autoconf; then + xio_configured=yes + fi + fi + AC_MSG_RESULT([$xio_configured]) + if test -z "$enable_kernel_module" -a "$xio_configured" = yes; then + ENABLE_XIO_MODULE=n + fi + fi +fi + +if test "$ENABLE_XIO_MODULE" = n; then + AC_MSG_NOTICE([ +NOTE: Detected that XIO is already present in the kernel, so +NOTE: building of kernel module is disabled. To force building +NOTE: of kernel module use the '--enable-kernel-module' option.]) +fi + +if test "$enable_kernel_module" = no; then + ENABLE_XIO_MODULE=n +fi +if test "$bad_kernel_version" = yes; then + ENABLE_XIO_MODULE=n +fi + +AC_MSG_CHECKING([is ENABLE_XIO_MODULE defined]) +if test "$ENABLE_XIO_MODULE" = y; then + AC_MSG_RESULT([yes]) +else + AC_MSG_RESULT([no]) +fi + +AC_SUBST(ENABLE_XIO_MODULE) + +if test "$ENABLE_XIO_MODULE" = y; then + AC_MSG_CHECKING([if kernel defines kzalloc function]) + if egrep -qw "kzalloc" $kernelsrc/include/linux/slab.h; then + AC_DEFINE(HAVE_KZALLOC, 1, [kzalloc() is defined]) + AC_MSG_RESULT([yes]) + else + AC_MSG_RESULT([no]) + fi + + isuml=no + KERNELMAKE_PARAMS= + KERNELCPPFLAGS= + AC_MSG_CHECKING([if this is user mode linux]) + if test -f $kernelbuild/include/linux/autoconf.h && egrep -q "^#define CONFIG_(USERMODE|UML) 1" $kernelbuild/include/linux/autoconf.h; then + isuml=yes + KERNELMAKE_PARAMS="ARCH=um" + KERNELCPPFLAGS="-D__arch_um__ -DSUBARCH=\\\"i386\\\" -D_LARGEFILE64_SOURCE -I${kernelsrc}/arch/um/include -Derrno=kernel_errno -I${kernelsrc}/arch/um/kernel/tt/include -I${kernelsrc}/arch/um/kernel/skas/include" + fi + AC_MSG_RESULT([$isuml]) + if test "$kernelbuild" != "$kernelsrc"; then + KERNELMAKE_PARAMS="$KERNELMAKE_PARAMS O=$kernelbuild" + fi + AC_SUBST(KERNELMAKE_PARAMS) + AC_SUBST(KERNELCPPFLAGS) + AC_SUBST(KERNELCFLAGS) +fi + +AC_CONFIG_FILES([Makefile]) +AC_OUTPUT diff --git a/open_src/xio/src/kernel/xio/install-sh b/open_src/xio/src/kernel/xio/install-sh new file mode 100644 index 0000000..6781b98 --- /dev/null +++ b/open_src/xio/src/kernel/xio/install-sh @@ -0,0 +1,520 @@ +#!/bin/sh +# install - install a program, script, or datafile + +scriptversion=2009-04-28.21; # UTC + +# This originates from X11R5 (mit/util/scripts/install.sh), which was +# later released in X11R6 (xc/config/util/install.sh) with the +# following copyright and license. +# +# Copyright (C) 1994 X Consortium +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to +# deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +# sell copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# X CONSORTIUM BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN +# AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNEC- +# TION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# Except as contained in this notice, the name of the X Consortium shall not +# be used in advertising or otherwise to promote the sale, use or other deal- +# ings in this Software without prior written authorization from the X Consor- +# tium. +# +# +# FSF changes to this file are in the public domain. +# +# Calling this script install-sh is preferred over install.sh, to prevent +# `make' implicit rules from creating a file called install from it +# when there is no Makefile. +# +# This script is compatible with the BSD install script, but was written +# from scratch. + +nl=' +' +IFS=" "" $nl" + +# set DOITPROG to echo to test this script + +# Don't use :- since 4.3BSD and earlier shells don't like it. +doit=${DOITPROG-} +if test -z "$doit"; then + doit_exec=exec +else + doit_exec=$doit +fi + +# Put in absolute file names if you don't have them in your path; +# or use environment vars. + +chgrpprog=${CHGRPPROG-chgrp} +chmodprog=${CHMODPROG-chmod} +chownprog=${CHOWNPROG-chown} +cmpprog=${CMPPROG-cmp} +cpprog=${CPPROG-cp} +mkdirprog=${MKDIRPROG-mkdir} +mvprog=${MVPROG-mv} +rmprog=${RMPROG-rm} +stripprog=${STRIPPROG-strip} + +posix_glob='?' +initialize_posix_glob=' + test "$posix_glob" != "?" || { + if (set -f) 2>/dev/null; then + posix_glob= + else + posix_glob=: + fi + } +' + +posix_mkdir= + +# Desired mode of installed file. +mode=0755 + +chgrpcmd= +chmodcmd=$chmodprog +chowncmd= +mvcmd=$mvprog +rmcmd="$rmprog -f" +stripcmd= + +src= +dst= +dir_arg= +dst_arg= + +copy_on_change=false +no_target_directory= + +usage="\ +Usage: $0 [OPTION]... [-T] SRCFILE DSTFILE + or: $0 [OPTION]... SRCFILES... DIRECTORY + or: $0 [OPTION]... -t DIRECTORY SRCFILES... + or: $0 [OPTION]... -d DIRECTORIES... + +In the 1st form, copy SRCFILE to DSTFILE. +In the 2nd and 3rd, copy all SRCFILES to DIRECTORY. +In the 4th, create DIRECTORIES. + +Options: + --help display this help and exit. + --version display version info and exit. + + -c (ignored) + -C install only if different (preserve the last data modification time) + -d create directories instead of installing files. + -g GROUP $chgrpprog installed files to GROUP. + -m MODE $chmodprog installed files to MODE. + -o USER $chownprog installed files to USER. + -s $stripprog installed files. + -t DIRECTORY install into DIRECTORY. + -T report an error if DSTFILE is a directory. + +Environment variables override the default commands: + CHGRPPROG CHMODPROG CHOWNPROG CMPPROG CPPROG MKDIRPROG MVPROG + RMPROG STRIPPROG +" + +while test $# -ne 0; do + case $1 in + -c) ;; + + -C) copy_on_change=true;; + + -d) dir_arg=true;; + + -g) chgrpcmd="$chgrpprog $2" + shift;; + + --help) echo "$usage"; exit $?;; + + -m) mode=$2 + case $mode in + *' '* | *' '* | *' +'* | *'*'* | *'?'* | *'['*) + echo "$0: invalid mode: $mode" >&2 + exit 1;; + esac + shift;; + + -o) chowncmd="$chownprog $2" + shift;; + + -s) stripcmd=$stripprog;; + + -t) dst_arg=$2 + shift;; + + -T) no_target_directory=true;; + + --version) echo "$0 $scriptversion"; exit $?;; + + --) shift + break;; + + -*) echo "$0: invalid option: $1" >&2 + exit 1;; + + *) break;; + esac + shift +done + +if test $# -ne 0 && test -z "$dir_arg$dst_arg"; then + # When -d is used, all remaining arguments are directories to create. + # When -t is used, the destination is already specified. + # Otherwise, the last argument is the destination. Remove it from $@. + for arg + do + if test -n "$dst_arg"; then + # $@ is not empty: it contains at least $arg. + set fnord "$@" "$dst_arg" + shift # fnord + fi + shift # arg + dst_arg=$arg + done +fi + +if test $# -eq 0; then + if test -z "$dir_arg"; then + echo "$0: no input file specified." >&2 + exit 1 + fi + # It's OK to call `install-sh -d' without argument. + # This can happen when creating conditional directories. + exit 0 +fi + +if test -z "$dir_arg"; then + trap '(exit $?); exit' 1 2 13 15 + + # Set umask so as not to create temps with too-generous modes. + # However, 'strip' requires both read and write access to temps. + case $mode in + # Optimize common cases. + *644) cp_umask=133;; + *755) cp_umask=22;; + + *[0-7]) + if test -z "$stripcmd"; then + u_plus_rw= + else + u_plus_rw='% 200' + fi + cp_umask=`expr '(' 777 - $mode % 1000 ')' $u_plus_rw`;; + *) + if test -z "$stripcmd"; then + u_plus_rw= + else + u_plus_rw=,u+rw + fi + cp_umask=$mode$u_plus_rw;; + esac +fi + +for src +do + # Protect names starting with `-'. + case $src in + -*) src=./$src;; + esac + + if test -n "$dir_arg"; then + dst=$src + dstdir=$dst + test -d "$dstdir" + dstdir_status=$? + else + + # Waiting for this to be detected by the "$cpprog $src $dsttmp" command + # might cause directories to be created, which would be especially bad + # if $src (and thus $dsttmp) contains '*'. + if test ! -f "$src" && test ! -d "$src"; then + echo "$0: $src does not exist." >&2 + exit 1 + fi + + if test -z "$dst_arg"; then + echo "$0: no destination specified." >&2 + exit 1 + fi + + dst=$dst_arg + # Protect names starting with `-'. + case $dst in + -*) dst=./$dst;; + esac + + # If destination is a directory, append the input filename; won't work + # if double slashes aren't ignored. + if test -d "$dst"; then + if test -n "$no_target_directory"; then + echo "$0: $dst_arg: Is a directory" >&2 + exit 1 + fi + dstdir=$dst + dst=$dstdir/`basename "$src"` + dstdir_status=0 + else + # Prefer dirname, but fall back on a substitute if dirname fails. + dstdir=` + (dirname "$dst") 2>/dev/null || + expr X"$dst" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ + X"$dst" : 'X\(//\)[^/]' \| \ + X"$dst" : 'X\(//\)$' \| \ + X"$dst" : 'X\(/\)' \| . 2>/dev/null || + echo X"$dst" | + sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ + s//\1/ + q + } + /^X\(\/\/\)[^/].*/{ + s//\1/ + q + } + /^X\(\/\/\)$/{ + s//\1/ + q + } + /^X\(\/\).*/{ + s//\1/ + q + } + s/.*/./; q' + ` + + test -d "$dstdir" + dstdir_status=$? + fi + fi + + obsolete_mkdir_used=false + + if test $dstdir_status != 0; then + case $posix_mkdir in + '') + # Create intermediate dirs using mode 755 as modified by the umask. + # This is like FreeBSD 'install' as of 1997-10-28. + umask=`umask` + case $stripcmd.$umask in + # Optimize common cases. + *[2367][2367]) mkdir_umask=$umask;; + .*0[02][02] | .[02][02] | .[02]) mkdir_umask=22;; + + *[0-7]) + mkdir_umask=`expr $umask + 22 \ + - $umask % 100 % 40 + $umask % 20 \ + - $umask % 10 % 4 + $umask % 2 + `;; + *) mkdir_umask=$umask,go-w;; + esac + + # With -d, create the new directory with the user-specified mode. + # Otherwise, rely on $mkdir_umask. + if test -n "$dir_arg"; then + mkdir_mode=-m$mode + else + mkdir_mode= + fi + + posix_mkdir=false + case $umask in + *[123567][0-7][0-7]) + # POSIX mkdir -p sets u+wx bits regardless of umask, which + # is incompatible with FreeBSD 'install' when (umask & 300) != 0. + ;; + *) + tmpdir=${TMPDIR-/tmp}/ins$RANDOM-$$ + trap 'ret=$?; rmdir "$tmpdir/d" "$tmpdir" 2>/dev/null; exit $ret' 0 + + if (umask $mkdir_umask && + exec $mkdirprog $mkdir_mode -p -- "$tmpdir/d") >/dev/null 2>&1 + then + if test -z "$dir_arg" || { + # Check for POSIX incompatibilities with -m. + # HP-UX 11.23 and IRIX 6.5 mkdir -m -p sets group- or + # other-writeable bit of parent directory when it shouldn't. + # FreeBSD 6.1 mkdir -m -p sets mode of existing directory. + ls_ld_tmpdir=`ls -ld "$tmpdir"` + case $ls_ld_tmpdir in + d????-?r-*) different_mode=700;; + d????-?--*) different_mode=755;; + *) false;; + esac && + $mkdirprog -m$different_mode -p -- "$tmpdir" && { + ls_ld_tmpdir_1=`ls -ld "$tmpdir"` + test "$ls_ld_tmpdir" = "$ls_ld_tmpdir_1" + } + } + then posix_mkdir=: + fi + rmdir "$tmpdir/d" "$tmpdir" + else + # Remove any dirs left behind by ancient mkdir implementations. + rmdir ./$mkdir_mode ./-p ./-- 2>/dev/null + fi + trap '' 0;; + esac;; + esac + + if + $posix_mkdir && ( + umask $mkdir_umask && + $doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir" + ) + then : + else + + # The umask is ridiculous, or mkdir does not conform to POSIX, + # or it failed possibly due to a race condition. Create the + # directory the slow way, step by step, checking for races as we go. + + case $dstdir in + /*) prefix='/';; + -*) prefix='./';; + *) prefix='';; + esac + + eval "$initialize_posix_glob" + + oIFS=$IFS + IFS=/ + $posix_glob set -f + set fnord $dstdir + shift + $posix_glob set +f + IFS=$oIFS + + prefixes= + + for d + do + test -z "$d" && continue + + prefix=$prefix$d + if test -d "$prefix"; then + prefixes= + else + if $posix_mkdir; then + (umask=$mkdir_umask && + $doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir") && break + # Don't fail if two instances are running concurrently. + test -d "$prefix" || exit 1 + else + case $prefix in + *\'*) qprefix=`echo "$prefix" | sed "s/'/'\\\\\\\\''/g"`;; + *) qprefix=$prefix;; + esac + prefixes="$prefixes '$qprefix'" + fi + fi + prefix=$prefix/ + done + + if test -n "$prefixes"; then + # Don't fail if two instances are running concurrently. + (umask $mkdir_umask && + eval "\$doit_exec \$mkdirprog $prefixes") || + test -d "$dstdir" || exit 1 + obsolete_mkdir_used=true + fi + fi + fi + + if test -n "$dir_arg"; then + { test -z "$chowncmd" || $doit $chowncmd "$dst"; } && + { test -z "$chgrpcmd" || $doit $chgrpcmd "$dst"; } && + { test "$obsolete_mkdir_used$chowncmd$chgrpcmd" = false || + test -z "$chmodcmd" || $doit $chmodcmd $mode "$dst"; } || exit 1 + else + + # Make a couple of temp file names in the proper directory. + dsttmp=$dstdir/_inst.$$_ + rmtmp=$dstdir/_rm.$$_ + + # Trap to clean up those temp files at exit. + trap 'ret=$?; rm -f "$dsttmp" "$rmtmp" && exit $ret' 0 + + # Copy the file name to the temp name. + (umask $cp_umask && $doit_exec $cpprog "$src" "$dsttmp") && + + # and set any options; do chmod last to preserve setuid bits. + # + # If any of these fail, we abort the whole thing. If we want to + # ignore errors from any of these, just make sure not to ignore + # errors from the above "$doit $cpprog $src $dsttmp" command. + # + { test -z "$chowncmd" || $doit $chowncmd "$dsttmp"; } && + { test -z "$chgrpcmd" || $doit $chgrpcmd "$dsttmp"; } && + { test -z "$stripcmd" || $doit $stripcmd "$dsttmp"; } && + { test -z "$chmodcmd" || $doit $chmodcmd $mode "$dsttmp"; } && + + # If -C, don't bother to copy if it wouldn't change the file. + if $copy_on_change && + old=`LC_ALL=C ls -dlL "$dst" 2>/dev/null` && + new=`LC_ALL=C ls -dlL "$dsttmp" 2>/dev/null` && + + eval "$initialize_posix_glob" && + $posix_glob set -f && + set X $old && old=:$2:$4:$5:$6 && + set X $new && new=:$2:$4:$5:$6 && + $posix_glob set +f && + + test "$old" = "$new" && + $cmpprog "$dst" "$dsttmp" >/dev/null 2>&1 + then + rm -f "$dsttmp" + else + # Rename the file to the real destination. + $doit $mvcmd -f "$dsttmp" "$dst" 2>/dev/null || + + # The rename failed, perhaps because mv can't rename something else + # to itself, or perhaps because mv is so ancient that it does not + # support -f. + { + # Now remove or move aside any old file at destination location. + # We try this two ways since rm can't unlink itself on some + # systems and the destination file might be busy for other + # reasons. In this case, the final cleanup might fail but the new + # file should still install successfully. + { + test ! -f "$dst" || + $doit $rmcmd -f "$dst" 2>/dev/null || + { $doit $mvcmd -f "$dst" "$rmtmp" 2>/dev/null && + { $doit $rmcmd -f "$rmtmp" 2>/dev/null; :; } + } || + { echo "$0: cannot unlink or rename $dst" >&2 + (exit 1); exit 1 + } + } && + + # Now rename the file to the real destination. + $doit $mvcmd "$dsttmp" "$dst" + } + fi || exit 1 + + trap '' 0 + fi +done + +# Local variables: +# eval: (add-hook 'write-file-hooks 'time-stamp) +# time-stamp-start: "scriptversion=" +# time-stamp-format: "%:y-%02m-%02d.%02H" +# time-stamp-time-zone: "UTC" +# time-stamp-end: "; # UTC" +# End: diff --git a/open_src/xio/src/kernel/xio/xio_context.c b/open_src/xio/src/kernel/xio/xio_context.c new file mode 100644 index 0000000..dac30b6 --- /dev/null +++ b/open_src/xio/src/kernel/xio/xio_context.c @@ -0,0 +1,767 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libxio.h" +#include "xio_log.h" +#include +#include "xio_common.h" +#include "xio_observer.h" +#include "xio_idr.h" +#include "xio_ev_data.h" +#include "xio_ev_loop.h" +#include "xio_objpool.h" +#include "xio_workqueue.h" +#include "xio_context.h" +#include "xio_mempool.h" +#include "xio_protocol.h" +#include "xio_mbuf.h" +#include "xio_task.h" +#include "xio_transport.h" + +#define MSGPOOL_INIT_NR 8 +#define MSGPOOL_GROW_NR 64 + +/*---------------------------------------------------------------------------*/ +/* xio_context_reg_observer */ +/*---------------------------------------------------------------------------*/ +int xio_context_reg_observer(struct xio_context *ctx, + struct xio_observer *observer) +{ + xio_observable_reg_observer(&ctx->observable, observer); + + return 0; +} +EXPORT_SYMBOL(xio_context_reg_observer); + +/*---------------------------------------------------------------------------*/ +/* xio_context_unreg_observer */ +/*---------------------------------------------------------------------------*/ +void xio_context_unreg_observer(struct xio_context *ctx, + struct xio_observer *observer) +{ + xio_observable_unreg_observer(&ctx->observable, observer); +} +EXPORT_SYMBOL(xio_context_unreg_observer); + +/*---------------------------------------------------------------------------*/ +/* xio_ctx_create */ +/*---------------------------------------------------------------------------*/ +struct xio_context *xio_context_create(struct xio_context_params *ctx_params, + int polling_timeout, + int cpu_hint) +{ + struct xio_context *ctx; + struct xio_loop_ops *loop_ops; + struct task_struct *worker; + struct xio_transport *transport; + int flags, cpu; + + if (!ctx_params) { + xio_set_error(EINVAL); + ERROR_LOG("ctx_params is NULL\n"); + goto cleanup0; + + } + + loop_ops = ctx_params->loop_ops; + worker = ctx_params->worker; + flags = ctx_params->flags; + + if (cpu_hint > 0 && cpu_hint >= num_online_cpus()) { + xio_set_error(EINVAL); + ERROR_LOG("cpu_hint(%d) >= num_online_cpus(%d)\n", + cpu_hint, num_online_cpus()); + goto cleanup0; + } + + if ((flags == XIO_LOOP_USER_LOOP) && + (!(loop_ops && loop_ops->add_event && loop_ops->ev_loop))) { + xio_set_error(EINVAL); + ERROR_LOG("loop_ops and ev_loop and ev_loop_add_event are " \ + "mandatory with loop_ops\n"); + goto cleanup0; + } + + xio_read_logging_level(); + + /* no need to disable preemption */ + cpu = raw_smp_processor_id(); + + if (cpu == -1) + goto cleanup0; + + /* allocate new context */ + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) { + xio_set_error(ENOMEM); + ERROR_LOG("kzalloc failed\n"); + goto cleanup0; + } + + if (cpu_hint < 0) + cpu_hint = cpu; + + ctx->run_private = 0; + ctx->user_context = ctx_params->user_context; + ctx->flags = flags; + ctx->cpuid = cpu_hint; + ctx->nodeid = cpu_to_node(cpu_hint); + ctx->polling_timeout = polling_timeout; + ctx->prealloc_xio_inline_bufs = + !!ctx_params->prealloc_xio_inline_bufs; + ctx->rq_depth = ctx_params->rq_depth; + + if (!ctx_params->max_conns_per_ctx) + ctx->max_conns_per_ctx = 100; + else + ctx->max_conns_per_ctx = + max(ctx_params->max_conns_per_ctx , 2); + + ctx->workqueue = xio_workqueue_create(ctx); + if (!ctx->workqueue) { + xio_set_error(ENOMEM); + ERROR_LOG("xio_workqueue_init failed.\n"); + goto cleanup1; + } + ctx->msg_pool = xio_objpool_create(sizeof(struct xio_msg), + MSGPOOL_INIT_NR, MSGPOOL_GROW_NR); + if (!ctx->msg_pool) { + xio_set_error(ENOMEM); + ERROR_LOG("context's msg_pool create failed. %m\n"); + goto cleanup2; + } + + XIO_OBSERVABLE_INIT(&ctx->observable, ctx); + INIT_LIST_HEAD(&ctx->ctx_list); + + switch (flags) { + case XIO_LOOP_USER_LOOP: + break; + case XIO_LOOP_GIVEN_THREAD: + set_cpus_allowed_ptr(worker, cpumask_of(cpu_hint)); + ctx->worker = (uint64_t)worker; + break; + case XIO_LOOP_TASKLET: + break; + case XIO_LOOP_WORKQUEUE: + break; + default: + ERROR_LOG("wrong type. %u\n", flags); + goto cleanup3; + } + + ctx->ev_loop = xio_ev_loop_init(flags, ctx, loop_ops); + if (!ctx->ev_loop) + goto cleanup3; + + ctx->stats.hertz = HZ; + /* Initialize default counters' name */ + ctx->stats.name[XIO_STAT_TX_MSG] = kstrdup("TX_MSG", GFP_KERNEL); + ctx->stats.name[XIO_STAT_RX_MSG] = kstrdup("RX_MSG", GFP_KERNEL); + ctx->stats.name[XIO_STAT_TX_BYTES] = kstrdup("TX_BYTES", GFP_KERNEL); + ctx->stats.name[XIO_STAT_RX_BYTES] = kstrdup("RX_BYTES", GFP_KERNEL); + ctx->stats.name[XIO_STAT_DELAY] = kstrdup("DELAY", GFP_KERNEL); + ctx->stats.name[XIO_STAT_APPDELAY] = kstrdup("APPDELAY", GFP_KERNEL); + + /* initialize rdma pools only */ + transport = xio_get_transport("rdma"); + if (transport && ctx->prealloc_xio_inline_bufs) { + int retval = xio_ctx_pool_create(ctx, XIO_PROTO_RDMA, + XIO_CONTEXT_POOL_CLASS_INITIAL); + if (retval) { + ERROR_LOG("Failed to create initial pool. ctx:%p\n", ctx); + goto cleanup2; + } + retval = xio_ctx_pool_create(ctx, XIO_PROTO_RDMA, + XIO_CONTEXT_POOL_CLASS_PRIMARY); + if (retval) { + ERROR_LOG("Failed to create primary pool. ctx:%p\n", ctx); + goto cleanup2; + } + } + spin_lock_init(&ctx->ctx_list_lock); + + xio_idr_add_uobj(usr_idr, ctx, "xio_context"); + return ctx; + +cleanup3: + xio_objpool_destroy(ctx->msg_pool); + +cleanup2: + xio_workqueue_destroy(ctx->workqueue); + +cleanup1: + kfree(ctx); + +cleanup0: + ERROR_LOG("xio_ctx_open failed\n"); + + return NULL; +} +EXPORT_SYMBOL(xio_context_create); + +/*---------------------------------------------------------------------------*/ +/* xio_modify_context */ +/*---------------------------------------------------------------------------*/ +int xio_modify_context(struct xio_context *ctx, + struct xio_context_attr *attr, + int attr_mask) +{ + if (!ctx || !attr) { + xio_set_error(EINVAL); + ERROR_LOG("invalid parameters\n"); + return -1; + } + + if (attr_mask & XIO_CONTEXT_ATTR_USER_CTX) + ctx->user_context = attr->user_context; + + return 0; +} +EXPORT_SYMBOL(xio_modify_context); + +/*---------------------------------------------------------------------------*/ +/* xio_query_context */ +/*---------------------------------------------------------------------------*/ +int xio_query_context(struct xio_context *ctx, + struct xio_context_attr *attr, + int attr_mask) +{ + if (!ctx || !attr) { + xio_set_error(EINVAL); + ERROR_LOG("invalid parameters\n"); + return -1; + } + + if (attr_mask & XIO_CONTEXT_ATTR_USER_CTX) + attr->user_context = ctx->user_context; + + return 0; +} +EXPORT_SYMBOL(xio_query_context); + +/*---------------------------------------------------------------------------*/ +/* xio_ctx_tasks_pools_destroy */ +/*---------------------------------------------------------------------------*/ +static void xio_ctx_task_pools_destroy(struct xio_context *ctx) +{ + int i; + + for (i = 0; i < XIO_PROTO_LAST; i++) { + if (ctx->initial_tasks_pool[i]) { + xio_tasks_pool_free_tasks(ctx->initial_tasks_pool[i]); + xio_tasks_pool_destroy(ctx->initial_tasks_pool[i]); + ctx->initial_tasks_pool[i] = NULL; + } + if (ctx->primary_tasks_pool[i]) { + xio_tasks_pool_free_tasks(ctx->primary_tasks_pool[i]); + xio_tasks_pool_destroy(ctx->primary_tasks_pool[i]); + ctx->primary_tasks_pool[i] = NULL; + } + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_context_destroy */ +/*---------------------------------------------------------------------------*/ +void xio_destroy_context_continue(struct work_struct *work) +{ + xio_work_handle_t *xio_work; + struct xio_context *ctx; + int i; + + xio_work = container_of(work, xio_work_handle_t, work); + ctx = container_of(xio_work, struct xio_context, destroy_ctx_work); + if (ctx->run_private) + ERROR_LOG("not all observers finished! run_private=%d\n", + ctx->run_private); + + xio_observable_notify_all_observers(&ctx->observable, + XIO_CONTEXT_EVENT_POST_CLOSE, NULL); + + if (!xio_observable_is_empty(&ctx->observable)) + ERROR_LOG("context destroy: observers leak - %p\n", ctx); + + xio_observable_unreg_all_observers(&ctx->observable); + + for (i = 0; i < XIO_STAT_LAST; i++) + kfree(ctx->stats.name[i]); + + xio_workqueue_destroy(ctx->workqueue); + xio_objpool_destroy(ctx->msg_pool); + + /* can free only xio created loop */ + if (ctx->flags != XIO_LOOP_USER_LOOP) + xio_ev_loop_destroy(ctx->ev_loop); + + ctx->ev_loop = NULL; + + XIO_OBSERVABLE_DESTROY(&ctx->observable); + + xio_ctx_task_pools_destroy(ctx); + + if (ctx->mempool) { + xio_mempool_destroy(ctx->mempool); + ctx->mempool = NULL; + } + + kfree(ctx); +} +EXPORT_SYMBOL(xio_destroy_context_continue); + +void xio_context_destroy(struct xio_context *ctx) +{ + int found; + + found = xio_idr_lookup_uobj(usr_idr, ctx); + if (found) { + xio_idr_remove_uobj(usr_idr, ctx); + } else { + ERROR_LOG("context not found:%p\n", ctx); + xio_set_error(XIO_E_USER_OBJ_NOT_FOUND); + return; + } + + ctx->run_private = 0; + xio_observable_notify_all_observers(&ctx->observable, + XIO_CONTEXT_EVENT_CLOSE, NULL); + /* allow internally to run the loop for final cleanup */ + if (ctx->run_private) + xio_context_run_loop(ctx); + if (ctx->run_private == 0) + xio_destroy_context_continue(&ctx->destroy_ctx_work.work); +} +EXPORT_SYMBOL(xio_context_destroy); + +/*---------------------------------------------------------------------------*/ +/* xio_ctx_add_delayed_work */ +/*---------------------------------------------------------------------------*/ +int xio_ctx_add_delayed_work(struct xio_context *ctx, + int msec_duration, void *data, + void (*timer_fn)(void *data), + xio_ctx_delayed_work_t *work) +{ + int retval; + + /* test if delayed work is pending */ + if (xio_is_delayed_work_pending(work)) + return 0; + + retval = xio_workqueue_add_delayed_work(ctx->workqueue, + msec_duration, data, + timer_fn, work); + if (retval) { + xio_set_error(retval); + ERROR_LOG("xio_workqueue_add_delayed_work failed. err=%d\n", + retval); + } + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_ctx_del_delayed_work */ +/*---------------------------------------------------------------------------*/ +int xio_ctx_del_delayed_work(struct xio_context *ctx, + xio_ctx_delayed_work_t *work) +{ + int retval; + + /* test if delayed work is pending */ + if (!xio_is_delayed_work_pending(work)) + return 0; + + retval = xio_workqueue_del_delayed_work(ctx->workqueue, work); + if (retval) { + xio_set_error(retval); + ERROR_LOG("workqueue_del_delayed_work failed. err=%d\n", + retval); + } + + return retval; +} + +int xio_context_run_loop(struct xio_context *ctx) +{ + struct xio_ev_loop *ev_loop = (struct xio_ev_loop *)ctx->ev_loop; + + return ev_loop->run(ev_loop->loop_object); +} +EXPORT_SYMBOL(xio_context_run_loop); + +void xio_context_stop_loop(struct xio_context *ctx) +{ + struct xio_ev_loop *ev_loop = (struct xio_ev_loop *)ctx->ev_loop; + + ev_loop->stop(ev_loop->loop_object); +} +EXPORT_SYMBOL(xio_context_stop_loop); + +int xio_context_add_event(struct xio_context *ctx, struct xio_ev_data *data) +{ + struct xio_ev_loop *ev_loop = (struct xio_ev_loop *)ctx->ev_loop; + + return ev_loop->add_event(ev_loop->loop_object, data); +} +EXPORT_SYMBOL(xio_context_add_event); + +/* + * Suspend the current handler run. + * Note: Not protected against a race. Another thread may reactivate the event. + */ +/*---------------------------------------------------------------------------*/ +/* xio_context_disable_event */ +/*---------------------------------------------------------------------------*/ +void xio_context_disable_event(struct xio_ev_data *data) +{ + clear_bit(XIO_EV_HANDLER_ENABLED, &data->states); +} +EXPORT_SYMBOL(xio_context_disable_event); + +/* + * Check if the event is pending. + * Return true if the event is pending in any list. + * Return false once the event is removed from the list in order to be executed. + * (When inside the event handler, the event is no longer pending) + * Note: Not protected against a race. Another thread may reactivate the event. + */ +/*---------------------------------------------------------------------------*/ +/* xio_context_is_pending_event */ +/*---------------------------------------------------------------------------*/ +int xio_context_is_pending_event(struct xio_ev_data *data) +{ + return test_bit(XIO_EV_HANDLER_PENDING, &data->states); +} +EXPORT_SYMBOL(xio_context_is_pending_event); + +int xio_context_is_loop_stopping(struct xio_context *ctx) +{ + struct xio_ev_loop *ev_loop = (struct xio_ev_loop *)ctx->ev_loop; + + return ev_loop->is_stopping(ev_loop->loop_object); +} +EXPORT_SYMBOL(xio_context_is_loop_stopping); + +/*---------------------------------------------------------------------------*/ +/* xio_ctx_add_work */ +/*---------------------------------------------------------------------------*/ +int xio_ctx_add_work(struct xio_context *ctx, + void *data, + void (*function)(void *data), + xio_ctx_work_t *work) +{ + int retval; + + /* test if work is pending */ + if (xio_is_work_pending(work)) + return 0; + + retval = xio_workqueue_add_work(ctx->workqueue, + data, function, work); + if (retval) { + xio_set_error(retval); + ERROR_LOG("xio_workqueue_add_work failed. err=%d\n", retval); + } + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_ctx_set_work_destructor */ +/*---------------------------------------------------------------------------*/ +int xio_ctx_set_work_destructor( + struct xio_context *ctx, void *data, + void (*destructor)(void *data), + xio_ctx_work_t *work) +{ + int retval; + + /* test if work is pending */ + if (xio_is_work_pending(work)) + return 0; + + retval = xio_workqueue_set_work_destructor( + ctx->workqueue, + data, destructor, work); + if (retval) { + xio_set_error(retval); + ERROR_LOG("xio_workqueue_set_work_destructor failed. %m\n"); + } + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_ctx_is_work_in_handler */ +/*---------------------------------------------------------------------------*/ +int xio_ctx_is_work_in_handler(struct xio_context *ctx, xio_ctx_work_t *work) +{ + /* test if work is pending */ + if (xio_is_work_pending(work)) + return 0; + + return xio_workqueue_is_work_in_handler(ctx->workqueue, work); +} + +/*---------------------------------------------------------------------------*/ +/* xio_ctx_del_work */ +/*---------------------------------------------------------------------------*/ +int xio_ctx_del_work(struct xio_context *ctx, + xio_ctx_work_t *work) + +{ + int retval; + + /* test if work is pending */ + if (!xio_is_work_pending(work)) + return 0; + + retval = xio_workqueue_del_work(ctx->workqueue, work); + if (retval) { + xio_set_error(retval); + ERROR_LOG("xio_workqueue_del_work failed. err=%d\n", retval); + } + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_mempool_get */ +/*---------------------------------------------------------------------------*/ +struct xio_mempool *xio_mempool_get(struct xio_context *ctx) +{ + if (ctx->mempool) + return ctx->mempool; + + ctx->mempool = xio_mempool_create(); + + if (!ctx->mempool) { + ERROR_LOG("xio_mempool_create failed\n"); + return NULL; + } + return ctx->mempool; +} +EXPORT_SYMBOL(xio_mempool_get); + +/* + * should be called only from loop context + */ +/*---------------------------------------------------------------------------*/ +/* xio_context_destroy_resume */ +/*---------------------------------------------------------------------------*/ +void xio_context_destroy_resume(struct xio_context *ctx) +{ + if (ctx->run_private) { + if (!--ctx->run_private) { + switch (ctx->flags) { + case XIO_LOOP_GIVEN_THREAD: + xio_context_stop_loop(ctx); + break; + case XIO_LOOP_WORKQUEUE: + INIT_WORK(&ctx->destroy_ctx_work.work, + xio_destroy_context_continue); + schedule_work(&ctx->destroy_ctx_work.work); + break; + default: + ERROR_LOG("Not supported type. %d\n", ctx->flags); + break; + } + } + } +} +EXPORT_SYMBOL(xio_context_destroy_resume); + +/*---------------------------------------------------------------------------*/ +/* xio_context_set_poll_completions_fn */ +/*---------------------------------------------------------------------------*/ +void xio_context_set_poll_completions_fn( + struct xio_context *ctx, + poll_completions_fn_t poll_completions_fn, + void *poll_completions_ctx) +{ + ctx->poll_completions_ctx = poll_completions_ctx; + ctx->poll_completions_fn = poll_completions_fn; +} +EXPORT_SYMBOL(xio_context_set_poll_completions_fn); + +/*---------------------------------------------------------------------------*/ +/* xio_context_poll_completions */ +/*---------------------------------------------------------------------------*/ +int xio_context_poll_completions(struct xio_context *ctx, int timeout_us) +{ + if (ctx->poll_completions_fn) + return ctx->poll_completions_fn(ctx->poll_completions_ctx, + timeout_us); + + return 0; +} +EXPORT_SYMBOL(xio_context_poll_completions); + +/*---------------------------------------------------------------------------*/ +/* xio_ctx_pool_create */ +/*---------------------------------------------------------------------------*/ +int xio_ctx_pool_create(struct xio_context *ctx, enum xio_proto proto, + enum xio_context_pool_class pool_cls) +{ + struct xio_tasks_pool_ops *pool_ops; + struct xio_tasks_pool **tasks_pool; + struct xio_transport *transport; + struct xio_tasks_pool_params params; + char pool_name[64]; + const char *proto_str = xio_proto_str(proto); + + /* get the transport's proto */ + transport = xio_get_transport(proto_str); + if (!transport) { + ERROR_LOG("failed to load %s transport layer.\n", proto_str); + ERROR_LOG("validate that your system support %s " \ + "and the accelio's %s module is loaded\n", + proto_str, proto_str); + xio_set_error(ENOPROTOOPT); + return -1; + } + + if (transport->get_pools_setup_ops) { + if (!ctx->primary_pool_ops[proto] || + !ctx->initial_pool_ops[proto]) + transport->get_pools_setup_ops( + NULL, + &ctx->initial_pool_ops[proto], + &ctx->primary_pool_ops[proto]); + } else { + ERROR_LOG("transport does not implement " \ + "\"get_pools_setup_ops\"\n"); + return -1; + } + + switch(pool_cls) { + case XIO_CONTEXT_POOL_CLASS_INITIAL: + tasks_pool = &ctx->initial_tasks_pool[proto]; + pool_ops = ctx->initial_pool_ops[proto]; + sprintf(pool_name, "ctx:%p - initial_pool_%s", ctx, proto_str); + + break; + case XIO_CONTEXT_POOL_CLASS_PRIMARY: + tasks_pool = &ctx->primary_tasks_pool[proto]; + pool_ops = ctx->primary_pool_ops[proto]; + sprintf(pool_name, "ctx:%p - primary_pool_%s", ctx, proto_str); + break; + default: + xio_set_error(EINVAL); + ERROR_LOG("unknown pool class\n"); + return -1; + }; + + /* if already exist */ + if (*tasks_pool) + return 0; + + if (!pool_ops) + return -1; + + if (!pool_ops->pool_get_params || + !pool_ops->slab_pre_create || + !pool_ops->slab_init_task || + !pool_ops->pool_post_create || + !pool_ops->slab_destroy) + return -1; + + /* get pool properties from the transport */ + memset(¶ms, 0, sizeof(params)); + + pool_ops->pool_get_params(NULL, + (int *)¶ms.start_nr, + (int *)¶ms.max_nr, + (int *)¶ms.alloc_nr, + (int *)¶ms.pool_dd_data_sz, + (int *)¶ms.slab_dd_data_sz, + (int *)¶ms.task_dd_data_sz); + if (ctx->prealloc_xio_inline_bufs) { + params.start_nr = params.max_nr; + params.alloc_nr = 0; + } + + params.pool_hooks.slab_pre_create = + (int (*)(void *, int, void *, void *)) + pool_ops->slab_pre_create; + params.pool_hooks.slab_post_create = (int (*)(void *, void *, void *)) + pool_ops->slab_post_create; + params.pool_hooks.slab_destroy = (int (*)(void *, void *, void *)) + pool_ops->slab_destroy; + params.pool_hooks.slab_init_task = + (int (*)(void *, void *, void *, int, struct xio_task *)) + pool_ops->slab_init_task; + params.pool_hooks.slab_uninit_task = + (int (*)(void *, void *, void *, struct xio_task *)) + pool_ops->slab_uninit_task; + params.pool_hooks.slab_remap_task = + (int (*)(void *, void *, void *, void *, struct xio_task *)) + pool_ops->slab_remap_task; + params.pool_hooks.pool_pre_create = (int (*)(void *, void *, void *)) + pool_ops->pool_pre_create; + params.pool_hooks.pool_post_create = (int (*)(void *, void *, void *)) + pool_ops->pool_post_create; + params.pool_hooks.pool_destroy = (int (*)(void *, void *, void *)) + pool_ops->pool_destroy; + params.pool_hooks.task_pre_put = (int (*)(void *, struct xio_task *)) + pool_ops->task_pre_put; + params.pool_hooks.task_post_get = (int (*)(void *, struct xio_task *)) + pool_ops->task_post_get; + + params.pool_name = kstrdup(pool_name, GFP_KERNEL); + + /* initialize the tasks pool */ + *tasks_pool = xio_tasks_pool_create(¶ms); + if (!*tasks_pool) { + ERROR_LOG("xio_tasks_pool_create failed\n"); + return -1; + } + + return 0; +} + + diff --git a/open_src/xio/src/kernel/xio/xio_context_priv.h b/open_src/xio/src/kernel/xio/xio_context_priv.h new file mode 100644 index 0000000..0d5e6f9 --- /dev/null +++ b/open_src/xio/src/kernel/xio/xio_context_priv.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_CONTEXT_PRIV_H_ +#define XIO_CONTEXT_PRIV_H_ + +struct xio_mempool *xio_mempool_get(struct xio_context *ctx); + +#endif /* XIO_CONTEXT_PRIV_H_ */ diff --git a/open_src/xio/src/kernel/xio/xio_ev_data.h b/open_src/xio/src/kernel/xio/xio_ev_data.h new file mode 100644 index 0000000..6e1c98c --- /dev/null +++ b/open_src/xio/src/kernel/xio/xio_ev_data.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_EV_DATA_H +#define XIO_EV_DATA_H + +typedef struct xio_ev_data xio_ev_data_t; + +#endif + diff --git a/open_src/xio/src/kernel/xio/xio_ev_loop.c b/open_src/xio/src/kernel/xio/xio_ev_loop.c new file mode 100644 index 0000000..7c3365f --- /dev/null +++ b/open_src/xio/src/kernel/xio/xio_ev_loop.c @@ -0,0 +1,574 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include +#include +#include +#include +#include + +#include +#include "libxio.h" +#include "xio_log.h" +#include "xio_observer.h" +#include "xio_common.h" +#include "xio_ev_data.h" +#include "xio_ev_loop.h" +#include "xio_objpool.h" +#include "xio_workqueue.h" +#include "xio_context.h" + +static void xio_append_ordered(struct llist_node *first, + struct llist_node *last, + struct xio_ev_loop *loop) +{ + if (loop->first) + loop->last->next = first; + else + loop->first = first; + loop->last = last; +} + +/*---------------------------------------------------------------------------*/ +/* forward declarations of private API */ +/*---------------------------------------------------------------------------*/ + +static int priv_ev_loop_run(void *loop_hndl); +static void priv_ev_loop_stop(void *loop_hndl); +static int priv_ev_is_loop_stopping(void *loop_hndl); + +static void priv_ev_loop_run_tasklet(unsigned long data); +static void priv_ev_loop_run_work(struct work_struct *work); + +static void priv_ev_loop_stop_thread(void *loop_hndl); + +static int priv_ev_add_thread(void *loop_hndl, struct xio_ev_data *event); +static int priv_ev_add_tasklet(void *loop_hndl, struct xio_ev_data *event); +static int priv_ev_add_workqueue(void *loop_hndl, struct xio_ev_data *event); + +/*---------------------------------------------------------------------------*/ +/* xio_ev_loop_init */ +/*---------------------------------------------------------------------------*/ +void *xio_ev_loop_init(unsigned long flags, struct xio_context *ctx, + struct xio_loop_ops *loop_ops) +{ + struct xio_ev_loop *loop; + char queue_name[64]; + + loop = kzalloc(sizeof(*loop), GFP_KERNEL); + if (!loop) { + xio_set_error(ENOMEM); + ERROR_LOG("kmalloc failed. %m\n"); + goto cleanup0; + } + + set_bit(XIO_EV_LOOP_STOP, &loop->states); + init_completion(&loop->complete); + + init_llist_head(&loop->ev_llist); + loop->first = NULL; + loop->last = NULL; + + /* use default implementation */ + loop->run = priv_ev_loop_run; + loop->stop = priv_ev_loop_stop; + loop->is_stopping = priv_ev_is_loop_stopping; + loop->loop_object = loop; + + switch (flags) { + case XIO_LOOP_USER_LOOP: + /* override with user provided routines and object */ + loop->run = loop_ops->run; + loop->stop = loop_ops->stop; + loop->add_event = loop_ops->add_event; + loop->loop_object = loop_ops->ev_loop; + break; + case XIO_LOOP_GIVEN_THREAD: + loop->stop = priv_ev_loop_stop_thread; + loop->add_event = priv_ev_add_thread; + init_waitqueue_head(&loop->wait); + break; + case XIO_LOOP_TASKLET: + loop->add_event = priv_ev_add_tasklet; + tasklet_init(&loop->tasklet, priv_ev_loop_run_tasklet, + (unsigned long)loop); + break; + case XIO_LOOP_WORKQUEUE: + /* temporary (also change to single thread) */ + sprintf(queue_name, "xio-%p", loop); + /* check flags and backward compatibility */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 36) + loop->workqueue = create_workqueue(queue_name); +#else + loop->workqueue = alloc_workqueue(queue_name, + WQ_MEM_RECLAIM | WQ_HIGHPRI, + 0); +#endif + if (!loop->workqueue) { + ERROR_LOG("workqueue create failed.\n"); + goto cleanup1; + } + loop->add_event = priv_ev_add_workqueue; + break; + default: + ERROR_LOG("wrong type. %lu\n", flags); + goto cleanup1; + } + + loop->flags = flags; + loop->ctx = ctx; + + return loop; + +cleanup1: + clear_bit(XIO_EV_LOOP_STOP, &loop->states); + kfree(loop); +cleanup0: + ERROR_LOG("event loop creation failed.\n"); + return NULL; +} + +/*---------------------------------------------------------------------------*/ +/* xio_ev_loop_destroy */ +/*---------------------------------------------------------------------------*/ +void xio_ev_loop_destroy(void *loop_hndl) +{ + struct xio_ev_loop *loop = (struct xio_ev_loop *)loop_hndl; + + if (!loop) + return; + + if (test_bit(XIO_EV_LOOP_IN_HANDLER, &loop->states)) { + ERROR_LOG("Can't destroy the loop from within handlers.\n"); + return; + } + + if (test_and_set_bit(XIO_EV_LOOP_DOWN, &loop->states)) { + ERROR_LOG("Down already in progress.\n"); + return; + } + + set_bit(XIO_EV_LOOP_STOP, &loop->states); + + /* TODO: Clean all unhandled events !!!! */ + + switch (loop->flags) { + case XIO_LOOP_GIVEN_THREAD: + if (!test_and_set_bit(XIO_EV_LOOP_WAKE, &loop->states)) + wake_up_interruptible(&loop->wait); + if (test_bit(XIO_EV_LOOP_ACTIVE, &loop->states)) { + TRACE_LOG("loop: wait_for_completion"); + wait_for_completion(&loop->complete); + } + break; + case XIO_LOOP_TASKLET: + tasklet_kill(&loop->tasklet); + break; + case XIO_LOOP_WORKQUEUE: + flush_workqueue(loop->workqueue); + destroy_workqueue(loop->workqueue); + break; + default: + break; + } + + kfree(loop); +} + +/*---------------------------------------------------------------------------*/ +/* priv_ev_add_thread */ +/*---------------------------------------------------------------------------*/ +static int priv_ev_add_thread(void *loop_hndl, struct xio_ev_data *event) +{ + struct xio_ev_loop *loop = (struct xio_ev_loop *)loop_hndl; + + /* don't add events */ + if (test_bit(XIO_EV_LOOP_DOWN, &loop->states)) + return 0; + + set_bit(XIO_EV_HANDLER_ENABLED, &event->states); + if (!test_and_set_bit(XIO_EV_HANDLER_PENDING, &event->states)) + llist_add(&event->ev_llist, &loop->ev_llist); + + /* don't wake up */ + if (test_bit(XIO_EV_LOOP_STOP, &loop->states)) + return 0; + + if (!test_and_set_bit(XIO_EV_LOOP_WAKE, &loop->states)) + wake_up_interruptible(&loop->wait); + + return 0; +} + +static void priv_ipi(void *loop_hndl) +{ + struct xio_ev_loop *loop = (struct xio_ev_loop *)loop_hndl; + + /* CSD can be reused */ + clear_bit(XIO_EV_LOOP_SCHED, &loop->states); + + /* don't wake up */ + if (test_bit(XIO_EV_LOOP_STOP, &loop->states)) + return; + + tasklet_schedule(&loop->tasklet); +} + +/*---------------------------------------------------------------------------*/ +/* priv_kick_tasklet */ +/*---------------------------------------------------------------------------*/ +static void priv_kick_tasklet(void *loop_hndl) +{ + struct xio_ev_loop *loop = (struct xio_ev_loop *)loop_hndl; + int cpu; + + /* If EQ related interrupt was not assigned to the requested core, + * or if a event from another context is sent (e.g. module down event) + * and since tasklet runs on the core that schedule it IPI must be used + */ + cpu = get_cpu(); + if (likely(loop->ctx->cpuid == cpu)) { + tasklet_schedule(&loop->tasklet); + put_cpu(); + return; + } + put_cpu(); + + /* check if CSD in use */ + if (test_and_set_bit(XIO_EV_LOOP_SCHED, &loop->states)) + return; + + /* can't use __smp_call_function_single it is GPL exported */ + smp_call_function_single(loop->ctx->cpuid, priv_ipi, loop_hndl, 0); +} + +/*---------------------------------------------------------------------------*/ +/* priv_ev_add_tasklet */ +/*---------------------------------------------------------------------------*/ +static int priv_ev_add_tasklet(void *loop_hndl, struct xio_ev_data *event) +{ + struct xio_ev_loop *loop = (struct xio_ev_loop *)loop_hndl; + + /* don't add events */ + if (test_bit(XIO_EV_LOOP_DOWN, &loop->states)) + return 0; + + set_bit(XIO_EV_HANDLER_ENABLED, &event->states); + if (!test_and_set_bit(XIO_EV_HANDLER_PENDING, &event->states)) + llist_add(&event->ev_llist, &loop->ev_llist); + + /* don't wake up */ + if (test_bit(XIO_EV_LOOP_STOP, &loop->states)) + return 0; + + priv_kick_tasklet(loop_hndl); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* priv_ev_add_workqueue */ +/*---------------------------------------------------------------------------*/ +static int priv_ev_add_workqueue(void *loop_hndl, struct xio_ev_data *event) +{ + struct xio_ev_loop *loop = (struct xio_ev_loop *)loop_hndl; + + /* don't add events */ + if (test_bit(XIO_EV_LOOP_DOWN, &loop->states)) + return 0; + + set_bit(XIO_EV_HANDLER_ENABLED, &event->states); + if (test_and_set_bit(XIO_EV_HANDLER_PENDING, &event->states)) + return 0; + + if (test_bit(XIO_EV_LOOP_STOP, &loop->states)) { + /* delayed put in link list until resume */ + llist_add(&event->ev_llist, &loop->ev_llist); + return 0; + } + + INIT_WORK(&event->work, priv_ev_loop_run_work); + queue_work_on(loop->ctx->cpuid, loop->workqueue, &event->work); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* priv_ev_loop_run_thread */ +/*---------------------------------------------------------------------------*/ +static void priv_ev_loop_run_thread(struct xio_ev_loop *loop) +{ + struct xio_ev_data *tev; + struct llist_node *last, *first; + struct llist_node *node; + unsigned long start_time = jiffies; + + if (test_bit(XIO_EV_LOOP_IN_HANDLER, &loop->states)) { + /* If a callback i.e. "tev->handler" stopped the loop, + * and then restart it by calling run than we must exit + */ + TRACE_LOG("call loop run while in handler\n"); + return; + } + + set_bit(XIO_EV_LOOP_ACTIVE, &loop->states); + if (test_bit(XIO_EV_LOOP_DOWN, &loop->states)) { + complete(&loop->complete); + clear_bit(XIO_EV_LOOP_ACTIVE, &loop->states); + return; + } + + /* loop can stopped and restarted, thus old events can be pending in + * order in the (first - last) list or new events (in reverse order) + * are queued in ev_llsits + */ + if (loop->first || !llist_empty(&loop->ev_llist)) { + if (test_and_set_bit(XIO_EV_LOOP_WAKE, &loop->states)) + goto retry_wait; /* race detected */ + else + goto retry_dont_wait; /* was one wake-up was called */ + } + +retry_wait: + + wait_event_interruptible(loop->wait, + test_bit(XIO_EV_LOOP_WAKE, &loop->states)); + + if (unlikely(test_bit(XIO_EV_LOOP_STOP, &loop->states))) + goto stopped; + +retry_dont_wait: + + while ((last = llist_del_all(&loop->ev_llist)) != NULL) { + first = llist_reverse_order(last); + xio_append_ordered(first, last, loop); + node = loop->first; + while (node) { + tev = llist_entry(node, struct xio_ev_data, ev_llist); + node = llist_next(node); + loop->first = node; + set_bit(XIO_EV_LOOP_IN_HANDLER, &loop->states); + clear_bit(XIO_EV_HANDLER_PENDING, &tev->states); + if (time_after(jiffies, start_time)) { + /* schedule(); todo need to understand better */ + start_time = jiffies; + } + if (test_bit(XIO_EV_HANDLER_ENABLED, &tev->states)) + tev->handler(tev->data); + clear_bit(XIO_EV_LOOP_IN_HANDLER, &loop->states); + } + loop->last = NULL; + if (unlikely(test_bit(XIO_EV_LOOP_STOP, &loop->states))) + goto stopped; + } + + /* All events were processed prepare to wait */ + + if (unlikely(test_bit(XIO_EV_LOOP_STOP, &loop->states))) + goto stopped; + + /* "race point" */ + clear_bit(XIO_EV_LOOP_WAKE, &loop->states); + + /* if a new entry was added while we were at "race point" + * an event was added and loop was resumed, + * than wait event might block forever as condition is false + */ + if (llist_empty(&loop->ev_llist)) + goto retry_wait; + + if (test_and_set_bit(XIO_EV_LOOP_WAKE, &loop->states)) + goto retry_wait; /* bit is set add_event did set it */ + else + goto retry_dont_wait; /* add_event will not call wake up */ + +stopped: + clear_bit(XIO_EV_LOOP_WAKE, &loop->states); + if (test_bit(XIO_EV_LOOP_DOWN, &loop->states)) + complete(&loop->complete); + clear_bit(XIO_EV_LOOP_ACTIVE, &loop->states); +} + +/*---------------------------------------------------------------------------*/ +/* priv_ev_loop_run_tasklet */ +/*---------------------------------------------------------------------------*/ +static void priv_ev_loop_run_tasklet(unsigned long data) +{ + struct xio_ev_loop *loop = (struct xio_ev_loop *)data; + struct xio_ev_data *tev; + struct llist_node *last, *first; + struct llist_node *node; + + while ((last = llist_del_all(&loop->ev_llist)) != NULL) { + first = llist_reverse_order(last); + xio_append_ordered(first, last, loop); + node = loop->first; + while (node) { + if (unlikely(test_bit(XIO_EV_LOOP_STOP, &loop->states))) + return; + tev = llist_entry(node, struct xio_ev_data, ev_llist); + node = llist_next(node); + loop->first = node; + set_bit(XIO_EV_LOOP_IN_HANDLER, &loop->states); + clear_bit(XIO_EV_HANDLER_PENDING, &tev->states); + if (test_bit(XIO_EV_HANDLER_ENABLED, &tev->states)) + tev->handler(tev->data); + clear_bit(XIO_EV_LOOP_IN_HANDLER, &loop->states); + } + loop->last = NULL; + } +} + +/*---------------------------------------------------------------------------*/ +/* priv_ev_loop_run_work */ +/*---------------------------------------------------------------------------*/ +static void priv_ev_loop_run_work(struct work_struct *work) +{ + struct xio_ev_data *tev = container_of(work, struct xio_ev_data, work); + + /* CURRENTLY CAN'T MARK IN LOOP */ + clear_bit(XIO_EV_HANDLER_PENDING, &tev->states); + if (test_bit(XIO_EV_HANDLER_ENABLED, &tev->states)) + tev->handler(tev->data); +} + +/*---------------------------------------------------------------------------*/ +/* priv_ev_loop_run */ +/*---------------------------------------------------------------------------*/ +int priv_ev_loop_run(void *loop_hndl) +{ + struct xio_ev_loop *loop = loop_hndl; + struct xio_ev_data *tev; + struct llist_node *last, *first; + struct llist_node *node; + int cpu; + + clear_bit(XIO_EV_LOOP_STOP, &loop->states); + + switch (loop->flags) { + case XIO_LOOP_GIVEN_THREAD: + if (unlikely(loop->ctx->worker != (uint64_t)get_current())) { + ERROR_LOG("worker kthread(%p) is not current(%p).\n", + (void *)loop->ctx->worker, get_current()); + goto cleanup0; + } + /* no need to disable preemption */ + cpu = raw_smp_processor_id(); + if (loop->ctx->cpuid != cpu) { + TRACE_LOG("worker on core(%d) scheduled to(%d).\n", + cpu, loop->ctx->cpuid); + set_cpus_allowed_ptr(get_current(), + cpumask_of(loop->ctx->cpuid)); + } + priv_ev_loop_run_thread(loop); + return 0; + case XIO_LOOP_TASKLET: + /* were events added to list while in STOP state ? */ + if (!llist_empty(&loop->ev_llist)) + priv_kick_tasklet(loop_hndl); + return 0; + case XIO_LOOP_WORKQUEUE: + /* were events added to list while in STOP state ? */ + while ((last = llist_del_all(&loop->ev_llist)) != NULL) { + first = llist_reverse_order(last); + xio_append_ordered(first, last, loop); + node = loop->first; + while (node) { + tev = llist_entry(node, struct xio_ev_data, + ev_llist); + node = llist_next(node); + loop->first = node; + INIT_WORK(&tev->work, priv_ev_loop_run_work); + queue_work_on(loop->ctx->cpuid, loop->workqueue, + &tev->work); + } + loop->last = NULL; + } + return 0; + default: + /* undo */ + set_bit(XIO_EV_LOOP_STOP, &loop->states); + return -1; + } + +cleanup0: + set_bit(XIO_EV_LOOP_STOP, &loop->states); + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* priv_ev_loop_stop */ +/*---------------------------------------------------------------------------*/ +void priv_ev_loop_stop(void *loop_hndl) +{ + struct xio_ev_loop *loop = loop_hndl; + + if (!loop) + return; + + set_bit(XIO_EV_LOOP_STOP, &loop->states); +} + +/*---------------------------------------------------------------------------*/ +/* priv_ev_loop_stop */ +/*---------------------------------------------------------------------------*/ +void priv_ev_loop_stop_thread(void *loop_hndl) +{ + struct xio_ev_loop *loop = loop_hndl; + + if (!loop) + return; + + set_bit(XIO_EV_LOOP_STOP, &loop->states); + if (!test_and_set_bit(XIO_EV_LOOP_WAKE, &loop->states)) + wake_up_interruptible(&loop->wait); +} + +/*---------------------------------------------------------------------------*/ +/* priv_ev_is_loop_stopping */ +/*---------------------------------------------------------------------------*/ +int priv_ev_is_loop_stopping(void *loop_hndl) +{ + struct xio_ev_loop *loop = loop_hndl; + + if (!loop) + return 0; + + return test_bit(XIO_EV_LOOP_STOP, &loop->states); +} diff --git a/open_src/xio/src/kernel/xio/xio_ev_loop.h b/open_src/xio/src/kernel/xio/xio_ev_loop.h new file mode 100644 index 0000000..957a7a3 --- /dev/null +++ b/open_src/xio/src/kernel/xio/xio_ev_loop.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_EV_LOOP_H +#define XIO_EV_LOOP_H + +/*---------------------------------------------------------------------------*/ +/* defines */ +/*---------------------------------------------------------------------------*/ + +#define XIO_EV_LOOP_WAKE BIT(0) +#define XIO_EV_LOOP_STOP BIT(1) +#define XIO_EV_LOOP_DOWN BIT(2) +#define XIO_EV_LOOP_SCHED BIT(3) +#define XIO_EV_LOOP_IN_HANDLER BIT(4) +#define XIO_EV_LOOP_ACTIVE BIT(5) + +#define XIO_EV_HANDLER_PENDING BIT(0) +#define XIO_EV_HANDLER_ENABLED BIT(1) + +/*---------------------------------------------------------------------------*/ +/* structures */ +/*---------------------------------------------------------------------------*/ + +struct xio_ev_loop { + struct xio_context *ctx; + void *loop_object; + int (*run)(void *loop_hndl); + void (*stop)(void *loop_hndl); + int (*is_stopping)(void *loop_hndl); + int (*add_event)(void *loop_hndl, struct xio_ev_data *data); + unsigned long flags; + + volatile unsigned long states; + union { + wait_queue_head_t wait; + struct tasklet_struct tasklet; + struct workqueue_struct *workqueue; + }; + /* for thread, tasklet and for stopped workqueue */ + struct llist_head ev_llist; + struct llist_node *first; + struct llist_node *last; + struct completion complete; +}; + +/*---------------------------------------------------------------------------*/ +/* XIO default event loop API */ +/* */ +/* NoTE: xio provides default muxer implementation around epoll. */ +/* users are encouraged to utilize their own implementations and provides */ +/* appropriate services to xio via the xio's context open interface */ +/*---------------------------------------------------------------------------*/ +/** + * initializes event loop handle + * + * @returns event loop handle or NULL upon error + */ +void *xio_ev_loop_init(unsigned long flags, struct xio_context *ctx, + struct xio_loop_ops *loop); + +/** + * destroy the event loop + * + * @param[in] loop_hndl Handle to event loop + */ +void xio_ev_loop_destroy(void *loop); + +#endif + diff --git a/open_src/xio/src/kernel/xio/xio_init.c b/open_src/xio/src/kernel/xio/xio_init.c new file mode 100644 index 0000000..4798c7e --- /dev/null +++ b/open_src/xio/src/kernel/xio/xio_init.c @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include + +#include +#include "libxio.h" +#include "xio_sessions_cache.h" +#include "xio_nexus_cache.h" +#include "xio_idr.h" + +MODULE_AUTHOR("Eyal Solomon, Shlomo Pongratz"); +MODULE_DESCRIPTION("XIO generic part " \ + "v" DRV_VERSION " (" DRV_RELDATE ")"); +MODULE_LICENSE("Dual BSD/GPL"); + +/* The root of XIO debugfs tree */ +static struct dentry *xio_root; +struct xio_idr *usr_idr = NULL; + +/*---------------------------------------------------------------------------*/ +/* xio_constructor */ +/*---------------------------------------------------------------------------*/ + +static int __init xio_init_module(void) +{ + if (debugfs_initialized()) { + xio_root = debugfs_create_dir("xio", NULL); + if (!xio_root) { + pr_err("xio_root debugfs creation failed\n"); + return -ENOMEM; + } + } else { + xio_root = NULL; + pr_err("debugfs not initialized\n"); + } + + sessions_cache_construct(); + nexus_cache_construct(); + usr_idr = xio_idr_create(); + if (!usr_idr) { + pr_err("usr_idr creation failed\n"); + return -ENOMEM; + } + + return 0; +} + +static void __exit xio_cleanup_module(void) +{ + xio_idr_destroy(usr_idr); + debugfs_remove_recursive(xio_root); +} + +struct dentry *xio_debugfs_root(void) +{ + return xio_root; +} + +module_init(xio_init_module); +module_exit(xio_cleanup_module); + diff --git a/open_src/xio/src/kernel/xio/xio_kernel_utils.c b/open_src/xio/src/kernel/xio/xio_kernel_utils.c new file mode 100644 index 0000000..01ebf36 --- /dev/null +++ b/open_src/xio/src/kernel/xio/xio_kernel_utils.c @@ -0,0 +1,310 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include + +#include +#include "libxio.h" +#include "xio_log.h" +#include "xio_common.h" +#include "xio_protocol.h" +#include "xio_sg_table.h" + +#ifndef IN6ADDR_ANY_INIT +#define IN6ADDR_ANY_INIT \ + { { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } +#endif + +/*---------------------------------------------------------------------------*/ +/* defines */ +/*---------------------------------------------------------------------------*/ +static int _xio_errno; + +/*---------------------------------------------------------------------------*/ +/* debuging facilities */ +/*---------------------------------------------------------------------------*/ +void xio_set_error(int errnum) { _xio_errno = errnum; } +EXPORT_SYMBOL(xio_set_error); + +/*---------------------------------------------------------------------------*/ +/* xio_errno */ +/*---------------------------------------------------------------------------*/ +int xio_errno(void) { return _xio_errno; } +EXPORT_SYMBOL(xio_errno); + +static int priv_parse_ip_addr(const char *str, size_t len, __be16 port, + struct sockaddr_storage *ss) +{ + const char *end; + + if (strnchr(str, len, '.')) { + /* Try IPv4 */ + struct sockaddr_in *s4 = (struct sockaddr_in *)ss; + + if (in4_pton(str, len, (void *)&s4->sin_addr, -1, &end) > 0) { + if (!*end) { + /* reached the '\0' */ + s4->sin_family = AF_INET; + s4->sin_port = port; + return 0; + } + } + } else if (strnchr(str, len, ':')) { + /* Try IPv6 */ + struct sockaddr_in6 *s6 = (struct sockaddr_in6 *)ss; + + if (in6_pton(str, -1, (void *)&s6->sin6_addr, -1, &end) > 0) { + if (!*end) { + /* reached the '\0' */ + /* what about scope and flow */ + s6->sin6_family = AF_INET6; + s6->sin6_port = port; + return 1; + } + } + } + return -1; +} + +#define NI_MAXSERV 32 + +/*---------------------------------------------------------------------------*/ +/* xio_uri_to_ss */ +/*---------------------------------------------------------------------------*/ +int xio_uri_to_ss(const char *uri, struct sockaddr_storage *ss) +{ + char *start; + char *host = NULL; + char port[NI_MAXSERV]; + unsigned long portul; + unsigned short port16; + __be16 port_be16; + const char *p1, *p2; + size_t len; + int ipv6_hint = 0; + int ss_len = -1; + int retval; + + /* only supported protocol is rdma */ + start = strstr(uri, "://"); + if (!start) + return -1; + + if (*(start+3) == '[') { /* IPv6 */ + ipv6_hint = 1; + p1 = strstr(start + 3, "]:"); + if (!p1) + return -1; + + len = p1-(start+4); + host = kstrndup((char *)(start + 4), len, GFP_KERNEL); + if (host) + host[len] = 0; + + p2 = strchr(p1 + 2, '/'); + if (!p2) { + strcpy(port, p1 + 2); + } else { + len = (p2-1)-(p1+2); + strncpy(port, (p1 + 2), len); + port[len] = 0; + } + } else { + /* extract the resource */ + p1 = uri + strlen(uri); + p2 = NULL; + while (p1 != (start + 3)) { + if (*p1 == '/') + p2 = p1; + p1--; + if (p1 == uri) + goto cleanup; + } + + if (!p2) { /* no resource */ + p1 = strrchr(uri, ':'); + if (!p1 || p1 == start) + goto cleanup; + strcpy(port, (p1 + 1)); + } else { + if (*p2 != '/') + goto cleanup; + p1 = p2; + while (*p1 != ':') { + p1--; + if (p1 == uri) + goto cleanup; + } + + len = p2 - (p1 + 1); + + strncpy(port, p1 + 1, len); + port[len] = 0; + } + len = p1 - (start + 3); + + /* extract the address */ + host = kstrndup((char *)(start + 3), len, GFP_KERNEL); + if (host) + host[len] = 0; + } + + /* debug */ + DEBUG_LOG("host:%s port:%s\n", host, port); + + if (kstrtoul(port, 10, &portul)) { + ERROR_LOG("Invalid port specification(%s)\n", port); + goto cleanup; + } + if (portul > 0xFFFF) { + ERROR_LOG("Invalid port specification(%s)\n", port); + goto cleanup; + } + port16 = portul; + port_be16 = htons(port16); + + if (!host || (host && (host[0] == '*' || host[0] == 0))) { + if (ipv6_hint) { + struct sockaddr_in6 *s6 = (struct sockaddr_in6 *)ss; + + /* what about scope and flow */ + s6->sin6_family = AF_INET6; + /* s6->sin6_addr = IN6ADDR_ANY_INIT; */ + memset((void *)&s6->sin6_addr, + 0, sizeof(s6->sin6_addr)); + s6->sin6_port = port_be16; + ss_len = sizeof(struct sockaddr_in6); + } else { + struct sockaddr_in *s4 = (struct sockaddr_in *)ss; + + s4->sin_family = AF_INET; + s4->sin_addr.s_addr = INADDR_ANY; + s4->sin_port = port_be16; + ss_len = sizeof(struct sockaddr_in); + } + } else { + retval = priv_parse_ip_addr(host, len, port_be16, ss); + if (retval < 0) { + ERROR_LOG("unresolved address\n"); + goto cleanup; + } else if (retval == 0) { + ss_len = sizeof(struct sockaddr_in); + } else if (retval == 1) { + ss_len = sizeof(struct sockaddr_in6); + } + } + + kfree(host); + return ss_len; + +cleanup: + kfree(host); + return -1; +} +EXPORT_SYMBOL(xio_uri_to_ss); + +int xio_host_port_to_ss(const char *buf, struct sockaddr_storage *ss) +{ + ERROR_LOG("unsupported\n"); + return -1; +} +EXPORT_SYMBOL(xio_host_port_to_ss); + +/* + * xio_get_nodeid(cpuid) - This will return the node to which selected cpu + * belongs + */ +unsigned int xio_get_nodeid(unsigned int cpu_id) +{ + return cpu_to_node(cpu_id); +} + +void xio_msg_dump(struct xio_msg *xio_msg) +{ + int i; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + void *sge; + + ERROR_LOG("*********************************************\n"); + ERROR_LOG("type:0x%x\n", xio_msg->type); + if (xio_msg->type == XIO_MSG_TYPE_REQ || + xio_msg->type == XIO_ONE_WAY_REQ) + ERROR_LOG("serial number:%lld\n", xio_msg->sn); + else if (xio_msg->type == XIO_MSG_TYPE_RSP) + ERROR_LOG("response:%p, serial number:%lld\n", + xio_msg->request, + ((xio_msg->request) ? xio_msg->request->sn : -1)); + + sgtbl = xio_sg_table_get(&xio_msg->in); + sgtbl_ops = xio_sg_table_ops_get(xio_msg->in.sgl_type); + + ERROR_LOG("in header: length:%zd, address:%p\n", + xio_msg->in.header.iov_len, xio_msg->in.header.iov_base); + ERROR_LOG("in sgl type:%d max_nents:%d\n", xio_msg->in.sgl_type, + tbl_max_nents(sgtbl_ops, sgtbl)); + ERROR_LOG("in data size:%d\n", + tbl_nents(sgtbl_ops, sgtbl)); + + for_each_sge(sgtbl, sgtbl_ops, sge, i) { + ERROR_LOG("in data[%d]: length:%zd, address:%p\n", i, + sge_length(sgtbl_ops, sge), + sge_addr(sgtbl_ops, sge)); + } + + sgtbl = xio_sg_table_get(&xio_msg->out); + sgtbl_ops = xio_sg_table_ops_get(xio_msg->out.sgl_type); + + ERROR_LOG("out header: length:%zd, address:%p\n", + xio_msg->out.header.iov_len, + xio_msg->out.header.iov_base); + ERROR_LOG("out sgl type:%d max_nents:%d\n", + xio_msg->out.sgl_type, + tbl_max_nents(sgtbl_ops, sgtbl)); + ERROR_LOG("out data size:%d\n", tbl_nents(sgtbl_ops, sgtbl)); + + for_each_sge(sgtbl, sgtbl_ops, sge, i) { + ERROR_LOG("out data[%d]: length:%zd, address:%p\n", i, + sge_length(sgtbl_ops, sge), + sge_addr(sgtbl_ops, sge)); + } + ERROR_LOG("*********************************************\n"); +} +EXPORT_SYMBOL(xio_msg_dump); + diff --git a/open_src/xio/src/kernel/xio/xio_mem.c b/open_src/xio/src/kernel/xio/xio_mem.c new file mode 100644 index 0000000..bca819f --- /dev/null +++ b/open_src/xio/src/kernel/xio/xio_mem.c @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include "xio_common.h" +#include "xio_kernel.h" + +#define HUGE_PAGE_SZ (2*1024*1024) + +int disable_huge_pages = 0; +int allocator_assigned = 0; +struct xio_mem_allocator g_mem_allocator; +struct xio_mem_allocator *mem_allocator = &g_mem_allocator; + diff --git a/open_src/xio/src/kernel/xio/xio_mem.h b/open_src/xio/src/kernel/xio/xio_mem.h new file mode 100644 index 0000000..4e1f0bb --- /dev/null +++ b/open_src/xio/src/kernel/xio/xio_mem.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_MEM_H +#define XIO_MEM_H + +extern int disable_huge_pages; +extern int allocator_assigned; +extern struct xio_mem_allocator *mem_allocator; + +extern void *malloc_huge_pages(size_t size); +extern void free_huge_pages(void *ptr); + +static inline void xio_disable_huge_pages(int disable) +{ + if (disable_huge_pages) + return; + disable_huge_pages = disable; +} + +static inline int xio_set_mem_allocator(struct xio_mem_allocator *allocator) +{ + if (allocator_assigned) + return -1; + memcpy(mem_allocator, allocator, sizeof(*allocator)); + allocator_assigned = 1; + + return 0; +} + +#endif diff --git a/open_src/xio/src/kernel/xio/xio_mempool.c b/open_src/xio/src/kernel/xio/xio_mempool.c new file mode 100644 index 0000000..6013f30 --- /dev/null +++ b/open_src/xio/src/kernel/xio/xio_mempool.c @@ -0,0 +1,272 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "libxio.h" +#include +#include "xio_log.h" +#include "xio_common.h" +#include "xio_mempool.h" + +/*---------------------------------------------------------------------------*/ +/* structures */ +/*---------------------------------------------------------------------------*/ +#ifndef SIZE_MAX +#define SIZE_MAX (~(size_t)0) +#endif + +static size_t sizes[] = { + XIO_16K_BLOCK_SZ, + XIO_64K_BLOCK_SZ, + XIO_256K_BLOCK_SZ, + XIO_1M_BLOCK_SZ, + SIZE_MAX + }; + +struct xio_chunks_list { + struct kmem_cache *kcache; + size_t block_sz; + char name[64]; /* kmem_cache_create keeps a pointer to the pool's name + * Therefore the name must be valid until the pool + * is destroyed + */ +}; + +struct xio_mempool { + struct xio_chunks_list pool[ARRAY_SIZE(sizes)]; +}; + +/* currently not in use - to be supported */ +struct xio_mempool_config g_mempool_config = { 0 }; + +/*---------------------------------------------------------------------------*/ +/* xio_mempool_destroy */ +/*---------------------------------------------------------------------------*/ +void xio_mempool_destroy(struct xio_mempool *p) +{ + struct xio_chunks_list *ch; + int real_ones, i; + + if (!p) + return; + + real_ones = ARRAY_SIZE(sizes) - 1; + ch = p->pool; + for (i = 0; i < real_ones; i++) { + if (!ch->kcache) + break; + DEBUG_LOG("kcache(%s) freed\n", ch->name); + kmem_cache_destroy(ch->kcache); + ch->kcache = NULL; + ch++; + } + + kfree(p); +} +EXPORT_SYMBOL(xio_mempool_destroy); + +/*---------------------------------------------------------------------------*/ +/* xio_mempol_create */ +/*---------------------------------------------------------------------------*/ +struct xio_mempool *xio_mempool_create(void) +{ + struct xio_mempool *p; + struct xio_chunks_list *ch; + int real_ones, i; + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) { + DEBUG_LOG("%s kzalloc failed\n", __func__); + goto cleanup0; + } + + real_ones = ARRAY_SIZE(sizes) - 1; + ch = p->pool; + for (i = 0; i < real_ones; i++) { + ch->block_sz = sizes[i]; + /* The name must be valid until the pool is destroyed + * Use the address of the pool structure to create a unique + * name for the pool + */ + sprintf(ch->name, "xio_mempool-%zuK-%p", + ch->block_sz/1024, p); + ch->kcache = kmem_cache_create(ch->name, + ch->block_sz, PAGE_SIZE, + SLAB_HWCACHE_ALIGN, NULL); + if (!ch->kcache) { + ERROR_LOG("kcache(%s) creation failed\n", ch->name); + goto cleanup; + } + DEBUG_LOG("kcache(%s) created(%p)\n", ch->name, ch->kcache); + ch++; + } + + DEBUG_LOG("mempool created(%p)\n", p); + return p; + +cleanup: + xio_mempool_destroy(p); + +cleanup0: + ERROR_LOG("%s failed\n", __func__); + + return NULL; +} +EXPORT_SYMBOL(xio_mempool_create); + +/*---------------------------------------------------------------------------*/ +/* size2index */ +/*---------------------------------------------------------------------------*/ +static inline int size2index(struct xio_mempool *p, size_t sz) +{ + int i; + + for (i = 0; i <= XIO_CHUNKS_SIZE_NR; i++) + if (sz <= p->pool[i].block_sz) + break; + + return (i == XIO_CHUNKS_SIZE_NR) ? -1 : i; +} + +/*---------------------------------------------------------------------------*/ +/* xio__mempool_alloc */ +/*---------------------------------------------------------------------------*/ +int xio_mempool_alloc(struct xio_mempool *p, size_t length, + struct xio_mp_mem *mp_mem) +{ + int index; + + mp_mem->addr = NULL; + mp_mem->cache = NULL; + mp_mem->length = 0; + + index = size2index(p, length); + if (index == -1) { + xio_set_error(EINVAL); + return -EINVAL; + } + + mp_mem->addr = kmem_cache_alloc(p->pool[index].kcache, GFP_KERNEL); + if (!mp_mem->addr) { + xio_set_error(ENOMEM); + return -ENOMEM; + } + + mp_mem->cache = (void *)p->pool[index].kcache; + mp_mem->length = p->pool[index].block_sz; + + return 0; +} +EXPORT_SYMBOL(xio_mempool_alloc); + +int xio_mp_sge_alloc(struct xio_mempool *pool, struct xio_sge *sge, + u32 num_sge, struct xio_mem_desc *desc) +{ + struct xio_mp_mem *mp_sge; + int i; + + desc->num_sge = 0; + mp_sge = desc->mp_sge; + + for (i = 0; i < num_sge; i++) { + if (xio_mempool_alloc(pool, sge->length, mp_sge)) + goto cleanup0; + mp_sge++; + } + + desc->num_sge = num_sge; + return 0; + +cleanup0: + return -1; +} +EXPORT_SYMBOL(xio_mp_sge_alloc); + +/*---------------------------------------------------------------------------*/ +/* xio_mempool_free */ +/*---------------------------------------------------------------------------*/ +void xio_mempool_free_mp(struct xio_mp_mem *mp_mem) +{ + if (!mp_mem) { + ERROR_LOG("%s mp_mem\n", __func__); + goto cleanup0; + } + + if (!mp_mem->cache) { + ERROR_LOG("%s mp_mem(%p)->cache(0)\n", __func__, mp_mem); + goto cleanup0; + } + + if (!mp_mem->addr) { + ERROR_LOG("%s mp_mem(%p)->addr(0)\n", __func__, mp_mem); + goto cleanup1; + } + + kmem_cache_free((struct kmem_cache *)mp_mem->cache, mp_mem->addr); + + mp_mem->cache = NULL; + mp_mem->addr = NULL; + + return; + +cleanup1: + mp_mem->cache = NULL; +cleanup0: + ERROR_LOG("%s failed\n", __func__); +} +EXPORT_SYMBOL(xio_mempool_free_mp); + +void xio_mempool_free(struct xio_mem_desc *desc) +{ + int i; + + for (i = 0; i < desc->num_sge; i++) { + if (desc->mp_sge[i].cache) + xio_mempool_free_mp(&desc->mp_sge[i]); + } + + desc->num_sge = 0; +} +EXPORT_SYMBOL(xio_mempool_free); diff --git a/open_src/xio/src/kernel/xio/xio_mempool.h b/open_src/xio/src/kernel/xio/xio_mempool.h new file mode 100644 index 0000000..ac4ae65 --- /dev/null +++ b/open_src/xio/src/kernel/xio/xio_mempool.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_MEMPOOL_H +#define XIO_MEMPOOL_H + +#include +#include + +struct xio_mempool; +struct xio_sge; + +struct xio_mem_reg { + u32 lkey; + u32 rkey; + u64 va; + u64 len; + void *mem_h; /* it is void as it might be FMR or FRWR */ +}; + +struct xio_mp_mem { + void *addr; + size_t length; + void *cache; +}; + +struct xio_mem_desc { + /* sg table for dma mapping */ + struct sg_table sgt; + struct xio_mp_mem *mp_sge; + u32 num_sge; + unsigned int nents; + unsigned int mapped; + struct xio_mem_reg mem_reg; +}; + +#define XIO_CHUNKS_SIZE_NR 4 + +#define XIO_16K_BLOCK_SZ (16*1024) +#define XIO_16K_MIN_NR 128 +#define XIO_16K_MAX_NR 1024 +#define XIO_16K_ALLOC_NR 128 + +#define XIO_64K_BLOCK_SZ (64*1024) +#define XIO_64K_MIN_NR 128 +#define XIO_64K_MAX_NR 1024 +#define XIO_64K_ALLOC_NR 128 + +#define XIO_256K_BLOCK_SZ (256*1024) +#define XIO_256K_MIN_NR 128 +#define XIO_256K_MAX_NR 1024 +#define XIO_256K_ALLOC_NR 128 + +#define XIO_1M_BLOCK_SZ (1024*1024) +#define XIO_1M_MIN_NR 128 +#define XIO_1M_MAX_NR 1024 +#define XIO_1M_ALLOC_NR 128 + +struct xio_mempool *xio_mempool_create(void); +void xio_mempool_destroy(struct xio_mempool *mpool); + +int xio_mempool_alloc(struct xio_mempool *mpool, + size_t length, struct xio_mp_mem *mp_mem); + +int xio_mp_sge_alloc(struct xio_mempool *mpool, struct xio_sge *sge, + u32 num_sge, struct xio_mem_desc *desc); + +void xio_mempool_free(struct xio_mem_desc *desc); + +void xio_mempool_free_mp(struct xio_mp_mem *mp_me); + +#endif diff --git a/open_src/xio/src/kernel/xio/xio_os.h b/open_src/xio/src/kernel/xio/xio_os.h new file mode 100644 index 0000000..aacf18b --- /dev/null +++ b/open_src/xio/src/kernel/xio/xio_os.h @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_OS_H +#define XIO_OS_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 37) +#include +#else +#include +#endif +#include +#include + +#include + +/* /usr/include/bits/types.h: *__STD_TYPE __U32_TYPE __socklen_t; */ +typedef u32 __socklen_t; +/* +/usr/include/arpa/inet.h:typedef __socklen_t socklen_t; +/usr/include/unistd.h:typedef __socklen_t socklen_t; +*/ +typedef __socklen_t socklen_t; + +#define assert(expr) BUG_ON(!(expr)) + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 1, 0) +static inline int __atomic_add_unless(atomic_t *v, int a, int u) +{ + int c, old; + + c = atomic_read(v); + while (c != u && (old = atomic_cmpxchg(v, c, c + a)) != c) + c = old; + return c; +} +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 10, 0) +/** + * sg_unmark_end - Undo setting the end of the scatterlist + * @sg: SG entryScatterlist + * + * Description: + * Removes the termination marker from the given entry of the scatterlist. + * +**/ +static inline void sg_unmark_end(struct scatterlist *sg) +{ +#ifdef CONFIG_DEBUG_SG + BUG_ON(sg->sg_magic != SG_MAGIC); +#endif + sg->page_link &= ~0x02; +} +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 13, 0) && \ + !(defined RHEL_MAJOR && RHEL_MAJOR >= 7) +/** + * llist_reverse_order - reverse order of a llist chain + * @head: first item of the list to be reversed + * + * Reverse the order of a chain of llist entries and return the + * new first entry. + */ +static inline struct llist_node *llist_reverse_order(struct llist_node *head) +{ + struct llist_node *new_head = NULL; + + while (head) { + struct llist_node *tmp = head; + + head = head->next; + tmp->next = new_head; + new_head = tmp; + } + + return new_head; +} +#endif + +/** + * list_first_entry_or_null - get the first element from a list + * @ptr: the list head to take the element from. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_struct within the struct. + * + * Note that if the list is empty, it returns NULL. + */ +#ifndef list_first_entry_or_null /* defined from 3.10 */ +#define list_first_entry_or_null(ptr, type, member) \ + (!list_empty(ptr) ? list_first_entry(ptr, type, member) : NULL) +#endif + +static inline char *strerror(int errnum) +{ + static char buf[64]; + + sprintf(buf, "errno(%d)", errnum); + return buf; +}; + +#endif /* XIO_OS_H */ diff --git a/open_src/xio/src/kernel/xio/xio_sg_iov.c b/open_src/xio/src/kernel/xio/xio_sg_iov.c new file mode 100644 index 0000000..0ba1aa1 --- /dev/null +++ b/open_src/xio/src/kernel/xio/xio_sg_iov.c @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* sg represents xio_sg_iov; */ +#include "libxio.h" +#include "xio_sg_table.h" + +/*---------------------------------------------------------------------------*/ +/* xio_sgve_set_buf */ +/*---------------------------------------------------------------------------*/ +static inline void xio_sgve_set_buf(struct xio_iovec_ex *sg, const void *buf, + uint32_t buflen, void *mr) +{ + sg->iov_base = (void *)buf; + sg->iov_len = buflen; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgve_addr */ +/*---------------------------------------------------------------------------*/ +static inline void *xio_sgve_addr(struct xio_iovec_ex *sg) +{ + return sg->iov_base; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgve_set_addr */ +/*---------------------------------------------------------------------------*/ +static inline void xio_sgve_set_addr(struct xio_iovec_ex *sg, void *addr) +{ + sg->iov_base = addr; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgve_length */ +/*---------------------------------------------------------------------------*/ +static inline size_t xio_sgve_length(struct xio_iovec_ex *sg) +{ + return sg->iov_len; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgve_set_length */ +/*---------------------------------------------------------------------------*/ +static inline void xio_sgve_set_length(struct xio_iovec_ex *sg, + uint32_t length) +{ + sg->iov_len = length; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgve_first */ +/*---------------------------------------------------------------------------*/ +static struct xio_iovec_ex *xio_sgve_first(struct xio_sg_iov *sgv) +{ + return ((!sgv || sgv->nents == 0) ? + NULL : &sgv->sglist[0]); +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgve_last */ +/*---------------------------------------------------------------------------*/ +static struct xio_iovec_ex *xio_sgve_last(struct xio_sg_iov *sgv) +{ + return ((!sgv || sgv->nents == 0) ? + NULL : &sgv->sglist[sgv->nents - 1]); +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgve_next */ +/*---------------------------------------------------------------------------*/ +static struct xio_iovec_ex *xio_sgve_next(struct xio_sg_iov *sgv, + struct xio_iovec_ex *sgve) +{ + return (!sgv || sgv->nents == 0 || + (sgve == &sgv->sglist[sgv->nents - 1])) + ? NULL : ++sgve; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgv_sglist */ +/*---------------------------------------------------------------------------*/ +static inline struct xio_iovec_ex *xio_sgv_sglist(struct xio_sg_iov *sgv) +{ + return sgv->sglist; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgv_nents */ +/*---------------------------------------------------------------------------*/ +static inline int xio_sgv_nents(struct xio_sg_iov *sgv) +{ + return sgv->nents; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgv_max_nents */ +/*---------------------------------------------------------------------------*/ +static inline int xio_sgv_max_nents(struct xio_sg_iov *sgv) +{ + return XIO_IOVLEN; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgv_set_nents */ +/*---------------------------------------------------------------------------*/ +static inline void xio_sgv_set_nents(struct xio_sg_iov *sgv, uint32_t nents) +{ + if (!sgv || XIO_IOVLEN < nents) + return; + sgv->nents = nents; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgv_set_max_nents */ +/*---------------------------------------------------------------------------*/ +static inline void xio_sgv_set_max_nents(struct xio_sg_iov *sgv, + uint32_t max_nents) +{ + sgv->max_nents = XIO_IOVLEN; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgv_empty */ +/*---------------------------------------------------------------------------*/ +static int xio_sgv_empty(struct xio_sg_iov *sgv) +{ + return (!sgv || sgv->nents == 0); +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgv_length */ +/*---------------------------------------------------------------------------*/ +static size_t xio_sgv_length(struct xio_sg_iov *sgv) +{ + size_t sz = 0; + uint32_t i; + + for (i = 0; i < sgv->nents; i++) + sz += sgv->sglist[i].iov_len; + + return sz; +} + +/*---------------------------------------------------------------------------*/ +/* sgtbl_ops_iov */ +/*---------------------------------------------------------------------------*/ +struct xio_sg_table_ops sgtbl_ops_iov = { + .sge_set_buf = (void *)xio_sgve_set_buf, + .sge_addr = (void *)xio_sgve_addr, + .sge_set_addr = (void *)xio_sgve_set_addr, + .sge_mr = NULL, + .sge_set_mr = NULL, + .sge_length = (void *)xio_sgve_length, + .sge_set_length = (void *)xio_sgve_set_length, + .sge_first = (void *)xio_sgve_first, + .sge_last = (void *)xio_sgve_last, + .sge_next = (void *)xio_sgve_next, + .tbl_empty = (void *)xio_sgv_empty, + .tbl_nents = (void *)xio_sgv_nents, + .tbl_sglist = (void *)xio_sgv_sglist, + .tbl_set_nents = (void *)xio_sgv_set_nents, + .tbl_max_nents = (void *)xio_sgv_max_nents, + .tbl_set_max_nents = (void *)xio_sgv_set_max_nents, + .tbl_length = (void *)xio_sgv_length, +}; + diff --git a/open_src/xio/src/kernel/xio/xio_sg_iovptr.c b/open_src/xio/src/kernel/xio/xio_sg_iovptr.c new file mode 100644 index 0000000..91ee6e1 --- /dev/null +++ b/open_src/xio/src/kernel/xio/xio_sg_iovptr.c @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* sg represents xio_sg_iovptr; */ +#include "libxio.h" +#include "xio_sg_table.h" + +/*---------------------------------------------------------------------------*/ +/* xio_sgve_set_buf */ +/*---------------------------------------------------------------------------*/ +static inline void xio_sgve_set_buf(struct xio_iovec_ex *sg, const void *buf, + uint32_t buflen, void *mr) +{ + sg->iov_base = (void *)buf; + sg->iov_len = buflen; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgve_addr */ +/*---------------------------------------------------------------------------*/ +static inline void *xio_sgve_addr(struct xio_iovec_ex *sg) +{ + return sg->iov_base; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgve_set_addr */ +/*---------------------------------------------------------------------------*/ +static inline void xio_sgve_set_addr(struct xio_iovec_ex *sg, void *addr) +{ + sg->iov_base = addr; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgve_length */ +/*---------------------------------------------------------------------------*/ +static inline size_t xio_sgve_length(struct xio_iovec_ex *sg) +{ + return sg->iov_len; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgve_set_length */ +/*---------------------------------------------------------------------------*/ +static inline void xio_sgve_set_length(struct xio_iovec_ex *sg, + uint32_t length) +{ + sg->iov_len = length; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgve_first */ +/*---------------------------------------------------------------------------*/ +static struct xio_iovec_ex *xio_sgve_first(struct xio_sg_iovptr *sgv) +{ + return ((!sgv || sgv->nents == 0) ? NULL : &sgv->sglist[0]); +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgve_last */ +/*---------------------------------------------------------------------------*/ +static struct xio_iovec_ex *xio_sgve_last(struct xio_sg_iovptr *sgv) +{ + return ((!sgv || sgv->nents == 0) ? + NULL : &sgv->sglist[sgv->nents - 1]); +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgve_next */ +/*---------------------------------------------------------------------------*/ +static struct xio_iovec_ex *xio_sgve_next(struct xio_sg_iovptr *sgv, + struct xio_iovec_ex *sgve) +{ + return (!sgv || sgv->nents == 0 || + (sgve == &sgv->sglist[sgv->nents - 1])) + ? NULL : ++sgve; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgv_sglist */ +/*---------------------------------------------------------------------------*/ +static inline struct xio_iovec_ex *xio_sgv_sglist(struct xio_sg_iovptr *sgv) +{ + return sgv->sglist; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgv_nents */ +/*---------------------------------------------------------------------------*/ +static inline int xio_sgv_nents(struct xio_sg_iovptr *sgv) +{ + return sgv->nents; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgv_max_nents */ +/*---------------------------------------------------------------------------*/ +static inline int xio_sgv_max_nents(struct xio_sg_iovptr *sgv) +{ + return sgv->max_nents; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgv_set_nents */ +/*---------------------------------------------------------------------------*/ +static inline void xio_sgv_set_nents(struct xio_sg_iovptr *sgv, uint32_t nents) +{ + if (!sgv || sgv->max_nents < nents) + return; + sgv->nents = nents; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgv_set_max_nents */ +/*---------------------------------------------------------------------------*/ +static inline void xio_sgv_set_max_nents(struct xio_sg_iovptr *sgv, + uint32_t max_nents) +{ + sgv->max_nents = max_nents; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgv_empty */ +/*---------------------------------------------------------------------------*/ +static int xio_sgv_empty(struct xio_sg_iovptr *sgv) +{ + return (!sgv || sgv->nents == 0); +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgv_set_length */ +/*---------------------------------------------------------------------------*/ +static size_t xio_sgv_length(struct xio_sg_iovptr *sgv) +{ + size_t sz = 0; + uint32_t i; + + for (i = 0; i < sgv->nents; i++) + sz += sgv->sglist[i].iov_len; + + return sz; +} + +/*---------------------------------------------------------------------------*/ +/* sgtbl_ops_iovptr */ +/*---------------------------------------------------------------------------*/ +struct xio_sg_table_ops sgtbl_ops_iovptr = { + .sge_set_buf = (void *)xio_sgve_set_buf, + .sge_addr = (void *)xio_sgve_addr, + .sge_set_addr = (void *)xio_sgve_set_addr, + .sge_mr = NULL, + .sge_set_mr = NULL, + .sge_length = (void *)xio_sgve_length, + .sge_set_length = (void *)xio_sgve_set_length, + .sge_first = (void *)xio_sgve_first, + .sge_last = (void *)xio_sgve_last, + .sge_next = (void *)xio_sgve_next, + .tbl_empty = (void *)xio_sgv_empty, + .tbl_nents = (void *)xio_sgv_nents, + .tbl_sglist = (void *)xio_sgv_sglist, + .tbl_set_nents = (void *)xio_sgv_set_nents, + .tbl_max_nents = (void *)xio_sgv_max_nents, + .tbl_set_max_nents = (void *)xio_sgv_set_max_nents, + .tbl_length = (void *)xio_sgv_length, +}; + diff --git a/open_src/xio/src/kernel/xio/xio_sg_scatter.c b/open_src/xio/src/kernel/xio/xio_sg_scatter.c new file mode 100644 index 0000000..4eb32dc --- /dev/null +++ b/open_src/xio/src/kernel/xio/xio_sg_scatter.c @@ -0,0 +1,291 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +/* sg represents xio_sg_iovptr; */ +#include "libxio.h" +#include +#include "xio_sg_table.h" + +#ifdef CONFIG_DEBUG_SG +/* not defined by default */ +#undef XIO_DEBUG_SG +#ifdef XIO_DEBUG_SG +static inline void verify_tbl(struct sg_table *tbl) +{ + if (tbl && tbl->sgl) { + struct scatterlist *sg; + int i; + + sg = tbl->sgl; + for (i = 0; i < tbl->nents; i++) { + if (!sg) + break; + BUG_ON(sg->sg_magic != SG_MAGIC); + /* if is last is marked the next is NULL */ + sg = sg_next(sg); + } + BUG_ON(i != tbl->nents); + } +} +#endif +#else +/* if CONFIG_DEBUG_SG is not defined we can't define XIO_DEBUG_SG */ +#undef XIO_DEBUG_SG +#endif + +/*---------------------------------------------------------------------------*/ +/* xio_tble_set_buf */ +/*---------------------------------------------------------------------------*/ +static inline void xio_sg_set_buf(struct scatterlist *sg, const void *buf, + uint32_t buflen, void *mr) +{ +#ifdef XIO_DEBUG_SG + BUG_ON(sg->sg_magic != SG_MAGIC); +#endif + sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf)); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tble_addr */ +/*---------------------------------------------------------------------------*/ +static inline void *xio_sg_addr(struct scatterlist *sg) +{ +#ifdef XIO_DEBUG_SG + BUG_ON(sg->sg_magic != SG_MAGIC); +#endif + return sg_virt(sg); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tble_set_addr */ +/*---------------------------------------------------------------------------*/ +static inline void xio_sg_set_addr(struct scatterlist *sg, void *addr) +{ + /* keep the length */ +#ifdef XIO_DEBUG_SG + BUG_ON(sg->sg_magic != SG_MAGIC); +#endif + sg_set_page(sg, virt_to_page(addr), sg->length, offset_in_page(addr)); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tble_length */ +/*---------------------------------------------------------------------------*/ +static inline size_t xio_sg_length(struct scatterlist *sg) +{ +#ifdef XIO_DEBUG_SG + BUG_ON(sg->sg_magic != SG_MAGIC); +#endif + return sg->length; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tble_set_length */ +/*---------------------------------------------------------------------------*/ +static inline void xio_sg_set_length(struct scatterlist *sg, uint32_t length) +{ +#ifdef XIO_DEBUG_SG + BUG_ON(sg->sg_magic != SG_MAGIC); +#endif + sg->length = length; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tble_first */ +/*---------------------------------------------------------------------------*/ +static struct scatterlist *xio_sg_first(struct sg_table *tbl) +{ +#ifdef XIO_DEBUG_SG + verify_tbl(tbl); +#endif + return ((!tbl || tbl->nents == 0) ? NULL : tbl->sgl); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tble_last */ +/*---------------------------------------------------------------------------*/ +static struct scatterlist *xio_sg_last(struct sg_table *tbl) +{ +#ifdef XIO_DEBUG_SG + verify_tbl(tbl); +#endif + return (!tbl || tbl->nents == 0) ? + NULL : sg_last(tbl->sgl, tbl->nents); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tble_next */ +/*---------------------------------------------------------------------------*/ +static struct scatterlist *xio_sg_next(struct sg_table *tbl, + struct scatterlist *tble) +{ +#ifdef XIO_DEBUG_SG + verify_tbl(tbl); +#endif + /* Note sg_next is checking for last and returns NULL for end */ + return (!tbl || tbl->nents == 0) + ? NULL : sg_next(tble); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tbl_sglist */ +/*---------------------------------------------------------------------------*/ +static inline struct scatterlist *xio_tbl_sglist(struct sg_table *tbl) +{ +#ifdef XIO_DEBUG_SG + verify_tbl(tbl); +#endif + return tbl->sgl; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tbl_nents */ +/*---------------------------------------------------------------------------*/ +static inline int xio_tbl_nents(struct sg_table *tbl) +{ +#ifdef XIO_DEBUG_SG + verify_tbl(tbl); +#endif + return tbl->nents; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tbl_max_nents */ +/*---------------------------------------------------------------------------*/ +static inline int xio_tbl_max_nents(struct sg_table *tbl) +{ +#ifdef XIO_DEBUG_SG + verify_tbl(tbl); +#endif + return tbl->orig_nents; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tbl_set_nents */ +/*---------------------------------------------------------------------------*/ +static inline void xio_tbl_set_nents(struct sg_table *tbl, uint32_t nents) +{ + struct scatterlist *sg; + int i; + +#ifdef XIO_DEBUG_SG + verify_tbl(tbl); +#endif + if (!tbl || tbl->orig_nents < nents) + return; + + sg = tbl->sgl; + /* tbl->nents is unsigned so if tbl->nents is ZERO then tbl->nents - 1 + * is a huge number, so check this. + */ + if (tbl->nents && (tbl->nents < tbl->orig_nents)) { + for (i = 0; i < tbl->nents - 1; i++) + sg = sg_next(sg); + sg_unmark_end(sg); + } + + if (!nents) { + tbl->nents = nents; + return; + } + + sg = tbl->sgl; + for (i = 0; i < nents - 1; i++) + sg = sg_next(sg); + + sg_mark_end(sg); + + tbl->nents = nents; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tbl_empty */ +/*---------------------------------------------------------------------------*/ +static int xio_tbl_empty(struct sg_table *tbl) +{ +#ifdef XIO_DEBUG_SG + verify_tbl(tbl); +#endif + return (!tbl || tbl->nents == 0); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tbl_set_length */ +/*---------------------------------------------------------------------------*/ +static size_t xio_tbl_length(struct sg_table *tbl) +{ + struct scatterlist *sg; + size_t sz = 0; + uint32_t i; + +#ifdef XIO_DEBUG_SG + verify_tbl(tbl); +#endif + sg = tbl->sgl; + for (i = 0; i < tbl->nents; i++) { + sz += sg->length; + sg = sg_next(sg); + } + + return sz; +} + +/*---------------------------------------------------------------------------*/ +/* sgtbl_ops_iovptr */ +/*---------------------------------------------------------------------------*/ +struct xio_sg_table_ops sgtbl_ops_sg = { + .sge_set_buf = (void *)xio_sg_set_buf, + .sge_addr = (void *)xio_sg_addr, + .sge_set_addr = (void *)xio_sg_set_addr, + .sge_mr = NULL, + .sge_set_mr = NULL, + .sge_length = (void *)xio_sg_length, + .sge_set_length = (void *)xio_sg_set_length, + .sge_first = (void *)xio_sg_first, + .sge_last = (void *)xio_sg_last, + .sge_next = (void *)xio_sg_next, + .tbl_empty = (void *)xio_tbl_empty, + .tbl_nents = (void *)xio_tbl_nents, + .tbl_sglist = (void *)xio_tbl_sglist, + .tbl_set_nents = (void *)xio_tbl_set_nents, + .tbl_max_nents = (void *)xio_tbl_max_nents, + .tbl_set_max_nents = NULL, + .tbl_length = (void *)xio_tbl_length, +}; + diff --git a/open_src/xio/src/kernel/xio/xio_sg_table.c b/open_src/xio/src/kernel/xio/xio_sg_table.c new file mode 100644 index 0000000..3e6766b --- /dev/null +++ b/open_src/xio/src/kernel/xio/xio_sg_table.c @@ -0,0 +1,235 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include "libxio.h" +#include "xio_log.h" +#include "xio_sg_table.h" + +extern struct xio_sg_table_ops sgtbl_ops_iov; +extern struct xio_sg_table_ops sgtbl_ops_iovptr; +extern struct xio_sg_table_ops sgtbl_ops_sg; + +void *xio_sg_table_ops_get(enum xio_sgl_type sgl_type) +{ + static void *vec[XIO_SGL_TYPE_LAST] = { + [XIO_SGL_TYPE_IOV] = (void *)&sgtbl_ops_iov, + [XIO_SGL_TYPE_IOV_PTR] = (void *)&sgtbl_ops_iovptr, + [XIO_SGL_TYPE_SCATTERLIST] = (void *)&sgtbl_ops_sg + }; + + return vec[sgl_type]; +} +EXPORT_SYMBOL(xio_sg_table_ops_get); + +/*---------------------------------------------------------------------------*/ +/* tbl_clone */ +/*---------------------------------------------------------------------------*/ +int tbl_clone(struct xio_sg_table_ops *dtbl_ops, void *dtbl, + struct xio_sg_table_ops *stbl_ops, void *stbl) +{ + void *dsge; + void *ssge; + int i; + + if (tbl_max_nents(dtbl_ops, dtbl) < tbl_nents(stbl_ops, stbl)) { + ERROR_LOG("dest max nents is %d while src nents is %d\n", + tbl_max_nents(dtbl_ops, dtbl), + tbl_nents(stbl_ops, stbl)); + return -1; + } + + tbl_set_nents(dtbl_ops, dtbl, + tbl_nents(stbl_ops, stbl)); + ssge = sge_first(stbl_ops, stbl); + for_each_sge(dtbl, dtbl_ops, dsge, i) { + sge_set_addr(dtbl_ops, dsge, + sge_addr(stbl_ops, ssge)); + sge_set_length(dtbl_ops, dsge, + sge_length(stbl_ops, ssge)); + + ssge = sge_next(stbl_ops, stbl, ssge); + } + + return 0; +} +EXPORT_SYMBOL(tbl_clone); + +/*---------------------------------------------------------------------------*/ +/* tbl_copy */ +/*---------------------------------------------------------------------------*/ +int tbl_copy(struct xio_sg_table_ops *dtbl_ops, void *dtbl, + struct xio_sg_table_ops *stbl_ops, void *stbl) +{ + void *dsge = sge_first(dtbl_ops, dtbl); + void *ssge = sge_first(stbl_ops, stbl); + void *daddr = sge_addr(dtbl_ops, dsge); + void *saddr = sge_addr(stbl_ops, ssge); + size_t dlen = sge_length(dtbl_ops, dsge); + size_t slen = sge_length(stbl_ops, ssge); + size_t dnents = tbl_nents(dtbl_ops, dtbl); + size_t snents = tbl_nents(stbl_ops, stbl); + + size_t d = 0, + s = 0, + dst_len = 0; + + if (dnents < 1 || snents < 1) { + ERROR_LOG("nents < 1 dnents:%zd, snents:%zd\n", + dnents, snents); + return 0; + } + + while (1) { + if (slen < dlen) { + memcpy(daddr, saddr, slen); + dst_len += slen; + + s++; + ssge = sge_next(stbl_ops, stbl, ssge); + if (s == snents) { + sge_set_length(dtbl_ops, dsge, dst_len); + d++; + /*dsge = sge_next(dtbl_ops, dtbl, dsge);*/ + break; + } + dlen -= slen; + daddr += slen; + saddr = sge_addr(stbl_ops, ssge); + slen = sge_length(stbl_ops, ssge); + } else if (dlen < slen) { + memcpy(daddr, saddr, dlen); + sge_set_length(dtbl_ops, dsge, (dst_len + dlen)); + dst_len = 0; + d++; + dsge = sge_next(dtbl_ops, dtbl, dsge); + if (d == dnents) + break; + slen -= dlen; + saddr += dlen; + daddr = sge_addr(dtbl_ops, dsge); + dlen = sge_length(dtbl_ops, dsge); + } else { + memcpy(daddr, saddr, dlen); + sge_set_length(dtbl_ops, dsge, (dst_len + dlen)); + dst_len = 0; + + d++; + s++; + dsge = sge_next(dtbl_ops, dtbl, dsge); + ssge = sge_next(stbl_ops, stbl, ssge); + if ((d == dnents) || (s == snents)) + break; + + daddr = sge_addr(dtbl_ops, dsge); + dlen = sge_length(dtbl_ops, dsge); + saddr = sge_addr(stbl_ops, ssge); + slen = sge_length(stbl_ops, ssge); + } + } + + /* not enough buffers to complete */ + if (s < snents) { + ERROR_LOG("dest iovec exhausted\n"); + return 0; + } + tbl_set_nents(dtbl_ops, dtbl, d); + + return 0; +} +EXPORT_SYMBOL(tbl_copy); + +/*---------------------------------------------------------------------------*/ +/* tbl_copy_sg */ +/*---------------------------------------------------------------------------*/ +int tbl_copy_sg(struct xio_sg_table_ops *dtbl_ops, void *dtbl, + struct xio_sg_table_ops *stbl_ops, void *stbl) +{ + void *dsge = sge_first(dtbl_ops, dtbl); + void *ssge = sge_first(stbl_ops, stbl); + void *daddr = sge_addr(dtbl_ops, dsge); + void *saddr = sge_addr(stbl_ops, ssge); + size_t dlen = sge_length(dtbl_ops, dsge); + size_t slen = sge_length(stbl_ops, ssge); + size_t dnents = tbl_nents(dtbl_ops, dtbl); + size_t snents = tbl_nents(stbl_ops, stbl); + + size_t d = 0, + s = 0; + + if (dnents < 1 || snents < 1) { + ERROR_LOG("nents < 1 dnents:%zd, snents:%zd\n", + dnents, snents); + return 0; + } + if (dnents < snents) { + ERROR_LOG("dnents < snents dnents:%zd, snents:%zd\n", + dnents, snents); + return 0; + } + + dnents = snents; + while (1) { + if (slen <= dlen) { + dlen = slen; + memcpy(daddr, saddr, dlen); + sge_set_length(dtbl_ops, dsge, dlen); + + d++; + s++; + dsge = sge_next(dtbl_ops, dtbl, dsge); + ssge = sge_next(stbl_ops, stbl, ssge); + if ((d == dnents) || (s == snents)) + break; + + daddr = sge_addr(dtbl_ops, dsge); + dlen = sge_length(dtbl_ops, dsge); + saddr = sge_addr(stbl_ops, ssge); + slen = sge_length(stbl_ops, ssge); + } else { + ERROR_LOG("not enough buffer to complete " \ + "slen:%zd dlen:%zd\n", slen, dlen); + break; + } + } + tbl_set_nents(dtbl_ops, dtbl, d); + + return 0; +} +EXPORT_SYMBOL(tbl_copy_sg); + diff --git a/open_src/xio/src/kernel/xio/xio_task.c b/open_src/xio/src/kernel/xio/xio_task.c new file mode 100644 index 0000000..33541a0 --- /dev/null +++ b/open_src/xio/src/kernel/xio/xio_task.c @@ -0,0 +1,364 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include "libxio.h" +#include "xio_log.h" +#include "xio_common.h" +#include "xio_protocol.h" +#include "xio_mbuf.h" +#include "xio_task.h" +#include "xio_observer.h" +#include "xio_transport.h" + +#include +#include + +#define XIO_TASK_MAGIC 0x58494f54 /* Hex of 'XIOT' */ + +/*---------------------------------------------------------------------------*/ +/* xio_tasks_pool_alloc_slab */ +/*---------------------------------------------------------------------------*/ +int xio_tasks_pool_alloc_slab(struct xio_tasks_pool *q, void *context) +{ + int alloc_nr; + size_t slab_alloc_sz; + size_t tasks_alloc_sz; + void *buf; + void *data; + void *ptr; + struct xio_tasks_slab *s; + int retval = 0, i, tot_len, initialized = 0; + struct xio_task *task; + struct xio_msg *msg; + struct xio_vmsg *vmsg; + LIST_HEAD(tmp_list); + + if ((int)q->params.start_nr < 0 || (int)q->params.max_nr < 0 || + (int)q->params.alloc_nr < 0) { + xio_set_error(EINVAL); + return -1; + } + + if (q->params.start_nr && q->curr_alloced < q->params.start_nr) + alloc_nr = min(q->params.start_nr, q->params.max_nr); + else + alloc_nr = min(q->params.alloc_nr, + q->params.max_nr - q->curr_alloced); + + if (alloc_nr == 0) + return 0; + + /* slab + private data */ + slab_alloc_sz = sizeof(struct xio_tasks_slab) + + q->params.slab_dd_data_sz + + alloc_nr * sizeof(struct xio_task *); + + /* slab data */ + tasks_alloc_sz = alloc_nr * (sizeof(struct xio_task) + + q->params.task_dd_data_sz); + + tot_len = PAGE_ALIGN(slab_alloc_sz + tasks_alloc_sz); + buf = vmalloc(tot_len); + if (!buf) { + xio_set_error(ENOMEM); + return -1; + } + memset(buf, 0, tot_len); + + data = buf; + ptr = buf; + + /* slab */ + s = (void *)((char *)buf + tasks_alloc_sz); + s->dd_data = (void *)((char *)s + sizeof(struct xio_tasks_slab)); + + /* array */ + s->array = (void *)((char *)(s->dd_data) + q->params.slab_dd_data_sz); + + /* fix indexes */ + s->start_idx = q->curr_idx; + s->end_idx = s->start_idx + alloc_nr - 1; + q->curr_idx = s->end_idx + 1; + s->nr = alloc_nr; + + INIT_LIST_HEAD(&s->slabs_list_entry); + + if (q->params.pool_hooks.slab_pre_create) { + retval = q->params.pool_hooks.slab_pre_create( + context, + alloc_nr, + q->dd_data, + s->dd_data); + if (retval) + goto cleanup; + } + + for (i = 0; i < alloc_nr; i++) { + s->array[i] = data; + task = s->array[i]; + task->ltid = s->start_idx + i; + task->magic = XIO_TASK_MAGIC; + task->pool = (void *)q; + task->slab = (void *)s; + task->dd_data = ((char *)data) + sizeof(struct xio_task); + + data = ((char *)data) + sizeof(struct xio_task) + + q->params.task_dd_data_sz; + + msg = &task->imsg; + + vmsg = &msg->in; + vmsg->sgl_type = XIO_SGL_TYPE_SCATTERLIST; + + if (sg_alloc_table(&vmsg->data_tbl, g_options.max_in_iovsz, + GFP_KERNEL)) { + ERROR_LOG("sg_alloc_table(read_sge)\n"); + goto cleanup; + } + + vmsg = &msg->out; + vmsg->sgl_type = XIO_SGL_TYPE_SCATTERLIST; + + if (sg_alloc_table(&vmsg->data_tbl, g_options.max_out_iovsz, + GFP_KERNEL)) { + ERROR_LOG("sg_weite_table(read_sge)\n"); + sg_free_table(&msg->in.data_tbl); + goto cleanup; + } + + if (q->params.pool_hooks.slab_init_task && context) { + retval = q->params.pool_hooks.slab_init_task( + context, + q->dd_data, + s->dd_data, + i, + s->array[i]); + if (retval) { + sg_free_table(&msg->out.data_tbl); + sg_free_table(&msg->in.data_tbl); + goto cleanup; + } + } + list_add_tail(&task->tasks_list_entry, &tmp_list); + initialized++; + } + q->curr_alloced += alloc_nr; + + list_add_tail(&s->slabs_list_entry, &q->slabs_list); + preempt_disable(); + list_splice_tail(&tmp_list, &q->stack); + preempt_enable(); + + if (q->params.pool_hooks.slab_post_create && context) { + retval = q->params.pool_hooks.slab_post_create( + context, + q->dd_data, + s->dd_data); + if (retval) + goto cleanup; + } + + return retval; + +cleanup: + list_del_init(&s->slabs_list_entry); + + for (i = 0; i < initialized; i++) { + task = s->array[i]; + msg = &task->imsg; + list_del_init(&task->tasks_list_entry); + sg_free_table(&msg->out.data_tbl); + sg_free_table(&msg->in.data_tbl); + } + vfree(ptr); + + return -1; +} +EXPORT_SYMBOL(xio_tasks_pool_alloc_slab); + +/*---------------------------------------------------------------------------*/ +/* xio_tasks_pool_create */ +/*---------------------------------------------------------------------------*/ +struct xio_tasks_pool *xio_tasks_pool_create( + struct xio_tasks_pool_params *params) +{ + struct xio_tasks_pool *q; + char *buf; + + /* pool */ + buf = kzalloc(sizeof(*q) + params->pool_dd_data_sz, GFP_KERNEL); + if (!buf) { + xio_set_error(ENOMEM); + return NULL; + } + q = (void *)buf; + if (params->pool_dd_data_sz) + q->dd_data = (void *)(q + 1); + else + q->dd_data = NULL; + + INIT_LIST_HEAD(&q->stack); + INIT_LIST_HEAD(&q->slabs_list); + + memcpy(&q->params, params, sizeof(*params)); + + if (q->params.pool_hooks.pool_pre_create) + q->params.pool_hooks.pool_pre_create( + q->params.pool_hooks.context, q, q->dd_data); + + if (q->params.start_nr) { + xio_tasks_pool_alloc_slab(q, q->params.pool_hooks.context); + if (list_empty(&q->stack)) { + kfree(q); + return NULL; + } + } + if (q->params.pool_hooks.pool_post_create) + q->params.pool_hooks.pool_post_create( + q->params.pool_hooks.context, q, q->dd_data); + + return q; +} +EXPORT_SYMBOL(xio_tasks_pool_create); + +/*---------------------------------------------------------------------------*/ +/* xio_tasks_pool_destroy */ +/*---------------------------------------------------------------------------*/ +void xio_tasks_pool_destroy(struct xio_tasks_pool *q) +{ + struct xio_tasks_slab *pslab, *next_pslab; + struct xio_task *task; + struct xio_msg *msg; + unsigned int i; + + list_for_each_entry_safe(pslab, next_pslab, &q->slabs_list, + slabs_list_entry) { + list_del_init(&pslab->slabs_list_entry); + + if (q->params.pool_hooks.slab_uninit_task) { + for (i = 0; i < pslab->nr; i++) { + task = pslab->array[i]; + msg = &task->imsg; + list_del_init(&task->tasks_list_entry); + sg_free_table(&msg->out.data_tbl); + sg_free_table(&msg->in.data_tbl); + + q->params.pool_hooks.slab_uninit_task( + pslab->array[i]->context, + q->dd_data, + pslab->dd_data, + pslab->array[i]); + } + } + + if (q->params.pool_hooks.slab_destroy) + q->params.pool_hooks.slab_destroy( + q->params.pool_hooks.context, + q->dd_data, + pslab->dd_data); + + /* the tmp tasks are returned back to pool */ + vfree(pslab->array[0]); + } + if (q->params.pool_hooks.pool_destroy) + q->params.pool_hooks.pool_destroy( + q->params.pool_hooks.context, + q, q->dd_data); + + kfree(q); +} +EXPORT_SYMBOL(xio_tasks_pool_destroy); + +/*---------------------------------------------------------------------------*/ +/* xio_tasks_pool_remap */ +/*---------------------------------------------------------------------------*/ +void xio_tasks_pool_remap(struct xio_tasks_pool *q, void *new_context) +{ + struct xio_tasks_slab *pslab, *next_pslab; + unsigned int i; + int retval; + + if (!q) + return; + + list_for_each_entry_safe(pslab, next_pslab, &q->slabs_list, + slabs_list_entry) { + if (q->params.pool_hooks.slab_post_create) + retval = q->params.pool_hooks.slab_post_create( + new_context, + q->dd_data, + pslab->dd_data); + + if (q->params.pool_hooks.slab_remap_task) { + for (i = 0; i < pslab->nr; i++) + q->params.pool_hooks.slab_remap_task( + q->params.pool_hooks.context, + new_context, + q->dd_data, + pslab->dd_data, + pslab->array[i]); + } + } + + q->params.pool_hooks.context = new_context; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tasks_pool_dump_used */ +/*---------------------------------------------------------------------------*/ +void xio_tasks_pool_dump_used(struct xio_tasks_pool *q) +{ + struct xio_tasks_slab *pslab; + unsigned int i; + char *pool_name; + + list_for_each_entry(pslab, &q->slabs_list, slabs_list_entry) { + for (i = 0; i < pslab->nr; i++) + if (pslab->array[i]->tlv_type != 0xdead) { + pool_name = q->params.pool_name ? + q->params.pool_name : "unknown"; + ERROR_LOG("pool_name:%s: in use: task:%p, " \ + "type:0x%x\n", + pool_name, + pslab->array[i], + pslab->array[i]->tlv_type); + } + } +} + diff --git a/open_src/xio/src/kernel/xio/xio_workqueue.c b/open_src/xio/src/kernel/xio/xio_workqueue.c new file mode 100644 index 0000000..87e815d --- /dev/null +++ b/open_src/xio/src/kernel/xio/xio_workqueue.c @@ -0,0 +1,452 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include +#include +#include +#include +#include + +#include "libxio.h" +#include + +#include "xio_log.h" +#include "xio_common.h" +#include "xio_ev_data.h" +#include "xio_objpool.h" +#include "xio_workqueue_priv.h" +#include "xio_observer.h" +#include "xio_ev_loop.h" +#include "xio_context.h" +#include "xio_context_priv.h" + +struct xio_workqueue { + struct xio_context *ctx; + struct workqueue_struct *workqueue; + spinlock_t lock; /* workqueue lock */ +}; + +/*---------------------------------------------------------------------------*/ +/* xio_workqueue_create */ +/*---------------------------------------------------------------------------*/ +struct xio_workqueue *xio_workqueue_create(struct xio_context *ctx) +{ + struct xio_workqueue *workqueue; + char queue_name[64]; + + workqueue = kmalloc(sizeof(*workqueue), GFP_KERNEL); + if (!workqueue) { + ERROR_LOG("kmalloc failed.\n"); + return NULL; + } + + /* temporary (also change to single thread) */ + sprintf(queue_name, "xio-scheduler-%p", ctx); + /* check flags and backward compatibility */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 36) + workqueue->workqueue = create_workqueue(queue_name); +#else + workqueue->workqueue = alloc_workqueue(queue_name, + WQ_MEM_RECLAIM | WQ_HIGHPRI, + 0); +#endif + if (!workqueue->workqueue) { + ERROR_LOG("workqueue create failed.\n"); + goto cleanup1; + } + + workqueue->ctx = ctx; + spin_lock_init(&workqueue->lock); + + return workqueue; + +cleanup1: + kfree(workqueue); + return NULL; +} + +/*---------------------------------------------------------------------------*/ +/* xio_workqueue_destroy */ +/*---------------------------------------------------------------------------*/ +int xio_workqueue_destroy(struct xio_workqueue *work_queue) +{ + flush_workqueue(work_queue->workqueue); + destroy_workqueue(work_queue->workqueue); + + kfree(work_queue); + + return 0; +} + +static void xio_ev_callback(void *user_context) +{ + int deleted = 0; + struct xio_uwork *uwork = user_context; + int try_destroy = 0; + + set_bit(XIO_WORK_RUNNING, &uwork->flags); + if (test_bit(XIO_WORK_CANCELED, &uwork->flags)) { + clear_bit(XIO_WORK_PENDING, &uwork->flags); + } else { + void (*function)(void *data); + void *data; + /* Must clear pending before calling the function + * in case the function deletes the work or the + * enclosing structure. Note that since the function + * can reuse the work structure after clearing the + * pending flag then we must use temporary variables + */ + function = uwork->function; + data = uwork->data; + /* Set running before clearing pending */ + clear_bit(XIO_WORK_PENDING, &uwork->flags); + uwork->deleted = &deleted; + set_bit(XIO_WORK_IN_HANDLER, &uwork->flags); + function(data); + if (deleted) + return; + clear_bit(XIO_WORK_IN_HANDLER, &uwork->flags); + try_destroy = !!uwork->destructor; + } + clear_bit(XIO_WORK_RUNNING, &uwork->flags); + complete(&uwork->complete); + if (try_destroy) + uwork->destructor(uwork->destructor_data); +} + +static void xio_uwork_add_event(struct xio_uwork *uwork) +{ + struct xio_ev_data *ev_data; + + if (test_bit(XIO_WORK_CANCELED, &uwork->flags)) { + clear_bit(XIO_WORK_PENDING, &uwork->flags); + return; + } + + /* This routine is called on context core */ + + ev_data = &uwork->ev_data; + ev_data->handler = xio_ev_callback; + ev_data->data = uwork; + + xio_context_add_event(uwork->ctx, ev_data); +} + +static void xio_dwork_callback(struct work_struct *workp) +{ + struct xio_delayed_work *dwork; + struct xio_uwork *uwork; + + dwork = container_of(workp, struct xio_delayed_work, dwork.work); + uwork = &dwork->uwork; + + xio_uwork_add_event(uwork); +} + +static void xio_work_callback(struct work_struct *workp) +{ + struct xio_work *work; + struct xio_uwork *uwork; + + work = container_of(workp, struct xio_work, work); + uwork = &work->uwork; + + xio_uwork_add_event(uwork); +} + +/*---------------------------------------------------------------------------*/ +/* xio_workqueue_del_uwork2 */ +/*---------------------------------------------------------------------------*/ +static int xio_workqueue_del_uwork2(struct xio_workqueue *workqueue, + struct xio_uwork *uwork) +{ + /* Work is in event loop queue or running, can wait for its completion + * only if on other workers context + */ + + if (workqueue->ctx == uwork->ctx) { + if (test_bit(XIO_WORK_IN_HANDLER, &uwork->flags)) { + /* simple self cancellation detected + * it doesn't detect loop cancellation + */ + TRACE_LOG("self cancellation.\n"); + clear_bit(XIO_WORK_IN_HANDLER, &uwork->flags); + clear_bit(XIO_WORK_RUNNING, &uwork->flags); + *uwork->deleted = 1; + } else { + /* It is O.K. to arm a work and then to cancel it but + * waiting for it will create a lockout situation. + * that is this context needs to block until completion + * is signaled from this context. + * since the work was marked canceled in phase 1 it + * is guaranteed not to run in the future. + */ + /* + * TODO We might have an issue in case the + * xio_ev_callback event was already added to the loop, + * and meanwhile the work was/will be freed from another + * event in this context. + * In this case, we need to remove xio_ev_callback event + * from the loop here, but we do not support this right + * now... + * The best solution for now is to add event for freeing + * the work, after canceling the work. + * Need to make sure to do this in each of the objects + * containing a work, e.g. nexus, connection, ... + */ + xio_context_disable_event(&uwork->ev_data); + } + return 0; + } + + /* work may be on event handler */ + /* TODO: tasklet version? */ + if (in_atomic()) { + ERROR_LOG("Can't wait for cancellation in atomic context.\n"); + return -1; + } + + wait_for_completion(&uwork->complete); + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_workqueue_del_uwork1 */ +/*---------------------------------------------------------------------------*/ +static int xio_workqueue_del_uwork1(struct xio_workqueue *workqueue, + struct xio_uwork *uwork) +{ + int ret; + + if (!test_bit(XIO_WORK_INITIALIZED, &uwork->flags)) { + ERROR_LOG("work not initialized.\n"); + return -1; + } + + if (!workqueue->workqueue) { + ERROR_LOG("No work-queue\n"); + return -1; + } + + if (test_and_set_bit(XIO_WORK_CANCELED, &uwork->flags)) { + /* Already canceled */ + return 0; + } + + if (test_bit(XIO_WORK_RUNNING, &uwork->flags)) { + /* In xio_ev_callback go directly to phase 2 */ + TRACE_LOG("phase1 -> phase2.\n"); + ret = xio_workqueue_del_uwork2(workqueue, uwork); + return ret; + } + + if (!test_bit(XIO_WORK_PENDING, &uwork->flags)) { + /* work not pending (run done) */ + TRACE_LOG("work not pending.\n"); + return 0; + } + + /* need to cancel the work */ + return 1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_workqueue_del_delayed_work */ +/*---------------------------------------------------------------------------*/ +int xio_workqueue_del_delayed_work(struct xio_workqueue *workqueue, + xio_delayed_work_handle_t *dwork) +{ + struct xio_uwork *uwork = &dwork->uwork; + int ret; + + ret = xio_workqueue_del_uwork1(workqueue, uwork); + if (ret <= 0) + return ret; + + /* need to cancel the work */ + if (cancel_delayed_work_sync(&dwork->dwork)) { + clear_bit(XIO_WORK_PENDING, &uwork->flags); + return 0; + } + + ret = xio_workqueue_del_uwork2(workqueue, uwork); + + return ret; +} + +/*---------------------------------------------------------------------------*/ +/* xio_workqueue_del_work */ +/*---------------------------------------------------------------------------*/ +int xio_workqueue_del_work(struct xio_workqueue *workqueue, + xio_work_handle_t *work) +{ + struct xio_uwork *uwork = &work->uwork; + int ret; + + ret = xio_workqueue_del_uwork1(workqueue, uwork); + if (ret <= 0) + return ret; + + /* need to cancel the work */ + if (cancel_work_sync(&work->work)) { + clear_bit(XIO_WORK_PENDING, &uwork->flags); + return 0; + } + + ret = xio_workqueue_del_uwork2(workqueue, uwork); + + return ret; +} + +static int xio_init_uwork(struct xio_context *ctx, + struct xio_uwork *uwork, + void *data, + void (*function)(void *data)) +{ + if (test_and_set_bit(XIO_WORK_PENDING, &uwork->flags)) { + /* work already pending */ + TRACE_LOG("work already pending.\n"); + return -1; + } + clear_bit(XIO_WORK_CANCELED, &uwork->flags); + + if (test_and_set_bit(XIO_WORK_INITIALIZED, &uwork->flags)) { + /* re-arm completion */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 13, 0) + INIT_COMPLETION(uwork->complete); +#else + reinit_completion(&uwork->complete); +#endif + } else { + init_completion(&uwork->complete); + } + + uwork->data = data; + uwork->function = function; + uwork->ctx = ctx; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_workqueue_add_delayed_work */ +/*---------------------------------------------------------------------------*/ +int xio_workqueue_add_delayed_work(struct xio_workqueue *workqueue, + int msec_duration, void *data, + void (*function)(void *data), + xio_delayed_work_handle_t *dwork) + +{ + struct xio_uwork *uwork = &dwork->uwork; + struct xio_context *ctx = workqueue->ctx; + unsigned long delay_jiffies; + + if (xio_init_uwork(ctx, uwork, data, function) < 0) { + ERROR_LOG("initialization of work failed.\n"); + return -1; + } + + INIT_DELAYED_WORK(&dwork->dwork, xio_dwork_callback); + + delay_jiffies = msecs_to_jiffies(msec_duration); + + /* queue the work */ + if (!queue_delayed_work_on(ctx->cpuid, workqueue->workqueue, + &dwork->dwork, delay_jiffies)) { + ERROR_LOG("work already queued?.\n"); + return -1; + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_workqueue_add_work */ +/*---------------------------------------------------------------------------*/ +int xio_workqueue_add_work(struct xio_workqueue *workqueue, + void *data, + void (*function)(void *data), + xio_work_handle_t *work) +{ + struct xio_uwork *uwork = &work->uwork; + struct xio_context *ctx = workqueue->ctx; + + if (xio_init_uwork(ctx, uwork, data, function) < 0) { + ERROR_LOG("initialization of work failed.\n"); + return -1; + } + + INIT_WORK(&work->work, xio_work_callback); + + /* queue the work */ + if (!queue_work_on(ctx->cpuid, workqueue->workqueue, &work->work)) { + ERROR_LOG("work already queued?.\n"); + return -1; + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_workqueue_set_work_destructor */ +/*---------------------------------------------------------------------------*/ +int xio_workqueue_set_work_destructor(struct xio_workqueue *work_queue, + void *data, + void (*destructor)(void *data), + xio_work_handle_t *work) +{ + struct xio_uwork *uwork = &work->uwork; + + uwork->destructor = destructor; + uwork->destructor_data = data; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_workqueue_is_work_in_hanlder */ +/*---------------------------------------------------------------------------*/ +int xio_workqueue_is_work_in_handler(struct xio_workqueue *work_queue, + xio_work_handle_t *work) +{ + struct xio_uwork *uwork = &work->uwork; + + return test_bit(XIO_WORK_IN_HANDLER, &uwork->flags); +} + diff --git a/open_src/xio/src/kernel/xio/xio_workqueue_priv.h b/open_src/xio/src/kernel/xio/xio_workqueue_priv.h new file mode 100644 index 0000000..e5dced5 --- /dev/null +++ b/open_src/xio/src/kernel/xio/xio_workqueue_priv.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_WORKQUEUE_PRIV_H +#define XIO_WORKQUEUE_PRIV_H + +enum xio_work_flags { + XIO_WORK_PENDING = 1 << 0, + XIO_WORK_CANCELED = 1 << 1, + XIO_WORK_RUNNING = 1 << 2, + XIO_WORK_INITIALIZED = 1 << 3, + XIO_WORK_IN_HANDLER = 1 << 4 +}; + +struct xio_uwork { + struct xio_ev_data ev_data; + struct xio_context *ctx; + + void (*function)(void *data); + void *data; + + void (*destructor)(void *data); + void *destructor_data; + + volatile unsigned long flags; + int *deleted; + struct completion complete; +}; + +typedef struct xio_work { + struct work_struct work; + struct xio_uwork uwork; +} xio_work_handle_t; + +typedef struct xio_delayed_work { + struct delayed_work dwork; + struct xio_uwork uwork; +} xio_delayed_work_handle_t; + +static inline int xio_is_uwork_pending(struct xio_uwork *uwork) +{ + /* xio_ev_callback sets RUNNING before clearing PENDING */ + if (test_bit(XIO_WORK_PENDING, &uwork->flags)) + return 1; + + if (test_bit(XIO_WORK_RUNNING, &uwork->flags)) + return 1; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_is_work_pending */ +/*---------------------------------------------------------------------------*/ +static inline int xio_is_work_pending(xio_work_handle_t *work) +{ + return xio_is_uwork_pending(&work->uwork); +} + +/*---------------------------------------------------------------------------*/ +/* xio_is_delayed_work_pending */ +/*---------------------------------------------------------------------------*/ +static inline int xio_is_delayed_work_pending(xio_delayed_work_handle_t *dwork) +{ + return xio_is_uwork_pending(&dwork->uwork); +} + +#endif /* XIO_WORKQUEUE_PRIV_H */ + diff --git a/open_src/xio/src/kernel/xio_log.h b/open_src/xio/src/kernel/xio_log.h new file mode 100644 index 0000000..04b14e0 --- /dev/null +++ b/open_src/xio/src/kernel/xio_log.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_LOG_H +#define XIO_LOG_H + +#include + +#define FATAL_LOG(fmt, ...) \ + pr_crit("FATAL: %s:%d::%s(): " pr_fmt(fmt), \ + __FILE__, __LINE__, __func__,\ + ## __VA_ARGS__) + +#define ERROR_LOG(fmt, ...) \ + pr_err("ERROR: %s:%d::%s(): " pr_fmt(fmt), \ + __FILE__, __LINE__, __func__,\ + ## __VA_ARGS__) + +#define WARN_LOG(fmt, ...) \ + pr_warn("WARN: %s:%d::%s(): " pr_fmt(fmt), \ + __FILE__, __LINE__, __func__,\ + ## __VA_ARGS__) + +#define INFO_LOG(fmt, ...) \ + pr_info("INFO: %s:%d::%s(): " pr_fmt(fmt), \ + __FILE__, __LINE__, __func__,\ + ## __VA_ARGS__) + +#define DEBUG_LOG(fmt, ...) \ + pr_debug("DEBUG: %s:%d::%s(): " pr_fmt(fmt), \ + __FILE__, __LINE__, __func__,\ + ## __VA_ARGS__) + +/* pr_devel() should produce zero code unless DEBUG is defined */ +#define TRACE_LOG(fmt, ...) \ + pr_devel("TRACE: %s:%d::%s(): " pr_fmt(fmt), \ + __FILE__, __LINE__, __func__,\ + ## __VA_ARGS__) + +/* Not yet implemented, parameter or sysfs */ +static inline void xio_read_logging_level(void) +{ + pr_devel("xio_read_logging_level\n"); +} + +static inline int xio_set_log_level(int /*enum xio_log_level*/ level) +{ + return -1; +} + +static inline int /*enum xio_log_level*/ xio_get_log_level(void) +{ + return 0; +} + +static inline int xio_set_log_fn(void * /*xio_log_fn*/ fn) +{ + return -1; +} + +#endif /* XIO_LOG_H */ diff --git a/open_src/xio/src/libxio_os/linuxapp/xio_env.h b/open_src/xio/src/libxio_os/linuxapp/xio_env.h new file mode 100644 index 0000000..a30ce66 --- /dev/null +++ b/open_src/xio/src/libxio_os/linuxapp/xio_env.h @@ -0,0 +1,523 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_ENV_H +#define XIO_ENV_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*---------------------------------------------------------------------------*/ +/*-------------------- Memory related things --------------------------------*/ +/*---------------------------------------------------------------------------*/ +#define PACKED_MEMORY(__declaration__) \ + __declaration__ __attribute__((__packed__)) + +/*---------------------------------------------------------------------------*/ +static inline int xio_memalign(void **memptr, size_t alignment, size_t size) +{ + return posix_memalign(memptr, alignment, size); +} + +/*---------------------------------------------------------------------------*/ +static inline void xio_memfree(void *memptr) +{ + free(memptr); +} + +/*---------------------------------------------------------------------------*/ +static inline long xio_get_page_size(void) +{ + static long page_size; + + if (!page_size) + page_size = sysconf(_SC_PAGESIZE); + + return page_size; +} + +/*---------------------------------------------------------------------------*/ +static inline void *xio_mmap(size_t length) +{ + return mmap(NULL, length, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | + MAP_POPULATE | MAP_HUGETLB, -1, 0); +} + +/*---------------------------------------------------------------------------*/ +static inline int xio_munmap(void *addr, size_t length) +{ + return munmap(addr, length); +} + +/*---------------------------------------------------------------------------*/ +static inline void *xio_numa_alloc_onnode(size_t size, int node) +{ + return numa_alloc_onnode(size, node); +} + +/*---------------------------------------------------------------------------*/ +static inline void xio_numa_free(void *start, size_t size) +{ + numa_free(start, size); +} + +/*---------------------------------------------------------------------------*/ +/*------------------- CPU and Clock related things --------------------------*/ +/*---------------------------------------------------------------------------*/ +static inline long xio_get_num_processors(void) +{ + static long num_processors; + + if (!num_processors) + num_processors = sysconf(_SC_NPROCESSORS_CONF); + + return num_processors; +} + +/*---------------------------------------------------------------------------*/ +static inline int xio_clock_gettime(struct timespec *ts) +{ + return clock_gettime(CLOCK_MONOTONIC, ts); +} + +struct getcpu_cache { + unsigned long blob[128 / sizeof(long)]; +}; + +typedef long (*vgetcpu_fn)(unsigned *cpu, + unsigned *node, struct getcpu_cache *tcache); +static vgetcpu_fn vgetcpu; + +static inline int init_vgetcpu(void) +{ + void *vdso; + + dlerror(); + vdso = dlopen("linux-vdso.so.1", RTLD_LAZY); + if (!vdso) + return -1; + vgetcpu = (vgetcpu_fn)dlsym(vdso, "__vdso_getcpu"); + dlclose(vdso); + return !vgetcpu ? -1 : 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_get_cpu */ +/*---------------------------------------------------------------------------*/ +static inline unsigned xio_get_cpu(void) +{ + static int first = 1; + unsigned cpu; + + if (!first && vgetcpu) { + vgetcpu(&cpu, NULL, NULL); + return cpu; + } + if (!first) + return sched_getcpu(); + + first = 0; + if (init_vgetcpu() < 0) { + vgetcpu = NULL; + return sched_getcpu(); + } + vgetcpu(&cpu, NULL, NULL); + return cpu; +} + +/* +#define CACHE_LINE_FILE \ + "/sys/devices/system/cpu/cpu0/cache/index0/coherency_line_size" + +static inline int arch_cache_line_size(void) +{ + char size[32]; + int fd, ret; + + fd = open(CACHE_LINE_FILE, O_RDONLY); + if (fd < 0) + return -1; + + ret = read(fd, size, sizeof(size)); + + close(fd); + + if (ret <= 0) + return -1; + else + return atoi(size); +} + +*/ +/*---------------------------------------------------------------------------*/ +/* xio_pause */ +/*---------------------------------------------------------------------------*/ +static inline void xio_pause(void) +{ + _mm_pause(); +} + +/*---------------------------------------------------------------------------*/ +static inline int xio_numa_node_of_cpu(int cpu) +{ + return numa_node_of_cpu(cpu); +} + +/*---------------------------------------------------------------------------*/ +static inline int xio_numa_run_on_node(int node) +{ + return numa_run_on_node(node); +} + +#define XIO_HZ_DIR "/var/tmp/accelio.d" +#define XIO_HZ_FILE XIO_HZ_DIR "/hz" + +/*---------------------------------------------------------------------------* + * xio_get_cpu_mhz * + * * + * since this operation may take time cache it on a cookie, * + * and use the cookie if exist * + * * + *---------------------------------------------------------------------------*/ +static inline double xio_get_cpu_mhz(void) +{ + char size[32] = {0}; + double hz = 0; + int fd; + ssize_t ret; + + fd = open(XIO_HZ_FILE, O_RDONLY); + if (fd < 0) + goto try_create; + + ret = read(fd, size, sizeof(size)); + + close(fd); + + if (ret > 0) + return atof(size); + +try_create: + hz = get_cpu_mhz(0); + + ret = mkdir(XIO_HZ_DIR, 0777); + if (ret < 0) + goto exit; + + fd = open(XIO_HZ_FILE, O_CREAT | O_TRUNC | O_WRONLY | O_SYNC, 0644); + if (fd < 0) + goto exit; + + sprintf(size, "%f", hz); + ret = write(fd, size, sizeof(size)); + if (ret < 0) + goto close_and_exit; + +close_and_exit: + close(fd); +exit: + return hz; +} + +/*---------------------------------------------------------------------------*/ +/* xio_pin_to_cpu - pin to specific cpu */ +/*---------------------------------------------------------------------------*/ +static inline int xio_pin_to_cpu(int cpu) +{ + int ncpus = numa_num_task_cpus(); + int ret; + cpu_set_t cs; + + if (ncpus > CPU_SETSIZE) + return -1; + + CPU_ZERO(&cs); + CPU_SET(cpu, &cs); + if (CPU_COUNT(&cs) == 1) + return 0; + + ret = sched_setaffinity(0, sizeof(cs), &cs); + if (ret) + return -1; + + /* guaranteed to take effect immediately */ + sched_yield(); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_pin_to_node - pin to the numa node of the cpu */ +/*---------------------------------------------------------------------------*/ +static inline int xio_pin_to_node(int cpu) +{ + int node = numa_node_of_cpu(cpu); + /* pin to node */ + int ret = numa_run_on_node(node); + + if (ret) + return -1; + + /* is numa_run_on_node() guaranteed to take effect immediately? */ + sched_yield(); + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/*-------------------- Thread related things --------------------------------*/ +/*---------------------------------------------------------------------------*/ +#define xio_tls __thread +typedef pthread_once_t thread_once_t; +#define THREAD_ONCE_INIT PTHREAD_ONCE_INIT +#define CALLBACK +/*---------------------------------------------------------------------------*/ +#define thread_once(once_control, init_routine) \ + pthread_once(once_control, init_routine) +/*---------------------------------------------------------------------------*/ +#define reset_thread_once_t(once_control) \ + ((*(once_control)) = THREAD_ONCE_INIT) +/*---------------------------------------------------------------------------*/ +#define is_reset_thread_once_t(once_control) \ + ((*(once_control)) == THREAD_ONCE_INIT) + +/*---------------------------------------------------------------------------*/ +#define xio_sync_bool_compare_and_swap(ptr, oldval, newval) \ + __sync_bool_compare_and_swap(ptr, oldval, newval) +#define xio_sync_fetch_and_add32(ptr, value) \ + __sync_fetch_and_add((ptr), (value)) +#define xio_sync_fetch_and_add64(ptr, value) \ + __sync_fetch_and_add((ptr), (value)) + +/*---------------------------------------------------------------------------*/ +#define XIO_F_ALWAYS_INLINE inline __attribute__((always_inline)) + +/*---------------------------------------------------------------------------*/ +#define LIBRARY_INITIALIZER(f) \ + static void f(void)__attribute__((constructor)); \ + static void f(void) + +/*---------------------------------------------------------------------------*/ +#define LIBRARY_FINALIZER(f) \ + static void f(void)__attribute__((destructor)); \ + static void f(void) + +/*---------------------------------------------------------------------------*/ +#define inc_ptr(_ptr, inc) ((_ptr) += (inc)) +#define sum_to_ptr(_ptr, a) ((_ptr) + (a)) + +static inline uint64_t xio_get_current_thread_id(void) +{ + return (uint64_t)pthread_self(); +} + +/*---------------------------------------------------------------------------*/ +/*-------------------- Socket related things --------------------------------*/ +/*---------------------------------------------------------------------------*/ +#define INVALID_SOCKET (-1) +#define XIO_ESHUTDOWN ESHUTDOWN +#define XIO_EINPROGRESS EINPROGRESS /* connect on non-blocking socket */ +#define XIO_EAGAIN EAGAIN /* recv on non-blocking socket */ +#define XIO_WOULDBLOCK EWOULDBLOCK /* recv on non-blocking socket */ +#define XIO_ECONNABORTED ECONNABORTED +#define XIO_ECONNRESET ECONNRESET +#define XIO_ECONNREFUSED ECONNREFUSED + +typedef int socket_t; +/*---------------------------------------------------------------------------*/ +static inline int xio_closesocket(socket_t sock) +{ + return close(sock); +} + +/*---------------------------------------------------------------------------*/ +static inline int xio_write(socket_t sock, const void *buf, size_t len) +{ + return write(sock, buf, len); +} + +/*---------------------------------------------------------------------------*/ +static inline ssize_t xio_read(socket_t sock, void *buf, size_t count) +{ + return read(sock, buf, count); +} + +/*---------------------------------------------------------------------------*/ +static inline int xio_get_last_socket_error(void) +{ + return errno; +} + +/*---------------------------------------------------------------------------*/ +/* enables or disables the blocking mode for the socket + If mode != 0, blocking is enabled; + If mode = 0, non-blocking mode is enabled. */ +static inline int xio_set_blocking(socket_t sock, unsigned long mode) +{ + long arg = fcntl(sock, F_GETFL, NULL); + + if (arg < 0) + return -1; + + if (mode) /* blocking */ + arg &= (~O_NONBLOCK); + + else /* non blocking */ + arg |= O_NONBLOCK; + + if (fcntl(sock, F_SETFL, arg) < 0) + return -1; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +static inline int xio_pipe(int socks[2], int is_blocking) +{ + return pipe2(socks, is_blocking ? 0 : O_NONBLOCK); +} + +/*---------------------------------------------------------------------------*/ +static inline socket_t xio_socket_non_blocking(int domain, int type, + int protocol) +{ + return socket(domain, type | SOCK_NONBLOCK, protocol); +} + +/*---------------------------------------------------------------------------*/ +/* NOTE: we aren't using static inline function here; because accept4 requires + * defining _GNU_SOURCE and we don't want users to be forced to define it in + * their application */ +#define xio_accept_non_blocking(sockfd, addr, addrlen) \ + accept4(sockfd, addr, addrlen, SOCK_NONBLOCK) + +/*---------------------------------------------------------------------------*/ +static inline void xio_env_cleanup(void) +{ + /* nothing to do */ +} + +/*---------------------------------------------------------------------------*/ +static inline void xio_env_startup(void) +{ + /* nothing to do */ +} + +/*---------------------------------------------------------------------------*/ +static inline int xio_timerfd_create(void) +{ + return timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK); +} + +/*---------------------------------------------------------------------------*/ +static inline int xio_timerfd_settime(int fd, int flags, + const struct itimerspec *new_value, + struct itimerspec *old_value) +{ + return timerfd_settime(fd, flags, new_value, old_value); +} + +/* + * Determine whether some value is a power of two, where zero is + * *not* considered a power of two. + */ + +static inline __attribute__((const)) +bool is_power_of_2(unsigned long n) +{ + return (n != 0 && ((n & (n - 1)) == 0)); +} + +#ifdef __cplusplus +} +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#endif /* XIO_ENV_H */ diff --git a/open_src/xio/src/libxio_os/linuxapp/xio_env_adv.h b/open_src/xio/src/libxio_os/linuxapp/xio_env_adv.h new file mode 100644 index 0000000..16f91a9 --- /dev/null +++ b/open_src/xio/src/libxio_os/linuxapp/xio_env_adv.h @@ -0,0 +1,3 @@ +#ifndef __ADV_ENV_H_ +#define __ADV_ENV_H_ +#endif /* __ADV_ENV_H_ */ diff --git a/open_src/xio/src/libxio_os/linuxapp/xio_env_basic.h b/open_src/xio/src/libxio_os/linuxapp/xio_env_basic.h new file mode 100644 index 0000000..a20f80f --- /dev/null +++ b/open_src/xio/src/libxio_os/linuxapp/xio_env_basic.h @@ -0,0 +1,3 @@ +#ifndef BASIC_ENV_H +#define BASIC_ENV_H +#endif /* BASIC_ENV_H */ diff --git a/open_src/xio/src/libxio_os/linuxkernel/xio_env.h b/open_src/xio/src/libxio_os/linuxkernel/xio_env.h new file mode 100644 index 0000000..b200c42 --- /dev/null +++ b/open_src/xio/src/libxio_os/linuxkernel/xio_env.h @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_ENV_H +#define XIO_ENV_H + +/*---------------------------------------------------------------------------*/ +/*-------------------- Memory related things --------------------------------*/ +/*---------------------------------------------------------------------------*/ +#define PACKED_MEMORY(__declaration__) \ + __declaration__ __attribute__((__packed__)) + +/*---------------------------------------------------------------------------*/ +#define inc_ptr(_ptr, inc) ((_ptr) += (inc)) +#define sum_to_ptr(_ptr, a) ((_ptr) + (a)) + +/*---------------------------------------------------------------------------*/ +/*-------------------- Threads related things --------------------------------*/ +/*---------------------------------------------------------------------------*/ +#define xio_sync_bool_compare_and_swap(ptr, oldval, newval) \ + __sync_bool_compare_and_swap(ptr, oldval, newval) +#define xio_sync_fetch_and_add32(ptr, value) \ + __sync_fetch_and_add((ptr), (value)) +#define xio_sync_fetch_and_add64(ptr, value) \ + __sync_fetch_and_add((ptr), (value)) + +/*---------------------------------------------------------------------------*/ +#define XIO_F_ALWAYS_INLINE inline __attribute__ ((always_inline)) + +/*---------------------------------------------------------------------------*/ +/*-------------------- Socket related things --------------------------------*/ +/*---------------------------------------------------------------------------*/ +#define INVALID_SOCKET (-1) +#define XIO_ESHUTDOWN ESHUTDOWN +#define XIO_EINPROGRESS EINPROGRESS /* connect on non-blocking socket */ +#define XIO_EAGAIN EAGAIN /* recv on non-blocking socket */ +#define XIO_WOULDBLOCK EWOULDBLOCK /* recv on non-blocking socket */ +#define XIO_ECONNABORTED ECONNABORTED +#define XIO_ECONNRESET ECONNRESET +#define XIO_ECONNREFUSED ECONNREFUSED + +#endif /* XIO_ENV_H */ diff --git a/open_src/xio/src/libxio_os/linuxkernel/xio_env_adv.h b/open_src/xio/src/libxio_os/linuxkernel/xio_env_adv.h new file mode 100644 index 0000000..16f91a9 --- /dev/null +++ b/open_src/xio/src/libxio_os/linuxkernel/xio_env_adv.h @@ -0,0 +1,3 @@ +#ifndef __ADV_ENV_H_ +#define __ADV_ENV_H_ +#endif /* __ADV_ENV_H_ */ diff --git a/open_src/xio/src/libxio_os/linuxkernel/xio_env_basic.h b/open_src/xio/src/libxio_os/linuxkernel/xio_env_basic.h new file mode 100644 index 0000000..a20f80f --- /dev/null +++ b/open_src/xio/src/libxio_os/linuxkernel/xio_env_basic.h @@ -0,0 +1,3 @@ +#ifndef BASIC_ENV_H +#define BASIC_ENV_H +#endif /* BASIC_ENV_H */ diff --git a/open_src/xio/src/libxio_os/winapp/list.h b/open_src/xio/src/libxio_os/winapp/list.h new file mode 100644 index 0000000..cccdf23 --- /dev/null +++ b/open_src/xio/src/libxio_os/winapp/list.h @@ -0,0 +1,312 @@ +#ifndef _LINUX_LIST_H +#define _LINUX_LIST_H + +/* +based on https ://raw.githubusercontent.com/sgminer-dev/sgminer/master/elist.h +with several enhancements by Avner BenHanoch +*/ + +#ifdef _MSC_VER +#define typeof(x) decltype(x) +#endif + +/* + * Simple doubly linked list implementation. + * + * Some of the internal functions ("__xxx") are useful when + * manipulating whole lists rather than single entries, as + * sometimes we already know the next/prev entries and we can + * generate better code by using them directly rather than + * using the generic single-entry routines. + */ + +struct list_head { + struct list_head *next, *prev; +}; + +#define LIST_HEAD_INIT(name) { &(name), &(name) } + +#define LIST_HEAD(name) \ + struct list_head name = LIST_HEAD_INIT(name) + +#define INIT_LIST_HEAD(ptr) do { \ + (ptr)->next = (ptr); (ptr)->prev = (ptr); \ +} while (0) + +/* + * Insert a new entry between two known consecutive entries. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static __inline void __list_add(struct list_head *newhead, + struct list_head *prev, + struct list_head *next) +{ + next->prev = newhead; + newhead->next = next; + newhead->prev = prev; + prev->next = newhead; +} + +/** + * list_add - add a new entry + * @new: new entry to be added + * @head: list head to add it after + * + * Insert a new entry after the specified head. + * This is good for implementing stacks. + */ +static __inline void list_add(struct list_head *newhead, struct list_head *head) +{ + __list_add(newhead, head, head->next); +} + +/** + * list_add_tail - add a new entry + * @new: new entry to be added + * @head: list head to add it before + * + * Insert a new entry before the specified head. + * This is useful for implementing queues. + */ +static __inline void list_add_tail(struct list_head *newhead, + struct list_head *head) +{ + __list_add(newhead, head->prev, head); +} + +/* + * Delete a list entry by making the prev/next entries + * point to each other. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static __inline void __list_del(struct list_head *prev, struct list_head *next) +{ + next->prev = prev; + prev->next = next; +} + +/** + * list_del - deletes entry from list. + * @entry: the element to delete from the list. + * Note: list_empty on entry does not return true after this, the entry is in + * an undefined state. + */ +static __inline void list_del(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + entry->next = NULL; + entry->prev = NULL; +} + +/** + * list_del_init - deletes entry from list and reinitialize it. + * @entry: the element to delete from the list. + */ +static __inline void list_del_init(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + INIT_LIST_HEAD(entry); +} + +/** + * list_move - delete from one list and add as another's head + * @list: the entry to move + * @head: the head that will precede our entry + */ +static __inline void list_move(struct list_head *list, struct list_head *head) +{ + __list_del(list->prev, list->next); + list_add(list, head); +} + +/** + * list_move_tail - delete from one list and add as another's tail + * @list: the entry to move + * @head: the head that will follow our entry + */ +static __inline void list_move_tail(struct list_head *list, + struct list_head *head) +{ + __list_del(list->prev, list->next); + list_add_tail(list, head); +} + +/** + * list_empty - tests whether a list is empty + * @head: the list to test. + */ +static __inline int list_empty(struct list_head *head) +{ + return head->next == head; +} + +static __inline void __list_splice(struct list_head *list, + struct list_head *head) +{ + struct list_head *first = list->next; + struct list_head *last = list->prev; + struct list_head *at = head->next; + + first->prev = head; + head->next = first; + + last->next = at; + at->prev = last; +} + +/** + * list_splice - join two lists + * @list: the new list to add. + * @head: the place to add it in the first list. + */ +static __inline void list_splice(struct list_head *list, struct list_head *head) +{ + if (!list_empty(list)) + __list_splice(list, head); +} + +/** + * list_splice_init - join two lists and reinitialise the emptied list. + * @list: the new list to add. + * @head: the place to add it in the first list. + * + * The list at @list is reinitialised + */ +static __inline void list_splice_init(struct list_head *list, + struct list_head *head) +{ + if (!list_empty(list)) { + __list_splice(list, head); + INIT_LIST_HEAD(list); + } +} + +/** + * list_entry - get the struct for this entry + * @ptr: the &struct list_head pointer. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_struct within the struct. + */ +#ifndef _MSC_VER +#define list_entry(ptr, type, member) \ + ((type)((char *)(ptr)-(unsigned long)(&((type)0)->member))) +#else +#define list_entry(ptr, type, member) \ + container_of(ptr, type, member) +#define list_entry2(ptr, ptrtype, member) \ + (reinterpret_cast((char *)(ptr)-\ + (char *)(&(reinterpret_cast(1)->member)) + 1)) +#endif + + + +/** +* list_first_entry - get the first element from a list +* @ptr: the list head to take the element from. +* @type: the type of the struct this is embedded in. +* @member: the name of the list_struct within the struct. +* +* Note, that list is expected to be not empty. +*/ +#define list_first_entry(ptr, type, member) \ + list_entry((ptr)->next, type, member) + +/** +* list_first_entry_or_null - get the first element from a list +* @ptr: the list head to take the element from. +* @type: the type of the struct this is embedded in. +* @member: the name of the list_struct within the struct. +* +* Note that if the list is empty, it returns NULL. +*/ +#define list_first_entry_or_null(ptr, type, member) \ + (!list_empty(ptr) ? list_first_entry(ptr, type, member) : NULL) + +/** + * list_for_each - iterate over a list + * @pos: the &struct list_head to use as a loop counter. + * @head: the head for your list. + */ +#define list_for_each(pos, head) \ + for (pos = (head)->next; pos != (head); \ + pos = pos->next) +/** + * list_for_each_prev - iterate over a list backwards + * @pos: the &struct list_head to use as a loop counter. + * @head: the head for your list. + */ +#define list_for_each_prev(pos, head) \ + for (pos = (head)->prev; pos != (head); \ + pos = pos->prev) + +/** + * list_for_each_safe - iterate over a list safe against removal of list entry + * @pos: the &struct list_head to use as a loop counter. + * @n: another &struct list_head to use as temporary storage + * @head: the head for your list. + */ +#define list_for_each_safe(pos, n, head) \ + for (pos = (head)->next, n = pos->next; pos != (head); \ + pos = n, n = pos->next) + +/** + * list_for_each_entry - iterate over list of given type + * @pos: the type * to use as a loop counter. + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#ifndef _MSC_VER +#define list_for_each_entry(pos, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_entry(pos->member.next, typeof(*pos), member)) +#else +#define list_for_each_entry(pos, head, member) \ + for (pos = list_entry2((head)->next, typeof(pos), member); \ + &pos->member != (head); \ + pos = list_entry2(pos->member.next, typeof(pos), member)) +#endif + +/** + * list_for_each_entry_safe - iterate over list of given type safe against + removal of list entry + * @pos: the type * to use as a loop counter. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#ifndef _MSC_VER +#define list_for_each_entry_safe(pos, n, head, member) \ + for (pos = list_entry((head)->next, typeof(pos), member), \ + n = list_entry(pos->member.next, typeof(pos), member); \ + &pos->member != (head); \ + pos = n, n = list_entry(n->member.next, typeof(n), member)) + +#else + +#define list_for_each_entry_safe(pos, n, head, member) \ + for (pos = list_entry2((head)->next, decltype(pos), member), \ + n = list_entry2(pos->member.next, decltype(pos), member); \ + &(pos->member) != (head); \ + pos = n, n = list_entry2(n->member.next, decltype(n), member)) +#endif + +/** + * list_for_each_entry_continue - iterate over list of given type + * continuing after existing point + * @pos: the type * to use as a loop counter. + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define list_for_each_entry_continue(pos, head, member) \ + for (pos = list_entry2(pos->member.next, typeof(*pos), member), \ + prefetch(pos->member.next); \ + &pos->member != (head); \ + pos = list_entry2(pos->member.next, typeof(*pos), member), \ + prefetch(pos->member.next)) + +#endif diff --git a/open_src/xio/src/libxio_os/winapp/spinlock.h b/open_src/xio/src/libxio_os/winapp/spinlock.h new file mode 100644 index 0000000..7c1e7bb --- /dev/null +++ b/open_src/xio/src/libxio_os/winapp/spinlock.h @@ -0,0 +1,127 @@ +#ifndef __SPINLOCK_WIN_H___ +#define __SPINLOCK_WIN_H___ + +#ifdef __cplusplus +extern "C" { +#endif + +#define YIELD_ITERATION 30 /* yield after 30 iterations */ +#define MAX_SLEEP_ITERATION 40 +#define SEED_VAL 100 + + +struct spinlock { + volatile long dest; + long exchange; + long compare; + +}; +typedef struct spinlock spinlock_t; + +#define PTHREAD_PROCESS_PRIVATE 0 + +static __inline int spin_lock_init(spinlock_t *spinlock) +{ + spinlock->dest = 0; + spinlock->exchange = SEED_VAL; + spinlock->compare = 0; + return 0; +} + +static __inline int spin_lock_init2(spinlock_t *lock, int pshared) +{ + (void *)pshared; + return spin_lock_init(lock); +} + +static __inline int spin_lock_destroy(spinlock_t *spinlock) +{ + // nothing to do + return 0; +} + +static __inline void spin_lock(spinlock_t *spinlock) +{ + int iterations = 0; + + while (1) { + /* A thread already owning the lock shouldn't be + * allowed to wait to acquire the lock - reentrant safe + */ + if (spinlock->dest == GetCurrentThreadId()) + break; + + /* Spinning in a loop of interlockedxxx calls can reduce + * the available memory bandwidth and slow down the + * rest of the system. Interlocked calls are expensive in + * their use of the system memory bus. It is better to + * see if the 'dest' value is what it is expected and then + * retry interlockedxx. + */ + if (InterlockedCompareExchangeAcquire(&spinlock->dest, + spinlock->exchange, + spinlock->compare) == 0) { + /* assign CurrentThreadId to dest to make it + * re-entrant safe + */ + spinlock->dest = GetCurrentThreadId(); + /* lock acquired */ + break; + } + + /* spin wait to acquire */ + while (spinlock->dest != spinlock->compare) { + if (iterations >= YIELD_ITERATION) { + if (iterations + YIELD_ITERATION >= + MAX_SLEEP_ITERATION) + Sleep(0); + + if (iterations < MAX_SLEEP_ITERATION) { + iterations = 0; + SwitchToThread(); + } + } + /* Yield processor on multi-processor but if + * on single processor then give other thread + * the CPU + */ + iterations++; + if (xio_get_num_processors() > 1) + YieldProcessor(); + else + SwitchToThread(); + } + } +} + +static __inline int spin_try_lock(spinlock_t *spinlock) +{ + if (spinlock->dest == GetCurrentThreadId()) + return 0; + + if (InterlockedCompareExchangeAcquire(&spinlock->dest, spinlock->exchange, + spinlock->compare) == 0) { + spinlock->dest = GetCurrentThreadId(); + return 1; + } + return 0; +} + +static __inline int spin_locked(spinlock_t *spinlock) +{ + return InterlockedAddAcquire(&spinlock->dest, 0); +} + +static __inline void spin_unlock(spinlock_t *spinlock) +{ + if (spinlock->dest != GetCurrentThreadId()) + return; + + InterlockedCompareExchangeRelease(&spinlock->dest, spinlock->compare, + GetCurrentThreadId()); +} +#ifdef __cplusplus +} +#endif + +#endif // ! __SPINLOCK_WIN_H___ \ No newline at end of file diff --git a/open_src/xio/src/libxio_os/winapp/xio_env.h b/open_src/xio/src/libxio_os/winapp/xio_env.h new file mode 100644 index 0000000..00ee5e9 --- /dev/null +++ b/open_src/xio/src/libxio_os/winapp/xio_env.h @@ -0,0 +1,758 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_ENV_H +#define XIO_ENV_H + +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include "list.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef SSIZE_T ssize_t; +typedef __int32 int32_t; +typedef unsigned __int32 uint32_t; +typedef int64_t __s64; + + +#define __func__ __FUNCTION__ +#define __builtin_expect(x,y) (x) /* kickoff likely/unlikely in MSVC */ +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +#define XIO_F_ALWAYS_INLINE __forceinline + +/*---------------------------------------------------------------------------*/ +/*-------------------- Memory related things --------------------------------*/ +/*---------------------------------------------------------------------------*/ + +#define PACKED_MEMORY( __Declaration__ ) __pragma( pack(push, 1) ) \ + __Declaration__ __pragma( pack(pop) ) + +/*---------------------------------------------------------------------------*/ +static inline int xio_memalign(void **memptr, size_t alignment, size_t size){ + *memptr = _aligned_malloc(size, alignment); + if (*memptr) return 0; /* success */ + return errno ? errno : -1; /* error */ +} + +/*---------------------------------------------------------------------------*/ +static inline void xio_memfree(void *memptr){ + _aligned_free(memptr); +} + +/*---------------------------------------------------------------------------*/ +static inline long xio_get_page_size(void) +{ + static long page_size = 0; + + if (!page_size) { + SYSTEM_INFO sysinfo; + GetSystemInfo(&sysinfo); + page_size = sysinfo.dwPageSize; + } + return page_size; +} + +/*---------------------------------------------------------------------------*/ +#define MAP_FAILED ((void *) -1) + +/*---------------------------------------------------------------------------*/ +static inline void *xio_mmap(size_t length){ + assert(0 && "not yet supported"); + return MAP_FAILED; +} + +/*---------------------------------------------------------------------------*/ +static inline int xio_munmap(void *addr, size_t length){ + assert(0 && "not yet supported"); + return -1; +} + +/*---------------------------------------------------------------------------*/ +static inline void *xio_numa_alloc_onnode(size_t size, int node) +{ + assert(0 && "not yet supported"); + return NULL; +} + +/*---------------------------------------------------------------------------*/ +static inline void xio_numa_free(void *start, size_t size) { + assert(0 && "not yet supported"); +} + + +/*---------------------------------------------------------------------------*/ +/*-------------------- Threads related things -------------------------------*/ +/*---------------------------------------------------------------------------*/ + +#define xio_tls __declspec(thread) + +typedef INIT_ONCE thread_once_t; +static const INIT_ONCE INIT_ONCE_RESET_VALUE = INIT_ONCE_STATIC_INIT; +#define THREAD_ONCE_INIT INIT_ONCE_STATIC_INIT +#define thread_once(once_control, init_routine) \ + InitOnceExecuteOnce(once_control, init_routine ## _msvc, NULL, NULL); +#define reset_thread_once_t(once_control) \ + memcpy(once_control, &INIT_ONCE_RESET_VALUE, sizeof(INIT_ONCE)) +#define is_reset_thread_once_t(once_control) \ + (0 == memcmp(once_control, &INIT_ONCE_RESET_VALUE, sizeof(INIT_ONCE))) +#define xio_sync_fetch_and_add32(ptr, value) \ + (InterlockedAddAcquire((volatile LONG *)(ptr), (value)) - (value)) +#define xio_sync_fetch_and_add64(ptr, value) \ + (InterlockedAddAcquire64((volatile LONG64 *)(ptr), (value)) - (value)) + +/* TODO: perhaps protect the type cast */ +#define xio_sync_bool_compare_and_swap(ptr, oldval, newval) \ + ((long)(oldval) == InterlockedCompareExchangeAcquire(\ + (volatile long*)(ptr), (long)(newval), (long)(oldval))) + + +/* TODO: consider removing (since user already must call xio_init()? +NOTICE: if you'll use DllMain here - DO NOT call WSAStartup from DllMain */ +#define LIBRARY_INITIALIZER(f) \ + static void f(void) + +#define LIBRARY_FINALIZER(f) \ + static void f(void) + + +#ifdef __cplusplus +#define inc_ptr(_ptr, _inc) do {char *temp = (char*)(_ptr); \ + temp += (_inc); (_ptr) = temp; } while (0) +#else +#define inc_ptr(_ptr, _inc) ( ((char*)(_ptr)) += (_inc) ) +#endif + +#define sum_to_ptr(_ptr, a) ( ((char*)(_ptr)) + (a) ) + +static inline uint64_t xio_get_current_thread_id() { + return GetCurrentThreadId(); +} + +/*---------------------------------------------------------------------------*/ +/*------------------- CPU and Clock related things --------------------------*/ +/*---------------------------------------------------------------------------*/ +static inline long xio_get_num_processors(void) +{ + static long num_processors = 0; + + if (!num_processors) { + SYSTEM_INFO sysinfo; + GetSystemInfo(&sysinfo); + num_processors = sysinfo.dwNumberOfProcessors; + } + return num_processors; +} + +/*---------------------------------------------------------------------------*/ +static inline long xio_get_cpu(void) +{ + /*TODO: consider GetCurrentProcessorNumberEx */ + return GetCurrentProcessorNumber(); +} + +/*---------------------------------------------------------------------------*/ +static inline int xio_numa_node_of_cpu(int cpu) +{ +// assert(0 && "not yet supported"); + return -1; /* error */ +} + +/*---------------------------------------------------------------------------*/ +static inline int xio_numa_run_on_node(int node) +{ +// assert(0 && "not yet supported"); + return -1; /* error */ +} + +/*---------------------------------------------------------------------------*/ +/* xio_pin_to_cpu - pin to specific cpu */ +/*---------------------------------------------------------------------------*/ +static int inline xio_pin_to_cpu(int cpu) { + /* not supported yet in Windows */ + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_pin_to_node - pin to the numa node of the cpu */ +/*---------------------------------------------------------------------------*/ +static inline int xio_pin_to_node(int cpu) { + /* not supported yet in Windows */ + return 0; +} + + + +struct timespec { + time_t tv_sec; /* seconds */ + long tv_nsec; /* nanoseconds */ +}; + +static const __int64 DELTA_EPOCH_IN_MICROSECS = 11644473600000000; + +struct timezone2 +{ + __int32 tz_minuteswest; /* minutes W of Greenwich */ + int tz_dsttime; /* type of dst correction */ +}; + +struct itimerspec { + struct timespec it_interval; /* Interval for periodic timer */ + struct timespec it_value; /* Initial expiration */ +}; + +/*---------------------------------------------------------------------------*/ +/* temp code here */ +int static inline gettimeofday(struct timeval *tv, struct timezone2 *tz) +{ + if (tv != NULL) { + FILETIME ft; + __int64 tmpres = 0; + + ZeroMemory(&ft, sizeof(ft)); + + GetSystemTimeAsFileTime(&ft); + + tmpres = ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (__int32)(tmpres*0.000001); + tv->tv_usec = (tmpres % 1000000); + } + + /*_tzset(),don't work properly, so we use GetTimeZoneInformation */ + if (tz != NULL) { + int rez = 0; + TIME_ZONE_INFORMATION tz_winapi; + ZeroMemory(&tz_winapi, sizeof(tz_winapi)); + rez = GetTimeZoneInformation(&tz_winapi); + tz->tz_dsttime = (rez == 2); + tz->tz_minuteswest = tz_winapi.Bias + + ((rez == 2) ? tz_winapi.DaylightBias : 0); + } + return 0; +} + +/*---------------------------------------------------------------------------*/ +#define localtime_r( _clock, _result ) \ + (*(_result) = *localtime((const time_t *)(_clock)), \ + (_result)) + +/*---------------------------------------------------------------------------* + * xio_get_cpu_mhz * + * * + * since this operation may take time cache it on a cookie, * + * and use the cookie if exist * + * * + *---------------------------------------------------------------------------*/ +static inline double xio_get_cpu_mhz(void) +{ + static double cpu_mhz; + + if (!cpu_mhz) { + LARGE_INTEGER performanceFrequency; + QueryPerformanceFrequency(&performanceFrequency); + cpu_mhz = (double)performanceFrequency.QuadPart; + } + + return cpu_mhz; +} + +/*---------------------------------------------------------------------------*/ +static inline int xio_clock_gettime(struct timespec *ts) +{ + LARGE_INTEGER t; + static LARGE_INTEGER offset; + static int initialized = 0; + static const long NANOSECONDS_IN_SECOND = 1000 * 1000 * 1000; + static LARGE_INTEGER performanceFrequency; + + if (!initialized) { + initialized = 1; + QueryPerformanceFrequency(&performanceFrequency); + QueryPerformanceCounter(&offset); + } + QueryPerformanceCounter(&t); + + t.QuadPart -= offset.QuadPart; + t.QuadPart *= NANOSECONDS_IN_SECOND; + t.QuadPart /= performanceFrequency.QuadPart; + + ts->tv_sec = (long)(t.QuadPart / NANOSECONDS_IN_SECOND); + ts->tv_nsec = (long)(t.QuadPart % NANOSECONDS_IN_SECOND); + return (0); +} + +/*---------------------------------------------------------------------------*/ +/*-------------------- Network related things -------------------------------*/ +/*---------------------------------------------------------------------------*/ + +#define XIO_ESHUTDOWN WSAESHUTDOWN +#define XIO_EINPROGRESS WSAEWOULDBLOCK /* connect on non-blocking */ +#define XIO_EAGAIN WSAEWOULDBLOCK /* recv on non-blocking */ +#define XIO_WOULDBLOCK WSAEWOULDBLOCK /* recv on non-blocking */ +#define XIO_ECONNABORTED WSAECONNABORTED +#define XIO_ECONNRESET WSAECONNRESET +#define XIO_ECONNREFUSED WSAECONNREFUSED + + +#define SHUT_RDWR SD_BOTH +#define MSG_NOSIGNAL 0 + +typedef SOCKET socket_t; + + +/*---------------------------------------------------------------------------*/ +static inline int xio_get_last_socket_error() { return WSAGetLastError(); } + +/*---------------------------------------------------------------------------*/ +static inline int xio_closesocket(socket_t sock) {return closesocket(sock);} + +/*---------------------------------------------------------------------------*/ +static inline int xio_write(socket_t sock, const void *buf, size_t len) { + return send(sock, (const char *)buf, len, 0); +} + +/*---------------------------------------------------------------------------*/ +static inline ssize_t xio_read(socket_t sock, void *buf, size_t count) { + return recv(sock, (char *)buf, count, 0); +} + +/*---------------------------------------------------------------------------*/ +/* +* based on: http://cantrip.org/socketpair.c +* +* dumb_socketpair: +* If make_overlapped is nonzero, both sockets created will be usable for +* "overlapped" operations via WSASend etc. If make_overlapped is zero, +* socks[0] (only) will be usable with regular ReadFile etc., and thus +* suitable for use as stdin or stdout of a child process. Note that the +* sockets must be closed with closesocket() regardless. +* +* int dumb_socketpair(socket_t socks[2], int make_overlapped) +*/ +static inline int socketpair(int domain, int type, int protocol, + socket_t socks[2]) +{ + union { + struct sockaddr_in inaddr; + struct sockaddr addr; + } a; + socket_t listener; + int e; + socklen_t addrlen = sizeof(a.inaddr); +// DWORD flags = 0; /* was: (make_overlapped ? WSA_FLAG_OVERLAPPED : 0); */ + DWORD flags = WSA_FLAG_OVERLAPPED; + int reuse = 1; + + if (socks == 0) { + WSASetLastError(WSAEINVAL); + return SOCKET_ERROR; + } + + /* was: listener = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); */ + listener = socket(domain, type, protocol); + if (listener == INVALID_SOCKET) + return SOCKET_ERROR; + + memset(&a, 0, sizeof(a)); + a.inaddr.sin_family = domain; + a.inaddr.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + a.inaddr.sin_port = 0; + + socks[0] = socks[1] = INVALID_SOCKET; + do { + if (setsockopt(listener, SOL_SOCKET, SO_REUSEADDR, + (char*)&reuse, (socklen_t) sizeof(reuse)) == -1) + break; + if (bind(listener, &a.addr, sizeof(a.inaddr)) == SOCKET_ERROR) + break; + if (getsockname(listener, &a.addr, &addrlen) == SOCKET_ERROR) + break; + if (listen(listener, 1) == SOCKET_ERROR) + break; + /* was: socks[0] = WSASocket(domain, type, 0, NULL, 0, flags);*/ + socks[0] = WSASocket(domain, type, protocol, NULL, 0, flags); + if (socks[0] == INVALID_SOCKET) + break; + if (connect(socks[0], &a.addr, sizeof(a.inaddr)) == SOCKET_ERROR) + break; + socks[1] = accept(listener, NULL, NULL); + if (socks[1] == INVALID_SOCKET) + break; + + xio_closesocket(listener); + return 0; + + } while (0); + + e = WSAGetLastError(); + xio_closesocket(listener); + xio_closesocket(socks[0]); + xio_closesocket(socks[1]); + WSASetLastError(e); + return SOCKET_ERROR; +} + +/*---------------------------------------------------------------------------*/ +/* enables or disables the blocking mode for the socket + If mode != 0, blocking is enabled; + If mode = 0, non-blocking mode is enabled. +-----------------------------------------------------------------------------*/ +static inline int xio_set_blocking(socket_t sock, unsigned long mode) +{ + int result; + mode = !mode; + result = ioctlsocket(sock, FIONBIO, &mode); + return result == NO_ERROR ? 0 : -1; +} + +/*---------------------------------------------------------------------------*/ +static inline int xio_pipe(socket_t socks[2], int is_blocking) +{ + int ret = socketpair(AF_INET, SOCK_STREAM, IPPROTO_TCP, socks); + if (ret) return -1; + if (!is_blocking) + if (xio_set_blocking(socks[0],0)||xio_set_blocking(socks[1],0)){ + xio_closesocket(socks[0]); + xio_closesocket(socks[1]); + return -1; + } + return 0; +} + +/*---------------------------------------------------------------------------*/ +static inline socket_t xio_socket_non_blocking(int domain, int type, + int protocol) +{ + socket_t sock_fd; + sock_fd = socket(domain, type, protocol); + if (sock_fd < 0) { + return sock_fd; + } + + if (xio_set_blocking(sock_fd, 0) < 0) { + xio_closesocket(sock_fd); + return -1; + } + return sock_fd; +} + +/*---------------------------------------------------------------------------*/ +static inline socket_t xio_accept_non_blocking(int sockfd, + struct sockaddr *addr, + socklen_t *addrlen) { + socket_t new_sock_fd; + new_sock_fd = accept(sockfd, addr, addrlen); + if (new_sock_fd < 0) { + return new_sock_fd; + } + + if (xio_set_blocking(new_sock_fd, 0) < 0) { + xio_closesocket(new_sock_fd); + return -1; + } + return new_sock_fd; + +} + + +struct iovec { /* Scatter/gather array items */ + void *iov_base; /* Starting address */ + size_t iov_len; /* Number of bytes to transfer */ +}; + +struct msghdr { + void *msg_name; /* optional address */ + socklen_t msg_namelen; /* size of address */ + struct iovec *msg_iov; /* scatter/gather array */ + size_t msg_iovlen; /* # elements in msg_iov */ + void *msg_control; /* ancillary data, see below */ + size_t msg_controllen; /* ancillary data buffer len */ + int msg_flags; /* flags on received message */ +}; + +/*---------------------------------------------------------------------------*/ +#define IOV_MAX 50 /* avner temp - TODO: consider the value */ + +/*---------------------------------------------------------------------------*/ +static inline ssize_t MIN(ssize_t x, ssize_t y) { return x < y ? x : y; } + +/*---------------------------------------------------------------------------*/ +ssize_t inline recvmsg(int sd, struct msghdr *msg, int flags) +{ + ssize_t bytes_read; + size_t expected_recv_size; + ssize_t left2move; + char *tmp_buf; + char *tmp; + unsigned int i; + + assert(msg->msg_iov); + + expected_recv_size = 0; + for (i = 0; i < msg->msg_iovlen; i++) + expected_recv_size += msg->msg_iov[i].iov_len; + tmp_buf = (char*)malloc(expected_recv_size); + if (!tmp_buf) + return -1; + + left2move = bytes_read = recvfrom(sd, + tmp_buf, + expected_recv_size, + flags, + (struct sockaddr *)msg->msg_name, + &msg->msg_namelen + ); + + for (tmp = tmp_buf, i = 0; i < msg->msg_iovlen; i++) + { + if (left2move <= 0) break; + assert(msg->msg_iov[i].iov_base); + memcpy( + msg->msg_iov[i].iov_base, + tmp, + MIN(msg->msg_iov[i].iov_len, left2move) + ); + left2move -= msg->msg_iov[i].iov_len; + tmp += msg->msg_iov[i].iov_len; + } + + free(tmp_buf); + + return bytes_read; +} + +/*---------------------------------------------------------------------------*/ +ssize_t inline sendmsg(int sd, struct msghdr *msg, int flags) +{ + ssize_t bytes_send; + size_t expected_send_size; + size_t left2move; + char *tmp_buf; + char *tmp; + unsigned int i; + + assert(msg->msg_iov); + + expected_send_size = 0; + for (i = 0; i < msg->msg_iovlen; i++) + expected_send_size += msg->msg_iov[i].iov_len; + tmp_buf = (char*)malloc(expected_send_size); + if (!tmp_buf) + return -1; + + for (tmp = tmp_buf, left2move = expected_send_size, i = 0; i < + msg->msg_iovlen; i++) + { + if (left2move <= 0) break; + assert(msg->msg_iov[i].iov_base); + memcpy( + tmp, + msg->msg_iov[i].iov_base, + MIN(msg->msg_iov[i].iov_len, left2move)); + left2move -= msg->msg_iov[i].iov_len; + tmp += msg->msg_iov[i].iov_len; + } + + bytes_send = sendto(sd, + tmp_buf, + expected_send_size, + flags, + (struct sockaddr *)msg->msg_name, + msg->msg_namelen + ); + + free(tmp_buf); + + return bytes_send; +} + +/*---------------------------------------------------------------------------*/ +/*-------------------- IO & miscelenious things -----------------------------*/ +/*---------------------------------------------------------------------------*/ + +/*---------------------------------------------------------------------------*/ +static inline void xio_env_cleanup() { + WSACleanup(); +} + +/*---------------------------------------------------------------------------*/ +static inline void xio_env_startup() { + WSADATA wsaData; + /* IMPORTANT: Don't call WSAStartup from DllMain because according to + documentation it can lead to deadlock */ + if (WSAStartup(MAKEWORD(2, 2), &wsaData)) + { + fprintf(stderr, "FATAL ERROR: WSAStartup has failed\n"); + abort(); + } +} + +/*---------------------------------------------------------------------------*/ +static inline char * +strndup(char const *s, size_t n) +{ + size_t len = strnlen(s, n); + char *new1 = (char*)malloc(len + 1); + + if (new1 == NULL) + return NULL; + + new1[len] = '\0'; + return (char*)memcpy(new1, s, len); +} + +/*---------------------------------------------------------------------------*/ +/* based on: +http://stackoverflow.com/questions/2915672/snprintf-and-visual-studio-2010 */ + +#define snprintf c99_snprintf + +/*---------------------------------------------------------------------------*/ +inline int c99_vsnprintf(char* str, size_t size, const char* format, va_list ap) +{ + int count = -1; + + if (size != 0) + count = _vsnprintf_s(str, size, _TRUNCATE, format, ap); + if (count == -1) + count = _vscprintf(format, ap); + + return count; +} + +/*---------------------------------------------------------------------------*/ +inline int c99_snprintf(char* str, size_t size, const char* format, ...){ + int count; + va_list ap; + + va_start(ap, format); + count = c99_vsnprintf(str, size, format, ap); + va_end(ap); + + return count; +} + +/*---------------------------------------------------------------------------*/ +static inline int close(int fd) +{ + return _close(fd); +} + +#define ___GFP_WAIT 0x10u +#define ___GFP_IO 0x40u +#define ___GFP_FS 0x80u + +#define GFP_KERNEL (___GFP_WAIT | ___GFP_IO | ___GFP_FS) + +/* should be __bitwise__ but it is dummy */ +typedef unsigned gfp_t; + + static inline char *kstrdup(const char *s, gfp_t gfp) +{ + /* Make sure code transfered to kernel will work as expected */ + assert(gfp == GFP_KERNEL); + return strdup(s); +} + +static inline char *kstrndup(const char *s, size_t len, gfp_t gfp) +{ + /* Make sure code transfered to kernel will work as expected */ + assert(gfp == GFP_KERNEL); + return strndup(s, len); +} + + +/*---------------------------------------------------------------------------*/ +/* ****** this section is devoted for not yet supported in Windows ********* */ +/*---------------------------------------------------------------------------*/ + +static inline int xio_timerfd_create() +{ + return (int)xio_socket_non_blocking(AF_INET, SOCK_STREAM, IPPROTO_TCP); +} + +static inline int xio_timerfd_settime(int fd, int flags, + const struct itimerspec *new_value, + struct itimerspec *old_value) +{ + return 0; +} + +static inline int xio_netlink(struct xio_context *ctx) +{ + /* not supported in Windows*/ + return 0; +} + +/* + * Determine whether some value is a power of two, where zero is + * *not* considered a power of two. + */ + +static inline const bool is_power_of_2(unsigned long n) +{ + return (n != 0 && ((n & (n - 1)) == 0)); +} + +#ifdef __cplusplus +} +#endif + +#endif /* XIO_ENV_H */ diff --git a/open_src/xio/src/libxio_os/winapp/xio_env_adv.h b/open_src/xio/src/libxio_os/winapp/xio_env_adv.h new file mode 100644 index 0000000..519f416 --- /dev/null +++ b/open_src/xio/src/libxio_os/winapp/xio_env_adv.h @@ -0,0 +1,95 @@ +#ifndef __ADV_ENV_H_ +#define __ADV_ENV_H_ +#include +static inline void kfree(const void *ptr) +{ + ufree((void *) ptr); +} + +static inline void *kmalloc(size_t size, gfp_t flags) +{ + /* Make sure code transfered to kernel will work as expected */ + assert(flags == GFP_KERNEL); + return umalloc(size); +} + +/** + * kcalloc - allocate memory for an array. The memory is set to zero. + * @n: number of elements. + * @size: element size. + * @flags: the type of memory to allocate (see kmalloc). + */ +static inline void *kcalloc(size_t n, size_t size, gfp_t flags) +{ + /* Make sure code transfered to kernel will work as expected */ + assert(flags == GFP_KERNEL); + return ucalloc(n, size); +} + +static inline void *vmalloc(unsigned long size) +{ + return umalloc(size); +} + +static inline void *vzalloc(unsigned long size) +{ + return ucalloc(1, size); +} + +static inline void vfree(const void *addr) +{ + ufree((void *) addr); +} + +/* + * These inlines deal with timer wrapping correctly. You are + * strongly encouraged to use them + * 1. Because people otherwise forget + * 2. Because if the timer wrap changes in future you won't have to + * alter your driver code. + * + * time_after(a,b) returns true if the time a is after time b. + * + * Do this with "<0" and ">=0" to only test the sign of the result. A + * good compiler would generate better code (and a really good compiler + * wouldn't care). Gcc is currently neither. + */ +#define time_after(a,b) \ + ((long)((b) - (a)) < 0) +#define time_before(a,b) time_after(b,a) + +#define time_after_eq(a,b) \ + ((long)((a) - (b)) >= 0) +#define time_before_eq(a,b) time_after_eq(b,a) + +/* + * Calculate whether a is in the range of [b, c]. + */ +#define time_in_range(a,b,c) \ + (time_after_eq(a,b) && \ + time_before_eq(a,c)) + +/* + * Calculate whether a is in the range of [b, c). + */ +#define time_in_range_open(a,b,c) \ + (time_after_eq(a,b) && \ + time_before(a,c)) + +/* Same as above, but does so with platform independent 64bit types. + * These must be used when utilizing jiffies_64 (i.e. return value of + * get_jiffies_64() */ +#define time_after64(a,b) \ + ((__s64)((b) - (a)) < 0) +#define time_before64(a,b) time_after64(b,a) + +#define time_after_eq64(a,b) \ + ((__s64)((a) - (b)) >= 0) +#define time_before_eq64(a,b) time_after_eq64(b,a) + +#define time_in_range64(a, b, c) \ + (time_after_eq64(a, b) && \ + time_before_eq64(a, c)) + + +#endif /* __ADV_ENV_H_ */ diff --git a/open_src/xio/src/libxio_os/winapp/xio_env_basic.h b/open_src/xio/src/libxio_os/winapp/xio_env_basic.h new file mode 100644 index 0000000..4102d4f --- /dev/null +++ b/open_src/xio/src/libxio_os/winapp/xio_env_basic.h @@ -0,0 +1,124 @@ +#ifndef BASIC_ENV_H +#define BASIC_ENV_H + +#include /* will bring offsetof */ + +#define inline __inline + +#define PTHREAD_MUTEX_INITIALIZER {(PRTL_CRITICAL_SECTION_DEBUG)-1,-1,0,0,0,0} +typedef struct pthread_mutexattr{ int a; } pthread_mutexattr_t; +typedef CRITICAL_SECTION pthread_mutex_t; +static int pthread_mutex_lock(pthread_mutex_t *m) +{ + EnterCriticalSection(m); + return 0; +} + +static int pthread_mutex_unlock(pthread_mutex_t *m) +{ + LeaveCriticalSection(m); + return 0; +} + +static int pthread_mutex_trylock(pthread_mutex_t *m) +{ + return TryEnterCriticalSection(m) ? 0 : EBUSY; +} + +static int pthread_mutex_init(pthread_mutex_t *m, pthread_mutexattr_t *a) +{ + (void)a; + InitializeCriticalSection(m); + + return 0; +} + +static int pthread_mutex_destroy(pthread_mutex_t *m) +{ + DeleteCriticalSection(m); + return 0; +} + + +/*---------------------------------------------------------------------------*/ +/* defines */ +/*---------------------------------------------------------------------------*/ +#ifndef min +#define min(a, b) (((a) < (b)) ? (a) : (b)) +#endif + +#ifndef max +#define max(a, b) (((a) < (b)) ? (b) : (a)) +#endif + +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) +/* +#define __ALIGN_XIO_MASK(x, mask) (((x) + (mask)) & ~(mask)) +#define __ALIGN_XIO(x, a) __ALIGN_XIO_MASK(x, (typeof(x))(a)-1) +#define ALIGN(x, a) __ALIGN_XIO((x), (a)) +//*/ +//AVNER - TODO: check! +#define ALIGN(_n, _alignment) (((_n)+(_alignment)-1) & ~((_alignment)-1)) + +#ifndef roundup +# define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y)) +#endif /* !defined(roundup) */ + + +extern const char hex_asc[]; +#define hex_asc_lo(x) hex_asc[((x) & 0x0f)] +#define hex_asc_hi(x) hex_asc[((x) & 0xf0) >> 4] + + +/** + * container_of - cast a member of a structure out to the containing structure + * @ptr: the pointer to the member. + * @type: the type of the container struct this is embedded in. + * @member: the name of the member within the struct. + * + */ +#ifndef container_of + #ifndef _MSC_VER + #define container_of(ptr, type, member) ({ \ + const typeof(((type *)0)->member) * __mptr = (ptr); \ + (type *)((char *)__mptr - offsetof(type, member)); }) + #else + #define container_of(ptr, type, member) \ + ((type *)((char *)(ptr)-(unsigned long)(&((type *)0L)->member))) + #endif +#endif + +#define __MUTEX_INITIALIZER(lockname) \ + { \ + PTHREAD_MUTEX_INITIALIZER \ + } \ + +#define DEFINE_MUTEX(mutexname) \ + struct mutex mutexname = __MUTEX_INITIALIZER(mutexname) + +struct mutex { + pthread_mutex_t lock; +}; + +static inline void mutex_init(struct mutex *mtx) +{ + pthread_mutex_init(&mtx->lock, NULL); +} + +static inline void mutex_destroy(struct mutex *mtx) +{ + pthread_mutex_destroy(&mtx->lock); +} + +static inline void mutex_lock(struct mutex *mtx) +{ + pthread_mutex_lock(&mtx->lock); +} + +static inline void mutex_unlock(struct mutex *mtx) +{ + pthread_mutex_unlock(&mtx->lock); +} + +#endif /* BASIC_ENV_H */ diff --git a/open_src/xio/src/libxio_os/winapp/xio_os.h b/open_src/xio/src/libxio_os/winapp/xio_os.h new file mode 100644 index 0000000..3d1d108 --- /dev/null +++ b/open_src/xio/src/libxio_os/winapp/xio_os.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef WIN_XIO_OS_H +#define WIN_XIO_OS_H + + +#include + +//* + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include + + + +#include +#include +#include + + +#include +#include +#include +#include +#include +#include +//#include + +#include "get_clock.h" + +#include "spinlock.h" + + +//*/ + +//*/ +#endif /* WIN_XIO_OS_H */ diff --git a/open_src/xio/src/tools/usr/Makefile.am b/open_src/xio/src/tools/usr/Makefile.am new file mode 100644 index 0000000..51c2b9e --- /dev/null +++ b/open_src/xio/src/tools/usr/Makefile.am @@ -0,0 +1,28 @@ + +# additional include pathes necessary to compile the C programs +AM_CFLAGS = -I$(top_srcdir)/src/libxio_os/linuxapp \ + -I$(top_srcdir)/include @AM_CFLAGS@ \ + -I$(top_srcdir)/src/common \ + -I$(top_srcdir)/src/usr \ + -I$(top_srcdir)/src/usr/transport \ + -I$(top_srcdir)/src/usr/transport/rdma \ + -I$(top_srcdir)/src/usr/transport/tcp \ + -I$(top_srcdir)/src/usr/xio + +AM_LDFLAGS = -L$(top_builddir)/src/usr/ + +############################################################################### +# THE PROGRAMS TO BUILD +############################################################################### + +# the program to build (the names of the final binaries) +bin_PROGRAMS = xio_mem_usage \ + xio_if_numa_cpus + +# list of sources for the 'xio_mem_usage' binary +xio_mem_usage_SOURCES = xio_mem_usage.c + +xio_if_numa_cpus_SOURCES = xio_if_numa_cpus.c +xio_if_numa_cpus_LDFLAGS = -lnuma + +############################################################################### diff --git a/open_src/xio/src/tools/usr/xio_if_numa_cpus.c b/open_src/xio/src/tools/usr/xio_if_numa_cpus.c new file mode 100644 index 0000000..da8f1f0 --- /dev/null +++ b/open_src/xio/src/tools/usr/xio_if_numa_cpus.c @@ -0,0 +1,258 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#define cpusmask_test_bit(nr, addr) (*(addr) & (1ULL << (nr))) +#define cpusmask_set_bit(nr, addr) (*(addr) |= (1ULL << (nr))) + +/*---------------------------------------------------------------------------*/ +/* intf_master_name */ +/*---------------------------------------------------------------------------*/ +static int intf_master_name(const char *iface, char *master) +{ + int fd, len; + char path[256]; + char buf[256]; + char *ptr; + + snprintf(path, 256, "/sys/class/net/%s/master", iface); + fd = open(path, O_RDONLY); + if (fd == -1) + return -1; + + len = read(fd, buf, sizeof(buf) - 1); + if (len < 0) { + len = readlink(path, buf, sizeof(buf) - 1); + if (len < 0) + goto cleanup; + } + buf[len] = '\0'; + ptr = strrchr(buf, '/'); + if (ptr) { + ptr++; + strcpy(buf, ptr); + } + strcpy(master, buf); +cleanup: + close(fd); + + return (len > 0) ? 0 : -1; +} + +/*---------------------------------------------------------------------------*/ +/* intf_numa_node */ +/*---------------------------------------------------------------------------*/ +static int intf_numa_node(const char *iface) +{ + int fd, numa_node = -1, len; + char buf[256]; + + snprintf(buf, 256, "/sys/class/net/%s/device/numa_node", iface); + fd = open(buf, O_RDONLY); + if (fd == -1) + return -1; + + len = read(fd, buf, sizeof(buf)); + if (len < 0) + goto cleanup; + + numa_node = strtol(buf, NULL, 0); + +cleanup: + close(fd); + + return numa_node; +} + +/*---------------------------------------------------------------------------*/ +/* numa_node_to_cpusmask */ +/*---------------------------------------------------------------------------*/ +static int numa_node_to_cpusmask(int node, uint64_t *cpusmask, int *nr) +{ + struct bitmask *mask; + uint64_t bmask = 0; + int retval = -1; + unsigned int i; + + mask = numa_allocate_cpumask(); + retval = numa_node_to_cpus(node, mask); + if (retval < 0) + goto cleanup; + + *nr = 0; + for (i = 0; i < mask->size && i < 64; i++) { + if (numa_bitmask_isbitset(mask, i)) { + cpusmask_set_bit(i, &bmask); + (*nr)++; + } + } + + retval = 0; +cleanup: + *cpusmask = bmask; + + numa_free_cpumask(mask); + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* intf_name_best_cpus */ +/*---------------------------------------------------------------------------*/ +static int intf_name_best_cpus(const char *if_name, uint64_t *cpusmask, int *nr) +{ + int numa_node, retval; + + *cpusmask = 0; + numa_node = intf_numa_node(if_name); + if (numa_node < 0) + return -1; + + retval = numa_node_to_cpusmask(numa_node, cpusmask, nr); + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* intf_name_best_cpus */ +/*---------------------------------------------------------------------------*/ +static char *intf_cpusmask_str(uint64_t cpusmask, int nr, char *str) +{ + int len = 0, i, cpus; + + for (i = 0, cpus = 0; i < 64 && cpus < nr; i++) { + if (cpusmask_test_bit(i, &cpusmask)) { + len += sprintf(&str[len], "%d ", i); + cpus++; + } + } + return str; +} + +/*---------------------------------------------------------------------------*/ +/* main */ +/*---------------------------------------------------------------------------*/ +int main(int argc, char *argv[]) +{ + struct ifaddrs *ifaddr, *ifa; + char host[NI_MAXHOST] = {0}; + char cpus_str[256]; + char flags[1024]; + uint64_t cpusmask = 0; + int cpusnum; + int retval = -1; + int ec = EXIT_FAILURE; + int numa_node; + + if (getifaddrs(&ifaddr) == -1) { + perror("getifaddrs"); + goto cleanup; + } + printf("%-10s %-16s %-30s %-5s %-10s %-40s\n", + "interface", "host", "flags", "numa", "cpus mask", "cpus"); + printf("---------------------------------------------------"); + printf("-------------------------------------------------------\n"); + + for (ifa = ifaddr; ifa; ifa = ifa->ifa_next) { + switch (ifa->ifa_addr->sa_family) { + case AF_INET: + if (!(ifa->ifa_flags & IFF_UP)) + continue; + getnameinfo(ifa->ifa_addr, sizeof(struct sockaddr_in), + host, NI_MAXHOST, NULL, 0, NI_NUMERICHOST); + break; + case AF_PACKET: + if (ifa->ifa_flags & IFF_MASTER) + continue; + if (ifa->ifa_flags & IFF_SLAVE) + break; + if (!(ifa->ifa_flags & IFF_UP)) + break; + continue; + break; + default: + continue; + break; + } + flags[0] = 0; + if (ifa->ifa_flags & IFF_UP) + sprintf(flags, "%s %s", flags, "UP"); + else + sprintf(flags, "%s %s", flags, "DOWN"); + if (ifa->ifa_flags & IFF_LOOPBACK) + sprintf(flags, "%s %s", flags, "LOOPBACK"); + if (ifa->ifa_flags & IFF_RUNNING) + sprintf(flags, "%s %s", flags, "RUNNING"); + if (ifa->ifa_flags & IFF_SLAVE) { + char master[256]; + + intf_master_name(ifa->ifa_name, master); + sprintf(flags, "%s %s - [%s]", flags, "SLAVE", master); + } + if (ifa->ifa_flags & IFF_MASTER) + sprintf(flags, "%s %s", flags, "MASTER"); + + numa_node = intf_numa_node(ifa->ifa_name); + retval = intf_name_best_cpus(ifa->ifa_name, + &cpusmask, &cpusnum); + if (retval != 0) { + /*perror("intf_name_best_cpus"); */ + printf("%-10s %-16s %-30s %-5c 0x%-8lx %-4s[0]\n", + ifa->ifa_name, host, flags, 0x20, 0UL, "cpus"); + continue; + } + intf_cpusmask_str(cpusmask, cpusnum, cpus_str); + + printf("%-10s %-16s %-30s %-5d 0x%-8lx %-4s[%d] - %s\n", + ifa->ifa_name, host, flags, numa_node, cpusmask, + "cpus", cpusnum, cpus_str); + } + ec = EXIT_SUCCESS; + + freeifaddrs(ifaddr); + +cleanup: + exit(ec); +} + diff --git a/open_src/xio/src/tools/usr/xio_mem_usage.c b/open_src/xio/src/tools/usr/xio_mem_usage.c new file mode 100644 index 0000000..14b33ff --- /dev/null +++ b/open_src/xio/src/tools/usr/xio_mem_usage.c @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include +#include "xio_log.h" +#include "xio_common.h" +#include "xio_protocol.h" +#include "xio_mbuf.h" +#include "xio_task.h" +#include "xio_hash.h" +#include "xio_observer.h" +#include "xio_usr_transport.h" +#include "xio_transport.h" +#include "xio_msg_list.h" +#include "xio_ev_data.h" +#include "xio_objpool.h" +#include "xio_workqueue.h" +#include "xio_context.h" +#include "xio_nexus.h" +#include "xio_connection.h" +#include "xio_session.h" + +#ifdef HAVE_INFINIBAND_VERBS_H +#include +#include +#include "xio_rdma_transport.h" +#endif +#include "xio_usr_transport.h" +#include "xio_mempool.h" +#include "xio_tcp_transport.h" + +#define PRINT_SIZE(type) \ +{ \ + int i; \ + \ + printf(" sizeof(%s)%n = ", #type, &i); \ + while (i++ < 48) { \ + printf("."); \ + } \ + printf(" %6lu\n", sizeof(type)); \ +} + +int main(int argc, char **argv) +{ + printf("\nAPI and Core:\n"); + PRINT_SIZE(struct xio_context); + PRINT_SIZE(struct xio_connection); + PRINT_SIZE(struct xio_session); + PRINT_SIZE(struct xio_msg); + PRINT_SIZE(struct xio_mr); + PRINT_SIZE(struct xio_task); + PRINT_SIZE(struct xio_nexus); + + printf("\nProtocol layer:\n"); + PRINT_SIZE(struct xio_sge); + PRINT_SIZE(struct xio_tlv); + PRINT_SIZE(struct xio_session_hdr); + PRINT_SIZE(struct xio_session_cancel_hdr); + PRINT_SIZE(struct xio_nexus_setup_req); + PRINT_SIZE(struct xio_nexus_setup_rsp); + +#ifdef HAVE_INFINIBAND_VERBS_H + printf("\nRDMA Transport:\n"); + PRINT_SIZE(struct xio_rdma_setup_msg); + PRINT_SIZE(struct xio_rdma_cancel_hdr); + PRINT_SIZE(struct xio_rdma_req_hdr); + PRINT_SIZE(struct xio_rdma_rsp_hdr); + PRINT_SIZE(struct xio_nop_hdr); + PRINT_SIZE(struct xio_rdma_task); + PRINT_SIZE(struct xio_cq); + PRINT_SIZE(struct xio_device); + PRINT_SIZE(struct xio_rdma_transport); + PRINT_SIZE(struct xio_cm_channel); + PRINT_SIZE(struct xio_work_req); +#endif + + printf("\nTCP Transport:\n"); + PRINT_SIZE(struct xio_tcp_connect_msg); + PRINT_SIZE(struct xio_tcp_pending_conn); + PRINT_SIZE(struct xio_tcp_setup_msg); + PRINT_SIZE(struct xio_tcp_cancel_hdr); + PRINT_SIZE(struct xio_tcp_req_hdr); + PRINT_SIZE(struct xio_tcp_rsp_hdr); + PRINT_SIZE(struct xio_tcp_task); + PRINT_SIZE(struct xio_tcp_transport); + PRINT_SIZE(struct xio_tcp_work_req); + + printf("\n"); + return 0; +} + diff --git a/open_src/xio/src/usr/Makefile.am b/open_src/xio/src/usr/Makefile.am new file mode 100644 index 0000000..96620a6 --- /dev/null +++ b/open_src/xio/src/usr/Makefile.am @@ -0,0 +1,163 @@ +# this is example-file: src/Makefile.am + +# additional include paths necessary to compile the C library + +if HAVE_INFINIBAND_VERBS + libxio_rdma_srcdir = -I$(top_srcdir)/src/usr/transport/rdma + libxio_rdma_headers = ./transport/rdma/xio_rdma_transport.h \ + ./transport/rdma/xio_rdma_utils.h + libxio_rdma_sources = ./transport/rdma/xio_rdma_utils.c \ + ./transport/rdma/xio_rdma_verbs.c \ + ./transport/rdma/xio_rdma_management.c \ + ./transport/rdma/xio_rdma_datapath.c + libxio_rdma_ldflags = -lrdmacm -libverbs +else + libxio_rdma_srcdir = + libxio_rdma_headers = + libxio_rdma_sources = + libxio_rdma_ldflags = +endif + +AM_CFLAGS = -fPIC -DPIC \ + -I$(top_srcdir)/src/libxio_os/linuxapp \ + -I$(top_srcdir)/src/usr \ + -I$(top_srcdir)/src/usr/xio \ + -I$(top_srcdir)/src/usr/transport \ + $(libxio_rdma_srcdir) \ + -I$(top_srcdir)/src/usr/transport/tcp \ + -I$(top_srcdir)/src/common \ + -I$(top_srcdir)/include \ + @AM_CFLAGS@ + +if HAVE_LD_VERSION_SCRIPT + libxio_version_script = -Wl,--version-script=$(top_srcdir)/src/usr/libxio.map +else + libxio_version_script = +endif + +install-exec-hook: + @runner=`whoami` ; \ + if test $$runner != "root" ; \ + then \ + echo "You are not root. run the following manualy:"; \ + echo "echo \"$(libdir)\" > /etc/ld.so.conf.d/libxio.conf"; \ + echo "/sbin/ldconfig"; \ + else \ + echo "You are root" ; \ + echo "$(libdir)" > $(DESTDIR)/etc/ld.so.conf.d/libxio.conf ; \ + /sbin/ldconfig; \ + fi + +############################################################################### +# THE LIBRARIES TO BUILD +############################################################################### + +# the library names to build (note we are building static libs only) +lib_LTLIBRARIES = libxio.la + +# where to install the headers on the system +libxio_includedir = $(includedir)/ + +# the list of header files that belong to the library (to be installed later) +libxio_include_HEADERS = $(top_srcdir)/include/libxio.h \ + $(top_srcdir)/include/xio_base.h \ + $(top_srcdir)/include/xio_user.h \ + $(top_srcdir)/include/xio_predefs.h + + +libxio_headers = ./xio/get_clock.h \ + ./xio/xio_log.h \ + ./xio/xio_mem.h \ + ./xio/xio_os.h \ + ./xio/xio_tls.h \ + ./xio/xio_timers_list.h \ + ./xio/xio_ev_loop.h \ + ./transport/xio_mempool.h \ + ./transport/xio_usr_transport.h \ + $(libxio_rdma_headers) \ + ./transport/tcp/xio_tcp_transport.h \ + ../common/xio_workqueue.h \ + ../common/xio_workqueue_priv.h \ + ../common/xio_common.h \ + ../common/xio_connection.h \ + ../common/xio_nexus.h \ + ../common/xio_nexus_cache.h \ + ../common/xio_context.h \ + ../common/xio_hash.h \ + ../common/xio_mbuf.h \ + ../common/xio_msg_list.h \ + ../common/xio_protocol.h \ + ../common/xio_server.h \ + ../common/xio_session.h \ + ../common/xio_session_priv.h \ + ../common/xio_sessions_cache.h \ + ../common/xio_idr.h \ + ../common/xio_observer.h \ + ../common/xio_task.h \ + ../common/xio_sg_table.h \ + ../common/xio_objpool.h \ + ../common/xio_transport.h \ + ../common/sys/hashtable.h \ + ./linux/atomic.h \ + ./linux/kernel.h \ + ./linux/kref.h \ + ./linux/list.h \ + ./linux/printk.h \ + ./linux/slab.h \ + ./linux/usr.h + + + + +# the sources to add to the library and to add to the source distribution +libxio_la_SOURCES = $(libxio_headers) \ + ../../version.c \ + ./xio/xio_init.c \ + ./xio/get_clock.c \ + ./xio/xio_ev_loop.c \ + ./xio/xio_log.c \ + ./xio/xio_mem.c \ + ./xio/xio_task.c \ + ./xio/xio_usr_utils.c \ + ./xio/xio_tls.c \ + ./xio/xio_context.c \ + ./xio/xio_netlink.c \ + ./xio/xio_workqueue.c \ + ./xio/xio_sg_iov.c \ + ./xio/xio_sg_iovptr.c \ + ./xio/xio_sg_table.c \ + $(libxio_rdma_sources) \ + ./transport/tcp/xio_tcp_management.c \ + ./transport/tcp/xio_tcp_datapath.c \ + ./transport/xio_mempool.c \ + ./transport/xio_usr_transport.c \ + ../common/xio_objpool.c \ + ../common/xio_options.c \ + ../common/xio_error.c \ + ../common/xio_utils.c \ + ../common/xio_server.c \ + ../common/xio_session.c \ + ../common/xio_session_server.c \ + ../common/xio_session_client.c \ + ../common/xio_sessions_cache.c \ + ../common/xio_observer.c \ + ../common/xio_nexus.c \ + ../common/xio_nexus_cache.c \ + ../common/xio_idr.c \ + ../common/xio_transport.c \ + ../common/xio_connection.c + +#libxio_la_LDFLAGS = -shared -rdynamic \ +# -lrdmacm -libverbs -lrt -ldl + +libxio_la_LDFLAGS = -lnuma $(libxio_rdma_ldflags) -ldl -lrt -lpthread \ + $(libxio_version_script) + +libxio_la_DEPENDENCIES = $(top_srcdir)/src/usr/libxio.map + +#libxio_la_LIBADD = $(AM_LDFLAGS) + +#EXTRA_DIST = libxio.map \ +# libxio.spec.in + +############################################################################### diff --git a/open_src/xio/src/usr/libxio.map b/open_src/xio/src/usr/libxio.map new file mode 100644 index 0000000..3288338 --- /dev/null +++ b/open_src/xio/src/usr/libxio.map @@ -0,0 +1,63 @@ +XIO_1.0 { + global: + xio_release_response; + xio_send_response; + xio_send_request; + xio_send_msg; + xio_send_rdma; + xio_cancel_request; + xio_cancel; + xio_release_msg; + xio_set_opt; + xio_get_opt; + xio_errno; + xio_strerror; + xio_version; + xio_mem_alloc; + xio_mem_free; + xio_mem_register; + xio_mem_dereg; + xio_lookup_rkey_by_request; + xio_lookup_rkey_by_response; + xio_register_remote_rkey; + xio_unregister_remote_key; + xio_managed_rkey_unwrap; + xio_init; + xio_shutdown; + xio_context_create; + xio_context_destroy; + xio_context_add_ev_handler; + xio_context_modify_ev_handler; + xio_context_del_ev_handler; + xio_context_run_loop; + xio_context_stop_loop; + xio_context_poll_wait; + xio_context_poll_completions; + xio_modify_context; + xio_query_context; + xio_context_get_poll_fd; + xio_session_event_str; + xio_session_create; + xio_session_destroy; + xio_query_session; + xio_modify_session; + xio_connect; + xio_disconnect; + xio_connection_destroy; + xio_modify_connection; + xio_query_connection; + xio_accept; + xio_redirect; + xio_reject; + xio_bind; + xio_unbind; + xio_mempool_create; + xio_mempool_create_ex; + xio_mempool_add_slab; + xio_mempool_destroy; + xio_mempool_alloc; + xio_mempool_free; + xio_connection_ioctl; + + local: *; +}; diff --git a/open_src/xio/src/usr/linux/atomic.h b/open_src/xio/src/usr/linux/atomic.h new file mode 100644 index 0000000..afc78b2 --- /dev/null +++ b/open_src/xio/src/usr/linux/atomic.h @@ -0,0 +1,197 @@ +/* + * Dummy C implementation of atomic counter operations. Usable on + * UP systems only. Do not include in machine independent code. + * + * Originally implemented for MN10300. + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ +#ifndef __DUMMY_ATOMIC_H +#define __DUMMY_ATOMIC_H + +/* should be in */ +typedef struct { + int counter; +} atomic_t; + +/* + * Atomic operations that C can't guarantee us. Useful for + * resource counting etc.. + */ + +#define ATOMIC_INIT(i) { (i) } + +/** + * atomic_read - read atomic variable + * @v: pointer of type atomic_t + * + * Atomically reads the value of @v. + */ +#ifndef atomic_read +#define atomic_read(v) (*(volatile int *)&(v)->counter) +#endif + +/** + * atomic_set - set atomic variable + * @v: pointer of type atomic_t + * @i: required value + * + * Atomically sets the value of @v to @i. + */ +#define atomic_set(v, i) (((v)->counter) = (i)) + +/** + * atomic_add_return - add integer to atomic variable + * @i: integer value to add + * @v: pointer of type atomic_t + * + * Atomically adds @i to @v and returns the result + */ +#ifndef atomic_add_return +static inline int atomic_add_return(int i, atomic_t *v) +{ + int temp; + + temp = v->counter; + temp += i; + v->counter = temp; + + return temp; +} +#endif + +/** + * atomic_sub_return - subtract integer from atomic variable + * @i: integer value to subtract + * @v: pointer of type atomic_t + * + * Atomically subtracts @i from @v and returns the result + */ +#ifndef atomic_sub_return +static inline int atomic_sub_return(int i, atomic_t *v) +{ + int temp; + + temp = v->counter; + temp -= i; + v->counter = temp; + + return temp; +} +#endif + +static inline int atomic_add_negative(int i, atomic_t *v) +{ + return atomic_add_return(i, v) < 0; +} + +static inline void atomic_add(int i, atomic_t *v) +{ + atomic_add_return(i, v); +} + +static inline void atomic_sub(int i, atomic_t *v) +{ + atomic_sub_return(i, v); +} + +static inline void atomic_inc(atomic_t *v) +{ + atomic_add_return(1, v); +} + +static inline void atomic_dec(atomic_t *v) +{ + atomic_sub_return(1, v); +} + +#define atomic_dec_return(v) atomic_sub_return(1, (v)) +#define atomic_inc_return(v) atomic_add_return(1, (v)) + +#define atomic_sub_and_test(i, v) (atomic_sub_return((i), (v)) == 0) +#define atomic_dec_and_test(v) (atomic_dec_return(v) == 0) +#define atomic_inc_and_test(v) (atomic_inc_return(v) == 0) + +/** + * atomic_clear_mask - Atomically clear bits in atomic variable + * @mask: Mask of the bits to be cleared + * @v: pointer of type atomic_t + * + * Atomically clears the bits set in @mask from @v + */ +#ifndef atomic_clear_mask +static inline void atomic_clear_mask(unsigned long mask, atomic_t *v) +{ + mask = ~mask; + v->counter &= mask; +} +#endif + +/** + * atomic_set_mask - Atomically set bits in atomic variable + * @mask: Mask of the bits to be set + * @v: pointer of type atomic_t + * + * Atomically sets the bits set in @mask in @v + */ +#ifndef atomic_set_mask +static inline void atomic_set_mask(unsigned int mask, atomic_t *v) +{ + v->counter |= mask; +} +#endif + +/** + * Dummy version of cmpxchg. + */ + +static inline int atomic_cmpxchg(atomic_t *v, int old, int _new) +{ + int prev = v->counter; + + if (prev == old) + v->counter = _new; + + return prev; +} + +/** + * atomic_add_unless - add unless the number is already a given value + * @v: pointer of type atomic_t + * @a: the amount to add to v... + * @u: ...unless v is equal to u. + * + * Atomically adds @a to @v, so long as @v was not already @u. + * Returns non-zero if @v was not @u, and zero otherwise. + */ + +static inline int __atomic_add_unless(atomic_t *v, int a, int u) +{ + int c, old; + c = atomic_read(v); + while (c != u && (old = atomic_cmpxchg(v, c, c + a)) != c) + c = old; + return c; +} + +/** + * atomic_add_unless - add unless the number is already a given value + * @v: pointer of type atomic_t + * @a: the amount to add to v... + * @u: ...unless v is equal to u. + * + * Atomically adds @a to @v, so long as @v was not already @u. + * Returns non-zero if @v was not @u, and zero otherwise. + */ +static inline int atomic_add_unless(atomic_t *v, int a, int u) +{ + return __atomic_add_unless(v, a, u) != u; +} + +#endif /* __DUMMY_ATOMIC_H */ diff --git a/open_src/xio/src/usr/linux/bitops.h b/open_src/xio/src/usr/linux/bitops.h new file mode 100644 index 0000000..6612f52 --- /dev/null +++ b/open_src/xio/src/usr/linux/bitops.h @@ -0,0 +1,13 @@ +#ifndef _LINUX_BITOPS_H +#define _LINUX_BITOPS_H + +#define BIT(nr) (1UL << (nr)) +#define BIT_ULL(nr) (1ULL << (nr)) +#define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) +#define BIT_WORD(nr) ((nr) / BITS_PER_LONG) +#define BIT_ULL_MASK(nr) (1ULL << ((nr) % BITS_PER_LONG_LONG)) +#define BIT_ULL_WORD(nr) ((nr) / BITS_PER_LONG_LONG) +#define BITS_PER_BYTE 8 +#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long)) + +#endif diff --git a/open_src/xio/src/usr/linux/debugfs.h b/open_src/xio/src/usr/linux/debugfs.h new file mode 100644 index 0000000..cb6cfb1 --- /dev/null +++ b/open_src/xio/src/usr/linux/debugfs.h @@ -0,0 +1,6 @@ +#ifndef __KERNEL_DEBUGFS__ +#define __KERNEL_DEBUGFS__ + +struct dentry; + +#endif diff --git a/open_src/xio/src/usr/linux/jiffies.h b/open_src/xio/src/usr/linux/jiffies.h new file mode 100644 index 0000000..327f6e0 --- /dev/null +++ b/open_src/xio/src/usr/linux/jiffies.h @@ -0,0 +1,63 @@ +#ifndef _LINUX_JIFFIES_H +#define _LINUX_JIFFIES_H + +#include +#include +/* +#include +#include +#include +*/ + +/* + * These inlines deal with timer wrapping correctly. You are + * strongly encouraged to use them + * 1. Because people otherwise forget + * 2. Because if the timer wrap changes in future you won't have to + * alter your driver code. + * + * time_after(a,b) returns true if the time a is after time b. + * + * Do this with "<0" and ">=0" to only test the sign of the result. A + * good compiler would generate better code (and a really good compiler + * wouldn't care). Gcc is currently neither. + */ +#define time_after(a,b) \ + ((long)((b) - (a)) < 0) +#define time_before(a,b) time_after(b,a) + +#define time_after_eq(a,b) \ + ((long)((a) - (b)) >= 0) +#define time_before_eq(a,b) time_after_eq(b,a) + +/* + * Calculate whether a is in the range of [b, c]. + */ +#define time_in_range(a,b,c) \ + (time_after_eq(a,b) && \ + time_before_eq(a,c)) + +/* + * Calculate whether a is in the range of [b, c). + */ +#define time_in_range_open(a,b,c) \ + (time_after_eq(a,b) && \ + time_before(a,c)) + +/* Same as above, but does so with platform independent 64bit types. + * These must be used when utilizing jiffies_64 (i.e. return value of + * get_jiffies_64() */ +#define time_after64(a,b) \ + ((__s64)((b) - (a)) < 0) +#define time_before64(a,b) time_after64(b,a) + +#define time_after_eq64(a,b) \ + ((__s64)((a) - (b)) >= 0) +#define time_before_eq64(a,b) time_after_eq64(b,a) + +#define time_in_range64(a, b, c) \ + (time_after_eq64(a, b) && \ + time_before_eq64(a, c)) + + +#endif diff --git a/open_src/xio/src/usr/linux/kernel.h b/open_src/xio/src/usr/linux/kernel.h new file mode 100644 index 0000000..4eccaa9 --- /dev/null +++ b/open_src/xio/src/usr/linux/kernel.h @@ -0,0 +1,183 @@ +#ifndef _LINUX_KERNEL_H +#define _LINUX_KERNEL_H + +#include + +/*---------------------------------------------------------------------------*/ +/* defines */ +/*---------------------------------------------------------------------------*/ +#ifndef min +#define min(a, b) (((a) < (b)) ? (a) : (b)) +#endif + +#ifndef max +#define max(a, b) (((a) < (b)) ? (b) : (a)) +#endif + +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +#define __ALIGN_XIO_MASK(x, mask) (((x) + (mask)) & ~(mask)) +#define __ALIGN_XIO(x, a) __ALIGN_XIO_MASK(x, (typeof(x))(a)-1) +#define ALIGN(x, a) __ALIGN_XIO((x), (a)) + +#ifndef roundup +# define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y)) +#endif /* !defined(roundup) */ + +#ifndef offsetof +#ifdef __compiler_offsetof +#define offsetof(TYPE, MEMBER) __compiler_offsetof(TYPE, MEMBER) +#else +#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) +#endif +#endif + +extern const char hex_asc[]; +#define hex_asc_lo(x) hex_asc[((x) & 0x0f)] +#define hex_asc_hi(x) hex_asc[((x) & 0xf0) >> 4] + +#define preempt_enable() +#define preempt_disable() + +/** + * container_of - cast a member of a structure out to the containing structure + * @ptr: the pointer to the member. + * @type: the type of the container struct this is embedded in. + * @member: the name of the member within the struct. + * + */ +#ifndef container_of +#define container_of(ptr, type, member) ({ \ + const typeof(((type *)0)->member) * __mptr = (ptr); \ + (type *)((char *)__mptr - offsetof(type, member)); }) + +#endif + +#define __MUTEX_INITIALIZER(lockname) \ + { \ + .lock = PTHREAD_MUTEX_INITIALIZER \ + } \ + +#define DEFINE_MUTEX(mutexname) \ + struct mutex mutexname = __MUTEX_INITIALIZER(mutexname) + +struct mutex { + pthread_mutex_t lock; +}; + +static inline void mutex_init(struct mutex *mtx) +{ + pthread_mutex_init(&mtx->lock, NULL); +} + +static inline void mutex_destroy(struct mutex *mtx) +{ + pthread_mutex_destroy(&mtx->lock); +} + +static inline void mutex_lock(struct mutex *mtx) +{ + pthread_mutex_lock(&mtx->lock); +} + +static inline void mutex_unlock(struct mutex *mtx) +{ + pthread_mutex_unlock(&mtx->lock); +} + + +typedef volatile int spinlock_t; + +#define SPINLOCK_NG + +#ifndef SPINLOCK_NG + +/* + * https://idea.popcount.org/2012-09-12-reinventing-spinlocks/ + * + */ + +static inline void spin_lock_init(spinlock_t* spinlock) +{ + __sync_lock_release(spinlock); +} + +static inline void spin_lock(spinlock_t* spinlock) +{ + int i; + while (1) { + for (i = 0; i < 10000; i++) { + if (__sync_bool_compare_and_swap(spinlock, 0, 1)) { + return; + } + } + /* yield the cpu */ + sched_yield(); + } +} + +static inline int spin_try_lock(spinlock_t* spinlock) +{ + return __sync_bool_compare_and_swap(spinlock, 0, 1) ? 1 : 0; +} + +static inline int spin_locked(spinlock_t* spinlock) +{ + __sync_synchronize(); + return *spinlock; +} + +static inline void spin_unlock(spinlock_t* spinlock) +{ + __sync_lock_release(spinlock); +} +#else +/*DPDK spin lock */ + +#include + +static inline void spin_lock_init(spinlock_t* spinlock) +{ + spinlock = 0; +} + +static inline void spin_lock(spinlock_t* spinlock) +{ + while (__sync_lock_test_and_set(spinlock, 1)) + while (*spinlock) + _mm_pause(); +} + +static inline int spin_try_lock(spinlock_t* spinlock) +{ + return (__sync_lock_test_and_set(spinlock, 1) == 0); +} + +static inline int spin_locked(spinlock_t* spinlock) +{ + return *spinlock; +} + +static inline void spin_unlock(spinlock_t* spinlock) +{ + __sync_lock_release(spinlock); +} + +#endif + +static inline char *kstrdup(const char *s, gfp_t gfp) +{ + /* Make sure code transfered to kernel will work as expected */ + assert(gfp == GFP_KERNEL); + return strdup(s); +} + +static inline char *kstrndup(const char *s, size_t len, gfp_t gfp) +{ + /* Make sure code transfered to kernel will work as expected */ + assert(gfp == GFP_KERNEL); + return strndup(s, len); +} + +#endif /* _LINUX_KERNEL_H */ diff --git a/open_src/xio/src/usr/linux/kref.h b/open_src/xio/src/usr/linux/kref.h new file mode 100644 index 0000000..cb11f60 --- /dev/null +++ b/open_src/xio/src/usr/linux/kref.h @@ -0,0 +1,124 @@ +/* + * kref.h - library routines for handling generic reference counted objects + * + * Copyright (C) 2004 Greg Kroah-Hartman + * Copyright (C) 2004 IBM Corp. + * + * based on kobject.h which was: + * Copyright (C) 2002-2003 Patrick Mochel + * Copyright (C) 2002-2003 Open Source Development Labs + * + * This file is released under the GPLv2. + * + */ + +#ifndef _KREF_H_ +#define _KREF_H_ + +#include + +#ifndef WARN_ON +#define WARN_ON(a) \ + if (a) \ + fprintf(stderr, "[WARN] - %s:%d %m\n", __func__, __LINE__); +#endif + +struct kref { + atomic_t refcount; +}; + +/** + * kref_init - initialize object. + * @kref: object in question. + */ +static inline void kref_init(struct kref *kref) +{ + atomic_set(&kref->refcount, 1); +} + +/** + * kref_get - increment refcount for object. + * @kref: object. + */ +static inline void kref_get(struct kref *kref) +{ + WARN_ON(!atomic_read(&kref->refcount)); + atomic_inc(&kref->refcount); +} + +/** + * kref_sub - subtract a number of refcounts for object. + * @kref: object. + * @count: Number of recounts to subtract. + * @release: pointer to the function that will clean up the object when the + * last reference to the object is released. + * This pointer is required, and it is not acceptable to pass kfree + * in as this function. If the caller does pass kfree to this + * function, you will be publicly mocked mercilessly by the kref + * maintainer, and anyone else who happens to notice it. You have + * been warned. + * + * Subtract @count from the refcount, and if 0, call release(). + * Return 1 if the object was removed, otherwise return 0. Beware, if this + * function returns 0, you still can not count on the kref from remaining in + * memory. Only use the return value if you want to see if the kref is now + * gone, not present. + */ +static inline int kref_sub(struct kref *kref, unsigned int count, + void (*release)(struct kref *kref)) +{ + WARN_ON(release == NULL); + + if (atomic_sub_and_test((int) count, &kref->refcount)) { + release(kref); + return 1; + } + return 0; +} + +/** + * kref_put - decrement refcount for object. + * @kref: object. + * @release: pointer to the function that will clean up the object when the + * last reference to the object is released. + * This pointer is required, and it is not acceptable to pass kfree + * in as this function. If the caller does pass kfree to this + * function, you will be publicly mocked mercilessly by the kref + * maintainer, and anyone else who happens to notice it. You have + * been warned. + * + * Decrement the refcount, and if 0, call release(). + * Return 1 if the object was removed, otherwise return 0. Beware, if this + * function returns 0, you still can not count on the kref from remaining in + * memory. Only use the return value if you want to see if the kref is now + * gone, not present. + */ +static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref)) +{ + return kref_sub(kref, 1, release); +} + + +/** + * kref_get_unless_zero - Increment refcount for object unless it is + * zero. + * @kref: object. + * + * Return non-zero if the increment succeeded. Otherwise return 0. + * + * This function is intended to simplify locking around refcounting for + * objects that can be looked up from a lookup structure, and which are + * removed from that lookup structure in the object destructor. + * Operations on such objects require at least a read lock around + * lookup + kref_get, and a write lock around kref_put + remove from lookup + * structure. Furthermore, RCU implementations become extremely tricky. + * With a lookup followed by a kref_get_unless_zero *with return value check* + * locking in the kref_put path can be deferred to the actual removal from + * the lookup structure and RCU lookups become trivial. + */ +static inline int kref_get_unless_zero(struct kref *kref) +{ + return __atomic_add_unless(&kref->refcount, 1, 0); +} + +#endif /* _KREF_H_ */ diff --git a/open_src/xio/src/usr/linux/list.h b/open_src/xio/src/usr/linux/list.h new file mode 100644 index 0000000..436c838 --- /dev/null +++ b/open_src/xio/src/usr/linux/list.h @@ -0,0 +1,775 @@ +#ifndef _LINUX_LIST_H +#define _LINUX_LIST_H + +/** + * @name from other kernel headers + */ +/*@{*/ + +/** + * Get offset of a member + */ +#ifndef offsetof +#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) +#endif + +/** + * Casts a member of a structure out to the containing structure + * @param ptr the pointer to the member. + * @param type the type of the container struct this is embedded in. + * @param member the name of the member within the struct. + * + */ +#ifndef container_of +#define container_of(ptr, type, member) ({ \ + const typeof(((type *)0)->member) * __mptr = (ptr); \ + (type *)((char *)__mptr - offsetof(type, member)); }) +#endif +/*@}*/ + +/* + * These are non-NULL pointers that will result in page faults + * under normal circumstances, used to verify that nobody uses + * non-initialized list entries. + */ +#define LIST_POISON1 ((void *)0x00100100) +#define LIST_POISON2 ((void *)0x00200200) + +/** + * Simple doubly linked list implementation. + * + * Some of the internal functions ("__xxx") are useful when + * manipulating whole lists rather than single entries, as + * sometimes we already know the next/prev entries and we can + * generate better code by using them directly rather than + * using the generic single-entry routines. + */ +struct list_head { + struct list_head *next, *prev; +}; + +#define LIST_HEAD_INIT(name) { &(name), &(name) } + +#define LIST_HEAD(name) \ + struct list_head name = LIST_HEAD_INIT(name) + +static inline void INIT_LIST_HEAD(struct list_head *list) +{ + list->next = list; + list->prev = list; +} + +/* + * Insert a new entry between two known consecutive entries. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static inline void __list_add(struct list_head *_new, + struct list_head *prev, + struct list_head *next) +{ + next->prev = _new; + _new->next = next; + _new->prev = prev; + prev->next = _new; +} + +/** + * list_add - add a new entry + * @_new: new entry to be added + * @head: list head to add it after + * + * Insert a new entry after the specified head. + * This is good for implementing stacks. + */ +static inline void list_add(struct list_head *_new, struct list_head *head) +{ + __list_add(_new, head, head->next); +} + + +/** + * list_add_tail - add a new entry + * @_new: new entry to be added + * @head: list head to add it before + * + * Insert a new entry before the specified head. + * This is useful for implementing queues. + */ +static inline void list_add_tail(struct list_head *_new, struct list_head *head) +{ + __list_add(_new, head->prev, head); +} + +/* + * Delete a list entry by making the prev/next entries + * point to each other. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static inline void __list_del(struct list_head *prev, struct list_head *next) +{ + next->prev = prev; + prev->next = next; +} + +/** + * list_del - deletes entry from list. + * @entry: the element to delete from the list. + * Note: list_empty() on entry does not return true after this, the entry is + * in an undefined state. + */ +static inline void __list_del_entry(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); +} + +static inline void list_del(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + entry->next = (struct list_head *)LIST_POISON1; + entry->prev = (struct list_head *)LIST_POISON2; +} + +/** + * list_replace - replace old entry by new one + * @old : the element to be replaced + * @_new : the new element to insert + * + * If @old was empty, it will be overwritten. + */ +static inline void list_replace(struct list_head *old, + struct list_head *_new) +{ + _new->next = old->next; + _new->next->prev = _new; + _new->prev = old->prev; + _new->prev->next = _new; +} + +static inline void list_replace_init(struct list_head *old, + struct list_head *_new) +{ + list_replace(old, _new); + INIT_LIST_HEAD(old); +} + +/** + * list_del_init - deletes entry from list and reinitialize it. + * @entry: the element to delete from the list. + */ +static inline void list_del_init(struct list_head *entry) +{ + __list_del_entry(entry); + INIT_LIST_HEAD(entry); +} + +/** + * list_move - delete from one list and add as another's head + * @list: the entry to move + * @head: the head that will precede our entry + */ +static inline void list_move(struct list_head *list, struct list_head *head) +{ + __list_del_entry(list); + list_add(list, head); +} + +/** + * list_move_tail - delete from one list and add as another's tail + * @list: the entry to move + * @head: the head that will follow our entry + */ +static inline void list_move_tail(struct list_head *list, + struct list_head *head) +{ + __list_del_entry(list); + list_add_tail(list, head); +} + +/** + * list_is_last - tests whether @list is the last entry in list @head + * @list: the entry to test + * @head: the head of the list + */ +static inline int list_is_last(const struct list_head *list, + const struct list_head *head) +{ + return list->next == head; +} + +/** + * list_empty - tests whether a list is empty + * @head: the list to test. + */ +static inline int list_empty(const struct list_head *head) +{ + return head->next == head; +} + +/** + * list_empty_careful - tests whether a list is empty and not being modified + * @head: the list to test + * + * Description: + * tests whether a list is empty _and_ checks that no other CPU might be + * in the process of modifying either member (next or prev) + * + * NOTE: using list_empty_careful() without synchronization + * can only be safe if the only activity that can happen + * to the list entry is list_del_init(). Eg. it cannot be used + * if another CPU could re-list_add() it. + */ +static inline int list_empty_careful(const struct list_head *head) +{ + struct list_head *next = head->next; + return (next == head) && (next == head->prev); +} + +/** + * list_rotate_left - rotate the list to the left + * @head: the head of the list + */ +static inline void list_rotate_left(struct list_head *head) +{ + struct list_head *first; + + if (!list_empty(head)) { + first = head->next; + list_move_tail(first, head); + } +} + +/** + * list_is_singular - tests whether a list has just one entry. + * @head: the list to test. + */ +static inline int list_is_singular(const struct list_head *head) +{ + return !list_empty(head) && (head->next == head->prev); +} + +static inline void __list_cut_position(struct list_head *list, + struct list_head *head, struct list_head *entry) +{ + struct list_head *new_first = entry->next; + list->next = head->next; + list->next->prev = list; + list->prev = entry; + entry->next = list; + head->next = new_first; + new_first->prev = head; +} + +/** + * list_cut_position - cut a list into two + * @list: a new list to add all removed entries + * @head: a list with entries + * @entry: an entry within head, could be the head itself + * and if so we won't cut the list + * + * This helper moves the initial part of @head, up to and + * including @entry, from @head to @list. You should + * pass on @entry an element you know is on @head. @list + * should be an empty list or a list you do not care about + * losing its data. + * + */ +static inline void list_cut_position(struct list_head *list, + struct list_head *head, struct list_head *entry) +{ + if (list_empty(head)) + return; + if (list_is_singular(head) && + (head->next != entry && head != entry)) + return; + if (entry == head) + INIT_LIST_HEAD(list); + else + __list_cut_position(list, head, entry); +} + +static inline void __list_splice(const struct list_head *list, + struct list_head *prev, + struct list_head *next) +{ + struct list_head *first = list->next; + struct list_head *last = list->prev; + + first->prev = prev; + prev->next = first; + + last->next = next; + next->prev = last; +} + +/** + * list_splice - join two lists, this is designed for stacks + * @list: the new list to add. + * @head: the place to add it in the first list. + */ +static inline void list_splice(const struct list_head *list, + struct list_head *head) +{ + if (!list_empty(list)) + __list_splice(list, head, head->next); +} + +/** + * list_splice_tail - join two lists, each list being a queue + * @list: the new list to add. + * @head: the place to add it in the first list. + */ +static inline void list_splice_tail(struct list_head *list, + struct list_head *head) +{ + if (!list_empty(list)) + __list_splice(list, head->prev, head); +} + +/** + * list_splice_init - join two lists and reinitialise the emptied list. + * @list: the new list to add. + * @head: the place to add it in the first list. + * + * The list at @list is reinitialised + */ +static inline void list_splice_init(struct list_head *list, + struct list_head *head) +{ + if (!list_empty(list)) { + __list_splice(list, head, head->next); + INIT_LIST_HEAD(list); + } +} + +/** + * list_splice_tail_init - join two lists and reinitialise the emptied list + * @list: the new list to add. + * @head: the place to add it in the first list. + * + * Each of the lists is a queue. + * The list at @list is reinitialised + */ +static inline void list_splice_tail_init(struct list_head *list, + struct list_head *head) +{ + if (!list_empty(list)) { + __list_splice(list, head->prev, head); + INIT_LIST_HEAD(list); + } +} + +/** + * list_entry - get the struct for this entry + * @ptr: the &struct list_head pointer. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_struct within the struct. + */ +#define list_entry(ptr, type, member) \ + container_of(ptr, type, member) + +/** + * list_first_entry - get the first element from a list + * @ptr: the list head to take the element from. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_struct within the struct. + * + * Note, that list is expected to be not empty. + */ +#define list_first_entry(ptr, type, member) \ + list_entry((ptr)->next, type, member) + +/** + * list_last_entry - get the last element from a list + * @ptr: the list head to take the element from. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_struct within the struct. + * + * Note, that list is expected to be not empty. + */ +#define list_last_entry(ptr, type, member) \ + list_entry((ptr)->prev, type, member) + +/** + * list_first_entry_or_null - get the first element from a list + * @ptr: the list head to take the element from. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_struct within the struct. + * + * Note that if the list is empty, it returns NULL. + */ +#define list_first_entry_or_null(ptr, type, member) \ + (!list_empty(ptr) ? list_first_entry(ptr, type, member) : NULL) + +/** + * list_next_entry - get the next element in list + * @pos: the type * to cursor + * @member: the name of the list_struct within the struct. + */ +#define list_next_entry(pos, member) \ + list_entry((pos)->member.next, typeof(*(pos)), member) + +/** + * list_prev_entry - get the prev element in list + * @pos: the type * to cursor + * @member: the name of the list_struct within the struct. + */ +#define list_prev_entry(pos, member) \ + list_entry((pos)->member.prev, typeof(*(pos)), member) + +/** + * list_for_each - iterate over a list + * @pos: the &struct list_head to use as a loop cursor. + * @head: the head for your list. + */ +#define list_for_each(pos, head) \ + for (pos = (head)->next; pos != (head); pos = pos->next) + +/** + * list_for_each_prev - iterate over a list backwards + * @pos: the &struct list_head to use as a loop cursor. + * @head: the head for your list. + */ +#define list_for_each_prev(pos, head) \ + for (pos = (head)->prev; pos != (head); pos = pos->prev) + +/** + * list_for_each_safe - iterate over a list safe against removal of list entry + * @pos: the &struct list_head to use as a loop cursor. + * @n: another &struct list_head to use as temporary storage + * @head: the head for your list. + */ +#define list_for_each_safe(pos, n, head) \ + for (pos = (head)->next, n = pos->next; pos != (head); \ + pos = n, n = pos->next) + +/** + * list_for_each_prev_safe - iterate over a list backwards safe against removal of list entry + * @pos: the &struct list_head to use as a loop cursor. + * @n: another &struct list_head to use as temporary storage + * @head: the head for your list. + */ +#define list_for_each_prev_safe(pos, n, head) \ + for (pos = (head)->prev, n = pos->prev; \ + pos != (head); \ + pos = n, n = pos->prev) + +/** + * list_for_each_entry - iterate over list of given type + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define list_for_each_entry(pos, head, member) \ + for (pos = list_first_entry(head, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_next_entry(pos, member)) + +/** + * list_for_each_entry_reverse - iterate backwards over list of given type. + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define list_for_each_entry_reverse(pos, head, member) \ + for (pos = list_last_entry(head, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_prev_entry(pos, member)) + +/** + * list_prepare_entry - prepare a pos entry for use in list_for_each_entry_continue() + * @pos: the type * to use as a start point + * @head: the head of the list + * @member: the name of the list_struct within the struct. + * + * Prepares a pos entry for use as a start point in + * list_for_each_entry_continue(). + */ +#define list_prepare_entry(pos, head, member) \ + ((pos) ? : list_entry(head, typeof(*pos), member)) + +/** + * list_for_each_entry_continue - continue iteration over list of given type + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + * + * Continue to iterate over list of given type, continuing after + * the current position. + */ +#define list_for_each_entry_continue(pos, head, member) \ + for (pos = list_next_entry(pos, member); \ + &pos->member != (head); \ + pos = list_next_entry(pos, member)) + +/** + * list_for_each_entry_continue_reverse - iterate backwards from the given point + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + * + * Start to iterate over list of given type backwards, continuing after + * the current position. + */ +#define list_for_each_entry_continue_reverse(pos, head, member) \ + for (pos = list_prev_entry(pos, member); \ + &pos->member != (head); \ + pos = list_prev_entry(pos, member)) + +/** + * list_for_each_entry_from - iterate over list of given type from the current point + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + * + * Iterate over list of given type, continuing from current position. + */ +#define list_for_each_entry_from(pos, head, member) \ + for (; &pos->member != (head); \ + pos = list_next_entry(pos, member)) + +/** + * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry + * @pos: the type * to use as a loop cursor. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define list_for_each_entry_safe(pos, n, head, member) \ + for (pos = list_first_entry(head, typeof(*pos), member), \ + n = list_next_entry(pos, member); \ + &pos->member != (head); \ + pos = n, n = list_next_entry(n, member)) +/** + * list_for_each_entry_safe_continue - continue list iteration safe against removal + * @pos: the type * to use as a loop cursor. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + * + * Iterate over list of given type, continuing after current point, + * safe against removal of list entry. + */ +#define list_for_each_entry_safe_continue(pos, n, head, member) \ + for (pos = list_next_entry(pos, member), \ + n = list_next_entry(pos, member); \ + &pos->member != (head); \ + pos = n, n = list_next_entry(n, member)) + +/** + * list_for_each_entry_safe_from - iterate over list from current point safe against removal + * @pos: the type * to use as a loop cursor. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + * + * Iterate over list of given type from current point, safe against + * removal of list entry. + */ +#define list_for_each_entry_safe_from(pos, n, head, member) \ + for (n = list_next_entry(pos, member); \ + &pos->member != (head); \ + pos = n, n = list_next_entry(n, member)) + +/** + * list_for_each_entry_safe_reverse - iterate backwards over list safe against removal + * @pos: the type * to use as a loop cursor. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + * + * Iterate backwards over list of given type, safe against removal + * of list entry. + */ +#define list_for_each_entry_safe_reverse(pos, n, head, member) \ + for (pos = list_last_entry(head, typeof(*pos), member), \ + n = list_prev_entry(pos, member); \ + &pos->member != (head); \ + pos = n, n = list_prev_entry(n, member)) + +/** + * list_safe_reset_next - reset a stale list_for_each_entry_safe loop + * @pos: the loop cursor used in the list_for_each_entry_safe loop + * @n: temporary storage used in list_for_each_entry_safe + * @member: the name of the list_struct within the struct. + * + * list_safe_reset_next is not safe to use in general if the list may be + * modified concurrently (eg. the lock is dropped in the loop body). An + * exception to this is if the cursor element (pos) is pinned in the list, + * and list_safe_reset_next is called after re-taking the lock and before + * completing the current iteration of the loop body. + */ +#define list_safe_reset_next(pos, n, member) \ + n = list_next_entry(pos, member) + +/* + * Double linked lists with a single pointer list head. + * Mostly useful for hash tables where the two pointer list head is + * too wasteful. + * You lose the ability to access the tail in O(1). + */ + +struct hlist_head { + struct hlist_node *first; +}; + +struct hlist_node { + struct hlist_node *next, **pprev; +}; + + +#define HLIST_HEAD_INIT { .first = NULL } +#define HLIST_HEAD(name) struct hlist_head name = { .first = NULL } +#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL) +static inline void INIT_HLIST_NODE(struct hlist_node *h) +{ + h->next = NULL; + h->pprev = NULL; +} + +static inline int hlist_unhashed(const struct hlist_node *h) +{ + return !h->pprev; +} + +static inline int hlist_empty(const struct hlist_head *h) +{ + return !h->first; +} + +static inline void __hlist_del(struct hlist_node *n) +{ + struct hlist_node *next = n->next; + struct hlist_node **pprev = n->pprev; + *pprev = next; + if (next) + next->pprev = pprev; +} + +static inline void hlist_del(struct hlist_node *n) +{ + __hlist_del(n); + n->next = (struct hlist_node *)LIST_POISON1; + n->pprev = (struct hlist_node **)LIST_POISON2; +} + +static inline void hlist_del_init(struct hlist_node *n) +{ + if (!hlist_unhashed(n)) { + __hlist_del(n); + INIT_HLIST_NODE(n); + } +} + +static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h) +{ + struct hlist_node *first = h->first; + n->next = first; + if (first) + first->pprev = &n->next; + h->first = n; + n->pprev = &h->first; +} + +/* next must be != NULL */ +static inline void hlist_add_before(struct hlist_node *n, + struct hlist_node *next) +{ + n->pprev = next->pprev; + n->next = next; + next->pprev = &n->next; + *(n->pprev) = n; +} + +static inline void hlist_add_behind(struct hlist_node *n, + struct hlist_node *prev) +{ + n->next = prev->next; + prev->next = n; + n->pprev = &prev->next; + + if (n->next) + n->next->pprev = &n->next; +} + +/* after that we'll appear to be on some hlist and hlist_del will work */ +static inline void hlist_add_fake(struct hlist_node *n) +{ + n->pprev = &n->next; +} + +/* + * Move a list from one list head to another. Fixup the pprev + * reference of the first entry if it exists. + */ +static inline void hlist_move_list(struct hlist_head *old, + struct hlist_head *_new) +{ + _new->first = old->first; + if (_new->first) + _new->first->pprev = &_new->first; + old->first = NULL; +} + +#define hlist_entry(ptr, type, member) container_of(ptr,type,member) + +#define hlist_for_each(pos, head) \ + for (pos = (head)->first; pos ; pos = pos->next) + +#define hlist_for_each_safe(pos, n, head) \ + for (pos = (head)->first; pos && ({ n = pos->next; 1; }); \ + pos = n) + +#define hlist_entry_safe(ptr, type, member) \ + ({ typeof(ptr) ____ptr = (ptr); \ + ____ptr ? hlist_entry(____ptr, type, member) : NULL; \ + }) + +/** + * hlist_for_each_entry - iterate over list of given type + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the hlist_node within the struct. + */ +#define hlist_for_each_entry(pos, head, member) \ + for (pos = hlist_entry_safe((head)->first, typeof(*(pos)), member);\ + pos; \ + pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) + +/** + * hlist_for_each_entry_continue - iterate over a hlist continuing after current point + * @pos: the type * to use as a loop cursor. + * @member: the name of the hlist_node within the struct. + */ +#define hlist_for_each_entry_continue(pos, member) \ + for (pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member);\ + pos; \ + pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) + +/** + * hlist_for_each_entry_from - iterate over a hlist continuing from current point + * @pos: the type * to use as a loop cursor. + * @member: the name of the hlist_node within the struct. + */ +#define hlist_for_each_entry_from(pos, member) \ + for (; pos; \ + pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) + +/** + * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry + * @pos: the type * to use as a loop cursor. + * @n: another &struct hlist_node to use as temporary storage + * @head: the head for your list. + * @member: the name of the hlist_node within the struct. + */ +#define hlist_for_each_entry_safe(pos, n, head, member) \ + for (pos = hlist_entry_safe((head)->first, typeof(*pos), member);\ + pos && ({ n = pos->member.next; 1; }); \ + pos = hlist_entry_safe(n, typeof(*pos), member)) + + +#endif + diff --git a/open_src/xio/src/usr/linux/printk.h b/open_src/xio/src/usr/linux/printk.h new file mode 100644 index 0000000..4bfa292 --- /dev/null +++ b/open_src/xio/src/usr/linux/printk.h @@ -0,0 +1,19 @@ +#ifndef __KERNEL_PRINTK__ +#define __KERNEL_PRINTK__ + + +enum { + DUMP_PREFIX_NONE, + DUMP_PREFIX_ADDRESS, + DUMP_PREFIX_OFFSET +}; +extern void hex_dump_to_buffer(const void *buf, size_t len, + int rowsize, int groupsize, + char *linebuf, size_t linebuflen, bool ascii); +extern void print_hex_dump(const char *level, const char *prefix_str, + int prefix_type, int rowsize, int groupsize, + const void *buf, size_t len, bool ascii); + +extern void print_hex_dump_bytes(const char *prefix_str, int prefix_type, + const void *buf, size_t len); +#endif diff --git a/open_src/xio/src/usr/linux/slab.h b/open_src/xio/src/usr/linux/slab.h new file mode 100644 index 0000000..c5e5c7c --- /dev/null +++ b/open_src/xio/src/usr/linux/slab.h @@ -0,0 +1,57 @@ +#ifndef _LINUX_SLAB_H +#define _LINUX_SLAB_H + +#include +#include +#include "xio_mem.h" + +#define ___GFP_WAIT 0x10u +#define ___GFP_IO 0x40u +#define ___GFP_FS 0x80u + +#define GFP_KERNEL (___GFP_WAIT | ___GFP_IO | ___GFP_FS) + +/* should be __bitwise__ but it is dummy */ +typedef unsigned gfp_t; + +static inline void kfree(const void *ptr) +{ + ufree((void *) ptr); +} + +static inline void *kmalloc(size_t size, gfp_t flags) +{ + /* Make sure code transfered to kernel will work as expected */ + assert(flags == GFP_KERNEL); + return umalloc(size); +} + +/** + * kcalloc - allocate memory for an array. The memory is set to zero. + * @n: number of elements. + * @size: element size. + * @flags: the type of memory to allocate (see kmalloc). + */ +static inline void *kcalloc(size_t n, size_t size, gfp_t flags) +{ + /* Make sure code transfered to kernel will work as expected */ + assert(flags == GFP_KERNEL); + return ucalloc(n, size); +} + +static inline void *vmalloc(unsigned long size) +{ + return umalloc(size); +} + +static inline void *vzalloc(unsigned long size) +{ + return ucalloc(1, size); +} + +static inline void vfree(const void *addr) +{ + ufree((void *) addr); +} + +#endif /* _LINUX_SLAB_H */ diff --git a/open_src/xio/src/usr/linux/usr.h b/open_src/xio/src/usr/linux/usr.h new file mode 100644 index 0000000..7057f5f --- /dev/null +++ b/open_src/xio/src/usr/linux/usr.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_USR_H +#define XIO_USR_H + +#ifndef EXPORT_SYMBOL +#define EXPORT_SYMBOL(sym) +#endif + +#endif diff --git a/open_src/xio/src/usr/transport/rdma/ib_cm.h b/open_src/xio/src/usr/transport/rdma/ib_cm.h new file mode 100644 index 0000000..1754c6a --- /dev/null +++ b/open_src/xio/src/usr/transport/rdma/ib_cm.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved. + * Copyright (c) 2004 Topspin Corporation. All rights reserved. + * Copyright (c) 2004 Voltaire Corporation. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if !defined(IB_CM_H) +#define IB_CM_H + +enum ib_cm_rej_reason { + IB_CM_REJ_NO_QP = 1, + IB_CM_REJ_NO_EEC = 2, + IB_CM_REJ_NO_RESOURCES = 3, + IB_CM_REJ_TIMEOUT = 4, + IB_CM_REJ_UNSUPPORTED = 5, + IB_CM_REJ_INVALID_COMM_ID = 6, + IB_CM_REJ_INVALID_COMM_INSTANCE = 7, + IB_CM_REJ_INVALID_SERVICE_ID = 8, + IB_CM_REJ_INVALID_TRANSPORT_TYPE = 9, + IB_CM_REJ_STALE_CONN = 10, + IB_CM_REJ_RDC_NOT_EXIST = 11, + IB_CM_REJ_INVALID_GID = 12, + IB_CM_REJ_INVALID_LID = 13, + IB_CM_REJ_INVALID_SL = 14, + IB_CM_REJ_INVALID_TRAFFIC_CLASS = 15, + IB_CM_REJ_INVALID_HOP_LIMIT = 16, + IB_CM_REJ_INVALID_PACKET_RATE = 17, + IB_CM_REJ_INVALID_ALT_GID = 18, + IB_CM_REJ_INVALID_ALT_LID = 19, + IB_CM_REJ_INVALID_ALT_SL = 20, + IB_CM_REJ_INVALID_ALT_TRAFFIC_CLASS = 21, + IB_CM_REJ_INVALID_ALT_HOP_LIMIT = 22, + IB_CM_REJ_INVALID_ALT_PACKET_RATE = 23, + IB_CM_REJ_PORT_CM_REDIRECT = 24, + IB_CM_REJ_PORT_REDIRECT = 25, + IB_CM_REJ_INVALID_MTU = 26, + IB_CM_REJ_INSUFFICIENT_RESP_RESOURCES = 27, + IB_CM_REJ_CONSUMER_DEFINED = 28, + IB_CM_REJ_INVALID_RNR_RETRY = 29, + IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID = 30, + IB_CM_REJ_INVALID_CLASS_VERSION = 31, + IB_CM_REJ_INVALID_FLOW_LABEL = 32, + IB_CM_REJ_INVALID_ALT_FLOW_LABEL = 33 +}; + +#endif /* IB_CM_H */ diff --git a/open_src/xio/src/usr/transport/rdma/xio_rdma_datapath.c b/open_src/xio/src/usr/transport/rdma/xio_rdma_datapath.c new file mode 100644 index 0000000..db75e16 --- /dev/null +++ b/open_src/xio/src/usr/transport/rdma/xio_rdma_datapath.c @@ -0,0 +1,5181 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include + +#include "libxio.h" +#include "xio_log.h" +#include "xio_common.h" +#include "xio_observer.h" +#include "xio_protocol.h" +#include "xio_mbuf.h" +#include "xio_task.h" +#include "xio_usr_transport.h" +#include "xio_transport.h" +#include "xio_protocol.h" +#include "get_clock.h" +#include "xio_mem.h" +#include "xio_rdma_utils.h" +#include "xio_ev_data.h" +#include "xio_sg_table.h" +#include "xio_objpool.h" +#include "xio_workqueue.h" +#include "xio_context.h" +#include "xio_rdma_transport.h" + +/*---------------------------------------------------------------------------*/ +/* forward declarations */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_on_recv_req(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task); +static int xio_rdma_on_recv_rsp(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task); +static int xio_rdma_on_setup_msg(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task); +static int xio_rdma_on_rsp_send_comp(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task); +static int xio_rdma_on_req_send_comp(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task); +static int xio_rdma_on_direct_rdma_comp(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task, + enum xio_wc_op op); +static int xio_rdma_on_recv_nop(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task); +static int xio_rdma_on_recv_cancel_req(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task); +static int xio_rdma_on_recv_cancel_rsp(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task); +#ifndef XIO_SRQ_ENABLE +static int xio_rdma_send_nop(struct xio_rdma_transport *rdma_hndl); +#endif +static int xio_sched_rdma_wr_req(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task); +static void xio_sched_consume_cq(void *data); +static void xio_sched_poll_cq(void *data); + +static int xio_rdma_send_rdma_read_ack(struct xio_rdma_transport *rdma_hndl, + int rtid); +static int xio_rdma_on_recv_rdma_read_ack(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task); +static int xio_sched_rdma_rd(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task); +static int xio_rdma_post_recv_rsp(struct xio_task *task); + +/*---------------------------------------------------------------------------*/ +/* xio_post_recv */ +/*---------------------------------------------------------------------------*/ +int xio_post_recv(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task, int num_recv_bufs) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + struct ibv_recv_wr *bad_wr = NULL; + int retval, nr_posted; + +#ifdef XIO_SRQ_ENABLE + retval = ibv_post_srq_recv(rdma_hndl->tcq->srq->srq, + &rdma_task->rxd.recv_wr, &bad_wr); +#else + retval = ibv_post_recv(rdma_hndl->qp, &rdma_task->rxd.recv_wr, &bad_wr); +#endif + if (likely(!retval)) { + nr_posted = num_recv_bufs; + } else { + struct ibv_recv_wr *wr; + nr_posted = 0; + for (wr = &rdma_task->rxd.recv_wr; wr != bad_wr; wr = wr->next) + nr_posted++; + + xio_set_error(retval); + ERROR_LOG("ibv_post_recv failed. (errno=%d %s)\n", + retval, strerror(retval)); + } +#ifdef XIO_SRQ_ENABLE + rdma_hndl->tcq->srq->rqe_avail += nr_posted; +#else + rdma_hndl->rqe_avail += nr_posted; +#endif + + /* credit updates */ + rdma_hndl->credits += nr_posted; + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_post_send */ +/*---------------------------------------------------------------------------*/ +static int xio_post_send(struct xio_rdma_transport *rdma_hndl, + struct xio_work_req *xio_send, + int num_send_reqs) +{ + struct ibv_send_wr *bad_wr; + int retval, nr_posted; + + /* + TRACE_LOG("num_sge:%d, len1:%d, len2:%d, send_flags:%d\n", + xio_send->send_wr.num_sge, + xio_send->send_wr.sg_list[0].length, + xio_send->send_wr.sg_list[1].length, + xio_send->send_wr.send_flags); + */ + retval = ibv_post_send(rdma_hndl->qp, &xio_send->send_wr, &bad_wr); + if (likely(!retval)) { + nr_posted = num_send_reqs; + } else { + struct ibv_send_wr *wr; + + nr_posted = 0; + for (wr = &xio_send->send_wr; wr != bad_wr; wr = wr->next) + nr_posted++; + + xio_set_error(retval); + + ERROR_LOG("ibv_post_send failed. (errno=%d %s) posted:%d/%d " \ + "sge_sz:%d, sqe_avail:%d\n", retval, strerror(retval), + nr_posted, num_send_reqs, xio_send->send_wr.num_sge, + rdma_hndl->sqe_avail); + } + rdma_hndl->sqe_avail -= nr_posted; + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_write_sn */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_write_sn(struct xio_task *task, + uint16_t sn, uint16_t ack_sn, uint16_t credits) +{ + uint16_t *psn; + struct xio_mbuf *mbuf = &task->mbuf; + + /* save the current place */ + xio_mbuf_push(mbuf); + /* goto the first transport header*/ + xio_mbuf_set_trans_hdr(mbuf); + + /* jump over the first uint32_t */ + xio_mbuf_inc(mbuf, sizeof(uint32_t)); + + /* and set serial number */ + psn = (uint16_t *)xio_mbuf_get_curr_ptr(mbuf); + *psn = htons(sn); + + xio_mbuf_inc(mbuf, sizeof(uint16_t)); + + /* and set ack serial number */ + psn = (uint16_t *)xio_mbuf_get_curr_ptr(mbuf); + *psn = htons(ack_sn); + + xio_mbuf_inc(mbuf, sizeof(uint16_t)); + + /* and set credits */ + psn = (uint16_t *)xio_mbuf_get_curr_ptr(mbuf); + *psn = htons(credits); + + /* pop to the original place */ + xio_mbuf_pop(mbuf); + + return 0; +} + +static inline uint16_t tx_window_sz(struct xio_rdma_transport *rdma_hndl) +{ + return rdma_hndl->max_sn - rdma_hndl->sn; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_xmit */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_xmit(struct xio_rdma_transport *rdma_hndl) +{ + struct xio_task *task = NULL, *task1, *task2; + struct xio_rdma_task *rdma_task = NULL; + struct xio_rdma_task *prev_rdma_task = NULL; + struct xio_work_req *first_wr = NULL; + struct xio_work_req *curr_wr = NULL; + struct xio_work_req *last_wr = NULL; + struct xio_work_req *prev_wr = &rdma_hndl->dummy_wr; + uint16_t tx_window; + uint16_t window = 0; + uint16_t retval; + uint16_t req_nr = 0; + + tx_window = tx_window_sz(rdma_hndl); +#ifdef XIO_SRQ_ENABLE + window = min(rdma_hndl->sqe_avail, tx_window); +#else + /* save one credit for nop */ + if (rdma_hndl->peer_credits > 1) { + window = min(rdma_hndl->peer_credits - 1, tx_window); + window = min(window, rdma_hndl->sqe_avail); + } +#endif + /* + TRACE_LOG("XMIT: tx_window:%d, peer_credits:%d, sqe_avail:%d\n", + tx_window, + rdma_hndl->peer_credits, + rdma_hndl->sqe_avail); + */ + if (window == 0) { + xio_set_error(EAGAIN); + return -1; + } + + /* if "ready to send queue" is not empty */ + while (rdma_hndl->tx_ready_tasks_num) { + task = list_first_entry( + &rdma_hndl->tx_ready_list, + struct xio_task, tasks_list_entry); + + rdma_task = (struct xio_rdma_task *)task->dd_data; + + /* prefetch next buffer */ + if (rdma_hndl->tx_ready_tasks_num > 2) { + task1 = list_first_entry_or_null( + &task->tasks_list_entry, + struct xio_task, tasks_list_entry); + if (task1) { + xio_prefetch(task1->mbuf.buf.head); + task2 = list_first_entry_or_null( + &task1->tasks_list_entry, + struct xio_task, + tasks_list_entry); + if (task2) + xio_prefetch(task2->mbuf.buf.head); + } + } + + /* phantom task */ + if (rdma_task->phantom_idx) { + if (req_nr >= window) + break; + curr_wr = &rdma_task->rdmad; + + prev_wr->send_wr.next = &curr_wr->send_wr; + + prev_rdma_task = rdma_task; + prev_wr = curr_wr; + req_nr++; + rdma_hndl->tx_ready_tasks_num--; + + rdma_task->txd.send_wr.send_flags &= ~IBV_SEND_SIGNALED; + + list_move_tail(&task->tasks_list_entry, + &rdma_hndl->in_flight_list); + continue; + } + if (rdma_task->out_ib_op == XIO_IB_RDMA_WRITE) { + if (req_nr >= (window - 1)) + break; + + curr_wr = &rdma_task->rdmad; + /* prepare it for rdma wr and concatenate the send + * wr to it */ + rdma_task->rdmad.send_wr.next = &rdma_task->txd.send_wr; + rdma_task->txd.send_wr.send_flags |= IBV_SEND_SIGNALED; + + curr_wr = &rdma_task->rdmad; + last_wr = &rdma_task->txd; + + req_nr++; + } else if (rdma_task->out_ib_op == XIO_IB_RDMA_WRITE_DIRECT || + rdma_task->out_ib_op == XIO_IB_RDMA_READ_DIRECT) { + if (req_nr >= window) + break; + rdma_task->rdmad.send_wr.send_flags |= + IBV_SEND_SIGNALED; + curr_wr = &rdma_task->rdmad; + last_wr = curr_wr; + } else { + if (req_nr >= window) + break; + curr_wr = &rdma_task->txd; + last_wr = curr_wr; + } + if (rdma_task->out_ib_op != XIO_IB_RDMA_WRITE_DIRECT && + rdma_task->out_ib_op != XIO_IB_RDMA_READ_DIRECT) { + xio_rdma_write_sn(task, rdma_hndl->sn, + rdma_hndl->ack_sn, + rdma_hndl->credits); + rdma_task->sn = rdma_hndl->sn; + rdma_hndl->sn++; + rdma_hndl->sim_peer_credits += rdma_hndl->credits; + rdma_hndl->credits = 0; + rdma_hndl->peer_credits--; + } + if (IS_REQUEST(task->tlv_type) || + task->tlv_type == XIO_MSG_TYPE_RDMA) + rdma_hndl->reqs_in_flight_nr++; + else if (IS_RESPONSE(task->tlv_type)) + rdma_hndl->rsps_in_flight_nr++; + else + ERROR_LOG("Unexpected tlv_type %u\n", task->tlv_type); + + prev_wr->send_wr.next = &curr_wr->send_wr; + prev_wr = last_wr; + + prev_rdma_task = rdma_task; + req_nr++; + rdma_hndl->tx_ready_tasks_num--; + list_move_tail(&task->tasks_list_entry, + &rdma_hndl->in_flight_list); + } + if (req_nr) { + first_wr = container_of(rdma_hndl->dummy_wr.send_wr.next, + struct xio_work_req, send_wr); + prev_rdma_task->txd.send_wr.next = NULL; + if (tx_window_sz(rdma_hndl) < 1 || + rdma_hndl->sqe_avail < req_nr + 1) + prev_rdma_task->txd.send_wr.send_flags |= + IBV_SEND_SIGNALED; + retval = xio_post_send(rdma_hndl, first_wr, req_nr); + if (unlikely(retval != 0)) { + ERROR_LOG("xio_post_send failed\n"); + return -1; + } + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_xmit_rdma_rd */ +/*---------------------------------------------------------------------------*/ +static int xio_xmit_rdma_rd_(struct xio_rdma_transport *rdma_hndl, + struct list_head *rdma_rd_list, + struct list_head *rdma_rd_in_flight_list, + int *rdma_rd_in_flight, + int *kick_rdma_rd) +{ + struct xio_task *task = NULL; + struct xio_rdma_task *rdma_task = NULL; + struct xio_work_req *first_wr = NULL; + struct xio_work_req *prev_wr = &rdma_hndl->dummy_wr; + struct xio_work_req *curr_wr = NULL; + int num_reqs = 0; + int err; + + if (list_empty(rdma_rd_list) || + rdma_hndl->sqe_avail == 0) + goto exit; + + do { + task = list_first_entry( + rdma_rd_list, + struct xio_task, tasks_list_entry); + list_move_tail(&task->tasks_list_entry, + rdma_rd_in_flight_list); + rdma_task = (struct xio_rdma_task *)task->dd_data; + + /* pending "sends" that were delayed for rdma read completion + * are moved to wait in the in_flight list + * because of the need to keep order + */ + if (rdma_task->out_ib_op == XIO_IB_RECV) { + (*rdma_rd_in_flight)++; + continue; + } + + /* prepare it for rdma read */ + curr_wr = &rdma_task->rdmad; + prev_wr->send_wr.next = &curr_wr->send_wr; + prev_wr = &rdma_task->rdmad; + + num_reqs++; + } while (!list_empty(rdma_rd_list) && + rdma_hndl->sqe_avail > num_reqs); + + if (num_reqs) { + first_wr = container_of(rdma_hndl->dummy_wr.send_wr.next, + struct xio_work_req, send_wr); + prev_wr->send_wr.next = NULL; + (*rdma_rd_in_flight) += num_reqs; + /* submit the chain of rdma-rd requests, start from the first */ + err = xio_post_send(rdma_hndl, first_wr, num_reqs); + if (unlikely(err)) + ERROR_LOG("xio_post_send failed\n"); + + /* ToDo: error handling */ + } + +exit: + *kick_rdma_rd = !list_empty(rdma_rd_list); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_xmit_rdma_rd_req */ +/*---------------------------------------------------------------------------*/ +static inline int xio_xmit_rdma_rd_req(struct xio_rdma_transport *rdma_hndl) +{ + return xio_xmit_rdma_rd_(rdma_hndl, + &rdma_hndl->rdma_rd_req_list, + &rdma_hndl->rdma_rd_req_in_flight_list, + &rdma_hndl->rdma_rd_req_in_flight, + &rdma_hndl->kick_rdma_rd_req); +} + +/*---------------------------------------------------------------------------*/ +/* xio_xmit_rdma_rd_rsp */ +/*---------------------------------------------------------------------------*/ +static inline int xio_xmit_rdma_rd_rsp(struct xio_rdma_transport *rdma_hndl) +{ + return xio_xmit_rdma_rd_(rdma_hndl, + &rdma_hndl->rdma_rd_rsp_list, + &rdma_hndl->rdma_rd_rsp_in_flight_list, + &rdma_hndl->rdma_rd_rsp_in_flight, + &rdma_hndl->kick_rdma_rd_rsp); +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_rearm_rq */ +/*---------------------------------------------------------------------------*/ +int xio_rdma_rearm_rq(struct xio_rdma_transport *rdma_hndl) +{ + struct xio_task *first_task = NULL; + struct xio_task *task = NULL; + struct xio_task *prev_task = NULL; + struct xio_rdma_task *rdma_task = NULL; + struct xio_rdma_task *prev_rdma_task = NULL; + int num_to_post; + int i; + +#ifdef XIO_SRQ_ENABLE + num_to_post = SRQ_DEPTH - rdma_hndl->tcq->srq->rqe_avail; +#else + num_to_post = rdma_hndl->rq_depth + EXTRA_RQE - rdma_hndl->rqe_avail; +#endif + for (i = 0; i < num_to_post; i++) { + /* get ready to receive message */ + task = xio_rdma_primary_task_alloc(rdma_hndl); + if (task == 0) { + ERROR_LOG("primary tasks pool is empty\n"); + return -1; + } + /* initialize the rxd */ + rdma_task = (struct xio_rdma_task *)task->dd_data; + if (!first_task) + first_task = task; + else + prev_rdma_task->rxd.recv_wr.next = + &rdma_task->rxd.recv_wr; + + prev_task = task; + prev_rdma_task = rdma_task; + rdma_task->out_ib_op = XIO_IB_RECV; +#ifdef XIO_SRQ_ENABLE + list_add_tail(&task->tasks_list_entry, + &rdma_hndl->tcq->srq->rx_list); +#else + list_add_tail(&task->tasks_list_entry, &rdma_hndl->rx_list); +#endif + } + if (prev_task) { + prev_rdma_task->rxd.recv_wr.next = NULL; + xio_post_recv(rdma_hndl, first_task, num_to_post); + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_rx_error_handler */ +/*---------------------------------------------------------------------------*/ +static inline int xio_rdma_rx_error_handler( + struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + /* remove the task from rx list */ + xio_tasks_pool_put(task); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_tx_error_handler */ +/*---------------------------------------------------------------------------*/ +static inline int xio_rdma_tx_error_handler( + struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + /* remove the task from in-flight list */ + xio_tasks_pool_put(task); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_rd_error_handler */ +/*---------------------------------------------------------------------------*/ +static inline int xio_rdma_rd_error_handler( + struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + /* remove the task from rdma rd in-flight list */ + xio_tasks_pool_put(task); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_wr_error_handler */ +/*---------------------------------------------------------------------------*/ +static inline int xio_rdma_wr_error_handler( + struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + + if (rdma_task->out_ib_op == XIO_IB_RDMA_WRITE_DIRECT) + return 0; + + /* wait for the concatenated "send" */ + rdma_task->out_ib_op = XIO_IB_SEND; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_handle_task_error */ +/*---------------------------------------------------------------------------*/ +static void xio_handle_task_error(struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + XIO_TO_RDMA_HNDL(task, rdma_hndl); + + switch (rdma_task->out_ib_op) { + case XIO_IB_RECV: + /* this should be the Flush, no task has been created yet */ + xio_rdma_rx_error_handler(rdma_hndl, task); + break; + case XIO_IB_SEND: + /* the task should be completed now */ + xio_rdma_tx_error_handler(rdma_hndl, task); + break; + case XIO_IB_RDMA_READ: + case XIO_IB_RDMA_READ_DIRECT: + xio_rdma_rd_error_handler(rdma_hndl, task); + break; + case XIO_IB_RDMA_WRITE: + case XIO_IB_RDMA_WRITE_DIRECT: + xio_rdma_wr_error_handler(rdma_hndl, task); + break; + default: + ERROR_LOG("unknown out_ib_op: task:%p, type:0x%x, " \ + "magic:0x%x, out_ib_op:0x%x\n", + task, task->tlv_type, + task->magic, rdma_task->out_ib_op); + break; + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_handle_wc_error */ +/*---------------------------------------------------------------------------*/ +static void xio_handle_wc_error(struct ibv_wc *wc, struct xio_srq *srq) +{ + struct xio_task *task = (struct xio_task *)ptr_from_int64(wc->wr_id); + struct xio_rdma_task *rdma_task = NULL; + struct xio_rdma_transport *rdma_hndl = NULL; + int retval; + struct xio_key_int32 key; + + /* complete in case all flush errors were consumed */ + if (task && task->dd_data == ptr_from_int64(XIO_BEACON_WRID)) { + rdma_hndl = container_of(task, + struct xio_rdma_transport, + beacon_task); + + rdma_hndl->beacon_sent = 0; + TRACE_LOG("beacon rdma_hndl:%p\n", rdma_hndl); + xio_set_timewait_timer(rdma_hndl); + kref_put(&rdma_hndl->base.kref, xio_rdma_close_cb); + return; + } + if (task && task->dd_data) { + rdma_task = (struct xio_rdma_task *)task->dd_data; + if (srq) { + key.id = wc->qp_num; + HT_LOOKUP(&srq->ht_rdma_hndl, &key, rdma_hndl, + rdma_hndl_htbl); + } else { + rdma_hndl = (struct xio_rdma_transport *)task->context; + } + } + + if (wc->status == IBV_WC_WR_FLUSH_ERR) { + TRACE_LOG("rdma_hndl:%p, rdma_task:%p, task:%p, " \ + "wr_id:0x%llx, " \ + "err:%s, vendor_err:0x%x\n", + rdma_hndl, rdma_task, task, + wc->wr_id, + ibv_wc_status_str(wc->status), + wc->vendor_err); + } else { + if (rdma_hndl) { + ERROR_LOG("[%s] - state:%d, rdma_hndl:%p, " \ + "rdma_task:%p, task:%p, wr_id:0x%lx, " \ + "err:%s, vendor_err:0x%x, " \ + "byte_len:%d, opcode:0x%x\n", + rdma_hndl->base.is_client ? + "client" : "server", + rdma_hndl->state, + rdma_hndl, rdma_task, task, + wc->wr_id, + ibv_wc_status_str(wc->status), + wc->vendor_err, + wc->byte_len, + wc->opcode); + if (task->omsg) + xio_msg_dump(task->omsg); + } else { + ERROR_LOG("wr_id:0x%lx, err:%s, vendor_err:0x%x " \ + "byte_len:%d, opcode:0x%x\n", + wc->wr_id, + ibv_wc_status_str(wc->status), + wc->vendor_err, + wc->byte_len, + wc->opcode); + } + ERROR_LOG("qp_num:0x%x, src_qp:0x%x, wc_flags:0x%x, " \ + "pkey_index:%d, slid:%d, sl:0x%x, dlid_path_bits:0x%x\n", + wc->qp_num, wc->src_qp, wc->wc_flags, wc->pkey_index, + wc->slid, wc->sl, wc->dlid_path_bits); + } + if (task && rdma_task) + xio_handle_task_error(task); + + /* temporary */ + if (wc->status != IBV_WC_WR_FLUSH_ERR) { + if (rdma_hndl) { + ERROR_LOG("cq error reported. calling " \ + "rdma_disconnect. rdma_hndl:%p\n", + rdma_hndl); + retval = rdma_disconnect(rdma_hndl->cm_id); + if (retval) + ERROR_LOG("rdma_hndl:%p rdma_disconnect" \ + "failed, %m\n", rdma_hndl); + } else { + /* TODO: handle each error specifically */ + ERROR_LOG("ASSERT: program abort\n"); + exit(0); + } + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_idle_handler */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_idle_handler(struct xio_rdma_transport *rdma_hndl) +{ + if (rdma_hndl->state != XIO_TRANSPORT_STATE_CONNECTED || + !rdma_hndl->primary_pool_cls.task_lookup) + return 0; + + /* Does the local have resources to send message? */ + if (!rdma_hndl->sqe_avail) + return 0; + + /* Try to do some useful work, want to spend time before calling the + * pool, this increase the chance that more messages will arrive + * and request notify will not be necessary + */ + + if (rdma_hndl->kick_rdma_rd_req) + xio_xmit_rdma_rd_req(rdma_hndl); + + if (rdma_hndl->kick_rdma_rd_rsp) + xio_xmit_rdma_rd_rsp(rdma_hndl); + + /* Does the local have resources to send message? + * xio_xmit_rdma_rd may consumed the sqe_avail + */ + if (!rdma_hndl->sqe_avail) + return 0; + +#ifndef XIO_SRQ_ENABLE + /* Can the peer receive messages? */ + if (!rdma_hndl->peer_credits) + return 0; +#endif + + /* If we have real messages to send there is no need for + * a special NOP message as credits are piggybacked + */ + if (rdma_hndl->tx_ready_tasks_num) { + xio_rdma_xmit(rdma_hndl); + return 0; + } + +#ifndef XIO_SRQ_ENABLE + /* Does the peer have already maximum credits? */ + if (rdma_hndl->sim_peer_credits >= MAX_RECV_WR) + return 0; + + /* Does the local have any credits to send? */ + if (!rdma_hndl->credits) + return 0; + + TRACE_LOG("peer_credits:%d, credits:%d sim_peer_credits:%d\n", + rdma_hndl->peer_credits, rdma_hndl->credits, + rdma_hndl->sim_peer_credits); + + xio_rdma_send_nop(rdma_hndl); +#endif + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_rx_handler */ +/*---------------------------------------------------------------------------*/ +static XIO_F_ALWAYS_INLINE int xio_rdma_rx_handler( + struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + struct list_head *task_prev; + struct xio_task *task1, *task2; + int must_send = 0; + int retval; + + /* prefetch next buffer */ + if (likely(task->tasks_list_entry.next != + task->tasks_list_entry.prev)) { + task1 = list_entry(task->tasks_list_entry.next, + struct xio_task, tasks_list_entry); + task_prev = task->tasks_list_entry.prev; + xio_prefetch(task1->mbuf.buf.head); + } else { + task1 = NULL; + task_prev = NULL; + } + + /* rearm the receive queue */ + /* + if ((rdma_hndl->state == XIO_TRANSPORT_STATE_CONNECTED) && + (rdma_hndl->rqe_avail <= rdma_hndl->rq_depth + 1)) + xio_rdma_rearm_rq(rdma_hndl); + */ + + retval = xio_mbuf_read_first_tlv(&task->mbuf); + + task->tlv_type = xio_mbuf_tlv_type(&task->mbuf); + list_move_tail(&task->tasks_list_entry, &rdma_hndl->io_list); +#ifdef XIO_SRQ_ENABLE + rdma_hndl->tcq->srq->rqe_avail--; +#else + rdma_hndl->rqe_avail--; + rdma_hndl->sim_peer_credits--; +#endif + /* call recv completion */ + switch (task->tlv_type) { + case XIO_CREDIT_NOP: + xio_rdma_on_recv_nop(rdma_hndl, task); +#ifdef XIO_SRQ_ENABLE + if (rdma_hndl->tcq->srq->rqe_avail <= SRQ_DEPTH + 1) +#else + if (rdma_hndl->rqe_avail <= rdma_hndl->rq_depth + 1) +#endif + xio_rdma_rearm_rq(rdma_hndl); + must_send = 1; + break; + case XIO_RDMA_READ_ACK: + xio_rdma_on_recv_rdma_read_ack(rdma_hndl, task); +#ifdef XIO_SRQ_ENABLE + if (rdma_hndl->tcq->srq->rqe_avail <= SRQ_DEPTH + 1) +#else + if (rdma_hndl->rqe_avail <= rdma_hndl->rq_depth + 1) +#endif + xio_rdma_rearm_rq(rdma_hndl); + must_send = 1; + break; + case XIO_NEXUS_SETUP_REQ: + case XIO_NEXUS_SETUP_RSP: + xio_rdma_on_setup_msg(rdma_hndl, task); + break; + case XIO_CANCEL_REQ: + xio_rdma_on_recv_cancel_req(rdma_hndl, task); + break; + case XIO_CANCEL_RSP: + xio_rdma_on_recv_cancel_rsp(rdma_hndl, task); + break; + default: + /* rearm the receive queue */ +#ifdef XIO_SRQ_ENABLE + if (rdma_hndl->tcq->srq->rqe_avail <= SRQ_DEPTH + 1) +#else + if (rdma_hndl->rqe_avail <= rdma_hndl->rq_depth + 1) +#endif + xio_rdma_rearm_rq(rdma_hndl); + if (IS_REQUEST(task->tlv_type)) + xio_rdma_on_recv_req(rdma_hndl, task); + else if (IS_RESPONSE(task->tlv_type)) + xio_rdma_on_recv_rsp(rdma_hndl, task); + else + ERROR_LOG("unknown message type:0x%x\n", + task->tlv_type); + break; + } + /* + if (rdma_hndl->state != XIO_TRANSPORT_STATE_CONNECTED) + return retval; + */ + + /* transmit ready packets */ + if (!must_send && rdma_hndl->tx_ready_tasks_num) + must_send = (tx_window_sz(rdma_hndl) >= SEND_THRESHOLD); + + /* resource are now available and rdma rd requests are pending kick + * them + */ + if (rdma_hndl->kick_rdma_rd_req) + xio_xmit_rdma_rd_req(rdma_hndl); + + if (rdma_hndl->kick_rdma_rd_rsp) + xio_xmit_rdma_rd_rsp(rdma_hndl); + + if (must_send) + xio_rdma_xmit(rdma_hndl); + + /* prefetch next buffer */ + if (task1) { + if (task1->tasks_list_entry.next != task_prev) { + task2 = list_entry(task1->tasks_list_entry.next, + struct xio_task, tasks_list_entry); + xio_prefetch(task2); + } + } + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_tx_comp_handler */ +/*---------------------------------------------------------------------------*/ +static XIO_F_ALWAYS_INLINE int xio_rdma_tx_comp_handler( + struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + struct xio_task *ptask, *next_ptask; + struct xio_rdma_task *rdma_task; + int found = 0; + int removed = 0; + + /* If we got a completion, it means all the previous tasks should've + been sent by now - due to ordering */ + list_for_each_entry_safe(ptask, next_ptask, &rdma_hndl->in_flight_list, + tasks_list_entry) { + list_move_tail(&ptask->tasks_list_entry, + &rdma_hndl->tx_comp_list); + removed++; + rdma_task = (struct xio_rdma_task *)ptask->dd_data; + + rdma_hndl->sqe_avail++; + + /* phantom task */ + if (rdma_task->phantom_idx) { + xio_tasks_pool_put(ptask); + continue; + } + + /* rdma wr utilizes two wqe but appears only once in the + * in flight list + */ + if (rdma_task->out_ib_op == XIO_IB_RDMA_WRITE) + rdma_hndl->sqe_avail++; + + if (IS_RDMA_RD_ACK(ptask->tlv_type)) { + rdma_hndl->rsps_in_flight_nr--; + xio_tasks_pool_put(ptask); + } else if (IS_REQUEST(ptask->tlv_type)) { + rdma_hndl->max_sn++; + rdma_hndl->reqs_in_flight_nr--; + xio_rdma_on_req_send_comp(rdma_hndl, ptask); + xio_tasks_pool_put(ptask); + } else if (IS_RESPONSE(ptask->tlv_type)) { + rdma_hndl->max_sn++; + rdma_hndl->rsps_in_flight_nr--; + xio_rdma_on_rsp_send_comp(rdma_hndl, ptask); + } else if (IS_NOP(ptask->tlv_type)) { + rdma_hndl->rsps_in_flight_nr--; + xio_tasks_pool_put(ptask); + } else if (ptask->tlv_type == XIO_MSG_TYPE_RDMA) { + if (rdma_task->out_ib_op == XIO_IB_RDMA_WRITE_DIRECT) { + rdma_hndl->reqs_in_flight_nr--; + xio_rdma_on_direct_rdma_comp( + rdma_hndl, ptask, + XIO_WC_OP_RDMA_WRITE); + xio_tasks_pool_put(ptask); + } + } else { + ERROR_LOG("unexpected task %p tlv %u type:0x%x id:%d " \ + "magic:0x%x\n", + ptask, ptask->tlv_type, rdma_task->out_ib_op, + ptask->ltid, ptask->magic); + continue; + } + if (ptask == task) { + found = 1; + break; + } + } + /* resource are now available and rdma rd requests are pending kick + * them + */ + if (rdma_hndl->kick_rdma_rd_req) + xio_xmit_rdma_rd_req(rdma_hndl); + + if (rdma_hndl->kick_rdma_rd_rsp) + xio_xmit_rdma_rd_rsp(rdma_hndl); + + if (rdma_hndl->tx_ready_tasks_num) + xio_rdma_xmit(rdma_hndl); + + if (!found && removed) + ERROR_LOG("not found but removed %d type:0x%x\n", + removed, task->tlv_type); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_rd_req_comp_handler */ +/*---------------------------------------------------------------------------*/ +static void xio_direct_rdma_rd_comp_handler( + struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + + rdma_hndl->sqe_avail++; + + if (rdma_task->phantom_idx == 0) { + rdma_hndl->reqs_in_flight_nr--; + xio_rdma_on_direct_rdma_comp(rdma_hndl, task, + XIO_WC_OP_RDMA_READ); + } else { + xio_tasks_pool_put(task); + xio_xmit_rdma_rd_req(rdma_hndl); + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_rd_req_comp_handler */ +/*---------------------------------------------------------------------------*/ +static XIO_F_ALWAYS_INLINE void xio_rdma_rd_req_comp_handler( + struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + union xio_transport_event_data event_data; + struct xio_transport_base *transport = + (struct xio_transport_base *)rdma_hndl; + + rdma_hndl->rdma_rd_req_in_flight--; + rdma_hndl->sqe_avail++; + + if (rdma_task->phantom_idx == 0) { + if (task->state == XIO_TASK_STATE_CANCEL_PENDING) { + TRACE_LOG("[%d] - **** message is canceled\n", + rdma_task->sn); + xio_rdma_cancel_rsp(transport, task, XIO_E_MSG_CANCELED, + NULL, 0); + xio_tasks_pool_put(task); + xio_xmit_rdma_rd_req(rdma_hndl); + return; + } + + list_move_tail(&task->tasks_list_entry, &rdma_hndl->io_list); + + xio_xmit_rdma_rd_req(rdma_hndl); + + /* fill notification event */ + event_data.msg.op = XIO_WC_OP_RECV; + event_data.msg.task = task; + + xio_transport_notify_observer(&rdma_hndl->base, + XIO_TRANSPORT_EVENT_NEW_MESSAGE, + &event_data); + + while (rdma_hndl->rdma_rd_req_in_flight) { + task = list_first_entry( + &rdma_hndl->rdma_rd_req_in_flight_list, + struct xio_task, tasks_list_entry); + + rdma_task = (struct xio_rdma_task *)task->dd_data; + + if (rdma_task->out_ib_op != XIO_IB_RECV) + break; + + /* tasks that arrived in Send/Receive while pending + * "RDMA READ" tasks were in flight was fenced. + */ + rdma_hndl->rdma_rd_req_in_flight--; + list_move_tail(&task->tasks_list_entry, + &rdma_hndl->io_list); + event_data.msg.op = XIO_WC_OP_RECV; + event_data.msg.task = task; + + xio_transport_notify_observer( + &rdma_hndl->base, + XIO_TRANSPORT_EVENT_NEW_MESSAGE, + &event_data); + } + } else { + xio_tasks_pool_put(task); + xio_xmit_rdma_rd_req(rdma_hndl); + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_rd_rsp_comp_handler */ +/*---------------------------------------------------------------------------*/ +static XIO_F_ALWAYS_INLINE void xio_rdma_rd_rsp_comp_handler( + struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + union xio_transport_event_data event_data; + struct xio_transport_base *transport = + (struct xio_transport_base *)rdma_hndl; + + rdma_hndl->rdma_rd_rsp_in_flight--; + rdma_hndl->sqe_avail++; + + if (rdma_task->phantom_idx == 0) { + if (task->state == XIO_TASK_STATE_CANCEL_PENDING) { + TRACE_LOG("[%d] - **** message is canceled\n", + rdma_task->sn); + xio_rdma_cancel_rsp(transport, task, XIO_E_MSG_CANCELED, + NULL, 0); + xio_tasks_pool_put(task); + xio_xmit_rdma_rd_rsp(rdma_hndl); + return; + } + + list_move_tail(&task->tasks_list_entry, &rdma_hndl->io_list); + + /* notify the peer that it can free resources */ + xio_rdma_send_rdma_read_ack(rdma_hndl, task->rtid); + + xio_xmit_rdma_rd_rsp(rdma_hndl); + + /* copy from task->in to sender_task->in */ + xio_rdma_post_recv_rsp(task); + + /* fill notification event */ + event_data.msg.op = XIO_WC_OP_RECV; + event_data.msg.task = task; + + xio_transport_notify_observer(&rdma_hndl->base, + XIO_TRANSPORT_EVENT_NEW_MESSAGE, + &event_data); + + while (rdma_hndl->rdma_rd_rsp_in_flight) { + task = list_first_entry( + &rdma_hndl->rdma_rd_rsp_in_flight_list, + struct xio_task, tasks_list_entry); + + rdma_task = (struct xio_rdma_task *)task->dd_data; + + if (rdma_task->out_ib_op != XIO_IB_RECV) + break; + + /* tasks that arrived in Send/Receive while pending + * "RDMA READ" tasks were in flight was fenced. + */ + rdma_hndl->rdma_rd_rsp_in_flight--; + list_move_tail(&task->tasks_list_entry, + &rdma_hndl->io_list); + event_data.msg.op = XIO_WC_OP_RECV; + event_data.msg.task = task; + + xio_transport_notify_observer( + &rdma_hndl->base, + XIO_TRANSPORT_EVENT_NEW_MESSAGE, + &event_data); + } + } else { + xio_tasks_pool_put(task); + xio_xmit_rdma_rd_rsp(rdma_hndl); + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_handle_wc */ +/*---------------------------------------------------------------------------*/ +static XIO_F_ALWAYS_INLINE void xio_handle_wc(struct ibv_wc *wc, + int last_in_rxq, struct xio_srq *srq) +{ + struct xio_task *task = (struct xio_task *)ptr_from_int64(wc->wr_id); + int opcode = wc->opcode; + struct xio_key_int32 key; + struct xio_rdma_transport *rdma_hndl; + + if (srq) { + key.id = wc->qp_num; + HT_LOOKUP(&srq->ht_rdma_hndl, &key, rdma_hndl, rdma_hndl_htbl); + } else { + rdma_hndl = (struct xio_rdma_transport *)task->context; + } + + /* + TRACE_LOG("received opcode :%s [%x]\n", + ibv_wc_opcode_str(wc->opcode), wc->opcode); + */ + + switch (opcode) { + case IBV_WC_RECV: + task->last_in_rxq = last_in_rxq; + xio_rdma_rx_handler(rdma_hndl, task); + break; + case IBV_WC_SEND: + case IBV_WC_RDMA_WRITE: + if (opcode == IBV_WC_SEND || + (opcode == IBV_WC_RDMA_WRITE && + task->tlv_type == XIO_MSG_TYPE_RDMA)) + xio_rdma_tx_comp_handler(rdma_hndl, task); + break; + case IBV_WC_RDMA_READ: + task->last_in_rxq = last_in_rxq; + if (IS_REQUEST(task->tlv_type)) + xio_rdma_rd_req_comp_handler(rdma_hndl, task); + else if (IS_RESPONSE(task->tlv_type)) + xio_rdma_rd_rsp_comp_handler(rdma_hndl, task); + else if (task->tlv_type == XIO_MSG_TYPE_RDMA) + xio_direct_rdma_rd_comp_handler(rdma_hndl, task); + else + ERROR_LOG("Unexpected tlv_type %u\n", task->tlv_type); + break; + default: + ERROR_LOG("unknown opcode :%s [%x]\n", + ibv_wc_opcode_str(wc->opcode), wc->opcode); + break; + } +} + +/* + * Could read as many entries as possible without blocking, but + * that just fills up a list of tasks. Instead pop out of here + * so that tx progress, like issuing rdma reads and writes, can + * happen periodically. + */ +static int xio_poll_cq(struct xio_cq *tcq, int max_wc, int timeout_us) +{ + int err = 0; + int stop = 0, tlv_type; + int wclen = max_wc, i, numwc = 0; + int timeouts_num = 0; + int polled = 0, last_in_rxq = -1; + cycles_t timeout; + cycles_t start_time = 0; + struct ibv_wc *wc; + struct xio_task *task; + struct xio_rdma_task *rdma_task; + + for (;;) { + if (wclen > tcq->wc_array_len) + wclen = tcq->wc_array_len; + + if (xio_context_is_loop_stopping(tcq->ctx) && polled) { + err = 0; /* same as in budget */ + stop = 1; + break; + } + err = ibv_poll_cq(tcq->cq, wclen, tcq->wc_array); + polled = 1; + if (err == 0) { /* no completions retrieved */ + if (timeout_us == 0) + break; + /* wait timeout before going out */ + if (timeouts_num == 0) { + start_time = get_cycles(); + } else { + /*calculate it again, need to spend time */ + timeout = timeout_us * g_mhz; + if (timeout_us > 0 && + (get_cycles() - start_time) > timeout) + break; + } + if (xio_context_is_loop_stopping(tcq->ctx)) { + err = 0; /* same as in budget */ + stop = 1; + break; + } + + timeouts_num++; + continue; + } + + if (unlikely(err < 0)) { + ERROR_LOG("ibv_poll_cq failed\n"); + break; + } + timeouts_num = 0; + + wc = &tcq->wc_array[err - 1]; + for (i = err - 1; i >= 0; i--) { + if (wc->status == IBV_WC_SUCCESS && + (wc->opcode == IBV_WC_RECV || wc->opcode == IBV_WC_RDMA_READ)) { + task = (struct xio_task *) + ptr_from_int64(wc->wr_id); + rdma_task = (struct xio_rdma_task *)task->dd_data; + if (!rdma_task->phantom_idx) { + tlv_type = xio_mbuf_read_type(&task->mbuf); + if (IS_APPLICATION_MSG(tlv_type)) { + last_in_rxq = i; + break; + } + } + } + wc--; + } + wc = &tcq->wc_array[0]; + for (i = 0; i < err; i++) { + if (likely(wc->status == IBV_WC_SUCCESS)) + xio_handle_wc(wc, (i == last_in_rxq), tcq->srq); + else + xio_handle_wc_error(wc, tcq->srq); + wc++; + } + numwc += err; + if (numwc == max_wc) { + err = 1; + break; + } + wclen = max_wc - numwc; + } + + return stop ? -1 : err; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rearm_completions */ +/*---------------------------------------------------------------------------*/ +static void xio_rearm_completions(struct xio_cq *tcq) +{ + int err; + + err = ibv_req_notify_cq(tcq->cq, 0); + if (unlikely(err)) { + ERROR_LOG("ibv_req_notify_cq failed. (errno=%d %m)\n", + errno); + } + + memset(&tcq->consume_cq_event, 0, + sizeof(tcq->consume_cq_event)); + tcq->consume_cq_event.handler = xio_sched_consume_cq; + tcq->consume_cq_event.data = tcq; + + xio_context_add_event(tcq->ctx, &tcq->consume_cq_event); + + tcq->num_delayed_arm = 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_poll_cq_armable */ +/*---------------------------------------------------------------------------*/ +static void xio_poll_cq_armable(struct xio_cq *tcq) +{ + int err = -1; + + if (++tcq->num_poll_cq < NUM_POLL_CQ) + err = xio_poll_cq(tcq, MAX_POLL_WC, tcq->ctx->polling_timeout); + if (unlikely(err < 0)) { + struct xio_rdma_transport *rdma_hndl; + + xio_rearm_completions(tcq); + list_for_each_entry(rdma_hndl, + &tcq->trans_list, trans_list_entry) { + xio_rdma_idle_handler(rdma_hndl); + } + return; + } + + if (err == 0 && (++tcq->num_delayed_arm == MAX_NUM_DELAYED_ARM)) { + /* no more completions on cq, give up and arm the interrupts */ + xio_rearm_completions(tcq); + } else { + memset(&tcq->poll_cq_event, 0, sizeof(tcq->poll_cq_event)); + tcq->poll_cq_event.handler = xio_sched_poll_cq; + tcq->poll_cq_event.data = tcq; + + xio_context_add_event(tcq->ctx, &tcq->poll_cq_event); + } +} + +/* xio_sched_consume_cq() is scheduled to consume completion events that + could arrive after the cq had been seen empty, but just before + the interrupts were re-armed. + Intended to consume those remaining completions only, the function + does not re-arm interrupts, but polls the cq until it's empty. + As we always limit the number of completions polled at a time, we may + need to schedule this functions few times. + It may happen that during this process new completions occur, and + we get an interrupt about that. Some of the "new" completions may be + processed by the self-scheduling xio_sched_consume_cq(), which is + a good thing, because we don't need to wait for the interrupt event. + When the interrupt notification arrives, its handler will remove the + scheduled event, and call xio_poll_cq_armable(), so that the polling + cycle resumes normally. +*/ +static void xio_sched_consume_cq(void *data) +{ + struct xio_cq *tcq = (struct xio_cq *)data; + int err; + + if (++tcq->num_poll_cq >= NUM_POLL_CQ) + return; + + err = xio_poll_cq(tcq, MAX_POLL_WC, tcq->ctx->polling_timeout); + if (err > 0) { + memset(&tcq->consume_cq_event, 0, + sizeof(tcq->consume_cq_event)); + tcq->consume_cq_event.handler = xio_sched_consume_cq; + tcq->consume_cq_event.data = tcq; + + xio_context_add_event(tcq->ctx, &tcq->consume_cq_event); + } +} + +/* Scheduled to poll cq after a completion event has been + received and acknowledged, if no more completions are found + the interrupts are re-armed */ +static void xio_sched_poll_cq(void *data) +{ + struct xio_rdma_transport *rdma_hndl; + struct xio_cq *tcq = (struct xio_cq *)data; + + xio_poll_cq_armable(tcq); + + list_for_each_entry(rdma_hndl, + &tcq->trans_list, trans_list_entry) { + xio_rdma_idle_handler(rdma_hndl); + } +} + +/* + * Called from main event loop when a CQ notification is available. + */ +void xio_cq_event_handler(int fd __attribute__ ((unused)), + int events __attribute__ ((unused)), + void *data) +{ + void *cq_context; + struct ibv_cq *cq; + struct xio_cq *tcq = (struct xio_cq *)data; + int err; + + err = ibv_get_cq_event(tcq->channel, &cq, &cq_context); + if (unlikely(err != 0)) { + /* Just print the log message, if that was a serious problem, + it will express itself elsewhere */ + ERROR_LOG("failed to retrieve CQ event, cq:%p\n", cq); + return; + } + tcq->cq_events_that_need_ack++; + tcq->num_poll_cq = 0; + /* if a poll was previously scheduled, remove it, + as it will be scheduled when necessary */ + xio_context_disable_event(&tcq->poll_cq_event); + xio_context_disable_event(&tcq->consume_cq_event); + + xio_poll_cq_armable(tcq); + + /* accumulate number of cq events that need to + * be acked, and periodically ack them + */ + if (tcq->cq_events_that_need_ack == MAX_ACKED_CQE/*UINT_MAX*/) { + ibv_ack_cq_events(tcq->cq, MAX_ACKED_CQE/*UINT_MAX*/); + tcq->cq_events_that_need_ack = 0; + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_poll_completions */ +/*---------------------------------------------------------------------------*/ +void xio_rdma_poll_completions(struct xio_cq *tcq, int timeout_us) +{ + void *cq_context; + struct ibv_cq *cq; + int err; + struct xio_rdma_transport *rdma_hndl; + int cq_rearmed = 0; + + err = ibv_get_cq_event(tcq->channel, &cq, &cq_context); + if (!err) { + tcq->cq_events_that_need_ack++; + cq_rearmed = 1; + } else if (errno != EAGAIN) { + /* Just print the log message, if that was a serious problem, + it will express itself elsewhere */ + ERROR_LOG("failed to retrieve CQ event, cq:%p\n", cq); + return; + } + /* if a poll was previously scheduled, remove it, + as it will be scheduled when necessary */ + xio_context_disable_event(&tcq->poll_cq_event); + xio_context_disable_event(&tcq->consume_cq_event); + + xio_poll_cq(tcq, MAX_POLL_WC, timeout_us); + /* TODO rearm interrupts optimization */ + if (cq_rearmed == 1) { + err = ibv_req_notify_cq(tcq->cq, 0); + if (unlikely(err)) { + ERROR_LOG("ibv_req_notify_cq failed. (errno=%d %m)\n", + errno); + } + } + list_for_each_entry(rdma_hndl, + &tcq->trans_list, trans_list_entry) + xio_rdma_idle_handler(rdma_hndl); + + /* accumulate number of cq events that need to + * be acked, and periodically ack them + */ + if (tcq->cq_events_that_need_ack == MAX_ACKED_CQE/*UINT_MAX*/) { + ibv_ack_cq_events(tcq->cq, MAX_ACKED_CQE/*UINT_MAX*/); + tcq->cq_events_that_need_ack = 0; + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_write_req_header */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_write_req_header(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task, + struct xio_rdma_req_hdr *req_hdr) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + struct xio_rdma_req_hdr *tmp_req_hdr; + struct xio_sge *tmp_sge; + struct xio_sge sge; + struct ibv_mr *mr; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + void *sg; + size_t hdr_len; + uint32_t i; + + sgtbl = xio_sg_table_get(&task->omsg->in); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(task->omsg->in.sgl_type); + + /* point to transport header */ + xio_mbuf_set_trans_hdr(&task->mbuf); + tmp_req_hdr = (struct xio_rdma_req_hdr *) + xio_mbuf_get_curr_ptr(&task->mbuf); + + /* pack relevant values */ + tmp_req_hdr->version = req_hdr->version; + tmp_req_hdr->flags = req_hdr->flags; + PACK_SVAL(req_hdr, tmp_req_hdr, req_hdr_len); + /* sn shall be coded later */ + /* ack_sn shall be coded later */ + /* credits shall be coded later */ + PACK_LVAL(req_hdr, tmp_req_hdr, ltid); + tmp_req_hdr->in_ib_op = req_hdr->in_ib_op; + tmp_req_hdr->out_ib_op = req_hdr->out_ib_op; + PACK_SVAL(req_hdr, tmp_req_hdr, in_num_sge); + PACK_SVAL(req_hdr, tmp_req_hdr, out_num_sge); + PACK_SVAL(req_hdr, tmp_req_hdr, ulp_hdr_len); + PACK_SVAL(req_hdr, tmp_req_hdr, ulp_pad_len); + /*remain_data_len is not used */ + PACK_LLVAL(req_hdr, tmp_req_hdr, ulp_imm_len); + + tmp_sge = (struct xio_sge *)((uint8_t *)tmp_req_hdr + + sizeof(struct xio_rdma_req_hdr)); + + /* IN: requester expect small input written via send */ + sg = sge_first(sgtbl_ops, sgtbl); + if (req_hdr->in_ib_op == XIO_IB_SEND) { + for (i = 0; i < req_hdr->in_num_sge; i++) { + sge.addr = 0; + sge.length = sge_length(sgtbl_ops, sg); + sge.stag = 0; + PACK_LLVAL(&sge, tmp_sge, addr); + PACK_LVAL(&sge, tmp_sge, length); + PACK_LVAL(&sge, tmp_sge, stag); + tmp_sge++; + sg = sge_next(sgtbl_ops, sgtbl, sg); + } + } + /* IN: requester expect big input written rdma write */ + if (req_hdr->in_ib_op == XIO_IB_RDMA_WRITE) { + for (i = 0; i < req_hdr->in_num_sge; i++) { + sge.addr = uint64_from_ptr( + rdma_task->read_reg_mem[i].addr); + sge.length = rdma_task->read_reg_mem[i].length; + if (rdma_task->read_reg_mem[i].mr) { + mr = xio_rdma_mr_lookup( + rdma_task->read_reg_mem[i].mr, + rdma_hndl->tcq->dev); + if (!mr) + goto cleanup; + + sge.stag = mr->rkey; + } else { + sge.stag = 0; + } + PACK_LLVAL(&sge, tmp_sge, addr); + PACK_LVAL(&sge, tmp_sge, length); + PACK_LVAL(&sge, tmp_sge, stag); + tmp_sge++; + } + } + /* OUT: requester want to write data via rdma read */ + if (req_hdr->out_ib_op == XIO_IB_RDMA_READ) { + for (i = 0; i < req_hdr->out_num_sge; i++) { + sge.addr = uint64_from_ptr( + rdma_task->write_reg_mem[i].addr); + sge.length = rdma_task->write_reg_mem[i].length; + if (rdma_task->write_reg_mem[i].mr) { + mr = xio_rdma_mr_lookup( + rdma_task->write_reg_mem[i].mr, + rdma_hndl->tcq->dev); + if (!mr) + goto cleanup; + + sge.stag = mr->rkey; + } else { + sge.stag = 0; + } + PACK_LLVAL(&sge, tmp_sge, addr); + PACK_LVAL(&sge, tmp_sge, length); + PACK_LVAL(&sge, tmp_sge, stag); + tmp_sge++; + } + } + if (req_hdr->out_ib_op == XIO_IB_SEND) { + for (i = 0; i < req_hdr->out_num_sge; i++) { + sge.addr = 0; + sge.length = sge_length(sgtbl_ops, sg); + sge.stag = 0; + PACK_LLVAL(&sge, tmp_sge, addr); + PACK_LVAL(&sge, tmp_sge, length); + PACK_LVAL(&sge, tmp_sge, stag); + tmp_sge++; + sg = sge_next(sgtbl_ops, sgtbl, sg); + } + } + hdr_len = sizeof(struct xio_rdma_req_hdr); + hdr_len += sizeof(struct xio_sge) * (req_hdr->in_num_sge + + req_hdr->out_num_sge); +#ifdef EYAL_TODO + print_hex_dump_bytes("post_send: ", DUMP_PREFIX_ADDRESS, + task->mbuf.curr, + hdr_len + 16); +#endif + xio_mbuf_inc(&task->mbuf, hdr_len); + + return 0; + +cleanup: + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_read_req_header */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_read_req_header(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task, + struct xio_rdma_req_hdr *req_hdr) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + struct xio_rdma_req_hdr *tmp_req_hdr; + struct xio_sge *tmp_sge; + int i; + size_t hdr_len; + + /* point to transport header */ + xio_mbuf_set_trans_hdr(&task->mbuf); + tmp_req_hdr = (struct xio_rdma_req_hdr *) + xio_mbuf_get_curr_ptr(&task->mbuf); + + req_hdr->version = tmp_req_hdr->version; + req_hdr->flags = tmp_req_hdr->flags; + UNPACK_SVAL(tmp_req_hdr, req_hdr, req_hdr_len); + + if (unlikely(req_hdr->req_hdr_len != sizeof(struct xio_rdma_req_hdr))) { + ERROR_LOG( + "header length's read failed. arrived:%d expected:%zd\n", + req_hdr->req_hdr_len, sizeof(struct xio_rdma_req_hdr)); + return -1; + } + UNPACK_SVAL(tmp_req_hdr, req_hdr, sn); + UNPACK_SVAL(tmp_req_hdr, req_hdr, credits); + UNPACK_LVAL(tmp_req_hdr, req_hdr, ltid); + req_hdr->in_ib_op = tmp_req_hdr->in_ib_op; + req_hdr->out_ib_op = tmp_req_hdr->out_ib_op; + + UNPACK_SVAL(tmp_req_hdr, req_hdr, in_num_sge); + UNPACK_SVAL(tmp_req_hdr, req_hdr, out_num_sge); + UNPACK_SVAL(tmp_req_hdr, req_hdr, ulp_hdr_len); + UNPACK_SVAL(tmp_req_hdr, req_hdr, ulp_pad_len); + + /* remain_data_len not in use */ + UNPACK_LLVAL(tmp_req_hdr, req_hdr, ulp_imm_len); + + tmp_sge = (struct xio_sge *)((uint8_t *)tmp_req_hdr + + sizeof(struct xio_rdma_req_hdr)); + + rdma_task->sn = req_hdr->sn; + + /* params for SEND/RDMA WRITE */ + for (i = 0; i < req_hdr->in_num_sge; i++) { + UNPACK_LLVAL(tmp_sge, &rdma_task->req_in_sge[i], addr); + UNPACK_LVAL(tmp_sge, &rdma_task->req_in_sge[i], length); + UNPACK_LVAL(tmp_sge, &rdma_task->req_in_sge[i], stag); + tmp_sge++; + } + rdma_task->req_in_num_sge = i; + + /* params for SEND/RDMA_READ */ + for (i = 0; i < req_hdr->out_num_sge; i++) { + UNPACK_LLVAL(tmp_sge, &rdma_task->req_out_sge[i], addr); + UNPACK_LVAL(tmp_sge, &rdma_task->req_out_sge[i], length); + UNPACK_LVAL(tmp_sge, &rdma_task->req_out_sge[i], stag); + tmp_sge++; + } + rdma_task->req_out_num_sge = i; + + hdr_len = sizeof(struct xio_rdma_req_hdr); + hdr_len += sizeof(struct xio_sge) * (req_hdr->in_num_sge + + req_hdr->out_num_sge); + + xio_mbuf_inc(&task->mbuf, hdr_len); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_write_rsp_header */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_write_rsp_header(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task, + struct xio_rdma_rsp_hdr *rsp_hdr) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + struct xio_sge sge; + struct xio_rdma_rsp_hdr *tmp_rsp_hdr; + struct xio_sge *tmp_sge; + struct ibv_mr *mr; + size_t hdr_len; + uint32_t *wr_len; + int i; + + /* point to transport header */ + xio_mbuf_set_trans_hdr(&task->mbuf); + tmp_rsp_hdr = (struct xio_rdma_rsp_hdr *) + xio_mbuf_get_curr_ptr(&task->mbuf); + + /* pack relevant values */ + tmp_rsp_hdr->version = rsp_hdr->version; + tmp_rsp_hdr->flags = rsp_hdr->flags; + PACK_SVAL(rsp_hdr, tmp_rsp_hdr, rsp_hdr_len); + /* sn shall be coded later */ + /* ack_sn shall be coded later */ + /* credits shall be coded later */ + PACK_LVAL(rsp_hdr, tmp_rsp_hdr, rtid); + tmp_rsp_hdr->out_ib_op = rsp_hdr->out_ib_op; + PACK_LVAL(rsp_hdr, tmp_rsp_hdr, status); + PACK_SVAL(rsp_hdr, tmp_rsp_hdr, out_num_sge); + PACK_LVAL(rsp_hdr, tmp_rsp_hdr, ltid); + PACK_SVAL(rsp_hdr, tmp_rsp_hdr, ulp_hdr_len); + PACK_SVAL(rsp_hdr, tmp_rsp_hdr, ulp_pad_len); + /* remain_data_len not in use */ + PACK_LLVAL(rsp_hdr, tmp_rsp_hdr, ulp_imm_len); + + hdr_len = sizeof(struct xio_rdma_rsp_hdr); + + /* OUT: responder want to write data via rdma write */ + if (rsp_hdr->out_ib_op == XIO_IB_RDMA_WRITE) { + wr_len = (uint32_t *)((uint8_t *)tmp_rsp_hdr + + sizeof(struct xio_rdma_rsp_hdr)); + + /* params for RDMA WRITE */ + for (i = 0; i < rsp_hdr->out_num_sge; i++) { + *wr_len = htonl(rdma_task->rsp_out_sge[i].length); + wr_len++; + } + hdr_len += sizeof(uint32_t) * rsp_hdr->out_num_sge; + } + if (rsp_hdr->out_ib_op == XIO_IB_RDMA_READ) { + tmp_sge = (struct xio_sge *)((uint8_t *)tmp_rsp_hdr + + sizeof(struct xio_rdma_rsp_hdr)); + + /* OUT: responder want to write data via rdma read */ + for (i = 0; i < rsp_hdr->out_num_sge; i++) { + sge.addr = uint64_from_ptr( + rdma_task->write_reg_mem[i].addr); + sge.length = rdma_task->write_reg_mem[i].length; + if (rdma_task->write_reg_mem[i].mr) { + mr = xio_rdma_mr_lookup( + rdma_task->write_reg_mem[i].mr, + rdma_hndl->tcq->dev); + if (!mr) + goto cleanup; + + sge.stag = mr->rkey; + } else { + sge.stag = 0; + } + PACK_LLVAL(&sge, tmp_sge, addr); + PACK_LVAL(&sge, tmp_sge, length); + PACK_LVAL(&sge, tmp_sge, stag); + tmp_sge++; + } + hdr_len += sizeof(struct xio_sge) * rsp_hdr->out_num_sge; + } + + xio_mbuf_inc(&task->mbuf, hdr_len); + +#ifdef EYAL_TODO + print_hex_dump_bytes("post_send: ", DUMP_PREFIX_ADDRESS, + task->mbuf.tlv.head, 64); +#endif + return 0; + +cleanup: + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_read_rsp_header */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_read_rsp_header(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task, + struct xio_rdma_rsp_hdr *rsp_hdr) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + struct xio_rdma_rsp_hdr *tmp_rsp_hdr; + struct xio_sge *tmp_sge; + size_t hdr_len; + uint32_t *wr_len; + int i; + + /* point to transport header */ + xio_mbuf_set_trans_hdr(&task->mbuf); + tmp_rsp_hdr = (struct xio_rdma_rsp_hdr *) + xio_mbuf_get_curr_ptr(&task->mbuf); + + rsp_hdr->version = tmp_rsp_hdr->version; + rsp_hdr->flags = tmp_rsp_hdr->flags; + UNPACK_SVAL(tmp_rsp_hdr, rsp_hdr, rsp_hdr_len); + + if (unlikely(rsp_hdr->rsp_hdr_len != sizeof(struct xio_rdma_rsp_hdr))) { + ERROR_LOG( + "header length's read failed. arrived:%d expected:%zd\n", + rsp_hdr->rsp_hdr_len, sizeof(struct xio_rdma_rsp_hdr)); + return -1; + } + + UNPACK_SVAL(tmp_rsp_hdr, rsp_hdr, sn); + /* ack_sn not used */ + UNPACK_SVAL(tmp_rsp_hdr, rsp_hdr, credits); + UNPACK_LVAL(tmp_rsp_hdr, rsp_hdr, rtid); + rsp_hdr->out_ib_op = tmp_rsp_hdr->out_ib_op; + UNPACK_LVAL(tmp_rsp_hdr, rsp_hdr, status); + UNPACK_SVAL(tmp_rsp_hdr, rsp_hdr, out_num_sge); + UNPACK_LVAL(tmp_rsp_hdr, rsp_hdr, ltid); + UNPACK_SVAL(tmp_rsp_hdr, rsp_hdr, ulp_hdr_len); + UNPACK_SVAL(tmp_rsp_hdr, rsp_hdr, ulp_pad_len); + /* remain_data_len not in use */ + UNPACK_LLVAL(tmp_rsp_hdr, rsp_hdr, ulp_imm_len); + + hdr_len = sizeof(struct xio_rdma_rsp_hdr); + if (rsp_hdr->out_ib_op == XIO_IB_RDMA_WRITE) { + wr_len = (uint32_t *)((uint8_t *)tmp_rsp_hdr + + sizeof(struct xio_rdma_rsp_hdr)); + + /* params for RDMA WRITE */ + for (i = 0; i < rsp_hdr->out_num_sge; i++) { + rdma_task->rsp_out_sge[i].length = ntohl(*wr_len); + wr_len++; + } + rdma_task->rsp_out_num_sge = rsp_hdr->out_num_sge; + + hdr_len += sizeof(uint32_t) * rsp_hdr->out_num_sge; + } + if (rsp_hdr->out_ib_op == XIO_IB_RDMA_READ) { + tmp_sge = (struct xio_sge *)((uint8_t *)tmp_rsp_hdr + + sizeof(struct xio_rdma_rsp_hdr)); + + /* params for RDMA_READ */ + for (i = 0; i < rsp_hdr->out_num_sge; i++) { + UNPACK_LLVAL(tmp_sge, &rdma_task->req_out_sge[i], + addr); + UNPACK_LVAL(tmp_sge, &rdma_task->req_out_sge[i], + length); + UNPACK_LVAL(tmp_sge, &rdma_task->req_out_sge[i], + stag); + tmp_sge++; + } + rdma_task->req_out_num_sge = i; + hdr_len += sizeof(struct xio_sge) * rsp_hdr->out_num_sge; + } + + xio_mbuf_inc(&task->mbuf, hdr_len); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_prep_req_header */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_prep_req_header(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task, + uint16_t ulp_hdr_len, + uint16_t ulp_pad_len, + uint64_t ulp_imm_len, + uint32_t status) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + struct xio_rdma_req_hdr req_hdr; + + if (unlikely(!IS_REQUEST(task->tlv_type))) { + ERROR_LOG("unknown message type\n"); + return -1; + } + + /* write the headers */ + + /* fill request header */ + req_hdr.version = XIO_REQ_HEADER_VERSION; + req_hdr.req_hdr_len = sizeof(req_hdr); + req_hdr.ltid = task->ltid; + req_hdr.in_ib_op = rdma_task->in_ib_op; + req_hdr.out_ib_op = rdma_task->out_ib_op; + req_hdr.flags = 0; + + if (test_bits(XIO_MSG_FLAG_PEER_WRITE_RSP, &task->omsg_flags)) + set_bits(XIO_MSG_FLAG_PEER_WRITE_RSP, &req_hdr.flags); + else if (test_bits(XIO_MSG_FLAG_LAST_IN_BATCH, &task->omsg_flags)) + set_bits(XIO_MSG_FLAG_LAST_IN_BATCH, &req_hdr.flags); + + req_hdr.ulp_hdr_len = ulp_hdr_len; + req_hdr.ulp_pad_len = ulp_pad_len; + req_hdr.ulp_imm_len = ulp_imm_len; + req_hdr.in_num_sge = rdma_task->read_num_reg_mem; + req_hdr.out_num_sge = rdma_task->write_num_reg_mem; + + if (xio_rdma_write_req_header(rdma_hndl, task, &req_hdr) != 0) + goto cleanup; + + /* write the payload header */ + if (ulp_hdr_len) { + if (xio_mbuf_write_array( + &task->mbuf, + task->omsg->out.header.iov_base, + task->omsg->out.header.iov_len) != 0) + goto cleanup; + } + + /* write the pad between header and data */ + if (ulp_pad_len) + xio_mbuf_inc(&task->mbuf, ulp_pad_len); + + return 0; + +cleanup: + xio_set_error(XIO_E_MSG_SIZE); + ERROR_LOG("xio_rdma_write_req_header failed\n"); + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_prep_rsp_header */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_prep_rsp_header(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task, + uint16_t ulp_hdr_len, + uint16_t ulp_pad_len, + uint64_t ulp_imm_len, + uint32_t status) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + struct xio_rdma_rsp_hdr rsp_hdr; + + if (unlikely(!IS_RESPONSE(task->tlv_type))) { + ERROR_LOG("unknown message type\n"); + return -1; + } + + /* fill response header */ + rsp_hdr.version = XIO_RSP_HEADER_VERSION; + rsp_hdr.rsp_hdr_len = sizeof(rsp_hdr); + rsp_hdr.rtid = task->rtid; + rsp_hdr.ltid = task->ltid; + rsp_hdr.out_ib_op = rdma_task->out_ib_op; + rsp_hdr.flags = XIO_HEADER_FLAG_NONE; + if (rdma_task->out_ib_op == XIO_IB_RDMA_READ) + rsp_hdr.out_num_sge = rdma_task->write_num_reg_mem; + else + rsp_hdr.out_num_sge = rdma_task->rsp_out_num_sge; + + rsp_hdr.ulp_hdr_len = ulp_hdr_len; + rsp_hdr.ulp_pad_len = ulp_pad_len; + rsp_hdr.ulp_imm_len = ulp_imm_len; + rsp_hdr.status = status; + if (xio_rdma_write_rsp_header(rdma_hndl, task, &rsp_hdr) != 0) + goto cleanup; + + /* write the payload header */ + if (ulp_hdr_len) { + if (xio_mbuf_write_array( + &task->mbuf, + task->omsg->out.header.iov_base, + task->omsg->out.header.iov_len) != 0) + goto cleanup; + } + + /* write the pad between header and data */ + if (ulp_pad_len) + xio_mbuf_inc(&task->mbuf, ulp_pad_len); + + return 0; + +cleanup: + xio_set_error(XIO_E_MSG_SIZE); + ERROR_LOG("xio_rdma_write_rsp_header failed\n"); + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_write_send_data */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_write_send_data(struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + XIO_TO_RDMA_HNDL(task, rdma_hndl); + struct xio_mr *xmr; + struct xio_device *dev = rdma_hndl->tcq->dev; + struct ibv_mr *mr; + struct ibv_sge *sge; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + void *sg; + size_t i; + + sgtbl = xio_sg_table_get(&task->omsg->out); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(task->omsg->out.sgl_type); + + /* user provided mr */ + sg = sge_first(sgtbl_ops, sgtbl); + if (sge_mr(sgtbl_ops, sg)) { + sge = &rdma_task->txd.sge[1]; + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + xmr = (struct xio_mr *)sge_mr(sgtbl_ops, sg); + if (unlikely(!xmr)) { + ERROR_LOG("failed to find mr on iov\n"); + goto cleanup; + } + + /* get the corresponding key of the + * outgoing adapter */ + mr = xio_rdma_mr_lookup(xmr, dev); + if (unlikely(!mr)) { + ERROR_LOG("failed to find memory " \ + "handle\n"); + goto cleanup; + } + /* copy the iovec */ + /* send it on registered memory */ + sge->addr = uint64_from_ptr(sge_addr(sgtbl_ops, sg)); + sge->length = (uint32_t)sge_length(sgtbl_ops, sg); + sge->lkey = mr->lkey; + sge++; + } + rdma_task->txd.send_wr.num_sge = + tbl_nents(sgtbl_ops, sgtbl) + 1; + } else { + /* copy to internal buffer */ + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + /* copy the data into internal buffer */ + if (xio_mbuf_write_array( + &task->mbuf, + sge_addr(sgtbl_ops, sg), + sge_length(sgtbl_ops, sg)) != 0) + goto cleanup; + } + rdma_task->txd.send_wr.num_sge = 1; + } + + return 0; + +cleanup: + xio_set_error(XIO_E_MSG_SIZE); + ERROR_LOG("xio_rdma_send_msg failed\n"); + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_prep_rsp_out_data */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_prep_rsp_out_data( + struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + struct xio_rdma_rsp_hdr rsp_hdr; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + void *sg; + struct xio_reg_mem *write_reg_mem; + size_t retval; + uint64_t xio_hdr_len; + uint64_t ulp_imm_len; + uint16_t ulp_hdr_len; + uint16_t ulp_pad_len = 0; + uint32_t i; + int enforce_write_rsp; + + sgtbl = xio_sg_table_get(&task->omsg->out); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(task->omsg->out.sgl_type); + + /* calculate headers */ + ulp_hdr_len = task->omsg->out.header.iov_len; + ulp_imm_len = tbl_length(sgtbl_ops, sgtbl); + + xio_hdr_len = xio_mbuf_get_curr_offset(&task->mbuf); + xio_hdr_len += sizeof(rsp_hdr); + xio_hdr_len += rdma_task->req_in_num_sge * sizeof(struct xio_sge); + + enforce_write_rsp = (task->imsg_flags && + (task->imsg_flags & + XIO_HEADER_FLAG_PEER_WRITE_RSP)); + + /* + if (rdma_hndl->max_inline_buf_sz < xio_hdr_len + ulp_hdr_len) { + ERROR_LOG("header size %lu exceeds max header %lu\n", + ulp_hdr_len, + rdma_hndl->max_inline_buf_sz - xio_hdr_len); + xio_set_error(XIO_E_MSG_SIZE); + goto cleanup; + } + */ + /* initialize the txd */ + rdma_task->txd.send_wr.num_sge = 1; + + if (g_options.inline_xio_data_align && ulp_imm_len) { + uint16_t hdr_len = xio_hdr_len + ulp_hdr_len; + + ulp_pad_len = ALIGN(hdr_len, g_options.inline_xio_data_align) - + hdr_len; + } + + /* Small data is outgoing via SEND unless the requester explicitly + * insisted on RDMA operation and provided resources. + * One sge is reserved for the header + */ + if ((ulp_imm_len == 0) || + (!enforce_write_rsp && + (tbl_nents(sgtbl_ops, sgtbl) <= + (size_t)(rdma_hndl->max_sge - 1)) && + ((xio_hdr_len + ulp_hdr_len + ulp_pad_len + ulp_imm_len) < + (uint64_t)rdma_hndl->max_inline_buf_sz))) { + rdma_task->out_ib_op = XIO_IB_SEND; + /* write xio header to the buffer */ + retval = xio_rdma_prep_rsp_header( + rdma_hndl, task, + ulp_hdr_len, ulp_pad_len, ulp_imm_len, + XIO_E_SUCCESS); + if (retval) + goto cleanup; + + /* if there is data, set it to buffer or directly to the sge */ + if (ulp_imm_len) { + retval = xio_rdma_write_send_data(task); + if (retval) + goto cleanup; + } else { + /* no data at all */ + tbl_set_nents(sgtbl_ops, sgtbl, 0); + } + } else { + if (rdma_task->req_in_sge[0].addr && + rdma_task->req_in_sge[0].length && + rdma_task->req_in_sge[0].stag) { + /* the data is sent via RDMA_WRITE */ + rdma_task->out_ib_op = XIO_IB_RDMA_WRITE; + + /* prepare rdma write */ + xio_sched_rdma_wr_req(rdma_hndl, task); + + /* and the header is sent via SEND */ + /* write xio header to the buffer */ + retval = xio_rdma_prep_rsp_header( + rdma_hndl, task, + ulp_hdr_len, 0, ulp_imm_len, + XIO_E_SUCCESS); + } else { +#if 0 + DEBUG_LOG("partial completion of request due " \ + "to missing, response buffer\n"); + + rdma_task->out_ib_op = XIO_IB_SEND; + + /* the client did not provide buffer for response */ + retval = xio_rdma_prep_rsp_header( + rdma_hndl, task, + ulp_hdr_len, 0, 0, + XIO_E_RSP_BUF_SIZE_MISMATCH); + + tbl_set_nents(sgtbl_ops, sgtbl, 0); +#else + /* the data is outgoing via SEND but the peer will do + * RDMA_READ */ + rdma_task->out_ib_op = XIO_IB_RDMA_READ; + /* user provided mr */ + sg = sge_first(sgtbl_ops, sgtbl); + if (sge_mr(sgtbl_ops, sg)) { + write_reg_mem = rdma_task->write_reg_mem; + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + write_reg_mem->addr = + sge_addr(sgtbl_ops, sg); + write_reg_mem->priv = NULL; + write_reg_mem->mr = (struct xio_mr *) + sge_mr(sgtbl_ops, sg); + write_reg_mem->length = + sge_length(sgtbl_ops, sg); + write_reg_mem++; + } + } else { + if (!rdma_hndl->rdma_mempool) { + xio_set_error(XIO_E_NO_BUFS); + ERROR_LOG("message /read/write " \ + "failed - library's " \ + "memory pool disabled\n"); + goto cleanup1; + } + + /* user did not provide mr - + * take buffers from pool and do copy */ + write_reg_mem = rdma_task->write_reg_mem; + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + retval = xio_mempool_alloc( + rdma_hndl->rdma_mempool, + sge_length(sgtbl_ops, sg), + write_reg_mem); + if (unlikely(retval)) { + rdma_task->write_num_reg_mem + = i; + xio_set_error(ENOMEM); + ERROR_LOG("mempool is empty" \ + "for %zd bytes\n", + sge_length(sgtbl_ops, + sg)); + goto cleanup1; + } + + write_reg_mem->length = + sge_length(sgtbl_ops, sg); + + /* copy the data to the buffer */ + memcpy(write_reg_mem->addr, + sge_addr(sgtbl_ops, sg), + sge_length(sgtbl_ops, sg)); + write_reg_mem++; + } + } + rdma_task->write_num_reg_mem = + tbl_nents(sgtbl_ops, sgtbl); + + /* write xio header to the buffer */ + retval = xio_rdma_prep_rsp_header( + rdma_hndl, task, + ulp_hdr_len, 0, 0, XIO_E_SUCCESS); + + if (unlikely(retval)) { + ERROR_LOG("Failed to write header\n"); + goto cleanup1; + } + +#endif + } + } + + return 0; +#if 1 +cleanup1: + for (i = 0; i < rdma_task->write_num_reg_mem; i++) + xio_mempool_free(&rdma_task->write_reg_mem[i]); + + rdma_task->write_num_reg_mem = 0; + + return -1; +#endif + +cleanup: + xio_set_error(XIO_E_MSG_SIZE); + ERROR_LOG("xio_rdma_send_msg failed\n"); + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_prep_req_out_data */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_prep_req_out_data( + struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + struct xio_vmsg *vmsg = &task->omsg->out; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + void *sg; + uint64_t xio_hdr_len; + uint64_t xio_max_hdr_len; + uint64_t ulp_imm_len; + size_t retval; + uint16_t ulp_hdr_len; + uint16_t ulp_pad_len = 0; + unsigned int i; + int nents; + int tx_by_sr; + + sgtbl = xio_sg_table_get(&task->omsg->out); + + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(task->omsg->out.sgl_type); + nents = tbl_nents(sgtbl_ops, sgtbl); + + /* calculate headers */ + ulp_hdr_len = vmsg->header.iov_len; + ulp_imm_len = tbl_length(sgtbl_ops, sgtbl); + + xio_hdr_len = xio_mbuf_get_curr_offset(&task->mbuf); + xio_hdr_len += sizeof(struct xio_rdma_req_hdr); + xio_hdr_len += sizeof(struct xio_sge) * rdma_task->req_in_num_sge; + xio_max_hdr_len = xio_hdr_len + sizeof(struct xio_sge) * nents; + + if (g_options.inline_xio_data_align && ulp_imm_len) { + uint16_t hdr_len = xio_hdr_len + ulp_hdr_len; + + ulp_pad_len = ALIGN(hdr_len, g_options.inline_xio_data_align) - + hdr_len; + } + + /* initialize the txd */ + rdma_task->txd.send_wr.num_sge = 1; + + if (test_bits(XIO_MSG_FLAG_PEER_READ_REQ, &task->omsg_flags) && nents) + tx_by_sr = 0; + else + /* test for using send/receive or rdma_read */ + tx_by_sr = (nents <= (rdma_hndl->max_sge - 1) && + ((ulp_hdr_len + ulp_pad_len + + ulp_imm_len + xio_max_hdr_len) <= + rdma_hndl->max_inline_buf_sz) && + (((int)(ulp_imm_len) <= + g_options.max_inline_xio_data) || + ulp_imm_len == 0)); + + /* The data is outgoing via SEND + * One sge is reserved for the header + */ + if (tx_by_sr) { + rdma_task->out_ib_op = XIO_IB_SEND; + /* user has small request - no rdma operation expected */ + rdma_task->write_num_reg_mem = 0; + + /* write xio header to the buffer */ + retval = xio_rdma_prep_req_header( + rdma_hndl, task, + ulp_hdr_len, ulp_pad_len, ulp_imm_len, + XIO_E_SUCCESS); + if (unlikely(retval)) + return -1; + + /* if there is data, set it to buffer or directly to the sge */ + if (ulp_imm_len) { + retval = xio_rdma_write_send_data(task); + if (unlikely(retval)) + return -1; + } + } else { + /* the data is outgoing via SEND but the peer will do + * RDMA_READ */ + rdma_task->out_ib_op = XIO_IB_RDMA_READ; + /* user provided mr */ + sg = sge_first(sgtbl_ops, sgtbl); + if (sge_mr(sgtbl_ops, sg)) { + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + rdma_task->write_reg_mem[i].addr = + sge_addr(sgtbl_ops, sg); + rdma_task->write_reg_mem[i].priv = NULL; + rdma_task->write_reg_mem[i].mr = + (struct xio_mr *)sge_mr(sgtbl_ops, sg); + rdma_task->write_reg_mem[i].length = + sge_length(sgtbl_ops, sg); + } + } else { + if (!rdma_hndl->rdma_mempool) { + xio_set_error(XIO_E_NO_BUFS); + ERROR_LOG( + "message /read/write failed - " \ + "library's memory pool disabled\n"); + goto cleanup; + } + + /* user did not provide mr - take buffers from pool + * and do copy */ + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + retval = xio_mempool_alloc( + rdma_hndl->rdma_mempool, + sge_length(sgtbl_ops, sg), + &rdma_task->write_reg_mem[i]); + if (unlikely(retval)) { + rdma_task->write_num_reg_mem = i; + xio_set_error(ENOMEM); + ERROR_LOG( + "mempool is empty for %zd bytes\n", + sge_length(sgtbl_ops, sg)); + goto cleanup; + } + + rdma_task->write_reg_mem[i].length = + sge_length(sgtbl_ops, sg); + + /* copy the data to the buffer */ + memcpy(rdma_task->write_reg_mem[i].addr, + sge_addr(sgtbl_ops, sg), + sge_length(sgtbl_ops, sg)); + } + } + rdma_task->write_num_reg_mem = tbl_nents(sgtbl_ops, sgtbl); + + /* write xio header to the buffer */ + retval = xio_rdma_prep_req_header( + rdma_hndl, task, + ulp_hdr_len, 0, 0, XIO_E_SUCCESS); + + if (unlikely(retval)) { + ERROR_LOG("Failed to write header\n"); + goto cleanup; + } + } + + return 0; + +cleanup: + for (i = 0; i < rdma_task->write_num_reg_mem; i++) + xio_mempool_free(&rdma_task->write_reg_mem[i]); + + rdma_task->write_num_reg_mem = 0; + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_prep_req_in_data */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_prep_req_in_data( + struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + size_t hdr_len; + size_t xio_hdr_len; + size_t data_len; + struct xio_vmsg *vmsg = &task->omsg->in; + unsigned int i; + int retval; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + void *sg; + int enforce_write_rsp; + int nents; + + sgtbl = xio_sg_table_get(&task->omsg->in); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(task->omsg->in.sgl_type); + nents = tbl_nents(sgtbl_ops, sgtbl); + + if (nents == 0) { + rdma_task->in_ib_op = XIO_IB_SEND; + rdma_task->read_num_reg_mem = 0; + return 0; + } + + data_len = tbl_length(sgtbl_ops, sgtbl); + hdr_len = vmsg->header.iov_len; + if (hdr_len && hdr_len >= rdma_hndl->peer_max_header) { + ERROR_LOG("hdr_len=%d is bigger than peer_max_reader=%d\n", + hdr_len, rdma_hndl->peer_max_header); + return -1; + } + + /* before working on the out - current place after the session header */ + xio_hdr_len = xio_mbuf_get_curr_offset(&task->mbuf); + xio_hdr_len += sizeof(struct xio_rdma_rsp_hdr); + xio_hdr_len += sizeof(struct xio_sge) * nents; + + /* requester may insist on RDMA for small buffers to eliminate copy + * from receive buffers to user buffers + */ + enforce_write_rsp = task->omsg_flags & XIO_MSG_FLAG_PEER_WRITE_RSP; + if (!enforce_write_rsp && + data_len + hdr_len + xio_hdr_len < rdma_hndl->max_inline_buf_sz) { + /* user has small response - no rdma operation expected */ + rdma_task->in_ib_op = XIO_IB_SEND; + rdma_task->read_num_reg_mem = (data_len) ? nents : 0; + } else { + /* user provided buffers with length for RDMA WRITE */ + /* user provided mr */ + rdma_task->in_ib_op = XIO_IB_RDMA_WRITE; + sg = sge_first(sgtbl_ops, sgtbl); + if (sge_mr(sgtbl_ops, sg)) { + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + rdma_task->read_reg_mem[i].addr = + sge_addr(sgtbl_ops, sg); + rdma_task->read_reg_mem[i].priv = NULL; + rdma_task->read_reg_mem[i].mr = + (struct xio_mr *)sge_mr(sgtbl_ops, sg); + rdma_task->read_reg_mem[i].length = + sge_length(sgtbl_ops, sg); + } + } else { + if (!rdma_hndl->rdma_mempool) { + xio_set_error(XIO_E_NO_BUFS); + ERROR_LOG( + "message /read/write failed - " \ + "library's memory pool disabled\n"); + goto cleanup; + } + + /* user did not provide mr */ + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + retval = xio_mempool_alloc( + rdma_hndl->rdma_mempool, + sge_length(sgtbl_ops, sg), + &rdma_task->read_reg_mem[i]); + + if (unlikely(retval)) { + rdma_task->read_num_reg_mem = i; + xio_set_error(ENOMEM); + ERROR_LOG( + "mempool is empty for %zd bytes\n", + sge_length(sgtbl_ops, sg)); + goto cleanup; + } + rdma_task->read_reg_mem[i].length = + sge_length(sgtbl_ops, sg); + } + } + rdma_task->read_num_reg_mem = nents; + } + /* + if (rdma_task->read_num_reg_mem > rdma_hndl->peer_max_out_iovsz) { + ERROR_LOG("request in iovlen %d is bigger then peer " \ + "max out iovlen %d\n", + rdma_task->read_num_reg_mem, + rdma_hndl->peer_max_out_iovsz); + goto cleanup; + } + */ + + return 0; + +cleanup: + for (i = 0; i < rdma_task->read_num_reg_mem; i++) + xio_mempool_free(&rdma_task->read_reg_mem[i]); + + rdma_task->req_in_num_sge = 0; + xio_set_error(EMSGSIZE); + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* verify_req_send_limits */ +/*---------------------------------------------------------------------------*/ +static int verify_req_send_limits(const struct xio_rdma_transport *rdma_hndl) +{ + if (rdma_hndl->reqs_in_flight_nr + rdma_hndl->rsps_in_flight_nr > + rdma_hndl->max_tx_ready_tasks_num) { + DEBUG_LOG("over limits reqs_in_flight_nr=%u, "\ + "rsps_in_flight_nr=%u, max_tx_ready_tasks_num=%u\n", + rdma_hndl->reqs_in_flight_nr, + rdma_hndl->rsps_in_flight_nr, + rdma_hndl->max_tx_ready_tasks_num); + xio_set_error(EAGAIN); + return -1; + } + + if (rdma_hndl->reqs_in_flight_nr >= + rdma_hndl->max_tx_ready_tasks_num - 1) { + DEBUG_LOG("over limits reqs_in_flight_nr=%u, " \ + "max_tx_ready_tasks_num=%u\n", + rdma_hndl->reqs_in_flight_nr, + rdma_hndl->max_tx_ready_tasks_num); + + xio_set_error(EAGAIN); + return -1; + } + /* tx ready is full - refuse request */ + if (rdma_hndl->tx_ready_tasks_num >= + rdma_hndl->max_tx_ready_tasks_num) { + DEBUG_LOG("over limits tx_ready_tasks_num=%u, "\ + "max_tx_ready_tasks_num=%u\n", + rdma_hndl->tx_ready_tasks_num, + rdma_hndl->max_tx_ready_tasks_num); + xio_set_error(EAGAIN); + return -1; + } + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* verify_rsp_send_limits */ +/*---------------------------------------------------------------------------*/ +static int verify_rsp_send_limits(const struct xio_rdma_transport *rdma_hndl) +{ + if (rdma_hndl->reqs_in_flight_nr + rdma_hndl->rsps_in_flight_nr > + rdma_hndl->max_tx_ready_tasks_num) { + DEBUG_LOG("over limits reqs_in_flight_nr=%u, "\ + "rsps_in_flight_nr=%u, max_tx_ready_tasks_num=%u\n", + rdma_hndl->reqs_in_flight_nr, + rdma_hndl->rsps_in_flight_nr, + rdma_hndl->max_tx_ready_tasks_num); + xio_set_error(EAGAIN); + return -1; + } + + if (rdma_hndl->rsps_in_flight_nr >= + rdma_hndl->max_tx_ready_tasks_num - 1) { + DEBUG_LOG("over limits rsps_in_flight_nr=%u, " \ + "max_tx_ready_tasks_num=%u\n", + rdma_hndl->rsps_in_flight_nr, + rdma_hndl->max_tx_ready_tasks_num); + + xio_set_error(EAGAIN); + return -1; + } + /* tx ready is full - refuse request */ + if (rdma_hndl->tx_ready_tasks_num >= + rdma_hndl->max_tx_ready_tasks_num) { + DEBUG_LOG("over limits tx_ready_tasks_num=%u, "\ + "max_tx_ready_tasks_num=%u\n", + rdma_hndl->tx_ready_tasks_num, + rdma_hndl->max_tx_ready_tasks_num); + xio_set_error(EAGAIN); + return -1; + } + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* kick_send_and_read */ +/*---------------------------------------------------------------------------*/ +static int kick_send_and_read(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task, + int must_send) +{ + int retval = 0; + + /* transmit only if available */ + if (test_bits(XIO_MSG_FLAG_LAST_IN_BATCH, &task->omsg->flags) || + task->is_control) { + must_send = 1; + } else { + if (tx_window_sz(rdma_hndl) >= SEND_THRESHOLD) + must_send = 1; + } + + /* resource are now available and rdma rd requests are pending kick + * them + */ + if (rdma_hndl->kick_rdma_rd_req) { + retval = xio_xmit_rdma_rd_req(rdma_hndl); + if (retval) { + retval = xio_errno(); + if (retval != EAGAIN) { + ERROR_LOG("xio_xmit_rdma_rd_req failed. %s\n", + xio_strerror(retval)); + return -1; + } + retval = 0; + } + } + if (rdma_hndl->kick_rdma_rd_rsp) { + retval = xio_xmit_rdma_rd_rsp(rdma_hndl); + if (retval) { + retval = xio_errno(); + if (retval != EAGAIN) { + ERROR_LOG("xio_xmit_rdma_rd_rsp failed. %s\n", + xio_strerror(retval)); + return -1; + } + retval = 0; + } + } + if (must_send) { + retval = xio_rdma_xmit(rdma_hndl); + if (retval) { + retval = xio_errno(); + if (retval != EAGAIN) { + ERROR_LOG("xio_xmit_rdma failed. %s\n", + xio_strerror(retval)); + return -1; + } + retval = 0; + } + } + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_send_req */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_send_req(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + struct xio_mbuf *mbuf = &task->mbuf; + struct xio_work_req *txd; + struct ibv_sge *sge; + uint64_t payload; + size_t retval; + size_t sge_len; + int i; + int must_send = 0; + + if (unlikely(verify_req_send_limits(rdma_hndl))) + return -1; + + /* prepare buffer for RDMA response */ + retval = xio_rdma_prep_req_in_data(rdma_hndl, task); + if (unlikely(retval != 0)) { + ERROR_LOG("rdma_prep_req_in_data failed\n"); + return -1; + } + /* prepare the out message */ + retval = xio_rdma_prep_req_out_data(rdma_hndl, task); + if (unlikely(retval != 0)) { + ERROR_LOG("rdma_prep_req_out_data failed\n"); + return -1; + } + + payload = xio_mbuf_tlv_payload_len(mbuf); + + /* add tlv */ + if (xio_mbuf_write_tlv(mbuf, task->tlv_type, payload) != 0) { + ERROR_LOG("write tlv failed\n"); + xio_set_error(EOVERFLOW); + return -1; + } + + txd = &rdma_task->txd; + sge = &txd->sge[0]; + + /* set the length */ + sge->length = xio_mbuf_get_curr_offset(mbuf); + sge_len = sge->length; + + /* validate header */ + if (unlikely(XIO_TLV_LEN + payload != sge_len)) { + ERROR_LOG("header validation failed\n"); + return -1; + } + xio_task_addref(task); + + /* check for inline */ + txd->send_wr.send_flags = 0; + + sge++; + for (i = 1; i < txd->send_wr.num_sge; i++) { + sge_len += sge->length; + sge++; + } + + if (sge_len < (size_t)rdma_hndl->max_inline_data) + txd->send_wr.send_flags |= IBV_SEND_INLINE; + + if (IS_FIN(task->tlv_type)) { + txd->send_wr.send_flags |= IBV_SEND_FENCE; + must_send = 1; + } + + if (unlikely(++rdma_hndl->req_sig_cnt >= HARD_CQ_MOD || + task->is_control || + task->omsg->flags & XIO_MSG_FLAG_IMM_SEND_COMP)) { + /* avoid race between send completion and response arrival */ + txd->send_wr.send_flags |= IBV_SEND_SIGNALED; + rdma_hndl->req_sig_cnt = 0; + } + + rdma_task->out_ib_op = XIO_IB_SEND; + + list_move_tail(&task->tasks_list_entry, &rdma_hndl->tx_ready_list); + + rdma_hndl->tx_ready_tasks_num++; + + return kick_send_and_read(rdma_hndl, task, must_send); +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_send_rsp */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_send_rsp(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + struct xio_mbuf *mbuf = &task->mbuf; + struct xio_work_req *txd; + struct ibv_sge *sge; + size_t retval; + size_t sge_len; + uint64_t payload; + int i; + int must_send = 0; + + if (unlikely(verify_rsp_send_limits(rdma_hndl))) + return -1; + + /* prepare the out message */ + retval = xio_rdma_prep_rsp_out_data(rdma_hndl, task); + if (unlikely(retval != 0)) { + ERROR_LOG("rdma_prep_rsp_out_data failed\n"); + goto cleanup; + } + + payload = xio_mbuf_tlv_payload_len(mbuf); + + /* add tlv */ + if (xio_mbuf_write_tlv(mbuf, task->tlv_type, payload) != 0) + goto cleanup; + + txd = &rdma_task->txd; + sge = &txd->sge[0]; + + /* set the length */ + sge->length = xio_mbuf_get_curr_offset(mbuf); + sge_len = sge->length; + + /* validate header */ + if (unlikely(XIO_TLV_LEN + payload != sge_len)) { + ERROR_LOG("header validation failed\n"); + goto cleanup; + } + + txd->send_wr.send_flags = 0; + if (++rdma_hndl->rsp_sig_cnt >= SOFT_CQ_MOD || task->is_control || + task->omsg->flags & XIO_MSG_FLAG_IMM_SEND_COMP) { + rdma_task->txd.send_wr.send_flags |= IBV_SEND_SIGNALED; + rdma_hndl->rsp_sig_cnt = 0; + } + + /* check for inline */ + if (rdma_task->out_ib_op == XIO_IB_SEND || + rdma_task->out_ib_op == XIO_IB_RDMA_READ) { + sge++; + for (i = 1; i < txd->send_wr.num_sge; i++) { + sge_len += sge->length; + sge++; + } + + if (sge_len < (size_t)rdma_hndl->max_inline_data) + txd->send_wr.send_flags |= IBV_SEND_INLINE; + + list_move_tail(&task->tasks_list_entry, + &rdma_hndl->tx_ready_list); + rdma_hndl->tx_ready_tasks_num++; + } + + if (IS_FIN(task->tlv_type)) { + rdma_task->txd.send_wr.send_flags |= IBV_SEND_FENCE; + must_send = 1; + } + if (rdma_task->out_ib_op == XIO_IB_RDMA_READ) + xio_task_addref(task); + + return kick_send_and_read(rdma_hndl, task, must_send); +cleanup: + xio_set_error(XIO_E_MSG_SIZE); + ERROR_LOG("xio_rdma_send_msg failed\n"); + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_on_rsp_send_comp */ +/*---------------------------------------------------------------------------*/ +int xio_rdma_on_rsp_send_comp(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + union xio_transport_event_data event_data; + + if (rdma_task->out_ib_op == XIO_IB_RDMA_READ) { + xio_tasks_pool_put(task); + return 0; + } + + if (IS_CANCEL(task->tlv_type)) + return 0; + + event_data.msg.op = XIO_WC_OP_SEND; + event_data.msg.task = task; + + xio_transport_notify_observer(&rdma_hndl->base, + XIO_TRANSPORT_EVENT_SEND_COMPLETION, + &event_data); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_on_req_send_comp */ +/*---------------------------------------------------------------------------*/ +int xio_rdma_on_req_send_comp(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + union xio_transport_event_data event_data; + + if (IS_CANCEL(task->tlv_type)) + return 0; + + event_data.msg.op = XIO_WC_OP_SEND; + event_data.msg.task = task; + + xio_transport_notify_observer(&rdma_hndl->base, + XIO_TRANSPORT_EVENT_SEND_COMPLETION, + &event_data); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_on_direct_rdma_comp */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_on_direct_rdma_comp(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task, + enum xio_wc_op op) +{ + union xio_transport_event_data event_data; + + event_data.msg.op = op; + event_data.msg.task = task; + xio_transport_notify_observer( + &rdma_hndl->base, + XIO_TRANSPORT_EVENT_DIRECT_RDMA_COMPLETION, + &event_data); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_post_recv_rsp */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_post_recv_rsp(struct xio_task *task) +{ + struct xio_msg *imsg; + struct xio_msg *omsg; + struct xio_sg_table_ops *isgtbl_ops; + void *isgtbl; + struct xio_sg_table_ops *osgtbl_ops; + void *osgtbl; + + omsg = task->sender_task->omsg; + imsg = &task->imsg; + isgtbl = xio_sg_table_get(&imsg->in); + isgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(imsg->in.sgl_type); + osgtbl = xio_sg_table_get(&omsg->in); + osgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(omsg->in.sgl_type); + + /* use provided only length - set user + * pointers */ + tbl_clone(osgtbl_ops, osgtbl, isgtbl_ops, isgtbl); + + /* also set bits */ + if (test_bits(XIO_MSG_HINT_ASSIGNED_DATA_IN_BUF, &imsg->hints)) + set_bits(XIO_MSG_HINT_ASSIGNED_DATA_IN_BUF, &omsg->hints); + else + clr_bits(XIO_MSG_HINT_ASSIGNED_DATA_IN_BUF, &omsg->hints); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_on_recv_rsp */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_on_recv_rsp(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + XIO_TO_RDMA_TASK(task, rdma_sender_task); + struct xio_task *sender_task; + union xio_transport_event_data event_data; + struct xio_rdma_rsp_hdr rsp_hdr; + struct xio_sg_table_ops *isgtbl_ops; + struct xio_sg_table_ops *osgtbl_ops; + struct xio_msg *imsg; + struct xio_msg *omsg; + void *ulp_hdr; + void *isgtbl; + void *osgtbl; + void *sg; + unsigned int i; + int retval = 0; + + /* read the response header */ + retval = xio_rdma_read_rsp_header(rdma_hndl, task, &rsp_hdr); + if (retval != 0) { + xio_set_error(XIO_E_MSG_INVALID); + goto cleanup; + } + /* update receive + send window */ + if (rdma_hndl->exp_sn == rsp_hdr.sn) { + rdma_hndl->exp_sn++; + rdma_hndl->ack_sn = rsp_hdr.sn; + rdma_hndl->peer_credits += rsp_hdr.credits; + } else { + ERROR_LOG("ERROR: expected sn:%d, arrived sn:%d\n", + rdma_hndl->exp_sn, rsp_hdr.sn); + } + /* read the sn */ + rdma_task->sn = rsp_hdr.sn; + + /* find the sender task */ + sender_task = + xio_rdma_primary_task_lookup(rdma_hndl, + rsp_hdr.rtid); + task->sender_task = sender_task; + rdma_sender_task = (struct xio_rdma_task *)sender_task->dd_data; + task->rtid = rsp_hdr.ltid; + + /* mark the sender task as arrived */ + sender_task->state = XIO_TASK_STATE_RESPONSE_RECV; + + omsg = sender_task->omsg; + imsg = &task->imsg; + isgtbl = xio_sg_table_get(&imsg->in); + isgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(imsg->in.sgl_type); + osgtbl = xio_sg_table_get(&omsg->in); + osgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(omsg->in.sgl_type); + + clr_bits(XIO_MSG_HINT_ASSIGNED_DATA_IN_BUF, &imsg->hints); + + ulp_hdr = xio_mbuf_get_curr_ptr(&task->mbuf); + /* msg from received message */ + if (rsp_hdr.ulp_hdr_len) { + imsg->in.header.iov_base = ulp_hdr; + imsg->in.header.iov_len = rsp_hdr.ulp_hdr_len; + } else { + imsg->in.header.iov_base = NULL; + imsg->in.header.iov_len = 0; + } + task->status = rsp_hdr.status; + + /* handle the headers */ + if (omsg->in.header.iov_base) { + /* copy header to user buffers */ + size_t hdr_len = 0; + + if (imsg->in.header.iov_len > omsg->in.header.iov_len) { + hdr_len = omsg->in.header.iov_len; + task->status = XIO_E_MSG_SIZE; + } else { + hdr_len = imsg->in.header.iov_len; + task->status = XIO_E_SUCCESS; + } + if (hdr_len && imsg->in.header.iov_base) + memcpy(omsg->in.header.iov_base, + imsg->in.header.iov_base, + hdr_len); + else + *((char *)omsg->in.header.iov_base) = 0; + + omsg->in.header.iov_len = hdr_len; + } else { + /* no copy - just pointers */ + memclonev(&omsg->in.header, 1, &imsg->in.header, 1); + } + + switch (rsp_hdr.out_ib_op) { + case XIO_IB_SEND: + /* if data arrived, set the pointers */ + if (rsp_hdr.ulp_imm_len) { + tbl_set_nents(isgtbl_ops, isgtbl, 1); + sg = sge_first(isgtbl_ops, isgtbl); + sge_set_addr(isgtbl_ops, sg, + (ulp_hdr + imsg->in.header.iov_len + + rsp_hdr.ulp_pad_len)); + sge_set_length(isgtbl_ops, sg, + rsp_hdr.ulp_imm_len); + } else { + tbl_set_nents(isgtbl_ops, isgtbl, 0); + } + if (tbl_nents(osgtbl_ops, osgtbl)) { + /* deep copy */ + if (tbl_nents(isgtbl_ops, isgtbl)) { + size_t idata_len = + tbl_length(isgtbl_ops, isgtbl); + size_t odata_len = + tbl_length(osgtbl_ops, osgtbl); + + if (idata_len > odata_len) { + task->status = XIO_E_MSG_SIZE; + goto partial_msg; + } else { + task->status = XIO_E_SUCCESS; + } + sg = sge_first(osgtbl_ops, osgtbl); + if (sge_addr(osgtbl_ops, sg)) { + /* user provided buffer so do copy */ + tbl_copy(osgtbl_ops, osgtbl, + isgtbl_ops, isgtbl); + } else { + /* use provided only length - set user + * pointers */ + tbl_clone(osgtbl_ops, osgtbl, + isgtbl_ops, isgtbl); + } + } else { + tbl_set_nents(osgtbl_ops, osgtbl, + tbl_nents(isgtbl_ops, isgtbl)); + } + } else { + tbl_clone(osgtbl_ops, osgtbl, + isgtbl_ops, isgtbl); + } + break; + case XIO_IB_RDMA_WRITE: + if (rdma_task->rsp_out_num_sge > + rdma_sender_task->read_num_reg_mem) { + ERROR_LOG("local in data_iovec is too small %d < %d\n", + rdma_sender_task->read_num_reg_mem, + rdma_task->rsp_out_num_sge); + goto partial_msg; + } + + tbl_set_nents(isgtbl_ops, isgtbl, + rdma_task->rsp_out_num_sge); + + sg = sge_first(isgtbl_ops, isgtbl); + for (i = 0; i < rdma_task->rsp_out_num_sge; i++) { + sge_set_addr(isgtbl_ops, sg, + ptr_from_int64( + rdma_sender_task->read_reg_mem[i].addr)); + sge_set_length(isgtbl_ops, sg, + rdma_task->rsp_out_sge[i].length); + sg = sge_next(isgtbl_ops, isgtbl, sg); + } + + if (tbl_nents(osgtbl_ops, osgtbl)) { + /* user provided mr */ + sg = sge_first(osgtbl_ops, osgtbl); + if (sge_mr(osgtbl_ops, sg)) { + void *isg; + /* data was copied directly to user buffer */ + /* need to update the buffer length */ + for_each_sge(isgtbl, isgtbl_ops, isg, i) { + sge_set_length( + osgtbl_ops, sg, + sge_length(isgtbl_ops, + isg)); + + sg = sge_next(osgtbl_ops, + osgtbl, sg); + } + tbl_set_nents(osgtbl_ops, osgtbl, + tbl_nents(isgtbl_ops, isgtbl)); + } else { + /* user provided buffer but not mr */ + /* deep copy */ + if (sge_addr(osgtbl_ops, sg)) { + tbl_copy(osgtbl_ops, osgtbl, + isgtbl_ops, isgtbl); + /* put buffers back to pool */ + for ( + i = 0; + i < rdma_sender_task->read_num_reg_mem; + i++) { + xio_mempool_free( + &rdma_sender_task->read_reg_mem[i]); + rdma_sender_task->read_reg_mem[i].priv = + NULL; + } + rdma_sender_task->read_num_reg_mem = 0; + } else { + /* use provided only length - set user + * pointers */ + tbl_clone(osgtbl_ops, osgtbl, + isgtbl_ops, isgtbl); + } + } + } else { + ERROR_LOG("empty out message\n"); + } + break; + case XIO_IB_RDMA_READ: + /* schedule request for RDMA READ. in case of error + * don't schedule the rdma read operation */ + /*TRACE_LOG("scheduling rdma read\n");*/ + retval = xio_sched_rdma_rd(rdma_hndl, task); + if (retval == 0) + return 0; + ERROR_LOG("scheduling rdma read failed\n"); + break; + + default: + ERROR_LOG("%s unexpected op 0x%x\n", __func__, + rsp_hdr.out_ib_op); + break; + } + + /* must delay the send due to pending rdma read responses + * if not user will get out of order messages - need fence + */ + if (!list_empty(&rdma_hndl->rdma_rd_rsp_list)) { + list_move_tail(&task->tasks_list_entry, + &rdma_hndl->rdma_rd_rsp_list); + rdma_hndl->kick_rdma_rd_rsp = 1; + return 0; + } + if (rdma_hndl->rdma_rd_rsp_in_flight) { + rdma_hndl->rdma_rd_rsp_in_flight++; + list_move_tail(&task->tasks_list_entry, + &rdma_hndl->rdma_rd_rsp_in_flight_list); + return 0; + } + +partial_msg: + /* fill notification event */ + event_data.msg.op = XIO_WC_OP_RECV; + event_data.msg.task = task; + + /* notify the upper layer of received message */ + xio_transport_notify_observer(&rdma_hndl->base, + XIO_TRANSPORT_EVENT_NEW_MESSAGE, + &event_data); + return 0; + +cleanup: + retval = xio_errno(); + ERROR_LOG("xio_rdma_on_recv_rsp failed. (errno=%d %s)\n", + retval, xio_strerror(retval)); + xio_transport_notify_observer_error(&rdma_hndl->base, retval); + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_prep_rdma_op */ +/*---------------------------------------------------------------------------*/ +static int xio_prep_rdma_op( + struct xio_task *task, + struct xio_rdma_transport *rdma_hndl, + enum xio_ib_op_code xio_out_ib_op, + enum ibv_wr_opcode opcode, + struct xio_sge *lsg_list, size_t lsize, size_t *out_lsize, + struct xio_sge *rsg_list, size_t rsize, size_t *out_rsize, + uint32_t op_size, + int max_sge, + int signaled, + struct list_head *target_list, + int tasks_number) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + struct xio_task *tmp_task; + struct xio_rdma_task *tmp_rdma_task; + struct xio_work_req *rdmad = &rdma_task->rdmad; + struct xio_task *ptask, *next_ptask; + uint64_t laddr = lsg_list[0].addr; + uint64_t raddr = rsg_list[0].addr; + uint64_t raddr_base = raddr; + uint32_t llen = lsg_list[0].length; + uint32_t rlen = rsg_list[0].length; + uint32_t lkey = lsg_list[0].stag; + uint32_t rkey = rsg_list[0].stag; + unsigned int l = 0, r = 0, k = 0; + uint32_t tot_len = 0; + uint32_t int_len = 0; + uint32_t rint_len = 0; + int task_idx; + + LIST_HEAD(tmp_list); + + if (lsize < 1 || rsize < 1) { + ERROR_LOG("iovec size < 1 lsize:%zd, rsize:%zd\n", + lsize, rsize); + return -1; + } + + task_idx = tasks_number - 1; + + if (task_idx == 0) { + tmp_task = task; + } else { + /* take new task */ + tmp_task = xio_tasks_pool_get(rdma_hndl->phantom_tasks_pool, + rdma_hndl); + if (unlikely(!tmp_task)) { + ERROR_LOG("phantom tasks pool is empty\n"); + return -1; + } + } + tmp_rdma_task = + (struct xio_rdma_task *)tmp_task->dd_data; + rdmad = &tmp_rdma_task->rdmad; + + while (1) { + if (rlen < llen) { + rdmad->send_wr.num_sge = k + 1; + rdmad->send_wr.wr_id = + uint64_from_ptr(tmp_task); + rdmad->send_wr.next = NULL; + rdmad->send_wr.opcode = opcode; + rdmad->send_wr.send_flags = + (signaled ? IBV_SEND_SIGNALED : 0); + rdmad->send_wr.wr.rdma.remote_addr = raddr_base; + rdmad->send_wr.wr.rdma.rkey = rkey; + + rdmad->sge[k].addr = laddr; + rdmad->sge[k].length = rlen; + rdmad->sge[k].lkey = lkey; + k = 0; + + tot_len += rlen; + int_len += rlen; + tmp_rdma_task->out_ib_op = xio_out_ib_op; + tmp_rdma_task->phantom_idx = task_idx; + + /* close the task */ + list_move_tail(&tmp_task->tasks_list_entry, &tmp_list); + /* advance the remote index */ + r++; + if (r == rsize) { + lsg_list[l].length = int_len; + int_len = 0; + l++; + break; + } + task_idx--; + /* Is this the last task */ + if (task_idx) { + /* take new task */ + tmp_task = xio_tasks_pool_get( + rdma_hndl->phantom_tasks_pool, + rdma_hndl); + if (unlikely(!tmp_task)) { + ERROR_LOG( + "phantom tasks pool is empty\n"); + goto cleanup; + } + } else { + tmp_task = task; + } + + tmp_rdma_task = + (struct xio_rdma_task *)tmp_task->dd_data; + rdmad = &tmp_rdma_task->rdmad; + + llen -= rlen; + laddr += rlen; + raddr = rsg_list[r].addr; + rlen = rsg_list[r].length; + rkey = rsg_list[r].stag; + raddr_base = raddr; + } else if (llen < rlen) { + rdmad->sge[k].addr = laddr; + rdmad->sge[k].length = llen; + rdmad->sge[k].lkey = lkey; + tot_len += llen; + int_len += llen; + rint_len += llen; + + lsg_list[l].length = int_len; + int_len = 0; + /* advance the local index */ + l++; + k++; + if (l == lsize || k == (unsigned int)max_sge - 1) { + rdmad->send_wr.num_sge = k; + rdmad->send_wr.wr_id = + uint64_from_ptr(tmp_task); + rdmad->send_wr.next = NULL; + rdmad->send_wr.opcode = opcode; + rdmad->send_wr.send_flags = + (signaled ? IBV_SEND_SIGNALED : 0); + rdmad->send_wr.wr.rdma.remote_addr = raddr_base; + rdmad->send_wr.wr.rdma.rkey = rkey; + tmp_rdma_task->out_ib_op = xio_out_ib_op; + tmp_rdma_task->phantom_idx = task_idx; + /* close the task */ + list_move_tail(&tmp_task->tasks_list_entry, + &tmp_list); + + if (l == lsize) { + rsg_list[r].length = rint_len; + rint_len = 0; + r++; + break; + } + + /* if we are here then k == max_sge - 1 */ + + task_idx--; + /* Is this the last task */ + if (task_idx) { + /* take new task */ + tmp_task = xio_tasks_pool_get( + rdma_hndl->phantom_tasks_pool, + rdma_hndl); + if (unlikely(!tmp_task)) { + ERROR_LOG( + "phantom tasks pool is empty\n"); + goto cleanup; + } + } else { + tmp_task = task; + } + + tmp_rdma_task = + (struct xio_rdma_task *) + tmp_task->dd_data; + rdmad = &tmp_rdma_task->rdmad; + k = 0; + } + rlen -= llen; + raddr += llen; + + laddr = lsg_list[l].addr; + llen = lsg_list[l].length; + lkey = lsg_list[l].stag; + } else { + rdmad->send_wr.num_sge = k + 1; + rdmad->send_wr.wr_id = + uint64_from_ptr(tmp_task); + rdmad->send_wr.next = NULL; + rdmad->send_wr.opcode = opcode; + rdmad->send_wr.send_flags = + (signaled ? IBV_SEND_SIGNALED : 0); + rdmad->send_wr.wr.rdma.remote_addr = raddr_base; + rdmad->send_wr.wr.rdma.rkey = rkey; + + rdmad->sge[k].addr = laddr; + rdmad->sge[k].length = llen; + rdmad->sge[k].lkey = lkey; + k = 0; + + tot_len += llen; + int_len += llen; + rint_len += llen; + tmp_rdma_task->out_ib_op = xio_out_ib_op; + tmp_rdma_task->phantom_idx = task_idx; + + /* close the task */ + list_move_tail(&tmp_task->tasks_list_entry, + &tmp_list); + + lsg_list[l].length = int_len; + int_len = 0; + rsg_list[r].length = rint_len; + rint_len = 0; + /* advance the remote and local indices */ + r++; + l++; + if ((l == lsize) || (r == rsize)) + break; + + task_idx--; + /* Is this the last task */ + if (task_idx) { + /* take new task */ + tmp_task = + xio_tasks_pool_get( + rdma_hndl->phantom_tasks_pool, + rdma_hndl); + if (unlikely(!tmp_task)) { + ERROR_LOG( + "phantom tasks pool is empty\n"); + goto cleanup; + } + } else { + tmp_task = task; + } + + tmp_rdma_task = + (struct xio_rdma_task *)tmp_task->dd_data; + rdmad = &tmp_rdma_task->rdmad; + + laddr = lsg_list[l].addr; + llen = lsg_list[l].length; + lkey = lsg_list[l].stag; + + raddr = rsg_list[r].addr; + rlen = rsg_list[r].length; + rkey = rsg_list[r].stag; + raddr_base = raddr; + } + } + *out_lsize = l; + *out_rsize = r; + + if (tot_len < op_size) { + ERROR_LOG("iovec exhausted\n"); + goto cleanup; + } + + list_splice_tail(&tmp_list, target_list); + + return 0; +cleanup: + + /* list does not contain the original task */ + list_for_each_entry_safe(ptask, next_ptask, &tmp_list, + tasks_list_entry) { + /* the tmp tasks are returned back to pool */ + xio_tasks_pool_put(ptask); + } + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* init_lsg_list */ +/*---------------------------------------------------------------------------*/ +static void init_lsg_list(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task, + struct xio_sge *lsg_list, + size_t *lsg_list_len, + size_t *llen) +{ + struct xio_sg_table_ops *sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(task->omsg->out.sgl_type); + void *sgtbl = (struct xio_sg_table_ops *) + xio_sg_table_get(&task->omsg->out); + struct ibv_mr *mr; + void *sg; + unsigned int i; + + *lsg_list_len = tbl_nents(sgtbl_ops, sgtbl); + *llen = 0; + + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + lsg_list[i].addr = uint64_from_ptr(sge_addr(sgtbl_ops, sg)); + lsg_list[i].length = sge_length(sgtbl_ops, sg); + mr = xio_rdma_mr_lookup((struct xio_mr *)sge_mr(sgtbl_ops, sg), + rdma_hndl->tcq->dev); + lsg_list[i].stag = mr->lkey; + *llen += lsg_list[i].length; + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_perform_direct_rdma */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_perform_direct_rdma(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + enum xio_ib_op_code out_ib_op; + enum ibv_wr_opcode wr_opcode; + struct xio_sge lsg_list[XIO_MAX_IOV]; + size_t lsg_list_len; + size_t llen; + int retval = 0; + size_t lsg_out_list_len = 0; + size_t rsg_out_list_len = 0; + int tasks_used = 0; + + if (unlikely(verify_req_send_limits(rdma_hndl))) + return -1; + + init_lsg_list(rdma_hndl, task, lsg_list, &lsg_list_len, &llen); + if (unlikely(task->omsg->rdma.length < llen)) { + ERROR_LOG("peer provided too small iovec\n"); + task->status = XIO_E_REM_USER_BUF_OVERFLOW; + return -1; + } + + retval = xio_validate_rdma_op( + lsg_list, lsg_list_len, + task->omsg->rdma.rsg_list, + task->omsg->rdma.nents, + llen, + rdma_hndl->max_sge, + &tasks_used); + if (unlikely(retval)) { + ERROR_LOG("failed to validate input scatter lists\n"); + task->status = XIO_E_MSG_INVALID; + return -1; + } + out_ib_op = task->omsg->rdma.is_read ? XIO_IB_RDMA_READ_DIRECT : + XIO_IB_RDMA_WRITE_DIRECT; + wr_opcode = task->omsg->rdma.is_read ? IBV_WR_RDMA_READ : + IBV_WR_RDMA_WRITE; + + retval = xio_prep_rdma_op(task, rdma_hndl, + out_ib_op, + wr_opcode, + lsg_list, lsg_list_len, &lsg_out_list_len, + task->omsg->rdma.rsg_list, + task->omsg->rdma.nents, + &rsg_out_list_len, + llen, + rdma_hndl->max_sge, + 0, + &rdma_hndl->tx_ready_list, tasks_used); + if (unlikely(retval)) { + ERROR_LOG("failed to allocate tasks\n"); + task->status = XIO_E_NO_BUFS; + return -1; + } + rdma_hndl->tx_ready_tasks_num += tasks_used; + + return kick_send_and_read(rdma_hndl, task, 0 /* must_send */); +} + +/*---------------------------------------------------------------------------*/ +/* xio_set_msg_in_data_iovec */ +/*---------------------------------------------------------------------------*/ +static inline void xio_set_msg_in_data_iovec(struct xio_task *task, + struct xio_sge *lsg_list, + size_t lsize) +{ + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + void *sg; + unsigned int i; + + sgtbl = (struct xio_sg_table_ops *) + xio_sg_table_get(&task->imsg.in); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(task->imsg.in.sgl_type); + sg = sge_first(sgtbl_ops, sgtbl); + + for (i = 0; i < lsize; i++) { + sge_set_length(sgtbl_ops, sg, lsg_list[i].length); + sg = sge_next(sgtbl_ops, sgtbl, sg); + } + tbl_set_nents(sgtbl_ops, sgtbl, lsize); +} + +/*---------------------------------------------------------------------------*/ +/* xio_sched_rdma_rd */ +/*---------------------------------------------------------------------------*/ +static int xio_sched_rdma_rd(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + + unsigned int i; + int retval; + int user_assign_flag = 0; + size_t llen = 0, rlen = 0; + int tasks_used = 0; + struct xio_sge lsg_list[XIO_MAX_IOV]; + size_t lsg_list_len; + size_t lsg_out_list_len = 0; + size_t rsg_out_list_len = 0; + struct ibv_mr *mr; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + void *sg; + struct list_head *rdma_rd_list; + + /* peer got request for rdma read */ + + /* need for buffer to do rdma read. there are two options: */ + /* option 1: user provides call back that fills application memory */ + /* option 2: use internal buffer pool */ + + /* hint the upper layer of sizes */ + sgtbl = (struct xio_sg_table_ops *)xio_sg_table_get(&task->imsg.in); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(task->imsg.in.sgl_type); + tbl_set_nents(sgtbl_ops, sgtbl, rdma_task->req_out_num_sge); + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + sge_set_addr(sgtbl_ops, sg, NULL); + sge_set_length(sgtbl_ops, sg, + rdma_task->req_out_sge[i].length); + rlen += rdma_task->req_out_sge[i].length; + rdma_task->read_reg_mem[i].priv = NULL; + } + + sgtbl = xio_sg_table_get(&task->imsg.out); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(task->imsg.out.sgl_type); + if (rdma_task->req_in_num_sge) { + tbl_set_nents(sgtbl_ops, sgtbl, rdma_task->req_in_num_sge); + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + sge_set_addr(sgtbl_ops, sg, NULL); + sge_set_length(sgtbl_ops, sg, + rdma_task->req_in_sge[i].length); + sge_set_mr(sgtbl_ops, sg, NULL); + rdma_task->write_reg_mem[i].priv = NULL; + } + } else { + tbl_set_nents(sgtbl_ops, sgtbl, 0); + } + sgtbl = xio_sg_table_get(&task->imsg.in); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(task->imsg.in.sgl_type); + + xio_transport_assign_in_buf(&rdma_hndl->base, task, &user_assign_flag); + + if (user_assign_flag) { + /* if user does not have buffers ignore */ + if (tbl_nents(sgtbl_ops, sgtbl) == 0) { + WARN_LOG("application has not provided buffers\n"); + WARN_LOG("rdma read is ignored\n"); + task->status = XIO_E_NO_USER_BUFS; + return -1; + } + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + if (!sge_mr(sgtbl_ops, sg)) { + ERROR_LOG("application has not provided mr\n"); + ERROR_LOG("rdma read is ignored\n"); + task->status = XIO_E_NO_USER_MR; + return -1; + } + if (!sge_addr(sgtbl_ops, sg)) { + ERROR_LOG("application has provided " \ + "null address\n"); + ERROR_LOG("rdma read is ignored\n"); + task->status = XIO_E_NO_USER_BUFS; + return -1; + } + llen += sge_length(sgtbl_ops, sg); + } + if (rlen > llen) { + ERROR_LOG("application provided too small iovec\n"); + ERROR_LOG("remote peer want to write %zd bytes while " \ + "local peer provided buffer size %zd bytes\n", + rlen, llen); + ERROR_LOG("rdma read is ignored\n"); + task->status = XIO_E_USER_BUF_OVERFLOW; + return -1; + } + set_bits(XIO_MSG_HINT_ASSIGNED_DATA_IN_BUF, &task->imsg.hints); + } else { + if (!rdma_hndl->rdma_mempool) { + ERROR_LOG( + "message /read/write failed - " \ + "library's memory pool disabled\n"); + task->status = XIO_E_NO_BUFS; + goto cleanup; + } + + tbl_set_nents(sgtbl_ops, sgtbl, rdma_task->req_out_num_sge); + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + retval = xio_mempool_alloc( + rdma_hndl->rdma_mempool, + rdma_task->req_out_sge[i].length, + &rdma_task->read_reg_mem[i]); + + if (unlikely(retval)) { + rdma_task->read_num_reg_mem = i; + ERROR_LOG("mempool is empty for %zd bytes\n", + rdma_task->read_reg_mem[i].length); + + task->status = ENOMEM; + goto cleanup; + } + sge_set_addr(sgtbl_ops, sg, + rdma_task->read_reg_mem[i].addr); + sge_set_length(sgtbl_ops, sg, + rdma_task->read_reg_mem[i].length); + sge_set_mr(sgtbl_ops, sg, + rdma_task->read_reg_mem[i].mr); + + llen += rdma_task->read_reg_mem[i].length; + } + rdma_task->read_num_reg_mem = rdma_task->req_out_num_sge; + } + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + lsg_list[i].addr = + uint64_from_ptr(sge_addr(sgtbl_ops, sg)); + lsg_list[i].length = + uint64_from_ptr(sge_length(sgtbl_ops, sg)); + + mr = xio_rdma_mr_lookup((struct xio_mr *)sge_mr(sgtbl_ops, sg), + rdma_hndl->tcq->dev); + lsg_list[i].stag = mr->rkey; + } + lsg_list_len = tbl_nents(sgtbl_ops, sgtbl); + + if (!task->sender_task) + rdma_rd_list = &rdma_hndl->rdma_rd_req_list; + else + rdma_rd_list = &rdma_hndl->rdma_rd_rsp_list; + + retval = xio_validate_rdma_op( + lsg_list, lsg_list_len, + rdma_task->req_out_sge, + rdma_task->req_out_num_sge, + min(rlen, llen), + rdma_hndl->max_sge, + &tasks_used); + if (retval) { + ERROR_LOG("failed to validate input iovecs\n"); + ERROR_LOG("rdma read is ignored\n"); + task->status = XIO_E_MSG_INVALID; + goto cleanup; + } + + retval = xio_prep_rdma_op(task, rdma_hndl, + XIO_IB_RDMA_READ, + IBV_WR_RDMA_READ, + lsg_list, + lsg_list_len, &lsg_out_list_len, + rdma_task->req_out_sge, + rdma_task->req_out_num_sge, + &rsg_out_list_len, + min(rlen, llen), + rdma_hndl->max_sge, + 1, + rdma_rd_list, tasks_used); + if (retval) { + ERROR_LOG("failed to allocate tasks\n"); + ERROR_LOG("rdma read is ignored\n"); + task->status = XIO_E_WRITE_FAILED; + goto cleanup; + } + + /* prepare the in side of the message */ + xio_set_msg_in_data_iovec(task, lsg_list, lsg_out_list_len); + + if (!task->sender_task) + xio_xmit_rdma_rd_req(rdma_hndl); + else + xio_xmit_rdma_rd_rsp(rdma_hndl); + + return 0; +cleanup: + xio_set_error(task->status); + for (i = 0; i < rdma_task->read_num_reg_mem; i++) + xio_mempool_free(&rdma_task->read_reg_mem[i]); + + rdma_task->read_num_reg_mem = 0; + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_set_rsp_out_sge */ +/*---------------------------------------------------------------------------*/ +static inline void xio_set_rsp_out_sge(struct xio_task *task, + struct xio_sge *rsg_list, + size_t rsize) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + + unsigned int i; + + for (i = 0; i < rsize; i++) + rdma_task->rsp_out_sge[i].length = rsg_list[i].length; + + rdma_task->rsp_out_num_sge = rsize; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sched_rdma_wr_req */ +/*---------------------------------------------------------------------------*/ +static int xio_sched_rdma_wr_req(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + + struct xio_sg_table_ops *sgtbl_ops; + struct xio_sge lsg_list[XIO_MAX_IOV]; + struct ibv_mr *mr; + void *sgtbl; + void *sg; + size_t lsg_list_len; + size_t lsg_out_list_len = 0; + size_t rsg_out_list_len = 0; + size_t rlen = 0, llen = 0; + int tasks_used = 0; + unsigned int i; + int retval = 0; + + sgtbl = xio_sg_table_get(&task->omsg->out); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(task->omsg->out.sgl_type); + sg = sge_first(sgtbl_ops, sgtbl); + + /* user did not provided mr */ + if (!sge_mr(sgtbl_ops, sg)) { + if (!rdma_hndl->rdma_mempool) { + xio_set_error(XIO_E_NO_BUFS); + ERROR_LOG( + "message /read/write failed - " \ + "library's memory pool disabled\n"); + goto cleanup; + } + /* user did not provide mr - take buffers from pool + * and do copy */ + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + retval = xio_mempool_alloc( + rdma_hndl->rdma_mempool, + sge_length(sgtbl_ops, sg), + &rdma_task->write_reg_mem[i]); + if (unlikely(retval)) { + rdma_task->write_num_reg_mem = i; + xio_set_error(ENOMEM); + ERROR_LOG("mempool is empty for %zd bytes\n", + sge_length(sgtbl_ops, sg)); + goto cleanup; + } + lsg_list[i].addr = uint64_from_ptr( + rdma_task->write_reg_mem[i].addr); + lsg_list[i].length = sge_length(sgtbl_ops, sg); + mr = xio_rdma_mr_lookup(rdma_task->write_reg_mem[i].mr, + rdma_hndl->tcq->dev); + lsg_list[i].stag = mr->lkey; + + llen += lsg_list[i].length; + + /* copy the data to the buffer */ + memcpy(rdma_task->write_reg_mem[i].addr, + sge_addr(sgtbl_ops, sg), + sge_length(sgtbl_ops, sg)); + } + } else { + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + lsg_list[i].addr = uint64_from_ptr( + sge_addr(sgtbl_ops, sg)); + lsg_list[i].length = sge_length(sgtbl_ops, sg); + mr = xio_rdma_mr_lookup((struct xio_mr *) + sge_mr(sgtbl_ops, sg), + rdma_hndl->tcq->dev); + lsg_list[i].stag = mr->lkey; + + llen += lsg_list[i].length; + } + } + lsg_list_len = tbl_nents(sgtbl_ops, sgtbl); + + for (i = 0; i < rdma_task->req_in_num_sge; i++) + rlen += rdma_task->req_in_sge[i].length; + + if (rlen < llen) { + ERROR_LOG("peer provided too small iovec\n"); + ERROR_LOG("rdma write is ignored\n"); + task->status = XIO_E_REM_USER_BUF_OVERFLOW; + goto cleanup; + } + retval = xio_validate_rdma_op( + lsg_list, lsg_list_len, + rdma_task->req_in_sge, + rdma_task->req_in_num_sge, + min(rlen, llen), + rdma_hndl->max_sge, + &tasks_used); + if (retval) { + ERROR_LOG("failed to invalidate input iovecs\n"); + ERROR_LOG("rdma write is ignored\n"); + task->status = XIO_E_MSG_INVALID; + goto cleanup; + } + + retval = xio_prep_rdma_op(task, rdma_hndl, + XIO_IB_RDMA_WRITE, + IBV_WR_RDMA_WRITE, + lsg_list, lsg_list_len, &lsg_out_list_len, + rdma_task->req_in_sge, + rdma_task->req_in_num_sge, + &rsg_out_list_len, + min(rlen, llen), + rdma_hndl->max_sge, + 0, + &rdma_hndl->tx_ready_list, tasks_used); + if (retval) { + ERROR_LOG("failed to allocate tasks\n"); + ERROR_LOG("rdma write is ignored\n"); + task->status = XIO_E_READ_FAILED; + goto cleanup; + } + /* prepare response to peer */ + xio_set_rsp_out_sge(task, rdma_task->req_in_sge, rsg_out_list_len); + + /* xio_prep_rdma_op used splice to transfer "tasks_used" to + * tx_ready_list + */ + rdma_hndl->tx_ready_tasks_num += tasks_used; + return 0; +cleanup: + for (i = 0; i < rdma_task->write_num_reg_mem; i++) + xio_mempool_free(&rdma_task->write_reg_mem[i]); + + rdma_task->write_num_reg_mem = 0; + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_on_recv_req */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_on_recv_req(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + + union xio_transport_event_data event_data; + struct xio_rdma_req_hdr req_hdr; + struct xio_sg_table_ops *sgtbl_ops; + struct xio_msg *imsg; + void *ulp_hdr; + void *sgtbl; + void *sg; + unsigned int i; + int retval = 0; + + /* read header */ + retval = xio_rdma_read_req_header(rdma_hndl, task, &req_hdr); + if (retval != 0) { + xio_set_error(XIO_E_MSG_INVALID); + goto cleanup; + } + if (rdma_hndl->exp_sn == req_hdr.sn) { + rdma_hndl->exp_sn++; + rdma_hndl->ack_sn = req_hdr.sn; + rdma_hndl->peer_credits += req_hdr.credits; + } else { + ERROR_LOG("ERROR: sn expected:%d, " \ + "sn arrived:%d out_ib_op:%u %u %u\n", + rdma_hndl->exp_sn, req_hdr.sn, + req_hdr.out_ib_op, req_hdr.in_num_sge, + req_hdr.out_num_sge); + } + /* save originator identifier */ + task->imsg_flags = req_hdr.flags; + task->rtid = req_hdr.ltid; + + imsg = &task->imsg; + sgtbl = xio_sg_table_get(&imsg->out); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(imsg->out.sgl_type); + + ulp_hdr = xio_mbuf_get_curr_ptr(&task->mbuf); + + imsg->type = (enum xio_msg_type)task->tlv_type; + imsg->in.header.iov_len = req_hdr.ulp_hdr_len; + clr_bits(XIO_MSG_HINT_ASSIGNED_DATA_IN_BUF, &imsg->hints); + + if (req_hdr.ulp_hdr_len) + imsg->in.header.iov_base = ulp_hdr; + else + imsg->in.header.iov_base = NULL; + + /* hint upper layer about expected response */ + if (rdma_task->req_in_num_sge) { + tbl_set_nents(sgtbl_ops, sgtbl, rdma_task->req_in_num_sge); + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + sge_set_addr(sgtbl_ops, sg, NULL); + sge_set_length(sgtbl_ops, sg, + rdma_task->req_in_sge[i].length); + sge_set_mr(sgtbl_ops, sg, NULL); + } + } else { + tbl_set_nents(sgtbl_ops, sgtbl, 0); + } + + switch (req_hdr.out_ib_op) { + case XIO_IB_SEND: + sgtbl = xio_sg_table_get(&imsg->in); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(imsg->in.sgl_type); + if (req_hdr.ulp_imm_len) { + /* incoming data via SEND */ + /* if data arrived, set the pointers */ + tbl_set_nents(sgtbl_ops, sgtbl, 1); + sg = sge_first(sgtbl_ops, sgtbl); + sge_set_addr(sgtbl_ops, sg, + (ulp_hdr + imsg->in.header.iov_len + + req_hdr.ulp_pad_len)); + sge_set_length(sgtbl_ops, sg, req_hdr.ulp_imm_len); + } else { + /* no data at all */ + tbl_set_nents(sgtbl_ops, sgtbl, 0); + } + break; + case XIO_IB_RDMA_READ: + /* schedule request for RDMA READ. in case of error + * don't schedule the rdma read operation */ + /*TRACE_LOG("scheduling rdma read\n");*/ + retval = xio_sched_rdma_rd(rdma_hndl, task); + if (retval == 0) + return 0; + ERROR_LOG("scheduling rdma read failed\n"); + break; + default: + ERROR_LOG("unexpected out_ib_op\n"); + xio_set_error(XIO_E_MSG_INVALID); + task->status = XIO_E_MSG_INVALID; + break; + }; + + /* must delay the send due to pending rdma read requests + * if not user will get out of order messages - need fence + */ + if (!list_empty(&rdma_hndl->rdma_rd_req_list)) { + list_move_tail(&task->tasks_list_entry, + &rdma_hndl->rdma_rd_req_list); + rdma_hndl->kick_rdma_rd_req = 1; + return 0; + } + if (rdma_hndl->rdma_rd_req_in_flight) { + rdma_hndl->rdma_rd_req_in_flight++; + list_move_tail(&task->tasks_list_entry, + &rdma_hndl->rdma_rd_req_in_flight_list); + return 0; + } + /* fill notification event */ + event_data.msg.op = XIO_WC_OP_RECV; + event_data.msg.task = task; + + xio_transport_notify_observer(&rdma_hndl->base, + XIO_TRANSPORT_EVENT_NEW_MESSAGE, + &event_data); + + return 0; + +cleanup: + retval = xio_errno(); + ERROR_LOG("xio_rdma_on_recv_req failed. (errno=%d %s)\n", retval, + xio_strerror(retval)); + xio_transport_notify_observer_error(&rdma_hndl->base, retval); + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_write_setup_msg */ +/*---------------------------------------------------------------------------*/ +static void xio_rdma_write_setup_msg(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task, + struct xio_rdma_setup_msg *msg) +{ + struct xio_rdma_setup_msg *tmp_msg; + struct xio_rkey_tbl_pack *ptbl; + struct xio_rkey_tbl *tbl; + int i; + + /* set the mbuf after tlv header */ + xio_mbuf_set_val_start(&task->mbuf); + + /* jump after connection setup header */ + if (rdma_hndl->base.is_client) + xio_mbuf_inc(&task->mbuf, + sizeof(struct xio_nexus_setup_req)); + else + xio_mbuf_inc(&task->mbuf, + sizeof(struct xio_nexus_setup_rsp)); + + tmp_msg = (struct xio_rdma_setup_msg *) + xio_mbuf_get_curr_ptr(&task->mbuf); + + /* pack relevant values */ + PACK_LLVAL(msg, tmp_msg, buffer_sz); + PACK_SVAL(msg, tmp_msg, sq_depth); + PACK_SVAL(msg, tmp_msg, rq_depth); + PACK_SVAL(msg, tmp_msg, credits); + PACK_LVAL(msg, tmp_msg, max_in_iovsz); + PACK_LVAL(msg, tmp_msg, max_out_iovsz); + PACK_SVAL(msg, tmp_msg, rkey_tbl_size); + PACK_LVAL(msg, tmp_msg, max_header_len); + +#ifdef EYAL_TODO + print_hex_dump_bytes("post_send: ", DUMP_PREFIX_ADDRESS, + task->mbuf.tlv.head, + 64); +#endif + xio_mbuf_inc(&task->mbuf, sizeof(struct xio_rdma_setup_msg)); + + if (!msg->rkey_tbl_size) + return; + + tbl = rdma_hndl->rkey_tbl; + ptbl = (struct xio_rkey_tbl_pack *)xio_mbuf_get_curr_ptr(&task->mbuf); + for (i = 0; i < rdma_hndl->rkey_tbl_size; i++) { + PACK_LVAL(tbl, ptbl, old_rkey); + PACK_LVAL(tbl, ptbl, new_rkey); + tbl++; + xio_mbuf_inc(&task->mbuf, sizeof(struct xio_rkey_tbl_pack)); + ptbl = (struct xio_rkey_tbl_pack *) + xio_mbuf_get_curr_ptr(&task->mbuf); + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_read_setup_msg */ +/*---------------------------------------------------------------------------*/ +static void xio_rdma_read_setup_msg(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task, + struct xio_rdma_setup_msg *msg) +{ + struct xio_rdma_setup_msg *tmp_msg; + struct xio_rkey_tbl_pack *ptbl; + struct xio_rkey_tbl *tbl; + int i; + + /* set the mbuf after tlv header */ + xio_mbuf_set_val_start(&task->mbuf); + + /* jump after connection setup header */ + if (rdma_hndl->base.is_client) + xio_mbuf_inc(&task->mbuf, + sizeof(struct xio_nexus_setup_rsp)); + else + xio_mbuf_inc(&task->mbuf, + sizeof(struct xio_nexus_setup_req)); + + tmp_msg = (struct xio_rdma_setup_msg *) + xio_mbuf_get_curr_ptr(&task->mbuf); + + /* pack relevant values */ + UNPACK_LLVAL(tmp_msg, msg, buffer_sz); + UNPACK_SVAL(tmp_msg, msg, sq_depth); + UNPACK_SVAL(tmp_msg, msg, rq_depth); + UNPACK_SVAL(tmp_msg, msg, credits); + UNPACK_LVAL(tmp_msg, msg, max_in_iovsz); + UNPACK_LVAL(tmp_msg, msg, max_out_iovsz); + UNPACK_SVAL(tmp_msg, msg, rkey_tbl_size); + UNPACK_LVAL(tmp_msg, msg, max_header_len); + +#ifdef EYAL_TODO + print_hex_dump_bytes("post_send: ", DUMP_PREFIX_ADDRESS, + task->mbuf.curr, + 64); +#endif + xio_mbuf_inc(&task->mbuf, sizeof(struct xio_rdma_setup_msg)); + + if (!msg->rkey_tbl_size) + return; + + rdma_hndl->peer_rkey_tbl = (struct xio_rkey_tbl *) + calloc(msg->rkey_tbl_size, sizeof(*tbl)); + if (unlikely(!rdma_hndl->peer_rkey_tbl)) { + ERROR_LOG("calloc failed. (errno=%m)\n"); + xio_strerror(ENOMEM); + msg->rkey_tbl_size = -1; + return; + } + + tbl = rdma_hndl->peer_rkey_tbl; + ptbl = (struct xio_rkey_tbl_pack *)xio_mbuf_get_curr_ptr(&task->mbuf); + for (i = 0; i < msg->rkey_tbl_size; i++) { + UNPACK_LVAL(ptbl, tbl, old_rkey); + UNPACK_LVAL(ptbl, tbl, new_rkey); + tbl++; + xio_mbuf_inc(&task->mbuf, sizeof(struct xio_rkey_tbl_pack)); + ptbl = (struct xio_rkey_tbl_pack *) + xio_mbuf_get_curr_ptr(&task->mbuf); + } + rdma_hndl->peer_rkey_tbl_size = msg->rkey_tbl_size; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_send_setup_req */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_send_setup_req(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + + struct xio_rdma_setup_msg req; + uint16_t payload; + + req.buffer_sz = xio_rdma_get_inline_buffer_size(); + req.sq_depth = rdma_hndl->sq_depth; + req.rq_depth = rdma_hndl->rq_depth; + req.credits = 0; + req.max_in_iovsz = rdma_options.max_in_iovsz; + req.max_out_iovsz = rdma_options.max_out_iovsz; + req.rkey_tbl_size = rdma_hndl->rkey_tbl_size; + req.max_header_len = g_options.max_inline_xio_hdr; + + xio_rdma_write_setup_msg(rdma_hndl, task, &req); + + payload = xio_mbuf_tlv_payload_len(&task->mbuf); + + /* add tlv */ + if (xio_mbuf_write_tlv(&task->mbuf, task->tlv_type, payload) != 0) + return -1; + + TRACE_LOG("rdma send setup request\n"); + + /* set the length */ + rdma_task->txd.sge[0].length = xio_mbuf_data_length(&task->mbuf); + + rdma_task->txd.send_wr.send_flags = IBV_SEND_SIGNALED; + if (rdma_task->txd.sge[0].length < (size_t)rdma_hndl->max_inline_data) + rdma_task->txd.send_wr.send_flags |= IBV_SEND_INLINE; + + rdma_task->txd.send_wr.next = NULL; + rdma_task->out_ib_op = XIO_IB_SEND; + rdma_task->txd.send_wr.num_sge = 1; + + xio_task_addref(task); + rdma_hndl->reqs_in_flight_nr++; + list_move_tail(&task->tasks_list_entry, &rdma_hndl->in_flight_list); + + rdma_hndl->peer_credits--; + xio_post_send(rdma_hndl, &rdma_task->txd, 1); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_send_setup_rsp */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_send_setup_rsp(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + + uint16_t payload; + + rdma_hndl->sim_peer_credits += rdma_hndl->credits; + + rdma_hndl->setup_rsp.credits = rdma_hndl->credits; + rdma_hndl->setup_rsp.max_in_iovsz = rdma_options.max_in_iovsz; + rdma_hndl->setup_rsp.max_out_iovsz = rdma_options.max_out_iovsz; + rdma_hndl->setup_rsp.buffer_sz = rdma_hndl->membuf_sz; + rdma_hndl->setup_rsp.max_header_len = g_options.max_inline_xio_hdr; + + xio_rdma_write_setup_msg(rdma_hndl, task, &rdma_hndl->setup_rsp); + rdma_hndl->credits = 0; + + payload = xio_mbuf_tlv_payload_len(&task->mbuf); + + /* add tlv */ + if (xio_mbuf_write_tlv(&task->mbuf, task->tlv_type, payload) != 0) + return -1; + + TRACE_LOG("rdma send setup response\n"); + + /* set the length */ + rdma_task->txd.sge[0].length = xio_mbuf_data_length(&task->mbuf); + rdma_task->txd.send_wr.send_flags = IBV_SEND_SIGNALED; + if (rdma_task->txd.sge[0].length < (size_t)rdma_hndl->max_inline_data) + rdma_task->txd.send_wr.send_flags |= IBV_SEND_INLINE; + rdma_task->txd.send_wr.next = NULL; + rdma_task->out_ib_op = XIO_IB_SEND; + rdma_task->txd.send_wr.num_sge = 1; + + rdma_hndl->rsps_in_flight_nr++; + list_move(&task->tasks_list_entry, &rdma_hndl->in_flight_list); + + rdma_hndl->peer_credits--; + xio_post_send(rdma_hndl, &rdma_task->txd, 1); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_on_setup_msg */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_on_setup_msg(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + union xio_transport_event_data event_data; + struct xio_rdma_setup_msg *rsp = &rdma_hndl->setup_rsp; + uint64_t local_buf_size; + + if (rdma_hndl->base.is_client) { + struct xio_task *sender_task = NULL; + + if (!list_empty(&rdma_hndl->in_flight_list)) + sender_task = list_first_entry( + &rdma_hndl->in_flight_list, + struct xio_task, tasks_list_entry); + else if (!list_empty(&rdma_hndl->tx_comp_list)) + sender_task = list_first_entry( + &rdma_hndl->tx_comp_list, + struct xio_task, tasks_list_entry); + else + ERROR_LOG("could not find sender task\n"); + + task->sender_task = sender_task; + xio_rdma_read_setup_msg(rdma_hndl, task, rsp); + /* get the initial credits */ + rdma_hndl->peer_credits += rsp->credits; + } else { + struct xio_rdma_setup_msg req; + + xio_rdma_read_setup_msg(rdma_hndl, task, &req); + + /* current implementation is symmetric */ + local_buf_size = xio_rdma_get_inline_buffer_size(); + rsp->buffer_sz = min(req.buffer_sz, local_buf_size); + rsp->sq_depth = max(req.sq_depth, rdma_hndl->rq_depth); + rsp->rq_depth = max(req.rq_depth, rdma_hndl->sq_depth); + rsp->max_in_iovsz = req.max_in_iovsz; + rsp->max_out_iovsz = req.max_out_iovsz; + rsp->max_header_len = req.max_header_len; + } + + /* save the values */ + rdma_hndl->rq_depth = rsp->rq_depth; + rdma_hndl->sq_depth = rsp->sq_depth; + rdma_hndl->membuf_sz = rsp->buffer_sz; + rdma_hndl->max_inline_buf_sz = rsp->buffer_sz; + rdma_hndl->peer_max_in_iovsz = rsp->max_in_iovsz; + rdma_hndl->peer_max_out_iovsz = rsp->max_out_iovsz; + rdma_hndl->peer_max_header = rsp->max_header_len; + + /* initialize send window */ + rdma_hndl->sn = 0; + rdma_hndl->ack_sn = ~0; + rdma_hndl->credits = 0; + rdma_hndl->max_sn = rdma_hndl->sq_depth; + + /* initialize receive window */ + rdma_hndl->exp_sn = 0; + rdma_hndl->max_exp_sn = 0; + + rdma_hndl->max_tx_ready_tasks_num = rdma_hndl->sq_depth; + + /* fill notification event */ + event_data.msg.op = XIO_WC_OP_RECV; + event_data.msg.task = task; + + TRACE_LOG("setup complete. send_buf_sz:%d\n", + rdma_hndl->max_inline_buf_sz); + + xio_transport_notify_observer(&rdma_hndl->base, + XIO_TRANSPORT_EVENT_NEW_MESSAGE, + &event_data); + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_write_rdma_read_ack_hdr */ +/*---------------------------------------------------------------------------*/ +static void xio_write_rdma_read_ack_hdr(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task, + struct xio_rdma_read_ack_hdr *rra) +{ + struct xio_rdma_read_ack_hdr *tmp_rra; + + xio_mbuf_reset(&task->mbuf); + + /* set start of the tlv */ + if (xio_mbuf_tlv_start(&task->mbuf) != 0) + return; + + /* set the mbuf after tlv header */ + xio_mbuf_set_val_start(&task->mbuf); + + /* get the pointer */ + tmp_rra = (struct xio_rdma_read_ack_hdr *) + xio_mbuf_get_curr_ptr(&task->mbuf); + + /* pack relevant values */ + PACK_SVAL(rra, tmp_rra, hdr_len); + PACK_LVAL(rra, tmp_rra, rtid); + + xio_mbuf_inc(&task->mbuf, sizeof(*rra)); +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_send_rdma_read_ack */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_send_rdma_read_ack(struct xio_rdma_transport *rdma_hndl, + int rtid) +{ + uint64_t payload; + struct xio_task *task; + struct xio_rdma_task *rdma_task; + struct xio_rdma_read_ack_hdr rra = { + .hdr_len = sizeof(rra), + .rtid = rtid, + }; + + task = xio_rdma_primary_task_alloc(rdma_hndl); + if (!task) { + ERROR_LOG("primary tasks pool is empty\n"); + return -1; + } + task->omsg = NULL; + + task->tlv_type = XIO_RDMA_READ_ACK; + rdma_task = (struct xio_rdma_task *)task->dd_data; + + /* write the message */ + xio_write_rdma_read_ack_hdr(rdma_hndl, task, &rra); + + payload = xio_mbuf_tlv_payload_len(&task->mbuf); + + /* add tlv */ + if (xio_mbuf_write_tlv(&task->mbuf, task->tlv_type, payload) != 0) + return -1; + + /* set the length */ + rdma_task->txd.sge[0].length = xio_mbuf_data_length(&task->mbuf); + rdma_task->txd.send_wr.send_flags = 0; + if (rdma_task->txd.sge[0].length < (size_t)rdma_hndl->max_inline_data) + rdma_task->txd.send_wr.send_flags |= IBV_SEND_INLINE; + + rdma_task->txd.send_wr.next = NULL; + rdma_task->out_ib_op = XIO_IB_SEND; + rdma_task->txd.send_wr.num_sge = 1; + + rdma_hndl->rsps_in_flight_nr++; + list_add_tail(&task->tasks_list_entry, &rdma_hndl->in_flight_list); + + rdma_hndl->peer_credits--; + xio_post_send(rdma_hndl, &rdma_task->txd, 1); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_read_rdma_read_ack_hdr */ +/*---------------------------------------------------------------------------*/ +static void xio_read_rdma_read_ack_hdr(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task, + struct xio_rdma_read_ack_hdr *rra) +{ + struct xio_rdma_read_ack_hdr *tmp_rra; + + /* goto to the first tlv and set the mbuf after tlv header */ + xio_mbuf_set_val_start(&task->mbuf); + + /* get the pointer */ + tmp_rra = (struct xio_rdma_read_ack_hdr *) + xio_mbuf_get_curr_ptr(&task->mbuf); + + /* pack relevant values */ + UNPACK_SVAL(tmp_rra, rra, hdr_len); + UNPACK_LVAL(tmp_rra, rra, rtid); + + xio_mbuf_inc(&task->mbuf, sizeof(*rra)); +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_on_recv_rdma_read_ack */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_on_recv_rdma_read_ack(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + struct xio_rdma_read_ack_hdr rra; + union xio_transport_event_data event_data; + struct xio_task *req_task; + + xio_read_rdma_read_ack_hdr(rdma_hndl, task, &rra); + + /* the rx task is returned back to pool */ + xio_tasks_pool_put(task); + + /* find the sender task */ + req_task = xio_rdma_primary_task_lookup(rdma_hndl, rra.rtid); + + event_data.msg.op = XIO_WC_OP_SEND; + event_data.msg.task = req_task; + + xio_transport_notify_observer(&rdma_hndl->base, + XIO_TRANSPORT_EVENT_SEND_COMPLETION, + &event_data); + + return 0; +} + +#ifndef XIO_SRQ_ENABLE +/*---------------------------------------------------------------------------*/ +/* xio_rdma_write_nop */ +/*---------------------------------------------------------------------------*/ +static void xio_rdma_write_nop(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task, struct xio_nop_hdr *nop) +{ + struct xio_nop_hdr *tmp_nop; + + xio_mbuf_reset(&task->mbuf); + + /* set start of the tlv */ + if (xio_mbuf_tlv_start(&task->mbuf) != 0) + return; + + /* set the mbuf after tlv header */ + xio_mbuf_set_val_start(&task->mbuf); + + /* get the pointer */ + tmp_nop = (struct xio_nop_hdr *)xio_mbuf_get_curr_ptr(&task->mbuf); + + /* pack relevant values */ + PACK_SVAL(nop, tmp_nop, hdr_len); + PACK_SVAL(nop, tmp_nop, sn); + PACK_SVAL(nop, tmp_nop, ack_sn); + PACK_SVAL(nop, tmp_nop, credits); + tmp_nop->opcode = nop->opcode; + tmp_nop->flags = nop->flags; + +#ifdef EYAL_TODO + print_hex_dump_bytes("write_nop: ", DUMP_PREFIX_ADDRESS, + task->mbuf.tlv.head, + 64); +#endif + xio_mbuf_inc(&task->mbuf, sizeof(*nop)); +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_send_nop */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_send_nop(struct xio_rdma_transport *rdma_hndl) +{ + uint64_t payload; + struct xio_task *task; + struct xio_rdma_task *rdma_task; + struct xio_nop_hdr nop = { + .hdr_len = sizeof(nop), + .sn = rdma_hndl->sn, + .ack_sn = rdma_hndl->ack_sn, + .credits = rdma_hndl->credits, + .opcode = 0, + .flags = 0 + }; + + TRACE_LOG("SEND_NOP\n"); + + task = xio_rdma_primary_task_alloc(rdma_hndl); + if (!task) { + ERROR_LOG("primary tasks pool is empty\n"); + return -1; + } + task->omsg = NULL; + + task->tlv_type = XIO_CREDIT_NOP; + rdma_task = (struct xio_rdma_task *)task->dd_data; + + /* write the message */ + xio_rdma_write_nop(rdma_hndl, task, &nop); + rdma_hndl->sim_peer_credits += rdma_hndl->credits; + rdma_hndl->credits = 0; + + payload = xio_mbuf_tlv_payload_len(&task->mbuf); + + /* add tlv */ + if (xio_mbuf_write_tlv(&task->mbuf, task->tlv_type, payload) != 0) + return -1; + + /* set the length */ + rdma_task->txd.sge[0].length = xio_mbuf_data_length(&task->mbuf); + rdma_task->txd.send_wr.send_flags = + IBV_SEND_SIGNALED | IBV_SEND_FENCE; + if (rdma_task->txd.sge[0].length < (size_t)rdma_hndl->max_inline_data) + rdma_task->txd.send_wr.send_flags |= IBV_SEND_INLINE; + + rdma_task->txd.send_wr.next = NULL; + rdma_task->out_ib_op = XIO_IB_SEND; + rdma_task->txd.send_wr.num_sge = 1; + + rdma_hndl->rsps_in_flight_nr++; + list_add_tail(&task->tasks_list_entry, &rdma_hndl->in_flight_list); + + rdma_hndl->peer_credits--; + xio_post_send(rdma_hndl, &rdma_task->txd, 1); + + return 0; +} +#endif +/*---------------------------------------------------------------------------*/ +/* xio_rdma_read_nop */ +/*---------------------------------------------------------------------------*/ +static void xio_rdma_read_nop(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task, struct xio_nop_hdr *nop) +{ + struct xio_nop_hdr *tmp_nop; + + /* goto to the first tlv and set the mbuf after tlv header */ + xio_mbuf_set_val_start(&task->mbuf); + + /* get the pointer */ + tmp_nop = (struct xio_nop_hdr *)xio_mbuf_get_curr_ptr(&task->mbuf); + + /* pack relevant values */ + UNPACK_SVAL(tmp_nop, nop, hdr_len); + UNPACK_SVAL(tmp_nop, nop, sn); + UNPACK_SVAL(tmp_nop, nop, ack_sn); + UNPACK_SVAL(tmp_nop, nop, credits); + nop->opcode = tmp_nop->opcode; + nop->flags = tmp_nop->flags; + +#ifdef EYAL_TODO + print_hex_dump_bytes("post_send: ", DUMP_PREFIX_ADDRESS, + task->mbuf.tlv.head, + 64); +#endif + xio_mbuf_inc(&task->mbuf, sizeof(*nop)); +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_on_recv_nop */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_on_recv_nop(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + struct xio_nop_hdr nop; + + TRACE_LOG("RECV_NOP\n"); + xio_rdma_read_nop(rdma_hndl, task, &nop); + + if (rdma_hndl->exp_sn == nop.sn) + rdma_hndl->peer_credits += nop.credits; + else + ERROR_LOG("ERROR: sn expected:%d, sn arrived:%d\n", + rdma_hndl->exp_sn, nop.sn); + + /* the rx task is returned back to pool */ + xio_tasks_pool_put(task); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_send_cancel */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_send_cancel(struct xio_rdma_transport *rdma_hndl, + uint32_t tlv_type, + struct xio_rdma_cancel_hdr *cancel_hdr, + void *ulp_msg, size_t ulp_msg_sz) +{ + uint64_t payload; + uint16_t ulp_hdr_len; + int retval; + struct xio_task *task; + struct xio_rdma_task *rdma_task; + void *buff; + + task = xio_rdma_primary_task_alloc(rdma_hndl); + if (!task) { + ERROR_LOG("primary tasks pool is empty\n"); + return -1; + } + xio_mbuf_reset(&task->mbuf); + + /* set start of the tlv */ + if (xio_mbuf_tlv_start(&task->mbuf) != 0) + return -1; + + task->tlv_type = tlv_type; + rdma_task = (struct xio_rdma_task *)task->dd_data; + rdma_task->out_ib_op = XIO_IB_SEND; + rdma_task->write_num_reg_mem = 0; + rdma_task->read_num_reg_mem = 0; + + ulp_hdr_len = sizeof(*cancel_hdr) + sizeof(uint16_t) + ulp_msg_sz; + rdma_hndl->dummy_msg.out.header.iov_base = ucalloc(1, ulp_hdr_len); + rdma_hndl->dummy_msg.out.header.iov_len = ulp_hdr_len; + + /* write the message */ + /* get the pointer */ + buff = rdma_hndl->dummy_msg.out.header.iov_base; + + /* pack relevant values */ + buff += xio_write_uint16(cancel_hdr->hdr_len, 0, (uint8_t *)buff); + buff += xio_write_uint16(cancel_hdr->sn, 0, (uint8_t *)buff); + buff += xio_write_uint32(cancel_hdr->result, 0, (uint8_t *)buff); + buff += xio_write_uint16((uint16_t)(ulp_msg_sz), 0, (uint8_t *)buff); + buff += xio_write_array((const uint8_t *)ulp_msg, ulp_msg_sz, 0, + (uint8_t *)buff); + + task->omsg = &rdma_hndl->dummy_msg; + + /* write xio header to the buffer */ + retval = xio_rdma_prep_req_header( + rdma_hndl, task, + ulp_hdr_len, 0, 0, + XIO_E_SUCCESS); + if (retval) + return -1; + + payload = xio_mbuf_tlv_payload_len(&task->mbuf); + + /* add tlv */ + if (xio_mbuf_write_tlv(&task->mbuf, task->tlv_type, payload) != 0) + return -1; + + /* set the length */ + rdma_task->txd.sge[0].length = xio_mbuf_data_length(&task->mbuf); + rdma_task->txd.send_wr.send_flags = IBV_SEND_SIGNALED; + if (rdma_task->txd.sge[0].length < (size_t)rdma_hndl->max_inline_data) + rdma_task->txd.send_wr.send_flags |= IBV_SEND_INLINE; + + rdma_task->txd.send_wr.next = NULL; + rdma_task->txd.send_wr.num_sge = 1; + + task->omsg = NULL; + free(rdma_hndl->dummy_msg.out.header.iov_base); + + rdma_hndl->tx_ready_tasks_num++; + list_move_tail(&task->tasks_list_entry, &rdma_hndl->tx_ready_list); + + xio_rdma_xmit(rdma_hndl); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_send */ +/*---------------------------------------------------------------------------*/ +int xio_rdma_send(struct xio_transport_base *transport, + struct xio_task *task) +{ + void *rdma_hndl = transport; + int retval = -1; + + switch (task->tlv_type) { + case XIO_NEXUS_SETUP_REQ: + retval = xio_rdma_send_setup_req( + (struct xio_rdma_transport *)rdma_hndl, task); + break; + case XIO_NEXUS_SETUP_RSP: + retval = xio_rdma_send_setup_rsp( + (struct xio_rdma_transport *)rdma_hndl, task); + break; + case XIO_MSG_TYPE_RDMA: + retval = xio_rdma_perform_direct_rdma( + (struct xio_rdma_transport *)rdma_hndl, task); + break; + default: + if (IS_REQUEST(task->tlv_type)) + retval = xio_rdma_send_req( + (struct xio_rdma_transport *)rdma_hndl, task); + else if (IS_RESPONSE(task->tlv_type)) + retval = xio_rdma_send_rsp( + (struct xio_rdma_transport *)rdma_hndl, task); + else + ERROR_LOG("unknown message type:0x%x\n", + task->tlv_type); + break; + } + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_cancel_req_handler */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_cancel_req_handler(struct xio_rdma_transport *rdma_hndl, + struct xio_rdma_cancel_hdr *cancel_hdr, + void *ulp_msg, size_t ulp_msg_sz) +{ + union xio_transport_event_data event_data; + struct xio_task *ptask, *next_ptask; + struct xio_rdma_task *rdma_task; + int found = 0; + + /* start by looking for the task rdma_rd */ + list_for_each_entry_safe(ptask, next_ptask, + &rdma_hndl->rdma_rd_req_list, + tasks_list_entry) { + rdma_task = (struct xio_rdma_task *)ptask->dd_data; + if (rdma_task->phantom_idx == 0 && + rdma_task->sn == cancel_hdr->sn) { + TRACE_LOG("[%u] - message found on rdma_rd_list\n", + cancel_hdr->sn); + ptask->state = XIO_TASK_STATE_CANCEL_PENDING; + found = 1; + break; + } + } + if (!found) { + list_for_each_entry_safe(ptask, next_ptask, + &rdma_hndl->rdma_rd_req_in_flight_list, + tasks_list_entry) { + rdma_task = (struct xio_rdma_task *)ptask->dd_data; + if (rdma_task->phantom_idx == 0 && + rdma_task->sn == cancel_hdr->sn) { + TRACE_LOG("[%u] - message found on " \ + "rdma_rd_in_flight_list\n", + cancel_hdr->sn); + ptask->state = XIO_TASK_STATE_CANCEL_PENDING; + found = 1; + break; + } + } + } + if (!found) { + TRACE_LOG("[%u] - was not found\n", cancel_hdr->sn); + /* fill notification event */ + event_data.cancel.ulp_msg = ulp_msg; + event_data.cancel.ulp_msg_sz = ulp_msg_sz; + event_data.cancel.task = NULL; + event_data.cancel.result = (enum xio_status)0; + + xio_transport_notify_observer( + &rdma_hndl->base, + XIO_TRANSPORT_EVENT_CANCEL_REQUEST, + &event_data); + } + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_cancel_rsp_handler */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_cancel_rsp_handler(struct xio_rdma_transport *rdma_hndl, + struct xio_rdma_cancel_hdr *cancel_hdr, + void *ulp_msg, size_t ulp_msg_sz) +{ + union xio_transport_event_data event_data; + struct xio_task *ptask, *next_ptask; + struct xio_rdma_task *rdma_task; + struct xio_task *task_to_cancel = NULL; + + if ((cancel_hdr->result == XIO_E_MSG_CANCELED) || + (cancel_hdr->result == XIO_E_MSG_CANCEL_FAILED)) { + /* look in the in_flight */ + list_for_each_entry_safe(ptask, next_ptask, + &rdma_hndl->in_flight_list, + tasks_list_entry) { + rdma_task = (struct xio_rdma_task *)ptask->dd_data; + if (rdma_task->sn == cancel_hdr->sn) { + task_to_cancel = ptask; + break; + } + } + if (!task_to_cancel) { + /* look in the tx_comp */ + list_for_each_entry_safe(ptask, next_ptask, + &rdma_hndl->tx_comp_list, + tasks_list_entry) { + rdma_task = (struct xio_rdma_task *) + ptask->dd_data; + if (rdma_task->sn == cancel_hdr->sn) { + task_to_cancel = ptask; + break; + } + } + } + + if (!task_to_cancel) { + ERROR_LOG("[%u] - Failed to found canceled message\n", + cancel_hdr->sn); + /* fill notification event */ + event_data.cancel.ulp_msg = ulp_msg; + event_data.cancel.ulp_msg_sz = ulp_msg_sz; + event_data.cancel.task = NULL; + event_data.cancel.result = XIO_E_MSG_NOT_FOUND; + + xio_transport_notify_observer( + &rdma_hndl->base, + XIO_TRANSPORT_EVENT_CANCEL_RESPONSE, + &event_data); + return 0; + } + } + + /* fill notification event */ + event_data.cancel.ulp_msg = ulp_msg; + event_data.cancel.ulp_msg_sz = ulp_msg_sz; + event_data.cancel.task = task_to_cancel; + event_data.cancel.result = (enum xio_status)cancel_hdr->result; + + xio_transport_notify_observer(&rdma_hndl->base, + XIO_TRANSPORT_EVENT_CANCEL_RESPONSE, + &event_data); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_on_recv_cancel_rsp */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_on_recv_cancel_rsp(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + + int retval = 0; + struct xio_rdma_rsp_hdr rsp_hdr; + struct xio_msg *imsg; + void *ulp_hdr; + void *buff; + uint16_t ulp_msg_sz; + struct xio_rdma_cancel_hdr cancel_hdr; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + void *sg; + + /* read the response header */ + retval = xio_rdma_read_rsp_header(rdma_hndl, task, &rsp_hdr); + if (retval != 0) { + xio_set_error(XIO_E_MSG_INVALID); + goto cleanup; + } + /* update receive + send window */ + if (rdma_hndl->exp_sn == rsp_hdr.sn) { + rdma_hndl->exp_sn++; + rdma_hndl->ack_sn = rsp_hdr.sn; + rdma_hndl->peer_credits += rsp_hdr.credits; + } else { + ERROR_LOG("ERROR: expected sn:%d, arrived sn:%d\n", + rdma_hndl->exp_sn, rsp_hdr.sn); + } + /* read the sn */ + rdma_task->sn = rsp_hdr.sn; + + imsg = &task->imsg; + sgtbl = xio_sg_table_get(&imsg->in); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(imsg->in.sgl_type); + sg = sge_first(sgtbl_ops, sgtbl); + + ulp_hdr = xio_mbuf_get_curr_ptr(&task->mbuf); + + imsg->type = (enum xio_msg_type)task->tlv_type; + imsg->in.header.iov_len = rsp_hdr.ulp_hdr_len; + imsg->in.header.iov_base = ulp_hdr; + sge_set_addr(sgtbl_ops, sg, NULL); + tbl_set_nents(sgtbl_ops, sgtbl, 0); + + buff = imsg->in.header.iov_base; + buff += xio_read_uint16(&cancel_hdr.hdr_len, 0, (const uint8_t *)buff); + buff += xio_read_uint16(&cancel_hdr.sn, 0, (const uint8_t *)buff); + buff += xio_read_uint32(&cancel_hdr.result, 0, (const uint8_t *)buff); + buff += xio_read_uint16(&ulp_msg_sz, 0, (const uint8_t *)buff); + + xio_rdma_cancel_rsp_handler(rdma_hndl, &cancel_hdr, + buff, ulp_msg_sz); + /* return the the cancel response task to pool */ + xio_tasks_pool_put(task); + + return 0; +cleanup: + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_on_recv_cancel_req */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_on_recv_cancel_req(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + int retval = 0; + struct xio_rdma_cancel_hdr cancel_hdr; + struct xio_rdma_req_hdr req_hdr; + struct xio_msg *imsg; + void *ulp_hdr; + void *buff; + uint16_t ulp_msg_sz; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + void *sg; + + /* read header */ + retval = xio_rdma_read_req_header(rdma_hndl, task, &req_hdr); + if (retval != 0) { + xio_set_error(XIO_E_MSG_INVALID); + goto cleanup; + } + if (rdma_hndl->exp_sn == req_hdr.sn) { + rdma_hndl->exp_sn++; + rdma_hndl->ack_sn = req_hdr.sn; + rdma_hndl->peer_credits += req_hdr.credits; + } else { + ERROR_LOG("ERROR: sn expected:%d, sn arrived:%d\n", + rdma_hndl->exp_sn, req_hdr.sn); + } + + /* read the sn */ + rdma_task->sn = req_hdr.sn; + + imsg = &task->imsg; + sgtbl = xio_sg_table_get(&imsg->in); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(imsg->in.sgl_type); + sg = sge_first(sgtbl_ops, sgtbl); + + ulp_hdr = xio_mbuf_get_curr_ptr(&task->mbuf); + + /* set header pointers */ + imsg->type = (enum xio_msg_type)task->tlv_type; + imsg->in.header.iov_len = req_hdr.ulp_hdr_len; + imsg->in.header.iov_base = ulp_hdr; + sge_set_addr(sgtbl_ops, sg, NULL); + tbl_set_nents(sgtbl_ops, sgtbl, 0); + + buff = imsg->in.header.iov_base; + buff += xio_read_uint16(&cancel_hdr.hdr_len, 0, (const uint8_t *)buff); + buff += xio_read_uint16(&cancel_hdr.sn, 0, (const uint8_t *)buff); + buff += xio_read_uint32(&cancel_hdr.result, 0, (const uint8_t *)buff); + buff += xio_read_uint16(&ulp_msg_sz, 0, (const uint8_t *)buff); + + xio_rdma_cancel_req_handler(rdma_hndl, &cancel_hdr, + buff, ulp_msg_sz); + /* return the the cancel request task to pool */ + xio_tasks_pool_put(task); + + return 0; + +cleanup: + retval = xio_errno(); + ERROR_LOG("xio_rdma_on_recv_req failed. (errno=%d %s)\n", retval, + xio_strerror(retval)); + xio_transport_notify_observer_error(&rdma_hndl->base, retval); + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_cancel_req */ +/*---------------------------------------------------------------------------*/ +int xio_rdma_cancel_req(struct xio_transport_base *transport, + struct xio_msg *req, uint64_t stag, + void *ulp_msg, size_t ulp_msg_sz) +{ + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)transport; + struct xio_task *ptask, *next_ptask; + union xio_transport_event_data event_data; + struct xio_rdma_task *rdma_task; + struct xio_rdma_cancel_hdr cancel_hdr = { + .hdr_len = sizeof(cancel_hdr), + .result = 0 + }; + + /* look in the tx_ready */ + list_for_each_entry_safe(ptask, next_ptask, &rdma_hndl->tx_ready_list, + tasks_list_entry) { + if (ptask->omsg && + (ptask->omsg->sn == req->sn) && + (ptask->stag == stag)) { + TRACE_LOG("[%lu] - message found on tx_ready_list\n", + req->sn); + + /* return decrease ref count from task */ + xio_tasks_pool_put(ptask); + rdma_hndl->tx_ready_tasks_num--; + list_move_tail(&ptask->tasks_list_entry, + &rdma_hndl->tx_comp_list); + + /* fill notification event */ + event_data.cancel.ulp_msg = ulp_msg; + event_data.cancel.ulp_msg_sz = ulp_msg_sz; + event_data.cancel.task = ptask; + event_data.cancel.result = XIO_E_MSG_CANCELED; + + xio_transport_notify_observer( + &rdma_hndl->base, + XIO_TRANSPORT_EVENT_CANCEL_RESPONSE, + &event_data); + return 0; + } + } + /* look in the in_flight */ + list_for_each_entry_safe(ptask, next_ptask, &rdma_hndl->in_flight_list, + tasks_list_entry) { + if (ptask->omsg && + (ptask->omsg->sn == req->sn) && + (ptask->stag == stag) && + (ptask->state != XIO_TASK_STATE_RESPONSE_RECV)) { + TRACE_LOG("[%lu] - message found on in_flight_list\n", + req->sn); + + rdma_task = (struct xio_rdma_task *)ptask->dd_data; + cancel_hdr.sn = rdma_task->sn; + + xio_rdma_send_cancel(rdma_hndl, XIO_CANCEL_REQ, + &cancel_hdr, + ulp_msg, ulp_msg_sz); + return 0; + } + } + /* look in the tx_comp */ + list_for_each_entry_safe(ptask, next_ptask, &rdma_hndl->tx_comp_list, + tasks_list_entry) { + if (ptask->omsg && + (ptask->omsg->sn == req->sn) && + (ptask->stag == stag) && + (ptask->state != XIO_TASK_STATE_RESPONSE_RECV)) { + TRACE_LOG("[%lu] - message found on tx_comp_list\n", + req->sn); + rdma_task = (struct xio_rdma_task *)ptask->dd_data; + cancel_hdr.sn = rdma_task->sn; + + xio_rdma_send_cancel(rdma_hndl, XIO_CANCEL_REQ, + &cancel_hdr, + ulp_msg, ulp_msg_sz); + return 0; + } + } + TRACE_LOG("[%lu] - message not found on tx path\n", req->sn); + + /* fill notification event */ + event_data.cancel.ulp_msg = ulp_msg; + event_data.cancel.ulp_msg_sz = ulp_msg_sz; + event_data.cancel.task = NULL; + event_data.cancel.result = XIO_E_MSG_NOT_FOUND; + + xio_transport_notify_observer(&rdma_hndl->base, + XIO_TRANSPORT_EVENT_CANCEL_RESPONSE, + &event_data); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_cancel_rsp */ +/*---------------------------------------------------------------------------*/ +int xio_rdma_cancel_rsp(struct xio_transport_base *transport, + struct xio_task *task, enum xio_status result, + void *ulp_msg, size_t ulp_msg_sz) +{ + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)transport; + struct xio_rdma_task *rdma_task; + + struct xio_rdma_cancel_hdr cancel_hdr = { + .hdr_len = sizeof(cancel_hdr), + .result = result, + }; + + if (task) { + rdma_task = (struct xio_rdma_task *)task->dd_data; + cancel_hdr.sn = rdma_task->sn; + } else { + cancel_hdr.sn = 0; + } + + /* fill dummy transport header since was handled by upper layer + */ + return xio_rdma_send_cancel(rdma_hndl, XIO_CANCEL_RSP, + &cancel_hdr, ulp_msg, ulp_msg_sz); +} diff --git a/open_src/xio/src/usr/transport/rdma/xio_rdma_management.c b/open_src/xio/src/usr/transport/rdma/xio_rdma_management.c new file mode 100644 index 0000000..25509a3 --- /dev/null +++ b/open_src/xio/src/usr/transport/rdma/xio_rdma_management.c @@ -0,0 +1,3730 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include + +#include "libxio.h" +#include "xio_log.h" +#include "xio_common.h" +#include "xio_observer.h" +#include "xio_protocol.h" +#include "xio_mbuf.h" +#include "xio_task.h" +#include "xio_usr_transport.h" +#include "xio_transport.h" +#include "xio_protocol.h" +#include "get_clock.h" +#include "xio_mem.h" +#include "xio_mempool.h" +#include "xio_rdma_utils.h" +#include "xio_ev_data.h" +#include "xio_ev_loop.h" +#include "xio_sg_table.h" +#include "xio_objpool.h" +#include "xio_workqueue.h" +#include "xio_context.h" +#include "xio_rdma_transport.h" +#include "xio_context_priv.h" + +/* default option values */ +#define XIO_OPTVAL_DEF_ENABLE_MEM_POOL 1 +#define XIO_OPTVAL_DEF_ENABLE_DMA_LATENCY 0 +#define XIO_OPTVAL_DEF_MAX_IN_IOVSZ XIO_IOVLEN +#define XIO_OPTVAL_DEF_MAX_OUT_IOVSZ XIO_IOVLEN +#define XIO_OPTVAL_DEF_QP_CAP_MAX_INLINE_DATA (200) + +/*---------------------------------------------------------------------------*/ +/* globals */ +/*---------------------------------------------------------------------------*/ +static spinlock_t mngmt_lock; +static pthread_rwlock_t dev_lock; +static pthread_rwlock_t cm_lock; +static pthread_once_t ctor_key_once = PTHREAD_ONCE_INIT; +static pthread_once_t dtor_key_once = PTHREAD_ONCE_INIT; + +spinlock_t dev_list_lock; /* devices list lock */ +LIST_HEAD(dev_list); +LIST_HEAD(dev_del_list); +static LIST_HEAD(cm_list); + +static struct xio_dev_tdata dev_tdata; + +static int cdl_fd = -1; + +static int rdma_num_devices; /*= 0;*/ + +/* rdma options */ +struct xio_rdma_options rdma_options = { + .enable_mem_pool = XIO_OPTVAL_DEF_ENABLE_MEM_POOL, + .enable_dma_latency = XIO_OPTVAL_DEF_ENABLE_DMA_LATENCY, + .max_in_iovsz = XIO_OPTVAL_DEF_MAX_IN_IOVSZ, + .max_out_iovsz = XIO_OPTVAL_DEF_MAX_OUT_IOVSZ, + .qp_cap_max_inline_data = XIO_OPTVAL_DEF_QP_CAP_MAX_INLINE_DATA, +}; + +/*---------------------------------------------------------------------------*/ +/* forward declaration */ +/*---------------------------------------------------------------------------*/ +static struct xio_transport_base *xio_rdma_open( + struct xio_transport *transport, + struct xio_context *ctx, + struct xio_observer *observer, + uint32_t trans_attr_mask, + struct xio_transport_init_attr *attr); + +static int xio_rdma_reject(struct xio_transport_base *transport); +static void xio_rdma_close(struct xio_transport_base *transport); +static struct xio_cm_channel *xio_cm_channel_get(struct xio_context *ctx); +static void xio_rdma_post_close(struct xio_transport_base *trans_hndl); +static int xio_rdma_flush_all_tasks(struct xio_rdma_transport *rdma_hndl); +static void xio_device_release(struct xio_device *dev); + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_get_max_header_size */ +/*---------------------------------------------------------------------------*/ +int xio_rdma_get_max_header_size(void) +{ + int req_hdr = XIO_TRANSPORT_OFFSET + sizeof(struct xio_rdma_req_hdr); + int rsp_hdr = XIO_TRANSPORT_OFFSET + sizeof(struct xio_rdma_rsp_hdr); + int iovsz = rdma_options.max_out_iovsz + rdma_options.max_in_iovsz; + + req_hdr += iovsz * sizeof(struct xio_sge); + rsp_hdr += rdma_options.max_out_iovsz * sizeof(struct xio_sge); + + return max(req_hdr, rsp_hdr); +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_get_inline_buffer_size */ +/*---------------------------------------------------------------------------*/ +int xio_rdma_get_inline_buffer_size(void) +{ + int inline_buf_sz = xio_rdma_get_max_header_size() + + g_options.max_inline_xio_hdr + + g_options.max_inline_xio_data; + inline_buf_sz = ALIGN(inline_buf_sz, 1024); + + return inline_buf_sz; +} + +/*---------------------------------------------------------------------------*/ +/* xio_async_ev_handler */ +/*---------------------------------------------------------------------------*/ +static void xio_async_ev_handler(int fd, int events, void *user_context) +{ + char *dev_name = NULL; + struct ibv_async_event async_event; + struct xio_device *dev = (struct xio_device *)user_context; + + dev_name = dev->verbs->device->name; + + while (1) { + if (ibv_get_async_event(dev->verbs, &async_event)) { + if (errno == EAGAIN) + return; + + xio_set_error(errno); + ERROR_LOG("ibv_get_async_event failed. (errno=%d %m)\n", + errno); + return; + } + if (async_event.event_type == IBV_EVENT_QP_LAST_WQE_REACHED) { + DEBUG_LOG("ibv_get_async_event: dev:%s evt: %s\n", + dev_name, + ibv_event_type_str(async_event.event_type)); + } else { + ERROR_LOG("ibv_get_async_event: dev:%s evt: %s\n", + dev_name, + ibv_event_type_str(async_event.event_type)); + + if (async_event.event_type == IBV_EVENT_COMM_EST) { + struct xio_rdma_transport *rdma_hndl; + + rdma_hndl = (struct xio_rdma_transport *) + async_event.element.qp->qp_context; + /* force "connection established" event */ + rdma_notify(rdma_hndl->cm_id, + IBV_EVENT_COMM_EST); + } + } + ibv_ack_async_event(&async_event); + } +} + +/*---------------------------------------------------------------------------*/ +/* device thread callback */ +/*---------------------------------------------------------------------------*/ +static void *device_thread_cb(void *data) +{ + cpu_set_t cpuset; + pthread_t thread; + + /* set affinity to thread */ + thread = pthread_self(); + + CPU_ZERO(&cpuset); + CPU_SET(0, &cpuset); /* bind the devices thread to first core */ + + pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset); + + /* the default xio supplied main loop */ + xio_ev_loop_run(dev_tdata.async_loop); + + /* normal exit phase */ + TRACE_LOG("devices thread exit signaled\n"); + + /* destroy the default loop */ + xio_ev_loop_destroy(dev_tdata.async_loop); + dev_tdata.async_loop = NULL; + + return NULL; +} + +/*---------------------------------------------------------------------------*/ +/* xio_device_thread_init */ +/*---------------------------------------------------------------------------*/ +static int xio_device_thread_init(void) +{ + int ret; + + /* open default event loop */ + dev_tdata.async_loop = xio_ev_loop_create(); + if (!dev_tdata.async_loop) { + ERROR_LOG("xio_ev_loop_init failed\n"); + return -1; + } + ret = pthread_create(&dev_tdata.dev_thread, NULL, + device_thread_cb, NULL); + if (ret < 0) { + ERROR_LOG("pthread_create failed. %m\n"); + /* destroy the default loop */ + xio_ev_loop_destroy(dev_tdata.async_loop); + dev_tdata.async_loop = NULL; + return -1; + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_device_thread_stop */ +/*---------------------------------------------------------------------------*/ +static void xio_device_thread_stop(void) +{ + xio_ev_loop_stop(dev_tdata.async_loop); + + pthread_join(dev_tdata.dev_thread, NULL); +} + +/*---------------------------------------------------------------------------*/ +/* xio_device_thread_add_device */ +/*---------------------------------------------------------------------------*/ +int xio_device_thread_add_device(struct xio_device *dev) +{ + int retval; + + retval = fcntl(dev->verbs->async_fd, F_GETFL, 0); + if (retval != -1) { + retval = fcntl(dev->verbs->async_fd, F_SETFL, + retval | O_NONBLOCK); + } + if (retval == -1) { + xio_set_error(errno); + ERROR_LOG("fcntl failed. (errno=%d %m)\n", errno); + return -1; + } + + /* add to epoll */ + retval = xio_ev_loop_add( + dev_tdata.async_loop, + dev->verbs->async_fd, + XIO_POLLIN, + xio_async_ev_handler, + dev); + if (retval != 0) { + xio_set_error(errno); + ERROR_LOG("ev_loop_add failed. (errno=%d %m)\n", errno); + return -1; + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_device_thread_remove_device */ +/*---------------------------------------------------------------------------*/ +int xio_device_thread_remove_device(struct xio_device *dev) +{ + if (dev_tdata.async_loop) + return xio_ev_loop_del( + dev_tdata.async_loop, + dev->verbs->async_fd); + return 0; +} + +#ifdef HAVE_IBV_MODIFY_CQ +/*---------------------------------------------------------------------------*/ +/* xio_cq_modify - use to throttle rates */ +/*---------------------------------------------------------------------------*/ +static int xio_cq_modify(struct xio_cq *tcq, int cq_count, int cq_pariod) +{ + struct ibv_cq_attr cq_attr; + int retval; + + memset(&cq_attr, 0, sizeof(cq_attr)); + + cq_attr.comp_mask = IBV_CQ_ATTR_MODERATION; + cq_attr.moderation.cq_count = cq_count; + cq_attr.moderation.cq_period = cq_pariod; + + retval = ibv_modify_cq(tcq->cq, &cq_attr, + IBV_CQ_MODERATION); + if (unlikely(retval)) + ERROR_LOG("ibv_modify_cq failed. (errno=%d %m)\n", errno); + + return retval; +} +#endif + +#ifdef XIO_SRQ_ENABLE +/*---------------------------------------------------------------------------*/ +/* xio_srq_get */ +/*---------------------------------------------------------------------------*/ +static struct xio_srq *xio_srq_get(struct xio_rdma_transport *rdma_hndl, + struct xio_cq *tcq) +{ + struct xio_srq *srq; + struct ibv_srq_init_attr srq_init_attr; + + if (tcq->srq) + return tcq->srq; + srq = (struct xio_srq *)ucalloc(1, sizeof(struct xio_srq)); + if (!srq) { + xio_set_error(ENOMEM); + ERROR_LOG("ucalloc failed. %m\n"); + return NULL; + } + + memset(&srq_init_attr, 0, sizeof(srq_init_attr)); + + srq_init_attr.attr.max_wr = SRQ_DEPTH; + srq_init_attr.attr.max_sge = 1; + + srq->srq = ibv_create_srq(rdma_hndl->dev->pd, &srq_init_attr); + if (!srq->srq) { + xio_set_error(errno); + ERROR_LOG("creation of shared receive queue failed " \ + "(errno=%d %m)\n", errno); + goto cleanup; + } + + HT_INIT(&srq->ht_rdma_hndl, xio_int32_hash, xio_int32_cmp, + xio_int32_cp); + INIT_LIST_HEAD(&srq->rx_list); + + tcq->srq = srq; + return srq; + +cleanup: + free(srq); + return NULL; +} + +/*---------------------------------------------------------------------------*/ +/* xio_srq_destroy */ +/*---------------------------------------------------------------------------*/ +static int xio_srq_destroy(struct xio_srq *srq) +{ + if (!list_empty(&srq->rx_list)) { + TRACE_LOG("rx_list not empty!\n"); + xio_transport_flush_task_list(&srq->rx_list); + } + if (ibv_destroy_srq(srq->srq)) { + ERROR_LOG("ibv_destroy_srq failed\n"); + return -1; + } + free(srq); + return 0; +} +#endif + +/*---------------------------------------------------------------------------*/ +/* xio_cq_down */ +/*---------------------------------------------------------------------------*/ +static void xio_cq_down(struct kref *kref) +{ + struct xio_cq *tcq = container_of(kref, struct xio_cq, kref); + int retval; + + pthread_rwlock_wrlock(&tcq->dev->cq_lock); + list_del(&tcq->cq_list_entry); + pthread_rwlock_unlock(&tcq->dev->cq_lock); + + if (!list_empty(&tcq->trans_list)) + ERROR_LOG("rdma_hndl memory leakage\n"); + + xio_context_disable_event(&tcq->consume_cq_event); + xio_context_disable_event(&tcq->poll_cq_event); + + xio_context_unreg_observer(tcq->ctx, &tcq->observer); + +#ifdef XIO_SRQ_ENABLE + xio_srq_destroy(tcq->srq); +#endif + + if (tcq->cq_events_that_need_ack != 0) { + ibv_ack_cq_events(tcq->cq, + tcq->cq_events_that_need_ack); + tcq->cq_events_that_need_ack = 0; + } + + retval = xio_context_del_ev_handler( + tcq->ctx, + tcq->channel->fd); + if (retval) + ERROR_LOG("ev_loop_del_cb failed. (errno=%d %m)\n", + errno); + + /* the event loop may be release by the time this function is called */ + retval = ibv_destroy_cq(tcq->cq); + if (retval) + ERROR_LOG("ibv_destroy_cq failed. (errno=%d %m)\n", errno); + + retval = ibv_destroy_comp_channel(tcq->channel); + if (retval) + ERROR_LOG("ibv_destroy_comp_channel failed. (errno=%d %m)\n", + errno); + + XIO_OBSERVER_DESTROY(&tcq->observer); + + ufree(tcq->wc_array); + ufree(tcq); +} + +/*---------------------------------------------------------------------------*/ +/* xio_cq_release */ +/*---------------------------------------------------------------------------*/ +static inline void xio_cq_release(struct xio_cq *tcq) +{ + kref_put(&tcq->kref, xio_cq_down); +} + +/*---------------------------------------------------------------------------*/ +/* xio_on_context_event */ +/*---------------------------------------------------------------------------*/ +static int xio_on_context_event(void *observer, void *sender, + int event, void *event_data) +{ + struct xio_cq *cq = (struct xio_cq *)observer; + + if (event == XIO_CONTEXT_EVENT_POST_CLOSE) { + TRACE_LOG("context: [close] ctx:%p\n", sender); + xio_cq_release(cq); + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_cq_create */ +/*---------------------------------------------------------------------------*/ +static struct xio_cq *xio_cq_get(struct xio_device *dev, + struct xio_context *ctx) +{ + struct xio_cq *tcq; + int retval; + int comp_vec = 0; + int alloc_sz; +#ifdef HAVE_IBV_MODIFY_CQ + int throttle = 0; +#endif + + list_for_each_entry(tcq, &dev->cq_list, cq_list_entry) { + if (tcq->ctx == ctx) { + kref_get(&tcq->kref); + return tcq; + } + } + tcq = (struct xio_cq *)ucalloc(1, sizeof(struct xio_cq)); + if (!tcq) { + xio_set_error(ENOMEM); + ERROR_LOG("ucalloc failed. %m\n"); + goto cleanup; + } + tcq->ctx = ctx; + + tcq->wc_array_len = MAX_POLL_WC; + /* allocate device wc array */ + tcq->wc_array = (struct ibv_wc *)ucalloc(tcq->wc_array_len, + sizeof(struct ibv_wc)); + if (!tcq->wc_array) { + xio_set_error(errno); + ERROR_LOG("ev_loop_add failed. (errno=%d %m)\n", errno); + goto cleanup1; + } + + tcq->alloc_sz = min(dev->device_attr.max_cqe, CQE_ALLOC_SIZE); + tcq->max_cqe = dev->device_attr.max_cqe; + alloc_sz = tcq->alloc_sz; + + /* set com_vector to cpu */ + comp_vec = ctx->cpuid % dev->verbs->num_comp_vectors; + + tcq->channel = ibv_create_comp_channel(dev->verbs); + if (!tcq->channel) { + xio_set_error(errno); + ERROR_LOG("ibv_create_comp_channel failed. (errno=%d %m)\n", + errno); + goto cleanup2; + } + retval = fcntl(tcq->channel->fd, F_GETFL, 0); + if (retval != -1) { + retval = fcntl(tcq->channel->fd, F_SETFL, + retval | O_NONBLOCK); + } + if (retval == -1) { + xio_set_error(errno); + ERROR_LOG("fcntl failed. (errno=%d %m)\n", errno); + goto cleanup2; + } + + /* add to epoll */ + retval = xio_context_add_ev_handler( + ctx, + tcq->channel->fd, + XIO_POLLIN, + xio_cq_event_handler, + tcq); + if (retval) { + xio_set_error(errno); + ERROR_LOG("ev_loop_add_cb failed. (errno=%d %m)\n", errno); + goto cleanup3; + } + + tcq->cq = ibv_create_cq(dev->verbs, alloc_sz, tcq, + tcq->channel, comp_vec); + TRACE_LOG("comp_vec:%d\n", comp_vec); + if (!tcq->cq) { + xio_set_error(errno); + ERROR_LOG("ibv_create_cq failed. (errno=%d %m)\n", errno); + if (errno == ENOMEM) + xio_validate_ulimit_memlock(); + goto cleanup4; + } + +#ifdef HAVE_IBV_MODIFY_CQ + if (throttle) + retval = xio_cq_modify(tcq, 5, 5); +#endif + + retval = ibv_req_notify_cq(tcq->cq, 0); + if (retval) { + xio_set_error(errno); + ERROR_LOG("ibv_req_notify_cq failed. (errno=%d %m)\n", + errno); + goto cleanup5; + } + + /* set cq depth params */ + tcq->dev = dev; + tcq->cq_depth = tcq->cq->cqe; + tcq->cqe_avail = tcq->cq->cqe; + + INIT_LIST_HEAD(&tcq->trans_list); + + list_add(&tcq->cq_list_entry, &dev->cq_list); + + /* One reference count for the context and one for the rdma handle */ + kref_init(&tcq->kref); + kref_get(&tcq->kref); + + /* set the tcq to be the observer for context events */ + XIO_OBSERVER_INIT(&tcq->observer, tcq, xio_on_context_event); + xio_context_reg_observer(ctx, &tcq->observer); + + xio_context_set_poll_completions_fn( + ctx, + (poll_completions_fn_t)xio_rdma_poll_completions, + tcq); + + return tcq; + +cleanup5: + retval = ibv_destroy_cq(tcq->cq); + if (retval) + ERROR_LOG("ibv_destroy_cq failed. (errno=%d %m)\n", errno); +cleanup4: + (void)xio_context_del_ev_handler(ctx, tcq->channel->fd); +cleanup3: + retval = ibv_destroy_comp_channel(tcq->channel); + if (retval) + ERROR_LOG("ibv_destroy_comp_channel failed. (errno=%d %m)\n", + errno); +cleanup2: + ufree(tcq->wc_array); +cleanup1: + ufree(tcq); +cleanup: + return NULL; +} + +/*---------------------------------------------------------------------------*/ +/* xio_device_init */ +/*---------------------------------------------------------------------------*/ +static struct xio_device *xio_device_init(struct ibv_context *ib_ctx) +{ + struct xio_device *dev; + int retval; + + dev = (struct xio_device *)ucalloc(1, sizeof(*dev)); + if (!dev) { + xio_set_error(errno); + ERROR_LOG("ucalloc failed. (errno=%d %m)\n", errno); + return NULL; + } + dev->verbs = ib_ctx; + + dev->pd = ibv_alloc_pd(dev->verbs); + if (!dev->pd) { + xio_set_error(errno); + ERROR_LOG("ibv_alloc_pd failed. (errno=%d %m)\n", errno); + goto cleanup; + } + retval = ibv_xio_query_device(dev->verbs, &dev->device_attr); + if (retval < 0) { + ERROR_LOG("ibv_query_device failed. (errno=%d %m)\n", errno); + goto cleanup1; + } + + retval = xio_device_thread_add_device(dev); + if (retval) { + ERROR_LOG( + "xio_device_thread_add_device failed. (errno=%d %m)\n", + errno); + goto cleanup1; + } + + INIT_LIST_HEAD(&dev->cq_list); + /* Initialize list of MR for this device */ + INIT_LIST_HEAD(&dev->xm_list); + INIT_LIST_HEAD(&dev->dev_list_entry); + pthread_rwlock_init(&dev->cq_lock, NULL); + kref_init(&dev->kref); + TRACE_LOG("rdma device: [new] %p\n", dev); + + return dev; + +cleanup1: + ibv_dealloc_pd(dev->pd); +cleanup: + ufree(dev); + + ERROR_LOG("rdma device: [new] failed\n"); + return NULL; +} + +/*---------------------------------------------------------------------------*/ +/* xio_device_lookup */ +/*---------------------------------------------------------------------------*/ +static struct xio_device *xio_device_lookup(struct ibv_context *verbs) +{ + struct xio_device *dev; + + /* Actually we should compare GUID(s) assume device is released and + * a new device gets the memory allocated for the old one + */ + spin_lock(&dev_list_lock); + /* Loop on known devices (need locking) */ + list_for_each_entry(dev, &dev_list, dev_list_entry) { + if (dev->verbs == verbs) { + /* increment device reference count */ + xio_device_get(dev); + spin_unlock(&dev_list_lock); + return dev; + } + } + spin_unlock(&dev_list_lock); + + return NULL; +} + +/*---------------------------------------------------------------------------*/ +/* xio_device_lookup_init */ +/*---------------------------------------------------------------------------*/ +static struct xio_device *xio_device_lookup_init(struct ibv_context *verbs) +{ + struct xio_device *dev; + + if (!verbs) { + xio_set_error(ENODEV); + ERROR_LOG("NULL ibv_context\n"); + return NULL; + } + + dev = xio_device_lookup(verbs); + if (dev) + goto exit; + + /* Connection on new device */ + TRACE_LOG("Connection via new device %s\n", + ibv_get_device_name(verbs->device)); + + dev = xio_device_init(verbs); + if (!dev) { + ERROR_LOG("Couldn't allocate device %s\n", + ibv_get_device_name(verbs->device)); + goto cleanup0; + } + + /* Update all MR with new device */ + if (xio_reg_mr_add_dev(dev)) { + ERROR_LOG("Couldn't allocate device %s\n", + ibv_get_device_name(verbs->device)); + goto cleanup1; + } + + /* Add reference count on behalf of the new connection */ + xio_device_get(dev); + + /* Add the new device */ + spin_lock(&dev_list_lock); + list_add(&dev->dev_list_entry, &dev_list); + spin_unlock(&dev_list_lock); + +exit: + return dev; + +cleanup1: + xio_device_release(dev); + +cleanup0: + return NULL; +} + +/*---------------------------------------------------------------------------*/ +/* xio_device_down */ +/*---------------------------------------------------------------------------*/ +void xio_device_down(struct kref *kref) +{ + struct xio_device *dev = container_of(kref, struct xio_device, kref); + int retval; + + spin_lock(&dev_list_lock); + list_del(&dev->dev_list_entry); + spin_unlock(&dev_list_lock); + + xio_dereg_mr_by_dev(dev); + + retval = ibv_dealloc_pd(dev->pd); + if (retval) + ERROR_LOG("ibv_dealloc_pd failed. (errno=%d %s)\n", + retval, strerror(retval)); + + ufree(dev); +} + +/*---------------------------------------------------------------------------*/ +/* xio_device_release */ +/*---------------------------------------------------------------------------*/ +static void xio_device_release(struct xio_device *dev) +{ + int retval; + + TRACE_LOG("rdma device: [close] dev:%p\n", dev); + + retval = xio_device_thread_remove_device(dev); + if (retval) { + ERROR_LOG( + "xio_device_thread_add_device failed. (errno=%d %m)\n", + errno); + } + + /* don't delete the fd - the loop may not exist at this stage */ + if (!list_empty(&dev->cq_list)) + ERROR_LOG("cq memory leakage\n"); + + pthread_rwlock_destroy(&dev->cq_lock); + + spin_lock(&dev_list_lock); + list_move_tail(&dev->dev_list_entry, &dev_del_list); + spin_unlock(&dev_list_lock); + + /* ibv_dealloc_pd will be called from xio_device_down (kerf) */ + xio_device_put(dev); +} + +/*---------------------------------------------------------------------------*/ +/* xio_device_list_check */ +/*---------------------------------------------------------------------------*/ +static void xio_device_list_check(void) +{ + struct ibv_context **ctx_list; + int num_devices = 0; + + rdma_num_devices = 0; + + ctx_list = rdma_get_devices(&num_devices); + if (!ctx_list) + return; + + if (!*ctx_list || num_devices == 0) + goto exit; + + rdma_num_devices = num_devices; +exit: + rdma_free_devices(ctx_list); +} + +/*---------------------------------------------------------------------------*/ +/* xio_device_list_init */ +/*---------------------------------------------------------------------------*/ +static int xio_device_list_init(void) +{ + struct ibv_context **ctx_list; + struct xio_device *dev; + int num_devices = 0, i; + int retval = 0; + + INIT_LIST_HEAD(&dev_list); + + rdma_num_devices = 0; + + ctx_list = rdma_get_devices(&num_devices); + if (!ctx_list) { + xio_set_error(errno); + ERROR_LOG("Failed to get IB devices list\n"); + return -1; + } + + if (!*ctx_list) { + xio_set_error(ENODEV); + ERROR_LOG("No IB devices found\n"); + retval = -1; + goto exit; + } + + rdma_num_devices = num_devices; + + for (i = 0; i < num_devices; ++i) { + dev = xio_device_init(ctx_list[i]); + if (!dev) { + ERROR_LOG("Couldn't allocate device %s\n", + ibv_get_device_name(ctx_list[i]->device)); + retval = -1; + goto exit; + } + pthread_rwlock_wrlock(&dev_lock); + list_add(&dev->dev_list_entry, &dev_list); + pthread_rwlock_unlock(&dev_lock); + } + +exit: + rdma_free_devices(ctx_list); + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_device_list_release */ +/*---------------------------------------------------------------------------*/ +static void xio_device_list_release(void) +{ + struct xio_device *dev, *next; + + /* free devices */ + pthread_rwlock_wrlock(&dev_lock); + list_for_each_entry_safe(dev, next, &dev_list, dev_list_entry) { + /* xio_device_release needs to do list_move -> _init */ + list_del_init(&dev->dev_list_entry); + xio_device_release(dev); + } + pthread_rwlock_unlock(&dev_lock); +} + +/*---------------------------------------------------------------------------*/ +/* xio_cm_channel_down */ +/*---------------------------------------------------------------------------*/ +void xio_cm_channel_down(struct kref *kref) +{ + struct xio_cm_channel *channel = + container_of(kref, struct xio_cm_channel, kref); + + pthread_rwlock_wrlock(&cm_lock); + list_del(&channel->channels_list_entry); + pthread_rwlock_unlock(&cm_lock); + (void)xio_context_del_ev_handler(channel->ctx, channel->cm_channel->fd); + rdma_destroy_event_channel(channel->cm_channel); + ufree(channel); +} + +/*---------------------------------------------------------------------------*/ +/* xio_cm_channel_release */ +/*---------------------------------------------------------------------------*/ +static inline void xio_cm_channel_release(struct xio_cm_channel *channel) +{ + kref_put(&channel->kref, xio_cm_channel_down); +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_context_shutdown */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_context_shutdown(struct xio_transport_base *trans_hndl, + struct xio_context *ctx) +{ + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)trans_hndl; + + DEBUG_LOG("context: [shutdown] trans_hndl:%p\n", trans_hndl); + + /*due to long timewait - force ignoring */ + rdma_hndl->ignore_timewait = 1; + rdma_hndl->ignore_disconnect = 1; + + xio_context_destroy_wait(ctx); + xio_rdma_close(trans_hndl); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_cq_alloc_slots */ +/*---------------------------------------------------------------------------*/ +int xio_cq_alloc_slots(struct xio_cq *tcq, int cqe_num) +{ + if (cqe_num < tcq->cqe_avail) { + tcq->cqe_avail -= cqe_num; + return 0; + } else if (tcq->cq_depth + tcq->alloc_sz < tcq->max_cqe) { + int cqe = tcq->cq->cqe; + int retval = ibv_resize_cq(tcq->cq, + (tcq->cq_depth + tcq->alloc_sz)); + if (retval != 0 || (cqe == tcq->cq->cqe)) { + ERROR_LOG("ibv_resize_cq failed. %m, cqe:%d\n", cqe); + return -1; + } + tcq->cq_depth += (tcq->cq->cqe - cqe); + tcq->cqe_avail += (tcq->cq->cqe - cqe); + DEBUG_LOG("cq_resize: expected:%d, actual:%d\n", + tcq->cq_depth, tcq->cq->cqe); + tcq->cqe_avail -= cqe_num; + return 0; + } + ERROR_LOG("cq overflow reached\n"); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_cq_free_slots */ +/*---------------------------------------------------------------------------*/ +static int xio_cq_free_slots(struct xio_cq *tcq, int cqe_num) +{ + if (tcq->cqe_avail + cqe_num <= tcq->cq_depth) { + tcq->cqe_avail += cqe_num; + return 0; + } + ERROR_LOG("cq allocation error"); + + return 0; +} + +#ifdef XIO_SRQ_ENABLE +/*---------------------------------------------------------------------------*/ +/* xio_srq_qp_added */ +/*---------------------------------------------------------------------------*/ +static void xio_srq_qp_added(struct xio_rdma_transport *rdma_hndl, + struct xio_srq *srq) +{ + HT_INSERT(&srq->ht_rdma_hndl, &rdma_hndl->qp->qp_num, rdma_hndl, + rdma_hndl_htbl); + DEBUG_LOG("adding rdma hndl %p with id %d\n", rdma_hndl, + rdma_hndl->qp->qp_num); +} + +/*---------------------------------------------------------------------------*/ +/* xio_srq_qp_deleted */ +/*---------------------------------------------------------------------------*/ +static void xio_srq_qp_deleted(struct xio_rdma_transport *rdma_hndl, + struct xio_srq *srq) +{ + struct xio_key_int32 key; + struct xio_rdma_transport *c; + + key.id = rdma_hndl->qp->qp_num; + + HT_LOOKUP(&srq->ht_rdma_hndl, &key, c, rdma_hndl_htbl); + HT_REMOVE(&srq->ht_rdma_hndl, c, rdma_hndl, rdma_hndl_htbl); + DEBUG_LOG("removing rdma hndl %p with id %d\n", rdma_hndl, + rdma_hndl->qp->qp_num); +} +#endif + +/*---------------------------------------------------------------------------*/ +/* xio_qp_create */ +/*---------------------------------------------------------------------------*/ +static int xio_qp_create(struct xio_rdma_transport *rdma_hndl) +{ + struct xio_cq *tcq; +#ifdef XIO_SRQ_ENABLE + struct xio_srq *srq; +#endif + struct xio_device *dev = rdma_hndl->dev; + struct ibv_qp_init_attr qp_init_attr; + struct ibv_qp_attr qp_attr; + int retval = 0; + + tcq = xio_cq_get(dev, rdma_hndl->base.ctx); + if (!tcq) { + ERROR_LOG("cq initialization failed\n"); + return -1; + } + retval = xio_cq_alloc_slots(tcq, MAX_CQE_PER_QP); + if (retval != 0) { + ERROR_LOG("cq full capacity reached\n"); + goto release_cq; + } + + memset(&qp_init_attr, 0, sizeof(qp_init_attr)); + + qp_init_attr.qp_context = rdma_hndl; + qp_init_attr.qp_type = IBV_QPT_RC; + qp_init_attr.send_cq = tcq->cq; + qp_init_attr.recv_cq = tcq->cq; + +#ifdef XIO_SRQ_ENABLE + srq = xio_srq_get(rdma_hndl, tcq); + if (!srq) { + ERROR_LOG("srq initialization failed\n"); + goto release_cq; + } + qp_init_attr.srq = srq->srq; +#else + qp_init_attr.cap.max_recv_wr = MAX_RECV_WR + EXTRA_RQE; + qp_init_attr.cap.max_recv_sge = 1; + +#endif + + qp_init_attr.cap.max_send_wr = MAX_SEND_WR; + qp_init_attr.cap.max_send_sge = min(rdma_options.max_out_iovsz + 1, + dev->device_attr.max_sge); + qp_init_attr.cap.max_inline_data = rdma_options.qp_cap_max_inline_data; + + /* only generate completion queue entries if requested */ + qp_init_attr.sq_sig_all = 0; + + retval = rdma_create_qp(rdma_hndl->cm_id, dev->pd, &qp_init_attr); + if (retval) { + xio_set_error(errno); + ERROR_LOG("rdma_create_qp failed. (errno=%d %m)\n", errno); + if (errno == ENOMEM) + xio_validate_ulimit_memlock(); + goto free_slots; + } + rdma_hndl->tcq = tcq; + rdma_hndl->qp = rdma_hndl->cm_id->qp; + rdma_hndl->sqe_avail = MAX_SEND_WR; + +#ifdef XIO_SRQ_ENABLE + xio_srq_qp_added(rdma_hndl, srq); +#endif + rdma_hndl->beacon_task.dd_data = ptr_from_int64(XIO_BEACON_WRID); + rdma_hndl->beacon_task.context = (void *)rdma_hndl; + rdma_hndl->beacon.wr_id = uint64_from_ptr(&rdma_hndl->beacon_task); + rdma_hndl->beacon.opcode = IBV_WR_SEND; + + memset(&qp_attr, 0, sizeof(qp_attr)); + if (ibv_query_qp(rdma_hndl->qp, &qp_attr, 0, &qp_init_attr) != 0) + ERROR_LOG("ibv_query_qp failed. (errno=%d %m)\n", errno); + rdma_hndl->max_inline_data = qp_attr.cap.max_inline_data; + rdma_hndl->max_sge = min(rdma_options.max_out_iovsz + 1, + dev->device_attr.max_sge); + + list_add(&rdma_hndl->trans_list_entry, &tcq->trans_list); + + DEBUG_LOG("rdma qp: [new] handle:%p, qp:0x%x, max inline:%d\n", + rdma_hndl, + rdma_hndl->qp->qp_num, + rdma_hndl->max_inline_data); + + return 0; + +free_slots: + xio_cq_free_slots(tcq, MAX_CQE_PER_QP); + +release_cq: + + xio_cq_release(tcq); + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_qp_release */ +/*---------------------------------------------------------------------------*/ +static void xio_qp_release(struct xio_rdma_transport *rdma_hndl) +{ + if (rdma_hndl->qp) { + TRACE_LOG("rdma qp: [close] handle:%p, qp:%p\n", rdma_hndl, + rdma_hndl->qp); +#ifdef XIO_SRQ_ENABLE + xio_srq_qp_deleted(rdma_hndl, rdma_hndl->tcq->srq); +#endif + xio_cq_free_slots(rdma_hndl->tcq, MAX_CQE_PER_QP); + list_del(&rdma_hndl->trans_list_entry); + rdma_destroy_qp(rdma_hndl->cm_id); + xio_cq_release(rdma_hndl->tcq); + rdma_hndl->qp = NULL; + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_rxd_init */ +/*---------------------------------------------------------------------------*/ +static void xio_rxd_init(struct xio_work_req *rxd, + struct xio_task *task, + void *buf, unsigned size, + struct ibv_mr *srmr) +{ + struct ibv_recv_wr *recv_wr = &rxd->recv_wr; + struct ibv_sge *sg_list = rxd->sge; + + recv_wr->wr_id = uint64_from_ptr(task); + recv_wr->next = NULL; + recv_wr->sg_list = sg_list; + recv_wr->num_sge = size ? 1 : 0; + + if (size) { + /* set the first element */ + sg_list->addr = uint64_from_ptr(buf); + sg_list->length = size; + sg_list->lkey = srmr->lkey; + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_txd_init */ +/*---------------------------------------------------------------------------*/ +static void xio_txd_init(struct xio_work_req *txd, + struct xio_task *task, + void *buf, unsigned size, + struct ibv_mr *srmr) +{ + struct ibv_send_wr *send_wr = &txd->send_wr; + struct ibv_sge *sg_list = txd->sge; + + send_wr->wr_id = uint64_from_ptr(task); + send_wr->next = NULL; + send_wr->sg_list = sg_list; + send_wr->num_sge = size ? 1 : 0; + send_wr->opcode = IBV_WR_SEND; + + if (size) { + /* set the first element */ + sg_list->addr = uint64_from_ptr(buf); + sg_list->length = size; + sg_list->lkey = srmr->lkey; + } + + /*txd->send_wr.send_flags = IBV_SEND_SIGNALED; */ +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdmad_init */ +/*---------------------------------------------------------------------------*/ +static void xio_rdmad_init(struct xio_work_req *rdmad, + struct xio_task *task) +{ + struct ibv_send_wr *send_wr = &rdmad->send_wr; + struct ibv_sge *sg_list = rdmad->sge; + + send_wr->wr_id = uint64_from_ptr(task); + send_wr->sg_list = sg_list; + send_wr->num_sge = 1; + send_wr->next = NULL; + send_wr->send_flags = IBV_SEND_SIGNALED; + + /* to be set before posting: + rdmad->iser_out_ib_op, rdmad->send_wr.out_ib_op + rdmad->sge.addr, rdmad->sge.length + rdmad->send_wr.wr.rdma.(remote_addr,rkey) */ +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_task_init */ +/*---------------------------------------------------------------------------*/ +static void xio_rdma_task_init(struct xio_task *task, + struct xio_rdma_transport *rdma_hndl, + void *buf, + unsigned long size, + struct ibv_mr *srmr) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + + xio_txd_init(&rdma_task->txd, task, buf, size, srmr); + xio_rxd_init(&rdma_task->rxd, task, buf, size, srmr); + xio_rdmad_init(&rdma_task->rdmad, task); + + /* initialize the mbuf */ + if (buf) + xio_mbuf_init(&task->mbuf, buf, size, 0); +} + +/*---------------------------------------------------------------------------*/ +/* xio_txd_reinit */ +/*---------------------------------------------------------------------------*/ +static void xio_xd_reinit(struct xio_work_req *xd, + size_t xd_nr, + struct ibv_mr *srmr) +{ + unsigned int i; + + if (!srmr || !xd || !xd->sge) + return; + + for (i = 0; i < xd_nr; i++) { + if (!xd->sge[i].lkey) + break; + xd->sge[i].lkey = srmr->lkey; + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_task_reinit */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_task_reinit(struct xio_task *task, + struct xio_rdma_transport *rdma_hndl, + struct ibv_mr *srmr) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + + xio_xd_reinit(&rdma_task->rxd, rdma_hndl->max_sge, srmr); + xio_xd_reinit(&rdma_task->txd, rdma_hndl->max_sge, srmr); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_flush_all_tasks */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_flush_all_tasks(struct xio_rdma_transport *rdma_hndl) +{ + if (!list_empty(&rdma_hndl->in_flight_list)) { + TRACE_LOG("in_flight_list not empty!\n"); + xio_transport_flush_task_list(&rdma_hndl->in_flight_list); + /* for task that attached to senders with ref count = 2 */ + xio_transport_flush_task_list(&rdma_hndl->in_flight_list); + } + + if (!list_empty(&rdma_hndl->rdma_rd_req_in_flight_list)) { + TRACE_LOG("rdma_rd_req_in_flight_list not empty!\n"); + xio_transport_flush_task_list( + &rdma_hndl->rdma_rd_req_in_flight_list); + } + if (!list_empty(&rdma_hndl->rdma_rd_req_list)) { + TRACE_LOG("rdma_rd_req_list not empty!\n"); + xio_transport_flush_task_list(&rdma_hndl->rdma_rd_req_list); + } + if (!list_empty(&rdma_hndl->rdma_rd_rsp_in_flight_list)) { + TRACE_LOG("rdma_rd_rsp_in_flight_list not empty!\n"); + xio_transport_flush_task_list( + &rdma_hndl->rdma_rd_rsp_in_flight_list); + } + if (!list_empty(&rdma_hndl->rdma_rd_rsp_list)) { + TRACE_LOG("rdma_rd_rsp_list not empty!\n"); + xio_transport_flush_task_list(&rdma_hndl->rdma_rd_rsp_list); + } + if (!list_empty(&rdma_hndl->tx_comp_list)) { + TRACE_LOG("tx_comp_list not empty!\n"); + xio_transport_flush_task_list(&rdma_hndl->tx_comp_list); + } + if (!list_empty(&rdma_hndl->io_list)) { + TRACE_LOG("io_list not empty!\n"); + xio_transport_flush_task_list(&rdma_hndl->io_list); + } + + if (!list_empty(&rdma_hndl->tx_ready_list)) { + TRACE_LOG("tx_ready_list not empty!\n"); + xio_transport_flush_task_list(&rdma_hndl->tx_ready_list); + /* for task that attached to senders with ref count = 2 */ + xio_transport_flush_task_list(&rdma_hndl->tx_ready_list); + } + + if (!list_empty(&rdma_hndl->rx_list)) { + TRACE_LOG("rx_list not empty!\n"); + xio_transport_flush_task_list(&rdma_hndl->rx_list); + } + + rdma_hndl->kick_rdma_rd_req = 0; + rdma_hndl->rdma_rd_req_in_flight = 0; + rdma_hndl->kick_rdma_rd_rsp = 0; + rdma_hndl->rdma_rd_rsp_in_flight = 0; + rdma_hndl->reqs_in_flight_nr = 0; + rdma_hndl->rsps_in_flight_nr = 0; + rdma_hndl->tx_ready_tasks_num = 0; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_initial_pool_slab_pre_create */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_initial_pool_slab_pre_create( + struct xio_transport_base *transport_hndl, + int alloc_nr, void *pool_dd_data, void *slab_dd_data) +{ + struct xio_rdma_tasks_slab *rdma_slab = + (struct xio_rdma_tasks_slab *)slab_dd_data; + uint32_t pool_size; + int retval; + + rdma_slab->buf_size = CONN_SETUP_BUF_SIZE; + pool_size = rdma_slab->buf_size * alloc_nr; + + retval = xio_mem_alloc(pool_size, &rdma_slab->reg_mem); + if (retval) { + ERROR_LOG("xio_mem_alloc conn_setup pool failed, %m\n"); + if (errno == ENOMEM) + xio_validate_ulimit_memlock(); + return -1; + } + rdma_slab->data_pool = rdma_slab->reg_mem.addr; + rdma_slab->data_mr = rdma_slab->reg_mem.mr; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_initial_task_alloc */ +/*---------------------------------------------------------------------------*/ +static inline struct xio_task *xio_rdma_initial_task_alloc( + struct xio_rdma_transport *rdma_hndl) +{ + return rdma_hndl->initial_pool_cls.task_get( + rdma_hndl->initial_pool_cls.pool, + rdma_hndl); +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_primary_task_alloc */ +/*---------------------------------------------------------------------------*/ +struct xio_task *xio_rdma_primary_task_alloc( + struct xio_rdma_transport *rdma_hndl) +{ + return rdma_hndl->primary_pool_cls.task_get( + rdma_hndl->primary_pool_cls.pool, + rdma_hndl); +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_primary_task_lookup */ +/*---------------------------------------------------------------------------*/ +struct xio_task *xio_rdma_primary_task_lookup( + struct xio_rdma_transport *rdma_hndl, + int tid) +{ + if (rdma_hndl->primary_pool_cls.task_lookup) + return rdma_hndl->primary_pool_cls.task_lookup( + rdma_hndl->primary_pool_cls.pool, tid); + return NULL; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_task_free */ +/*---------------------------------------------------------------------------*/ +inline void xio_rdma_task_free(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task) +{ + if (rdma_hndl->primary_pool_cls.task_put) + return rdma_hndl->primary_pool_cls.task_put(task); +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_initial_pool_post_create */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_initial_pool_post_create( + struct xio_transport_base *transport_hndl, + void *pool, void *pool_dd_data) +{ + struct xio_task *task; + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)transport_hndl; + struct xio_rdma_task *rdma_task; + int retval; + + if (!rdma_hndl) + return 0; + + rdma_hndl->initial_pool_cls.pool = pool; + + task = xio_rdma_initial_task_alloc(rdma_hndl); + if (!task) { + ERROR_LOG("failed to get task\n"); + } else { + DEBUG_LOG("post_recv conn_setup rx task:%p\n", task); + retval = xio_post_recv(rdma_hndl, task, 1); + if (retval) + ERROR_LOG("xio_post_recv failed\n"); + + /* assuming that both sides posted one recv wr for initial + * negotiation + */ + rdma_hndl->peer_credits = 1; + rdma_hndl->sim_peer_credits = 1; + rdma_task = (struct xio_rdma_task *)task->dd_data; + + rdma_task->out_ib_op = XIO_IB_RECV; + + /* When using SRQ the rx_list used is that of the SRQ and not + * the rdma_hndl. However, in this case the initial pool is not + * created (we don't reach this flow) so it's not necessary to + * handle the SRQ case here. */ + list_add_tail(&task->tasks_list_entry, &rdma_hndl->rx_list); + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_task_pre_put */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_task_pre_put( + struct xio_transport_base *trans_hndl, + struct xio_task *task) +{ + XIO_TO_RDMA_TASK(task, rdma_task); + unsigned int i; + + /* recycle RDMA buffers back to pool */ + + /* put buffers back to pool */ + if (rdma_task->read_num_reg_mem) { + for (i = 0; i < rdma_task->read_num_reg_mem; i++) { + if (rdma_task->read_reg_mem[i].priv) { + xio_mempool_free(&rdma_task->read_reg_mem[i]); + rdma_task->read_reg_mem[i].priv = NULL; + } + } + rdma_task->read_num_reg_mem = 0; + } + + if (rdma_task->write_num_reg_mem) { + for (i = 0; i < rdma_task->write_num_reg_mem; i++) { + if (rdma_task->write_reg_mem[i].priv) { + xio_mempool_free(&rdma_task->write_reg_mem[i]); + rdma_task->write_reg_mem[i].priv = NULL; + } + } + rdma_task->write_num_reg_mem = 0; + } + /* + rdma_task->req_write_num_reg_mem = 0; + rdma_task->rsp_write_num_reg_mem = 0; + rdma_task->req_read_num_reg_mem = 0; + rdma_task->req_recv_num_sge = 0; + + rdma_task->txd.send_wr.num_sge = 1; + rdma_task->out_ib_op = XIO_IB_NULL; + rdma_task->phantom_idx = 0; + rdma_task->sn = 0; + */ + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_initial_pool_slab_destroy */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_initial_pool_slab_destroy( + struct xio_transport_base *transport_hndl, + void *pool_dd_data, void *slab_dd_data) +{ + struct xio_rdma_tasks_slab *rdma_slab = + (struct xio_rdma_tasks_slab *)slab_dd_data; + + xio_mem_free(&rdma_slab->reg_mem); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_initial_pool_slab_init_task */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_initial_pool_slab_init_task( + struct xio_transport_base *transport_hndl, + void *pool_dd_data, + void *slab_dd_data, int tid, struct xio_task *task) +{ + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)transport_hndl; + struct xio_rdma_tasks_slab *rdma_slab = + (struct xio_rdma_tasks_slab *)slab_dd_data; + void *buf = rdma_slab->data_pool + tid * rdma_slab->buf_size; + char *ptr; + struct ibv_mr *data_mr; + + XIO_TO_RDMA_TASK(task, rdma_task); + + if (!rdma_hndl /*|| rdma_task->buf*/) + return 0; + + /* fill xio_rdma_task */ + ptr = (char *)rdma_task; + ptr += sizeof(struct xio_rdma_task); + + /* fill xio_work_req */ + rdma_task->txd.sge = (struct ibv_sge *)ptr; + ptr += sizeof(struct ibv_sge); + + rdma_task->rxd.sge = (struct ibv_sge *)ptr; + ptr += sizeof(struct ibv_sge); + /*****************************************/ + + data_mr = xio_rdma_mr_lookup(rdma_slab->data_mr, + rdma_hndl->tcq->dev); + xio_rdma_task_init( + task, + rdma_hndl, + buf, + rdma_slab->buf_size, + data_mr); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_initial_pool_get_params */ +/*---------------------------------------------------------------------------*/ +static void xio_rdma_initial_pool_get_params( + struct xio_transport_base *transport_hndl, + int *start_nr, int *max_nr, int *alloc_nr, + int *pool_dd_sz, int *slab_dd_sz, int *task_dd_sz) +{ + *start_nr = 10 * NUM_CONN_SETUP_TASKS; + *alloc_nr = 10 * NUM_CONN_SETUP_TASKS; + *max_nr = 10 * NUM_CONN_SETUP_TASKS; + + *pool_dd_sz = 0; + *slab_dd_sz = sizeof(struct xio_rdma_tasks_slab); + *task_dd_sz = sizeof(struct xio_rdma_task) + + 2 * sizeof(struct ibv_sge); +} + +static struct xio_tasks_pool_ops initial_tasks_pool_ops = { + .pool_get_params = xio_rdma_initial_pool_get_params, + .slab_pre_create = xio_rdma_initial_pool_slab_pre_create, + .slab_destroy = xio_rdma_initial_pool_slab_destroy, + .slab_init_task = xio_rdma_initial_pool_slab_init_task, + .pool_post_create = xio_rdma_initial_pool_post_create +}; + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_phantom_pool_slab_init_task */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_phantom_pool_slab_init_task( + struct xio_transport_base *transport_hndl, + void *pool_dd_data, + void *slab_dd_data, int tid, struct xio_task *task) +{ + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)transport_hndl; + char *ptr; + + XIO_TO_RDMA_TASK(task, rdma_task); + + /* set the task to point to hndl */ + task->context = transport_hndl; + + /* fill xio_rdma_task */ + ptr = (char *)rdma_task; + ptr += sizeof(struct xio_rdma_task); + + /* fill xio_work_req */ + rdma_task->rdmad.sge = (struct ibv_sge *)ptr; + /*ptr += rdma_hndl->max_sge*sizeof(struct ibv_sge);*/ + /*****************************************/ + + rdma_task->out_ib_op = (enum xio_ib_op_code)0x200; + xio_rdma_task_init( + task, + rdma_hndl, + NULL, + 0, + NULL); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_phantom_pool_create */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_phantom_pool_create(struct xio_rdma_transport *rdma_hndl) +{ + struct xio_tasks_pool_params params; + + memset(¶ms, 0, sizeof(params)); + + params.start_nr = NUM_START_PHANTOM_POOL_TASKS; + params.max_nr = NUM_MAX_PHANTOM_POOL_TASKS; + params.alloc_nr = NUM_ALLOC_PHANTOM_POOL_TASKS; + params.pool_dd_data_sz = 0; + params.slab_dd_data_sz = sizeof(struct xio_rdma_tasks_slab); + params.task_dd_data_sz = sizeof(struct xio_rdma_task) + + rdma_hndl->max_sge * + sizeof(struct ibv_sge); + + params.pool_hooks.context = rdma_hndl; + params.pool_hooks.slab_init_task = + (int (*)(void *, void *, void *, int, struct xio_task *)) + xio_rdma_phantom_pool_slab_init_task; + params.pool_hooks.slab_uninit_task = NULL; + params.pool_hooks.task_pre_put = + (int (*)(void *, struct xio_task *))xio_rdma_task_pre_put; + + /* initialize the tasks pool */ + rdma_hndl->phantom_tasks_pool = xio_tasks_pool_create(¶ms); + if (!rdma_hndl->phantom_tasks_pool) { + ERROR_LOG("xio_tasks_pool_create failed\n"); + goto cleanup; + } + + return 0; + +cleanup: + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_phantom_pool_destroy */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_phantom_pool_destroy(struct xio_rdma_transport *rdma_hndl) +{ + if (!rdma_hndl->phantom_tasks_pool) + return -1; + + xio_tasks_pool_destroy(rdma_hndl->phantom_tasks_pool); + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_primary_pool_slab_pre_create */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_primary_pool_slab_pre_create( + struct xio_transport_base *transport_hndl, + int alloc_nr, void *pool_dd_data, void *slab_dd_data) +{ + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)transport_hndl; + struct xio_rdma_tasks_slab *rdma_slab = + (struct xio_rdma_tasks_slab *)slab_dd_data; + size_t inline_buf_sz = xio_rdma_get_inline_buffer_size(); + size_t alloc_sz = alloc_nr * ALIGN(inline_buf_sz, PAGE_SIZE); + int retval; + + if (alloc_sz == 0) { + xio_set_error(EINVAL); + ERROR_LOG("primary_pre_slab_create failed. alloc_nr:%d, " \ + "membuf_sz:%zu\n", + alloc_nr, rdma_hndl->membuf_sz); + return -1; + } + rdma_slab->alloc_nr = alloc_nr; + rdma_slab->buf_size = inline_buf_sz; + + if (disable_huge_pages) { + retval = xio_mem_alloc(alloc_sz, &rdma_slab->reg_mem); + if (retval == -1) { + xio_set_error(ENOMEM); + ERROR_LOG("xio_alloc rdma pool sz:%zu failed\n", + alloc_sz); + return -1; + } + } else { + /* maybe allocation of with unuma_alloc can provide better + * performance? + */ + rdma_slab->data_pool = umalloc_huge_pages(alloc_sz); + if (!rdma_slab->data_pool) { + xio_set_error(ENOMEM); + ERROR_LOG("malloc rdma pool sz:%zu failed\n", + alloc_sz); + return -1; + } + retval = xio_mem_register(rdma_slab->data_pool, + alloc_sz, &rdma_slab->reg_mem); + if (retval == -1) { + ERROR_LOG("xio_mem_register rdma pool sz:%zu failed\n", + alloc_sz); + ufree_huge_pages(rdma_slab->data_pool); + if (errno == ENOMEM) + xio_validate_ulimit_memlock(); + return -1; + } + } + rdma_slab->data_pool = rdma_slab->reg_mem.addr; + rdma_slab->data_mr = rdma_slab->reg_mem.mr; + + DEBUG_LOG("pool buf:%p, mr:%p\n", + rdma_slab->data_pool, rdma_slab->data_mr); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_primary_pool_slab_post_create */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_primary_pool_slab_post_create( + struct xio_transport_base *transport_hndl, + void *pool_dd_data, void *slab_dd_data) +{ + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)transport_hndl; + struct xio_rdma_tasks_slab *rdma_slab = + (struct xio_rdma_tasks_slab *)slab_dd_data; + struct ibv_mr *data_mr; + + if (!rdma_slab->data_mr) + return 0; + + /* With reconnect can use another HCA */ + data_mr = xio_rdma_mr_lookup( + rdma_slab->data_mr, + rdma_hndl->tcq->dev); + if (data_mr) + return 0; + + if (!disable_huge_pages) { + size_t alloc_sz = rdma_slab->buf_size * rdma_slab->alloc_nr; + int retval = xio_mem_dereg(&rdma_slab->reg_mem); + + if (retval != 0) + ERROR_LOG("xio_mem_dreg failed\n"); + + retval = xio_mem_register(rdma_slab->data_pool, + alloc_sz, &rdma_slab->reg_mem); + if (retval == -1) { + ERROR_LOG("xio_mem_register rdma pool sz:%zu failed\n", + alloc_sz); + ufree_huge_pages(rdma_slab->data_pool); + if (errno == ENOMEM) + xio_validate_ulimit_memlock(); + return -1; + } + rdma_slab->data_mr = rdma_slab->reg_mem.mr; + return 0; + } + ERROR_LOG("can't re register allocated memory\n"); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_primary_pool_post_create */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_primary_pool_post_create( + struct xio_transport_base *transport_hndl, + void *pool, void *pool_dd_data) +{ + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)transport_hndl; + + if (!rdma_hndl) + return 0; + + rdma_hndl->primary_pool_cls.pool = pool; + + xio_rdma_rearm_rq(rdma_hndl); + + /* late creation */ + xio_rdma_phantom_pool_create(rdma_hndl); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_primary_pool_slab_destroy */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_primary_pool_slab_destroy( + struct xio_transport_base *transport_hndl, + void *pool_dd_data, void *slab_dd_data) +{ + struct xio_rdma_tasks_slab *rdma_slab = + (struct xio_rdma_tasks_slab *)slab_dd_data; + + if (disable_huge_pages) { + int retval = xio_mem_free(&rdma_slab->reg_mem); + + if (retval != 0) + ERROR_LOG("xio_mem_dreg failed\n"); + } else { + int retval = xio_mem_dereg(&rdma_slab->reg_mem); + + if (retval != 0) + ERROR_LOG("xio_mem_dreg failed\n"); + ufree_huge_pages(rdma_slab->data_pool); + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_primary_pool_slab_remap_task */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_primary_pool_slab_remap_task( + struct xio_transport_base *old_th, + struct xio_transport_base *new_th, + void *pool_dd_data, void *slab_dd_data, + struct xio_task *task) +{ + struct xio_rdma_transport *old_hndl = + (struct xio_rdma_transport *)old_th; + struct xio_rdma_transport *new_hndl = + (struct xio_rdma_transport *)new_th; + struct xio_rdma_tasks_slab *rdma_slab = + (struct xio_rdma_tasks_slab *)slab_dd_data; + struct ibv_mr *data_mr; + + task->context = new_th; + + /* if the same device is used then there is no need to remap */ + if (old_hndl && old_hndl->tcq->dev == new_hndl->tcq->dev) + return 0; + + data_mr = xio_rdma_mr_lookup(rdma_slab->data_mr, new_hndl->tcq->dev); + xio_rdma_task_reinit(task, new_hndl, data_mr); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_primary_pool_slab_init_task */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_primary_pool_slab_init_task( + struct xio_transport_base *transport_hndl, + void *pool_dd_data, + void *slab_dd_data, int tid, struct xio_task *task) +{ + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)transport_hndl; + struct xio_rdma_tasks_slab *rdma_slab = + (struct xio_rdma_tasks_slab *)slab_dd_data; + void *buf = rdma_slab->data_pool + tid * ALIGN(rdma_slab->buf_size, + PAGE_SIZE); + int max_iovsz = max(rdma_options.max_out_iovsz, + rdma_options.max_in_iovsz) + 1; + int max_sge; + char *ptr; + struct ibv_mr *data_mr; + + XIO_TO_RDMA_TASK(task, rdma_task); + + if (!rdma_hndl || !rdma_hndl->tcq) + return 0; + + max_sge = min(rdma_hndl->max_sge, max_iovsz); + + /* fill xio_rdma_task */ + ptr = (char *)rdma_task; + ptr += sizeof(struct xio_rdma_task); + + /* fill xio_work_req */ + rdma_task->txd.sge = (struct ibv_sge *)ptr; + ptr += max_sge * sizeof(struct ibv_sge); + rdma_task->rxd.sge = (struct ibv_sge *)ptr; + ptr += sizeof(struct ibv_sge); + rdma_task->rdmad.sge = (struct ibv_sge *)ptr; + ptr += max_sge * sizeof(struct ibv_sge); + + rdma_task->read_reg_mem = (struct xio_reg_mem *)ptr; + ptr += max_iovsz * sizeof(struct xio_reg_mem); + rdma_task->write_reg_mem = (struct xio_reg_mem *)ptr; + ptr += max_iovsz * sizeof(struct xio_reg_mem); + + rdma_task->req_in_sge = (struct xio_sge *)ptr; + ptr += max_iovsz * sizeof(struct xio_sge); + rdma_task->req_out_sge = (struct xio_sge *)ptr; + ptr += max_iovsz * sizeof(struct xio_sge); + rdma_task->rsp_out_sge = (struct xio_sge *)ptr; + ptr += max_iovsz * sizeof(struct xio_sge); + /*****************************************/ + + rdma_task->out_ib_op = (enum xio_ib_op_code)0x200; + data_mr = xio_rdma_mr_lookup(rdma_slab->data_mr, rdma_hndl->tcq->dev); + + xio_rdma_task_init( + task, + rdma_hndl, + buf, + rdma_slab->buf_size, + data_mr); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_primary_pool_get_params */ +/*---------------------------------------------------------------------------*/ +static void xio_rdma_primary_pool_get_params( + struct xio_transport_base *transport_hndl, + int *start_nr, int *max_nr, int *alloc_nr, + int *pool_dd_sz, int *slab_dd_sz, int *task_dd_sz) +{ + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)transport_hndl; + int max_iovsz = max(rdma_options.max_out_iovsz, + rdma_options.max_in_iovsz) + 1; + int max_sge; + int queued_nr; + + if (rdma_hndl) + max_sge = min(rdma_hndl->max_sge, max_iovsz); + else + max_sge = min(XIO_DEV_ATTR_MAX_SGE, max_iovsz); + + queued_nr = g_options.snd_queue_depth_msgs + + g_options.rcv_queue_depth_msgs + + MAX_CQE_PER_QP; /* for ibv_post_recv */ + + if (rdma_hndl) + *start_nr = rdma_hndl->rq_depth + EXTRA_RQE + SEND_QE; + else + *start_nr = NUM_START_PRIMARY_POOL_TASKS; + + *alloc_nr = NUM_ALLOC_PRIMARY_POOL_TASKS; + *max_nr = max(queued_nr, *start_nr); + + *pool_dd_sz = 0; + *slab_dd_sz = sizeof(struct xio_rdma_tasks_slab); + *task_dd_sz = sizeof(struct xio_rdma_task) + + (max_sge + 1 + max_sge) * sizeof(struct ibv_sge) + + 2 * max_iovsz * sizeof(struct xio_reg_mem) + + 3 * max_iovsz * sizeof(struct xio_sge); +} + +static struct xio_tasks_pool_ops primary_tasks_pool_ops = { + .pool_get_params = xio_rdma_primary_pool_get_params, + .slab_pre_create = xio_rdma_primary_pool_slab_pre_create, + .slab_post_create = xio_rdma_primary_pool_slab_post_create, + .slab_destroy = xio_rdma_primary_pool_slab_destroy, + .slab_init_task = xio_rdma_primary_pool_slab_init_task, + .slab_remap_task = xio_rdma_primary_pool_slab_remap_task, + .pool_post_create = xio_rdma_primary_pool_post_create, + .task_pre_put = xio_rdma_task_pre_put, +}; + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_post_close */ +/*---------------------------------------------------------------------------*/ +static void xio_rdma_post_close(struct xio_transport_base *trans_base) +{ + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)trans_base; + + if (rdma_hndl->handler_nesting) { + rdma_hndl->state = XIO_TRANSPORT_STATE_DESTROYED; + return; + } + TRACE_LOG("rdma transport: [post close] handle:%p, qp:%p\n", + rdma_hndl, rdma_hndl->qp); + + xio_ctx_del_delayed_work(rdma_hndl->base.ctx, + &rdma_hndl->timewait_timeout_work); + + xio_ctx_del_delayed_work(rdma_hndl->base.ctx, + &rdma_hndl->disconnect_timeout_work); + + xio_context_disable_event(&rdma_hndl->timewait_exit_event); + + xio_context_disable_event(&rdma_hndl->close_event); + + xio_observable_unreg_all_observers(&rdma_hndl->base.observable); + + xio_rdma_phantom_pool_destroy(rdma_hndl); + + xio_qp_release(rdma_hndl); + + if (rdma_hndl->cm_id) { + TRACE_LOG("call rdma_destroy_id\n"); + rdma_destroy_id(rdma_hndl->cm_id); + rdma_hndl->cm_id = NULL; + } + + xio_cm_channel_release(rdma_hndl->cm_channel); + + xio_context_destroy_resume(rdma_hndl->base.ctx); + + if (rdma_hndl->rkey_tbl) { + ufree(rdma_hndl->rkey_tbl); + rdma_hndl->rkey_tbl = NULL; + } + if (rdma_hndl->peer_rkey_tbl) { + ufree(rdma_hndl->peer_rkey_tbl); + rdma_hndl->peer_rkey_tbl = NULL; + } + + if (trans_base->portal_uri) { + ufree(trans_base->portal_uri); + trans_base->portal_uri = NULL; + } + + XIO_OBSERVABLE_DESTROY(&rdma_hndl->base.observable); + /* last chance to flush all tasks */ + xio_rdma_flush_all_tasks(rdma_hndl); + + ufree(rdma_hndl); +} + +/*---------------------------------------------------------------------------*/ +/* on_cm_addr_resolved */ +/*---------------------------------------------------------------------------*/ +static void on_cm_addr_resolved(struct rdma_cm_event *ev, + struct xio_rdma_transport *rdma_hndl) +{ + int retval = 0; + + rdma_hndl->dev = xio_device_lookup_init(rdma_hndl->cm_id->verbs); + if (!rdma_hndl->dev) { + ERROR_LOG("failed find/init device. " \ + "rdma_hndl:%p, cm_id->verbs:%p\n", rdma_hndl, + rdma_hndl->cm_id->verbs); + goto notify_err0; + } + + if (test_bits(XIO_TRANSPORT_ATTR_TOS, &rdma_hndl->trans_attr_mask)) { + retval = rdma_set_option(rdma_hndl->cm_id, RDMA_OPTION_ID, + RDMA_OPTION_ID_TOS, + &rdma_hndl->trans_attr.tos, + sizeof(rdma_hndl->trans_attr.tos)); + if (unlikely(retval)) { + xio_set_error(errno); + ERROR_LOG("set TOS option failed. %m\n"); + } + DEBUG_LOG("set TOS option success. mask:0x%x, tos:0x%x\n", + rdma_hndl->trans_attr_mask, + rdma_hndl->trans_attr.tos); + } + + retval = rdma_resolve_route(rdma_hndl->cm_id, ROUTE_RESOLVE_TIMEOUT); + if (retval) { + xio_set_error(errno); + DEBUG_LOG("rdma_resolve_route failed. (errno=%d %m)\n", errno); + goto notify_err1; + } + + return; + +notify_err1: + xio_device_put(rdma_hndl->dev); + +notify_err0: + xio_transport_notify_observer_error(&rdma_hndl->base, xio_errno()); +} + +/*---------------------------------------------------------------------------*/ +/* on_cm_route_resolved */ +/*---------------------------------------------------------------------------*/ +static void on_cm_route_resolved(struct rdma_cm_event *ev, + struct xio_rdma_transport *rdma_hndl) +{ + int retval = 0; + struct rdma_conn_param cm_params; + + if (!rdma_hndl->cm_id || !rdma_hndl->cm_id->verbs) { + xio_set_error(ENODEV); + ERROR_LOG("NULL ibv_context. rdma_hndl:%p, cm_id:%p\n", + rdma_hndl, rdma_hndl->cm_id); + /* do not notify error in this case since it may + * already was notified */ + return; + } + retval = xio_qp_create(rdma_hndl); + if (unlikely(retval != 0)) { + ERROR_LOG("internal logic error in create_endpoint\n"); + goto notify_err0; + } + + memset(&cm_params, 0, sizeof(cm_params)); +#ifdef XIO_SRQ_ENABLE + cm_params.rnr_retry_count = 7; /* 7 - infinite retry */ +#else + cm_params.rnr_retry_count = 3; +#endif + cm_params.retry_count = 3; + + /* + * When choosing the responder resources for a ULP, it is usually + * best to use the maximum value of the HCA. If the other side is + * not going to use RDMA read, then it should zero out the + * initiator_depth in the REP, which will zero out the local + * responder_resources when we program the QP. Generally, the + * initiator_depth should be either set to 0 or + * min(max_qp_rd_atom, max_send_wr). Use 0 if RDMA read is + * never going to be sent from this side. + */ + cm_params.responder_resources = + rdma_hndl->tcq->dev->device_attr.max_qp_rd_atom; + cm_params.initiator_depth = + rdma_hndl->tcq->dev->device_attr.max_qp_init_rd_atom; + + /* connect to peer */ + retval = rdma_connect(rdma_hndl->cm_id, &cm_params); + if (retval != 0) { + xio_set_error(ENOMEM); + DEBUG_LOG("rdma_connect failed. (errno=%d %m)\n", errno); + goto notify_err1; + } + rdma_hndl->client_responder_resources = cm_params.responder_resources; + rdma_hndl->client_initiator_depth = cm_params.initiator_depth; + rdma_hndl->state = XIO_TRANSPORT_STATE_CONNECTING; + + return; + +notify_err1: + xio_qp_release(rdma_hndl); +notify_err0: + xio_transport_notify_observer_error(&rdma_hndl->base, xio_errno()); +} + +/*---------------------------------------------------------------------------*/ +/* on_cm_connect_request */ +/*---------------------------------------------------------------------------*/ +static void on_cm_connect_request(struct rdma_cm_event *ev, + struct xio_rdma_transport *parent_hndl) +{ + struct xio_rdma_transport *child_hndl; + union xio_transport_event_data event_data; + int retval = 0; + struct rdma_cm_id *cm_id = ev->id; + struct xio_device *dev; + + dev = xio_device_lookup_init(cm_id->verbs); + if (!dev) { + ERROR_LOG("failed find/init device\n"); + retval = rdma_reject(ev->id, NULL, 0); + if (retval) { + xio_set_error(errno); + ERROR_LOG("rdma_reject failed. (errno=%d %m)\n", errno); + } + + goto notify_err1; + } + + child_hndl = (struct xio_rdma_transport *)xio_rdma_open( + parent_hndl->transport, + parent_hndl->base.ctx, + NULL, + 0, NULL); + if (!child_hndl) { + ERROR_LOG("failed to open rdma transport\n"); + retval = rdma_reject(ev->id, NULL, 0); + if (retval) { + xio_set_error(errno); + ERROR_LOG("rdma_reject failed. (errno=%d %m)\n", errno); + } + + goto notify_err2; + } + child_hndl->state = XIO_TRANSPORT_STATE_CONNECTING; + + child_hndl->cm_id = ev->id; + /* Parent handle i.e. listener doesn't have a CQ */ + child_hndl->tcq = NULL; + child_hndl->dev = dev; + ev->id->context = child_hndl; + child_hndl->client_initiator_depth = + ev->param.conn.initiator_depth; + child_hndl->client_responder_resources = + ev->param.conn.responder_resources; + + /* initiator is dst, target is src */ + memcpy(&child_hndl->base.peer_addr, + &child_hndl->cm_id->route.addr.dst_storage, + sizeof(child_hndl->base.peer_addr)); + memcpy(&child_hndl->base.local_addr, + &child_hndl->cm_id->route.addr.src_storage, + sizeof(child_hndl->base.local_addr)); + child_hndl->base.proto = XIO_PROTO_RDMA; + + retval = xio_qp_create(child_hndl); + if (unlikely(retval != 0)) { + ERROR_LOG("failed to create qp\n"); + xio_rdma_reject((struct xio_transport_base *)child_hndl); + goto notify_err3; + } + + event_data.new_connection.child_trans_hndl = + (struct xio_transport_base *)child_hndl; + xio_transport_notify_observer(&parent_hndl->base, + XIO_TRANSPORT_EVENT_NEW_CONNECTION, + &event_data); + + return; + +notify_err3: + xio_rdma_close((struct xio_transport_base *)child_hndl); +notify_err2: + xio_device_put(dev); +notify_err1: + xio_transport_notify_observer_error(&parent_hndl->base, xio_errno()); +} + +/*---------------------------------------------------------------------------*/ +/* on_cm_refused */ +/*---------------------------------------------------------------------------*/ +static void on_cm_refused(struct rdma_cm_event *ev, + struct xio_rdma_transport *rdma_hndl) +{ + DEBUG_LOG("on_cm_refused. rdma_hndl:%p, reason:%s, state:%s\n", + rdma_hndl, xio_cm_rej_reason_str(ev->status), + xio_transport_state_str(rdma_hndl->state)); + + /* we get CM_ESTABLISHED and afterward we get cm_refused. It looks like + * cm state machine error. + */ + if (rdma_hndl->state == XIO_TRANSPORT_STATE_CONNECTED) { + /* one for beacon */ + kref_put(&rdma_hndl->base.kref, xio_rdma_close_cb); + /* one for timedwait_exit */ + kref_put(&rdma_hndl->base.kref, xio_rdma_close_cb); + rdma_hndl->state = XIO_TRANSPORT_STATE_ERROR; + } + xio_transport_notify_observer(&rdma_hndl->base, + XIO_TRANSPORT_EVENT_REFUSED, NULL); +} + +/*---------------------------------------------------------------------------*/ +/* on_cm_established */ +/*---------------------------------------------------------------------------*/ +static void on_cm_established(struct rdma_cm_event *ev, + struct xio_rdma_transport *rdma_hndl) +{ + /* initiator is dst, target is src */ + memcpy(&rdma_hndl->base.peer_addr, + &rdma_hndl->cm_id->route.addr.dst_storage, + sizeof(rdma_hndl->base.peer_addr)); + memcpy(&rdma_hndl->base.local_addr, + &rdma_hndl->cm_id->route.addr.src_storage, + sizeof(rdma_hndl->base.local_addr)); + + rdma_hndl->state = XIO_TRANSPORT_STATE_CONNECTED; + + /* one for beacon */ + kref_get(&rdma_hndl->base.kref); + /* one for timedwait_exit */ + kref_get(&rdma_hndl->base.kref); + + xio_transport_notify_observer(&rdma_hndl->base, + XIO_TRANSPORT_EVENT_ESTABLISHED, + NULL); +} + +/* + * Handle RDMA_CM_EVENT_TIMEWAIT_EXIT which is expected to be the last + * event during the life cycle of a connection, when it had been shut down + * and the network has cleared from the remaining in-flight messages. +*/ +/*---------------------------------------------------------------------------*/ +/* on_cm_timedwait_exit */ +/*---------------------------------------------------------------------------*/ +static void on_cm_timewait_exit(void *trans_hndl) +{ + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)trans_hndl; + + TRACE_LOG("on_cm_timedwait_exit rdma_hndl:%p state:%s\n", + rdma_hndl, xio_transport_state_str(rdma_hndl->state)); + + if (rdma_hndl->timewait_nr) + return; + rdma_hndl->timewait_nr = 1; + + xio_ctx_del_delayed_work(rdma_hndl->base.ctx, + &rdma_hndl->timewait_timeout_work); + + xio_rdma_flush_all_tasks(rdma_hndl); + + if (rdma_hndl->state == XIO_TRANSPORT_STATE_DISCONNECTED) { + xio_transport_notify_observer(&rdma_hndl->base, + XIO_TRANSPORT_EVENT_DISCONNECTED, + NULL); + } + /* if beacon was sent but was never received as wc error then reduce + ref count */ + if (rdma_hndl->beacon_sent) { + rdma_hndl->beacon_sent = 0; + kref_put(&rdma_hndl->base.kref, xio_rdma_close_cb); + } + + kref_put(&rdma_hndl->base.kref, xio_rdma_close_cb); +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_disoconnect */ +/*---------------------------------------------------------------------------*/ +int xio_rdma_disconnect(struct xio_rdma_transport *rdma_hndl, + int send_beacon) +{ + struct ibv_send_wr *bad_wr; + int retval; + + retval = rdma_disconnect(rdma_hndl->cm_id); + if (unlikely(retval)) { + ERROR_LOG("rdma_hndl:%p rdma_disconnect failed, %m\n", + rdma_hndl); + return -1; + } + if (!send_beacon) + return 0; + + /* post an indication that all flush errors were consumed */ + retval = ibv_post_send(rdma_hndl->qp, &rdma_hndl->beacon, &bad_wr); + if (retval == ENOTCONN) { + /* softiwarp returns ENOTCONN right away if the QP is not + in RTS state. */ + WARN_LOG("rdma_hndl %p failed to post beacon " \ + "- ignored because the QP is not in RTS state.\n", + rdma_hndl); + /* for beacon */ + xio_set_timewait_timer(rdma_hndl); + kref_put(&rdma_hndl->base.kref, xio_rdma_close_cb); + } else if (retval) { + ERROR_LOG("rdma_hndl %p failed to post beacon %d %d\n", + rdma_hndl, retval, errno); + return -1; + } else + rdma_hndl->beacon_sent = 1; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_set_timewait_timer */ +/*---------------------------------------------------------------------------*/ +void xio_set_timewait_timer(struct xio_rdma_transport *rdma_hndl) +{ + int retval; + int timeout; + + if (rdma_hndl->timewait_nr) + return; + + /* from context shutdown */ + if (rdma_hndl->ignore_timewait) + timeout = XIO_TIMEWAIT_EXIT_FAST_TIMEOUT; + else + timeout = XIO_TIMEWAIT_EXIT_TIMEOUT; + + /* trigger the timer */ + retval = xio_ctx_add_delayed_work( + rdma_hndl->base.ctx, + timeout, rdma_hndl, + on_cm_timewait_exit, + &rdma_hndl->timewait_timeout_work); + if (retval != 0) { + ERROR_LOG("xio_ctx_timer_add_delayed_work failed.\n"); + return; + } +} + +/*---------------------------------------------------------------------------*/ +/* on_cm_disconnected */ +/*---------------------------------------------------------------------------*/ +static void on_cm_disconnected(struct rdma_cm_event *ev, + struct xio_rdma_transport *rdma_hndl) +{ + int retval; + + if (rdma_hndl->disconnect_nr) + return; + rdma_hndl->disconnect_nr = 1; + + xio_ctx_del_delayed_work(rdma_hndl->base.ctx, + &rdma_hndl->disconnect_timeout_work); + + DEBUG_LOG("on_cm_disconnected. rdma_hndl:%p, state:%s\n", + rdma_hndl, xio_transport_state_str(rdma_hndl->state)); + + rdma_hndl->timewait_nr = 0; + + switch (rdma_hndl->state) { + case XIO_TRANSPORT_STATE_CONNECTED: + TRACE_LOG("call to rdma_disconnect. rdma_hndl:%p\n", + rdma_hndl); + rdma_hndl->state = XIO_TRANSPORT_STATE_DISCONNECTED; + retval = xio_rdma_disconnect(rdma_hndl, 1); + if (retval) + ERROR_LOG("rdma_hndl:%p rdma_disconnect failed, %m\n", + rdma_hndl); + break; + case XIO_TRANSPORT_STATE_CONNECTING: + TRACE_LOG("call to rdma_disconnect. rdma_hndl:%p\n", + rdma_hndl); + rdma_hndl->state = XIO_TRANSPORT_STATE_DISCONNECTED; + retval = xio_rdma_disconnect(rdma_hndl, 0); + if (retval) + ERROR_LOG("rdma_hndl:%p rdma_disconnect failed, %m\n", + rdma_hndl); + /* for beacon */ + kref_put(&rdma_hndl->base.kref, xio_rdma_close_cb); + break; + case XIO_TRANSPORT_STATE_CLOSED: + /* coming here from + * context_shutdown/rdma_close, + * don't go to disconnect state + */ + retval = xio_rdma_disconnect(rdma_hndl, 1); + if (retval) + ERROR_LOG("rdma_hndl:%p rdma_disconnect failed, " \ + "err=%d\n", rdma_hndl, retval); + break; + case XIO_TRANSPORT_STATE_INIT: + case XIO_TRANSPORT_STATE_LISTEN: + case XIO_TRANSPORT_STATE_DISCONNECTED: + case XIO_TRANSPORT_STATE_RECONNECT: + case XIO_TRANSPORT_STATE_DESTROYED: + case XIO_TRANSPORT_STATE_ERROR: + break; + } +} + +/*---------------------------------------------------------------------------*/ +/* on_cm_disconnected */ +/*---------------------------------------------------------------------------*/ +static inline void xio_disconnect_handler(void *rdma_hndl) +{ + on_cm_disconnected(NULL, (struct xio_rdma_transport *)rdma_hndl); +} + +/*---------------------------------------------------------------------------*/ +/* xio_set_disconnect_timer */ +/*---------------------------------------------------------------------------*/ +void xio_set_disconnect_timer(struct xio_rdma_transport *rdma_hndl) +{ + int retval; + int timeout; + + if (rdma_hndl->disconnect_nr) + return; + + /* from context shutdown */ + timeout = XIO_DISCONNECT_TIMEOUT; + + /* trigger the timer */ + retval = xio_ctx_add_delayed_work( + rdma_hndl->base.ctx, + timeout, rdma_hndl, + xio_disconnect_handler, + &rdma_hndl->disconnect_timeout_work); + if (retval != 0) { + ERROR_LOG("xio_ctx_timer_add_delayed_work failed.\n"); + return; + } +} + +/*---------------------------------------------------------------------------*/ +/* on_cm_device_release */ +/*---------------------------------------------------------------------------*/ +static void on_cm_device_release(struct rdma_cm_event *ev, + struct xio_rdma_transport *rdma_hndl) +{ + struct xio_device *dev; + + if (!rdma_hndl->cm_id) + return; + + dev = xio_device_lookup(rdma_hndl->cm_id->verbs); + if (!dev) { + ERROR_LOG("device release, device not found\n"); + return; + } + + xio_device_release(dev); +} + +/*---------------------------------------------------------------------------*/ +/* on_cm_error */ +/*---------------------------------------------------------------------------*/ +static void on_cm_error(struct rdma_cm_event *ev, + struct xio_rdma_transport *rdma_hndl) +{ + int reason; + + DEBUG_LOG("rdma transport [error] %s, rdma_hndl:%p\n", + rdma_event_str(ev->event), rdma_hndl); + + switch (ev->event) { + case RDMA_CM_EVENT_CONNECT_ERROR: + reason = XIO_E_CONNECT_ERROR; + break; + case RDMA_CM_EVENT_ADDR_ERROR: + reason = XIO_E_ADDR_ERROR; + break; + case RDMA_CM_EVENT_ROUTE_ERROR: + reason = XIO_E_ROUTE_ERROR; + break; + case RDMA_CM_EVENT_UNREACHABLE: + reason = XIO_E_UNREACHABLE; + break; + default: + reason = XIO_E_NOT_SUPPORTED; + break; + }; + xio_transport_notify_observer_error(&rdma_hndl->base, reason); +} + +/*---------------------------------------------------------------------------*/ +/* xio_close_handler */ +/*---------------------------------------------------------------------------*/ +void xio_close_handler(void *hndl) +{ + xio_rdma_post_close((struct xio_transport_base *)hndl); +} + +/*---------------------------------------------------------------------------*/ +/* xio_handle_cm_event */ +/*---------------------------------------------------------------------------*/ +static void xio_handle_cm_event(struct rdma_cm_event *ev, + struct xio_rdma_transport *rdma_hndl) +{ + DEBUG_LOG("cm event: [%s], hndl:%p, status:%d\n", + rdma_event_str(ev->event), rdma_hndl, ev->status); + + rdma_hndl->handler_nesting++; + switch (ev->event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + on_cm_addr_resolved(ev, rdma_hndl); + break; + case RDMA_CM_EVENT_ROUTE_RESOLVED: + on_cm_route_resolved(ev, rdma_hndl); + break; + case RDMA_CM_EVENT_CONNECT_REQUEST: + on_cm_connect_request(ev, rdma_hndl); + break; + case RDMA_CM_EVENT_ESTABLISHED: + on_cm_established(ev, rdma_hndl); + break; + case RDMA_CM_EVENT_REJECTED: + on_cm_refused(ev, rdma_hndl); + break; + case RDMA_CM_EVENT_ADDR_CHANGE: + case RDMA_CM_EVENT_DISCONNECTED: + on_cm_disconnected(ev, rdma_hndl); + break; + case RDMA_CM_EVENT_TIMEWAIT_EXIT: + /* The caller of this callback i.e. cma_ib_handler is holding + * cma_disable_callback, thus rdma_destroy_id should not + * be called in xio_rdma_close_complete! this is prevented as + * rdma_hndl->handler_nesting > 0. We return one to ensure that + * cma_ib_handler will call + */ + memset(&rdma_hndl->timewait_exit_event, 0, + sizeof(rdma_hndl->timewait_exit_event)); + rdma_hndl->timewait_exit_event.handler = on_cm_timewait_exit; + rdma_hndl->timewait_exit_event.data = rdma_hndl; + + xio_context_add_event(rdma_hndl->base.ctx, + &rdma_hndl->timewait_exit_event); + break; + + case RDMA_CM_EVENT_MULTICAST_JOIN: + case RDMA_CM_EVENT_MULTICAST_ERROR: + ERROR_LOG("Unrelated event:%d, %s - ignored\n", ev->event, + rdma_event_str(ev->event)); + break; + + case RDMA_CM_EVENT_DEVICE_REMOVAL: + on_cm_device_release(ev, rdma_hndl); + break; + + case RDMA_CM_EVENT_CONNECT_RESPONSE: + break; + + case RDMA_CM_EVENT_CONNECT_ERROR: + case RDMA_CM_EVENT_ADDR_ERROR: + case RDMA_CM_EVENT_ROUTE_ERROR: + case RDMA_CM_EVENT_UNREACHABLE: + default: + on_cm_error(ev, rdma_hndl); + break; + }; + rdma_hndl->handler_nesting--; + + /* state can be modified to destroyed (side effect) */ + if (rdma_hndl->state == XIO_TRANSPORT_STATE_DESTROYED) { + /* user space code calls here, xio_rdma_post_close which may + * call rdma_destroy_id which is not allowed in an handler + */ + memset(&rdma_hndl->close_event, 0, + sizeof(rdma_hndl->close_event)); + rdma_hndl->close_event.handler = xio_close_handler; + rdma_hndl->close_event.data = rdma_hndl; + + /* tell "poller mechanism" */ + xio_context_add_event(rdma_hndl->base.ctx, + &rdma_hndl->close_event); + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_cma_handler */ +/*---------------------------------------------------------------------------*/ +static void xio_cma_handler(int fd, int events, void *user_context) +{ + struct rdma_event_channel *p_cm_channel = + (struct rdma_event_channel *)(user_context); + struct rdma_cm_event *ev, lev; + struct xio_rdma_transport *rdma_hndl; + int retval; + + do { + /* get the event */ + retval = rdma_get_cm_event(p_cm_channel, &ev); + if (retval) { + if (errno == EAGAIN) + break; + xio_set_error(errno); + ERROR_LOG("rdma_get_cm_event failed. " \ + "(errno=%d %m)\n", errno); + break; + } + + rdma_hndl = (struct xio_rdma_transport *)ev->id->context; + + lev = *ev; + + /* ack the event */ + rdma_ack_cm_event(ev); + + /* and handle it */ + xio_handle_cm_event(&lev, rdma_hndl); + } while (1); +} + +/*---------------------------------------------------------------------------*/ +/* xio_cm_channel_get */ +/*---------------------------------------------------------------------------*/ +static struct xio_cm_channel *xio_cm_channel_get(struct xio_context *ctx) +{ + struct xio_cm_channel *channel; + int retval; + + pthread_rwlock_rdlock(&cm_lock); + list_for_each_entry(channel, &cm_list, channels_list_entry) { + if (channel->ctx == ctx) { + pthread_rwlock_unlock(&cm_lock); + kref_get(&channel->kref); + return channel; + } + } + pthread_rwlock_unlock(&cm_lock); + + channel = (struct xio_cm_channel *) + ucalloc(1, sizeof(struct xio_cm_channel)); + if (!channel) { + ERROR_LOG("rdma_create_event_channel failed " \ + "(errno=%d %m)\n", errno); + return NULL; + } + + channel->cm_channel = rdma_create_event_channel(); + if (!channel->cm_channel) { + ERROR_LOG("rdma_create_event_channel failed " \ + "(errno=%d %m)\n", errno); + goto free; + } + /* turn the file descriptor to non blocking */ + retval = fcntl(channel->cm_channel->fd, F_GETFL, 0); + if (retval != -1) { + retval = fcntl(channel->cm_channel->fd, F_SETFL, + retval | O_NONBLOCK); + } + if (retval == -1) { + xio_set_error(errno); + ERROR_LOG("fcntl failed. (errno=%d %m)\n", errno); + goto cleanup; + } + + retval = xio_context_add_ev_handler( + ctx, + channel->cm_channel->fd, + XIO_POLLIN, + xio_cma_handler, + channel->cm_channel); + if (retval != 0) { + xio_set_error(errno); + ERROR_LOG("Adding to event loop failed (errno=%d %m)\n", + errno); + goto cleanup; + } + channel->ctx = ctx; + + pthread_rwlock_wrlock(&cm_lock); + list_add(&channel->channels_list_entry, &cm_list); + pthread_rwlock_unlock(&cm_lock); + + /* One reference count for the rdma handle */ + kref_init(&channel->kref); + + return channel; + +cleanup: + rdma_destroy_event_channel(channel->cm_channel); +free: + ufree(channel); + + return NULL; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_open */ +/*---------------------------------------------------------------------------*/ +static struct xio_transport_base *xio_rdma_open( + struct xio_transport *transport, + struct xio_context *ctx, + struct xio_observer *observer, + uint32_t trans_attr_mask, + struct xio_transport_init_attr *attr) +{ + struct xio_rdma_transport *rdma_hndl; + + /*allocate rdma handle */ + rdma_hndl = (struct xio_rdma_transport *) + ucalloc(1, sizeof(struct xio_rdma_transport)); + if (!rdma_hndl) { + xio_set_error(ENOMEM); + ERROR_LOG("ucalloc failed. %m\n"); + return NULL; + } + if (attr && trans_attr_mask) { + memcpy(&rdma_hndl->trans_attr, attr, sizeof(*attr)); + rdma_hndl->trans_attr_mask = trans_attr_mask; + } + + XIO_OBSERVABLE_INIT(&rdma_hndl->base.observable, rdma_hndl); + + if (rdma_options.enable_mem_pool) { + rdma_hndl->rdma_mempool = + xio_transport_mempool_get(ctx, 1); + if (!rdma_hndl->rdma_mempool) { + xio_set_error(ENOMEM); + ERROR_LOG("allocating rdma mempool failed. %m\n"); + goto cleanup; + } + } + rdma_hndl->base.portal_uri = NULL; + rdma_hndl->base.proto = XIO_PROTO_RDMA; + kref_init(&rdma_hndl->base.kref); + rdma_hndl->transport = transport; + rdma_hndl->cm_id = NULL; + rdma_hndl->qp = NULL; + rdma_hndl->tcq = NULL; + rdma_hndl->base.ctx = ctx; + + if (rdma_hndl->base.ctx->rq_depth) { + //user chose to confgure rq depth + rdma_hndl->rq_depth = max(g_options.max_in_iovsz, rdma_hndl->base.ctx->rq_depth); + } else { + rdma_hndl->rq_depth = MAX_RECV_WR; + } + rdma_hndl->sq_depth = g_options.max_out_iovsz + 1; + + rdma_hndl->peer_credits = 0; + rdma_hndl->cm_channel = xio_cm_channel_get(ctx); + rdma_hndl->max_inline_buf_sz = xio_rdma_get_inline_buffer_size(); + + + /* + DEBUG_LOG("max_inline_buf:%d\n", rdma_hndl->max_inline_buf_sz); + */ + if (!rdma_hndl->cm_channel) { + TRACE_LOG("rdma transport: failed to allocate cm_channel\n"); + goto cleanup; + } + if (observer) + xio_observable_reg_observer(&rdma_hndl->base.observable, + observer); + + INIT_LIST_HEAD(&rdma_hndl->in_flight_list); + INIT_LIST_HEAD(&rdma_hndl->rdma_rd_req_in_flight_list); + INIT_LIST_HEAD(&rdma_hndl->rdma_rd_rsp_in_flight_list); + INIT_LIST_HEAD(&rdma_hndl->tx_ready_list); + INIT_LIST_HEAD(&rdma_hndl->tx_comp_list); + INIT_LIST_HEAD(&rdma_hndl->rx_list); + INIT_LIST_HEAD(&rdma_hndl->io_list); + INIT_LIST_HEAD(&rdma_hndl->rdma_rd_req_list); + INIT_LIST_HEAD(&rdma_hndl->rdma_rd_rsp_list); + + TRACE_LOG("xio_rdma_open: [new] handle:%p\n", rdma_hndl); + + return (struct xio_transport_base *)rdma_hndl; + +cleanup: + if (rdma_hndl->cm_channel) + xio_cm_channel_release(rdma_hndl->cm_channel); + + ufree(rdma_hndl); + + return NULL; +} + +/* + * Start closing connection. Transfer IB QP to error state. + * This will be followed by WC error and buffers flush events. + * We also should expect DISCONNECTED and TIMEWAIT_EXIT events. + * Only after the draining is over we are sure to have reclaimed + * all buffers (and tasks). After the RDMA CM events are collected, + * the connection QP may be destroyed, and its number may be recycled. + */ +/*---------------------------------------------------------------------------*/ +/* xio_rdma_close_cb */ +/*---------------------------------------------------------------------------*/ +void xio_rdma_close_cb(struct kref *kref) +{ + struct xio_transport_base *transport = container_of( + kref, struct xio_transport_base, kref); + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)transport; + + xio_transport_notify_observer( + transport, + XIO_TRANSPORT_EVENT_CLOSED, + NULL); + + xio_rdma_post_close((struct xio_transport_base *)rdma_hndl); +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_close */ +/*---------------------------------------------------------------------------*/ +static void xio_rdma_close(struct xio_transport_base *transport) +{ + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)transport; + int retval = 0; + + /* now it is zero */ + DEBUG_LOG("xio_rmda_close: [close] handle:%p, qp:%p state:%s\n", + rdma_hndl, rdma_hndl->qp, + xio_transport_state_str(rdma_hndl->state)); + + switch (rdma_hndl->state) { + case XIO_TRANSPORT_STATE_LISTEN: + rdma_hndl->state = XIO_TRANSPORT_STATE_CLOSED; + break; + case XIO_TRANSPORT_STATE_CONNECTED: + TRACE_LOG("call to rdma_disconnect. rdma_hndl:%p\n", + rdma_hndl); + + rdma_hndl->state = XIO_TRANSPORT_STATE_CLOSED; + retval = xio_rdma_disconnect(rdma_hndl, 0); + if (retval) + DEBUG_LOG("handle:%p rdma_disconnect failed, " \ + "%m\n", rdma_hndl); + + if (rdma_hndl->ignore_disconnect && + !rdma_hndl->disconnect_nr) { + xio_ctx_del_delayed_work( + rdma_hndl->base.ctx, + &rdma_hndl->disconnect_timeout_work); + xio_set_disconnect_timer(rdma_hndl); + } + break; + case XIO_TRANSPORT_STATE_DISCONNECTED: + rdma_hndl->state = XIO_TRANSPORT_STATE_CLOSED; + + if (rdma_hndl->ignore_timewait && + !rdma_hndl->timewait_nr) { + xio_ctx_del_delayed_work( + rdma_hndl->base.ctx, + &rdma_hndl->timewait_timeout_work); + xio_set_timewait_timer(rdma_hndl); + } + break; + case XIO_TRANSPORT_STATE_CLOSED: + return; + default: + rdma_hndl->state = XIO_TRANSPORT_STATE_CLOSED; + break; + } + + kref_put(&transport->kref, xio_rdma_close_cb); +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_dup2 */ +/* makes new_trans_hndl be the copy of old_trans_hndl, closes new_trans_hndl */ +/* Note old and new are in dup2 terminology opposite to reconnect terms */ +/* --------------------------------------------------------------------------*/ +static int xio_rdma_dup2(struct xio_transport_base *old_trans_hndl, + struct xio_transport_base **new_trans_hndl) +{ + int ret = 0; + + struct xio_rdma_transport *old_hndl = + (struct xio_rdma_transport *)old_trans_hndl; + struct xio_rdma_transport *new_hndl = + (struct xio_rdma_transport *)*new_trans_hndl; + + /* if device is not the same an R_KEY replacement table is created */ + if (old_hndl->tcq->dev != new_hndl->tcq->dev) { + /* new is actually the old one we want to replace */ + ret = xio_rkey_table_create(new_hndl->tcq->dev, + old_hndl->tcq->dev, + &old_hndl->rkey_tbl, + &old_hndl->rkey_tbl_size); + if (ret) { + ERROR_LOG("rkey table creation failed\n"); + return -1; + } + } + + xio_rdma_close(*new_trans_hndl); + + /* nexus layer will call close which will only decrement */ + /*kref_get(&old_trans_hndl->kref);*/ + + *new_trans_hndl = old_trans_hndl; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_new_rkey */ +/*---------------------------------------------------------------------------*/ + +static int xio_new_rkey(struct xio_rdma_transport *rdma_hndl, uint32_t *key) +{ + int i; + + if (!*key) + return 0; + + for (i = 0; i < rdma_hndl->peer_rkey_tbl_size; i++) { + if (rdma_hndl->peer_rkey_tbl[i].old_rkey == *key) { + *key = rdma_hndl->peer_rkey_tbl[i].new_rkey; + return 0; + } + } + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_update_task */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_update_task(struct xio_transport_base *trans_hndl, + struct xio_task *task) +{ + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)trans_hndl; + XIO_TO_RDMA_TASK(task, rdma_task); + unsigned int i; + + for (i = 0; i < rdma_task->req_in_num_sge; i++) { + if (xio_new_rkey(rdma_hndl, &rdma_task->req_in_sge[i].stag)) + return -1; + } + + for (i = 0; i < rdma_task->req_out_num_sge; i++) { + if (xio_new_rkey(rdma_hndl, &rdma_task->req_out_sge[i].stag)) + return -1; + } + + return 0; +} + +static int xio_rdma_update_rkey(struct xio_transport_base *trans_hndl, + uint32_t *rkey) +{ + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)trans_hndl; + + return xio_new_rkey(rdma_hndl, rkey); +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_accept */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_accept(struct xio_transport_base *transport) +{ + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)transport; + int retval; + struct rdma_conn_param cm_params; + + memset(&cm_params, 0, sizeof(cm_params)); + /* + * Limit the responder resources requested by the remote + * to our capabilities. Note that the kernel swaps + * req->responder_resources and req->initiator_depth, so + * that req->responder_resources is actually the active + * side's initiator depth. + */ + if (rdma_hndl->client_responder_resources > + rdma_hndl->tcq->dev->device_attr.max_qp_rd_atom) + cm_params.responder_resources = + rdma_hndl->tcq->dev->device_attr.max_qp_rd_atom; + else + cm_params.responder_resources = + rdma_hndl->client_responder_resources; + + /* + * Note: if this side of the connection is never going to + * use RDMA read operations, then initiator_depth can be set + * to 0 here. + */ + if (rdma_hndl->client_initiator_depth > + rdma_hndl->tcq->dev->device_attr.max_qp_init_rd_atom) + cm_params.initiator_depth = + rdma_hndl->tcq->dev->device_attr.max_qp_init_rd_atom; + else + cm_params.initiator_depth = rdma_hndl->client_initiator_depth; + + /* "accept" the connection */ + retval = rdma_accept(rdma_hndl->cm_id, &cm_params); + if (retval) { + xio_set_error(errno); + DEBUG_LOG("rdma_accept failed. (errno=%d %m)\n", errno); + return -1; + } + rdma_hndl->client_responder_resources = cm_params.responder_resources; + rdma_hndl->client_initiator_depth = cm_params.initiator_depth; + + TRACE_LOG("rdma transport: [accept] handle:%p\n", rdma_hndl); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_reject */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_reject(struct xio_transport_base *transport) +{ + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)transport; + int retval; + + /* "reject" the connection */ + retval = rdma_reject(rdma_hndl->cm_id, NULL, 0); + if (retval) { + xio_set_error(errno); + DEBUG_LOG("rdma_reject failed. (errno=%d %m)\n", errno); + return -1; + } + DEBUG_LOG("rdma transport: [reject] handle:%p\n", rdma_hndl); + + return 0; +} + +static int xio_rdma_do_connect(struct xio_transport_base *trans_hndl, + const char *out_if_addr) +{ + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)trans_hndl; + union xio_sockaddr sa; + int retval = 0; + + /* resolve the portal_uri */ + if (xio_uri_to_ss(trans_hndl->portal_uri, &sa.sa_stor) == -1) { + xio_set_error(XIO_E_ADDR_ERROR); + ERROR_LOG("address [%s] resolving failed\n", + trans_hndl->portal_uri); + return -1; + } + + /* create cm id */ + retval = rdma_create_id(rdma_hndl->cm_channel->cm_channel, + &rdma_hndl->cm_id, + rdma_hndl, RDMA_PS_TCP); + if (retval) { + xio_set_error(errno); + ERROR_LOG("rdma_create id failed. (errno=%d %m)\n", errno); + goto exit1; + } + + if (out_if_addr) { + union xio_sockaddr if_sa; + + if (xio_host_port_to_ss(out_if_addr, + &if_sa.sa_stor) == -1) { + xio_set_error(XIO_E_ADDR_ERROR); + ERROR_LOG("outgoing interface [%s] resolving failed\n", + out_if_addr); + goto exit2; + } + retval = rdma_bind_addr(rdma_hndl->cm_id, &if_sa.sa); + if (retval) { + xio_set_error(errno); + ERROR_LOG("rdma_bind_addr failed. (errno=%d %m)\n", + errno); + goto exit2; + } + } + retval = rdma_resolve_addr(rdma_hndl->cm_id, NULL, &sa.sa, + ADDR_RESOLVE_TIMEOUT); + if (retval) { + xio_set_error(errno); + ERROR_LOG("rdma_resolve_addr failed. (errno=%d %m)\n", errno); + goto exit2; + } + + return 0; + +exit2: + TRACE_LOG("call rdma_destroy_id\n"); + rdma_destroy_id(rdma_hndl->cm_id); +exit1: + rdma_hndl->cm_id = NULL; + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_connect */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_connect(struct xio_transport_base *trans_hndl, + const char *portal_uri, const char *out_if_addr) +{ + trans_hndl->is_client = 1; + + if (!portal_uri) { + xio_set_error(XIO_E_ADDR_ERROR); + ERROR_LOG("address [%s] resolving failed\n", portal_uri); + goto exit1; + } + + /* allocate memory for portal_uri */ + trans_hndl->portal_uri = strdup(portal_uri); + if (!trans_hndl->portal_uri) { + xio_set_error(ENOMEM); + ERROR_LOG("calloc failed. %m\n"); + goto exit1; + } + + if (xio_rdma_do_connect(trans_hndl, out_if_addr) < 0) + goto exit2; + + return 0; + +exit2: + free(trans_hndl->portal_uri); + trans_hndl->portal_uri = NULL; + +exit1: + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_listen */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_listen(struct xio_transport_base *transport, + const char *portal_uri, + uint16_t *src_port, int backlog) +{ + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)transport; + union xio_sockaddr sa; + int retval = 0; + uint16_t sport; + + /* resolve the portal_uri */ + if (xio_uri_to_ss(portal_uri, &sa.sa_stor) == -1) { + xio_set_error(XIO_E_ADDR_ERROR); + DEBUG_LOG("address [%s] resolving failed\n", portal_uri); + return -1; + } + rdma_hndl->base.is_client = 0; + /*is_server = 1; */ + + /* create cm id */ + retval = rdma_create_id(rdma_hndl->cm_channel->cm_channel, + &rdma_hndl->cm_id, + rdma_hndl, RDMA_PS_TCP); + if (retval) { + xio_set_error(errno); + DEBUG_LOG("rdma_create id failed. (errno=%d %m)\n", errno); + goto exit1; + } + + retval = rdma_bind_addr(rdma_hndl->cm_id, &sa.sa); + if (retval) { + xio_set_error(errno); + DEBUG_LOG("rdma_bind_addr failed. (errno=%d %m)\n", errno); + goto exit2; + } + + /* 0 == maximum backlog */ + retval = rdma_listen(rdma_hndl->cm_id, backlog); + if (retval) { + xio_set_error(errno); + DEBUG_LOG("rdma_listen failed. (errno=%d %m)\n", errno); + goto exit2; + } + + sport = ntohs(rdma_get_src_port(rdma_hndl->cm_id)); + if (src_port) + *src_port = sport; + + rdma_hndl->state = XIO_TRANSPORT_STATE_LISTEN; + DEBUG_LOG("listen on [%s] src_port:%d\n", portal_uri, sport); + + return 0; + +exit2: + TRACE_LOG("call rdma_destroy_id\n"); + rdma_destroy_id(rdma_hndl->cm_id); +exit1: + rdma_hndl->cm_id = NULL; + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_enable_fork_support */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_enable_fork_support(void) +{ + int retval; + + if (!disable_huge_pages) + setenv("RDMAV_HUGEPAGES_SAFE", "YES", 1); + retval = ibv_fork_init(); + if (retval) { + ERROR_LOG("ibv_fork_init failed (errno=%d %s)\n", + retval, strerror(retval)); + xio_set_error(errno); + return -1; + } + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_set_opt */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_set_opt(void *xio_obj, + int optname, const void *optval, int optlen) +{ + switch (optname) { + case XIO_OPTNAME_ENABLE_MEM_POOL: + VALIDATE_SZ(sizeof(int)); + rdma_options.enable_mem_pool = *((int *)optval); + return 0; + case XIO_OPTNAME_ENABLE_DMA_LATENCY: + VALIDATE_SZ(sizeof(int)); + rdma_options.enable_dma_latency = *((int *)optval); + return 0; + case XIO_OPTNAME_MAX_IN_IOVLEN: + VALIDATE_SZ(sizeof(int)); + rdma_options.max_in_iovsz = *((int *)optval); + return 0; + case XIO_OPTNAME_MAX_OUT_IOVLEN: + VALIDATE_SZ(sizeof(int)); + rdma_options.max_out_iovsz = *((int *)optval); + return 0; + case XIO_OPTNAME_QP_CAP_MAX_INLINE_DATA: + VALIDATE_SZ(sizeof(int)); + rdma_options.qp_cap_max_inline_data = *((int *)optval); + return 0; + case XIO_OPTNAME_ENABLE_FORK_INIT: + return xio_rdma_enable_fork_support(); + default: + break; + } + xio_set_error(XIO_E_NOT_SUPPORTED); + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_get_opt */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_get_opt(void *xio_obj, + int optname, void *optval, int *optlen) +{ + switch (optname) { + case XIO_OPTNAME_ENABLE_MEM_POOL: + *((int *)optval) = rdma_options.enable_mem_pool; + *optlen = sizeof(int); + return 0; + case XIO_OPTNAME_ENABLE_DMA_LATENCY: + *((int *)optval) = rdma_options.enable_dma_latency; + *optlen = sizeof(int); + return 0; + case XIO_OPTNAME_MAX_IN_IOVLEN: + *((int *)optval) = rdma_options.max_in_iovsz; + *optlen = sizeof(int); + return 0; + case XIO_OPTNAME_MAX_OUT_IOVLEN: + *((int *)optval) = rdma_options.max_out_iovsz; + *optlen = sizeof(int); + return 0; + case XIO_OPTNAME_QP_CAP_MAX_INLINE_DATA: + *((int *)optval) = rdma_options.qp_cap_max_inline_data; + *optlen = sizeof(int); + return 0; + case XIO_OPTNAME_RDMA_NUM_DEVICES: + *((int *)optval) = rdma_num_devices; + *optlen = sizeof(int); + return 0; + default: + break; + } + xio_set_error(XIO_E_NOT_SUPPORTED); + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_transport_modify */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_transport_modify(struct xio_transport_base *trans_hndl, + struct xio_transport_attr *attr, + int attr_mask) +{ + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)trans_hndl; + int ret; + int modified = 0; + + if (test_bits(XIO_TRANSPORT_ATTR_TOS, &attr_mask)) { + ret = rdma_set_option(rdma_hndl->cm_id, RDMA_OPTION_ID, + RDMA_OPTION_ID_TOS, + &attr->tos, sizeof(attr->tos)); + if (unlikely(ret)) { + ERROR_LOG("set TOS option failed. %m\n"); + xio_set_error(errno); + return -1; + } + set_bits(XIO_TRANSPORT_ATTR_TOS, &rdma_hndl->trans_attr_mask); + rdma_hndl->trans_attr.tos = attr->tos; + modified = 1; + } + + if (modified) + return 0; + + xio_set_error(XIO_E_NOT_SUPPORTED); + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_transport_query */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_transport_query(struct xio_transport_base *trans_hndl, + struct xio_transport_attr *attr, + int attr_mask) +{ + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)trans_hndl; + int queried = 0; + + if (test_bits(XIO_TRANSPORT_ATTR_TOS, &attr_mask)) { + if (test_bits(XIO_TRANSPORT_ATTR_TOS, + &rdma_hndl->trans_attr_mask)) { + attr->tos = rdma_hndl->trans_attr.tos; + queried = 1; + } else { + goto not_supported; + } + } + + if (queried) + return 0; + +not_supported: + xio_set_error(XIO_E_NOT_SUPPORTED); + return -1; +} + +/* + * To dynamically control C-states, open the file /dev/cpu_dma_latency and + * write the maximum allowable latency to it. This will prevent C-states with + * transition latencies higher than the specified value from being used, as + * long as the file /dev/cpu_dma_latency is kept open. + * Writing a maximum allowable latency of 0 will keep the processors in C0 + * (like using kernel parameter ―idle=poll), and writing 1 should force + * the processors to C1 when idle. Higher values could also be written to + * restrict the use of C-states with latency greater than the value written. + * + * http://en.community.dell.com/techcenter/extras/m/white_papers/20227764/download.aspx + */ + +/*---------------------------------------------------------------------------*/ +/* xio_set_cpu_latency */ +/*---------------------------------------------------------------------------*/ +static int xio_set_cpu_latency(int *fd) +{ + int32_t latency = 0; + + if (!rdma_options.enable_dma_latency) + return 0; + + DEBUG_LOG("setting latency to %d us\n", latency); + *fd = open("/dev/cpu_dma_latency", O_WRONLY); + if (*fd < 0) { + ERROR_LOG( + "open /dev/cpu_dma_latency %m - need root permissions\n"); + return -1; + } + if (write(*fd, &latency, sizeof(latency)) != sizeof(latency)) { + ERROR_LOG( + "write to /dev/cpu_dma_latency %m - need root permissions\n"); + close(*fd); + *fd = -1; + return -1; + } + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_init */ +/*---------------------------------------------------------------------------*/ +static void xio_rdma_init(void) +{ + int retval = 0; + + INIT_LIST_HEAD(&cm_list); + + spin_lock_init(&mngmt_lock); + pthread_rwlock_init(&dev_lock, NULL); + pthread_rwlock_init(&cm_lock, NULL); + + /* set cpu latency until process is down */ + xio_set_cpu_latency(&cdl_fd); + + retval = xio_device_thread_init(); + if (retval != 0) { + ERROR_LOG("Failed to initialize devices thread\n"); + return; + } + + retval = xio_device_list_init(); + if (retval != 0) { + ERROR_LOG("Failed to initialize device list\n"); + return; + } + + /* storage for all memory registrations */ + xio_mr_list_init(); +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_transport_init */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_transport_init(struct xio_transport *transport) +{ + pthread_once(&ctor_key_once, xio_rdma_init); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_release */ +/*---------------------------------------------------------------------------*/ +static void xio_rdma_release(void) +{ + if (cdl_fd >= 0) + close(cdl_fd); + + /* free all redundant registered memory */ + xio_mr_list_free(); + + xio_device_thread_stop(); + + /* free devices */ + xio_device_list_release(); + + if (!list_empty(&cm_list)) + ERROR_LOG("cm_channel memory leakage\n"); + + pthread_rwlock_destroy(&dev_lock); + pthread_rwlock_destroy(&cm_lock); +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_transport_release */ +/*---------------------------------------------------------------------------*/ +static void xio_rdma_transport_release(struct xio_transport *transport) +{ + if (ctor_key_once == PTHREAD_ONCE_INIT) + return; + + pthread_once(&dtor_key_once, xio_rdma_release); +} + +/*---------------------------------------------------------------------------*/ +/* xio_is_valid_in_req */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_is_valid_in_req(struct xio_msg *msg) +{ + struct xio_vmsg *vmsg = &msg->in; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + void *sge; + int32_t nents, max_nents; + unsigned int i; + int mr_found = 0; + + sgtbl = xio_sg_table_get(vmsg); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(vmsg->sgl_type); + nents = tbl_nents(sgtbl_ops, sgtbl); + max_nents = tbl_max_nents(sgtbl_ops, sgtbl); + + if ((nents > rdma_options.max_in_iovsz) || + (nents > max_nents)) + return 0; + + if (vmsg->sgl_type == XIO_SGL_TYPE_IOV && nents > XIO_IOVLEN) + return 0; + + if (vmsg->header.iov_base && (vmsg->header.iov_len == 0)) + return 0; + + for_each_sge(sgtbl, sgtbl_ops, sge, i) { + if (sge_mr(sgtbl_ops, sge)) + mr_found++; + if (!sge_addr(sgtbl_ops, sge)) { + if (sge_mr(sgtbl_ops, sge)) + return 0; + } else { + if (sge_length(sgtbl_ops, sge) == 0) + return 0; + } + } + if (mr_found != nents && mr_found) + return 0; + + return 1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_is_valid_out_msg */ +/*---------------------------------------------------------------------------*/ +static int xio_rdma_is_valid_out_msg(struct xio_msg *msg) +{ + struct xio_vmsg *vmsg = &msg->out; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + void *sge; + int32_t nents, max_nents; + unsigned int i; + int mr_found = 0; + + sgtbl = xio_sg_table_get(&msg->out); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(msg->out.sgl_type); + nents = tbl_nents(sgtbl_ops, sgtbl); + max_nents = tbl_max_nents(sgtbl_ops, sgtbl); + + if ((nents > rdma_options.max_out_iovsz) || + (nents > max_nents)) { + ERROR_LOG("sgl exceeded allowed size (nents=%zu, max_nents=%zu, max_out_iovsz=%zu)\n", + nents, max_nents, rdma_options.max_out_iovsz); + return 0; + } + + if (vmsg->sgl_type == XIO_SGL_TYPE_IOV && nents > XIO_IOVLEN) { + ERROR_LOG("sgl (iovec) too big (nents=%zu, max=%zu)\n", XIO_IOVLEN); + return 0; + } + + if (!vmsg->header.iov_base && (vmsg->header.iov_len != 0)) { + ERROR_LOG("Header ptr is NULL (vmsg=%p)\n", vmsg); + return 0; + } + + if (vmsg->header.iov_len > (size_t)g_options.max_inline_xio_hdr){ + ERROR_LOG("Header length exceeds max (len=%zu, max=%zu)\n", + vmsg->header.iov_len, (size_t)g_options.max_inline_xio_hdr); + return 0; + } + + for_each_sge(sgtbl, sgtbl_ops, sge, i) { + if (sge_mr(sgtbl_ops, sge)) + mr_found++; + if (!sge_addr(sgtbl_ops, sge) || + (sge_length(sgtbl_ops, sge) == 0)) + return 0; + } + + if (mr_found != nents && mr_found){ + ERROR_LOG( + "not all entries has mr (mr_found=%d, nents=%zu)\n", + mr_found, nents); + return 0; + } + + return 1; +} + +/* task pools management */ +/*---------------------------------------------------------------------------*/ +/* xio_rdma_get_pools_ops */ +/*---------------------------------------------------------------------------*/ +static void xio_rdma_get_pools_ops( + struct xio_transport_base *trans_hndl, + struct xio_tasks_pool_ops **initial_pool_ops, + struct xio_tasks_pool_ops **primary_pool_ops) +{ + *initial_pool_ops = &initial_tasks_pool_ops; + *primary_pool_ops = &primary_tasks_pool_ops; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_set_pools_cls */ +/*---------------------------------------------------------------------------*/ +static void xio_rdma_set_pools_cls( + struct xio_transport_base *trans_hndl, + struct xio_tasks_pool_cls *initial_pool_cls, + struct xio_tasks_pool_cls *primary_pool_cls) +{ + struct xio_rdma_transport *rdma_hndl = + (struct xio_rdma_transport *)trans_hndl; + + if (initial_pool_cls) + rdma_hndl->initial_pool_cls = *initial_pool_cls; + if (primary_pool_cls) + rdma_hndl->primary_pool_cls = *primary_pool_cls; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_transport_constructor */ +/*---------------------------------------------------------------------------*/ +void xio_rdma_transport_constructor(void) +{ + /* this must be before calling setenv as libibverbs may crash + * see libibverbs/src/device.c clone_env + */ + xio_device_list_check(); + + /* Mellanox OFED's User Manual */ + setenv("RDMAV_HUGEPAGES_SAFE", "1", 0); + setenv("MLX_QP_ALLOC_TYPE", "PREFER_CONTIG", 0); + setenv("MLX_CQ_ALLOC_TYPE", "PREFER_CONTIG", 0); + + /* Mellanox OFED's User Manual */ + /* + setenv("MLX_QP_ALLOC_TYPE","PREFER_CONTIG", 1); + setenv("MLX_CQ_ALLOC_TYPE","ALL", 1); + setenv("MLX_MR_ALLOC_TYPE","ALL", 1); + */ + if (0) + xio_rdma_enable_fork_support(); + + spin_lock_init(&dev_list_lock); +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_transport_destructor */ +/*---------------------------------------------------------------------------*/ +void xio_rdma_transport_destructor(void) +{ + ctor_key_once = PTHREAD_ONCE_INIT; + dtor_key_once = PTHREAD_ONCE_INIT; +} + +struct xio_transport xio_rdma_transport = { + .name = "rdma", + .ctor = xio_rdma_transport_constructor, + .dtor = xio_rdma_transport_destructor, + .init = xio_rdma_transport_init, + .release = xio_rdma_transport_release, + .context_shutdown = xio_rdma_context_shutdown, + .open = xio_rdma_open, + .connect = xio_rdma_connect, + .listen = xio_rdma_listen, + .accept = xio_rdma_accept, + .reject = xio_rdma_reject, + .close = xio_rdma_close, + .dup2 = xio_rdma_dup2, + .update_task = xio_rdma_update_task, + .update_rkey = xio_rdma_update_rkey, + .send = xio_rdma_send, + .poll = NULL, + .set_opt = xio_rdma_set_opt, + .get_opt = xio_rdma_get_opt, + .cancel_req = xio_rdma_cancel_req, + .cancel_rsp = xio_rdma_cancel_rsp, + .get_pools_setup_ops = xio_rdma_get_pools_ops, + .set_pools_cls = xio_rdma_set_pools_cls, + .modify = xio_rdma_transport_modify, + .query = xio_rdma_transport_query, + + .validators_cls.is_valid_in_req = xio_rdma_is_valid_in_req, + .validators_cls.is_valid_out_msg = xio_rdma_is_valid_out_msg, +}; + +/*---------------------------------------------------------------------------*/ +/* xio_is_rdma_dev_exist */ +/*---------------------------------------------------------------------------*/ +int xio_is_rdma_dev_exist() +{ + + struct ibv_device **dev_list; + int num_devices = 0; + int retval = 0; + + dev_list = ibv_get_device_list(&num_devices); + if (!dev_list) + return -1; + + if (!*dev_list || num_devices == 0) { + retval = -1; + goto exit; + } + +exit: + ibv_free_device_list(dev_list); + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_get_transport_func_list */ +/*---------------------------------------------------------------------------*/ +struct xio_transport *xio_rdma_get_transport_func_list(void) +{ + /* we wish to compile and link with rdma but + * infiniband devices are not installed on the machines. + * this case ignore rdma (only tcp is available for usage) + */ + if (xio_is_rdma_dev_exist() == -1) { + DEBUG_LOG("no capable device installed\n"); + INIT_LIST_HEAD(&dev_list); + return NULL; + } + + return &xio_rdma_transport; +} diff --git a/open_src/xio/src/usr/transport/rdma/xio_rdma_transport.h b/open_src/xio/src/usr/transport/rdma/xio_rdma_transport.h new file mode 100644 index 0000000..22a0f0d --- /dev/null +++ b/open_src/xio/src/usr/transport/rdma/xio_rdma_transport.h @@ -0,0 +1,580 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_RDMA_TRANSPORT_H +#define XIO_RDMA_TRANSPORT_H + +#include +#include "xio_hash.h" + +/*---------------------------------------------------------------------------*/ +/* externals */ +/*---------------------------------------------------------------------------*/ +extern int page_size; +extern double g_mhz; +extern struct xio_rdma_options rdma_options; +extern struct list_head dev_list; +extern spinlock_t dev_list_lock; + +#define XIO_DISCONNECT_TIMEOUT 100 /* 100 mili */ +#define XIO_TIMEWAIT_EXIT_TIMEOUT 60000 /* 1 minute */ +#define XIO_TIMEWAIT_EXIT_FAST_TIMEOUT 0 /* 0 milliseconds */ + +/* poll_cq definitions */ +#define MAX_RDMA_ADAPTERS 64 /* 64 adapters per unit */ +#define MAX_POLL_WC 128 +#define NUM_POLL_CQ 16 + +#define ADDR_RESOLVE_TIMEOUT 1000 +#define ROUTE_RESOLVE_TIMEOUT 1000 + + /* 256 rdma_write + 1 send */ +#define MAX_SEND_WR (XIO_MAX_IOV + 1) +#define MAX_RECV_WR (XIO_MAX_IOV) +#define EXTRA_RQE 32 +#define SEND_QE NUM_START_PRIMARY_POOL_TASKS - EXTRA_RQE - MAX_RECV_WR +#define MAX_ACKED_CQE 128 +#define XIO_DEV_ATTR_MAX_SGE 30 + +#define MAX_CQE_PER_QP (MAX_SEND_WR + MAX_RECV_WR + EXTRA_RQE) +#define CQE_ALLOC_SIZE (10 * MAX_CQE_PER_QP) + +#define BUDGET_SIZE 1024 +#define MAX_NUM_DELAYED_ARM 16 + +#define NUM_START_PHANTOM_POOL_TASKS 0 +#define NUM_ALLOC_PHANTOM_POOL_TASKS 512 +#define NUM_MAX_PHANTOM_POOL_TASKS 32768 + +#define SOFT_CQ_MOD 8 +#define HARD_CQ_MOD 64 +#define SEND_THRESHOLD 8 +#define SRQ_DEPTH 1024 + +#define XIO_BEACON_WRID 0xfffffffffffffffeULL + +#define PAGE_SIZE page_size +/* see if a pointer is page aligned. */ +#define IS_PAGE_ALIGNED(ptr) \ + (((PAGE_SIZE - 1) & (intptr_t)(ptr)) == 0) + +#define XIO_TO_RDMA_TASK(xt, rt) \ + struct xio_rdma_task *(rt) = \ + (struct xio_rdma_task *)(xt)->dd_data +#define XIO_TO_RDMA_HNDL(xt, rh) \ + struct xio_rdma_transport *(rh) = \ + (struct xio_rdma_transport *)(xt)->context + +#ifdef HAVE_MPAGES_EXP +# define IBV_XIO_ACCESS_ALLOCATE_MR IBV_EXP_ACCESS_ALLOCATE_MR +# define IBV_IS_MPAGES_AVAIL(_attr) ((_attr)->exp_device_cap_flags \ + & IBV_EXP_DEVICE_MR_ALLOCATE) +# define ibv_xio_device_attr ibv_exp_device_attr +# define ibv_xio_query_device ibv_exp_query_device +# define ibv_xio_reg_mr ibv_exp_reg_mr +#else +# ifdef HAVE_MPAGES +# define IBV_XIO_ACCESS_ALLOCATE_MR IBV_ACCESS_ALLOCATE_MR +# define IBV_IS_MPAGES_AVAIL(_attr) ((_attr)->device_cap_flags \ + & IBV_DEVICE_MR_ALLOCATE) +# else +# define IBV_XIO_ACCESS_ALLOCATE_MR (0) +# define IBV_IS_MPAGES_AVAIL(_attr) (0) +# endif + +# define ibv_xio_device_attr ibv_device_attr +# define ibv_xio_query_device ibv_query_device + +struct ibv_exp_reg_mr_in { + struct ibv_pd *pd; + void *addr; + size_t length; + int exp_access; + uint32_t comp_mask; +}; + +static inline struct ibv_mr *ibv_xio_reg_mr(struct ibv_exp_reg_mr_in *in) +{ + return ibv_reg_mr(in->pd, in->addr, in->length, in->exp_access); +} +#endif + +/*---------------------------------------------------------------------------*/ +/* enums */ +/*---------------------------------------------------------------------------*/ +enum xio_ib_op_code { + XIO_IB_NULL, + XIO_IB_RECV = 1, + XIO_IB_SEND, + XIO_IB_RDMA_WRITE, + XIO_IB_RDMA_READ, + XIO_IB_RDMA_WRITE_DIRECT, + XIO_IB_RDMA_READ_DIRECT +}; + +struct xio_transport_base; +struct xio_rdma_transport; + +/*---------------------------------------------------------------------------*/ +struct xio_rdma_options { + int enable_mem_pool; + int enable_dma_latency; + int max_in_iovsz; + int max_out_iovsz; + int qp_cap_max_inline_data; +}; + +#define XIO_REQ_HEADER_VERSION 1 + +struct __attribute__((__packed__)) xio_rdma_req_hdr { + uint8_t version; /* request version */ + uint8_t flags; + uint16_t req_hdr_len; /* req header length */ + uint16_t sn; /* serial number */ + uint16_t ack_sn; /* ack serial number */ + + uint16_t credits; /* peer send credits */ + uint32_t ltid; /* originator identifier*/ + uint8_t in_ib_op; /* opcode for peers */ + uint8_t out_ib_op; + + uint16_t in_num_sge; + uint16_t out_num_sge; + uint32_t pad1; + + uint16_t ulp_hdr_len; /* ulp header length */ + uint16_t ulp_pad_len; /* pad_len length */ + uint32_t remain_data_len;/* remaining data length */ + uint64_t ulp_imm_len; /* ulp data length */ +}; + +#define XIO_RSP_HEADER_VERSION 1 + +struct __attribute__((__packed__)) xio_rdma_rsp_hdr { + uint8_t version; /* response version */ + uint8_t flags; + uint16_t rsp_hdr_len; /* rsp header length */ + uint16_t sn; /* serial number */ + uint16_t ack_sn; /* ack serial number */ + + uint16_t credits; /* peer send credits */ + uint32_t rtid; /* originator identifier*/ + uint8_t out_ib_op; /* opcode for peers */ + uint8_t pad; + + uint16_t pad1; + uint16_t out_num_sge; + uint32_t status; /* status */ + + uint32_t ltid; /* local task id */ + uint16_t ulp_hdr_len; /* ulp header length */ + uint16_t ulp_pad_len; /* pad_len length */ + + uint32_t remain_data_len;/* remaining data length */ + + uint64_t ulp_imm_len; /* ulp data length */ +}; + +struct __attribute__((__packed__)) xio_rdma_setup_msg { + uint16_t credits; /* peer send credits */ + uint16_t sq_depth; + uint16_t rq_depth; + uint16_t rkey_tbl_size; + uint64_t buffer_sz; + uint32_t max_in_iovsz; + uint32_t max_out_iovsz; + uint32_t max_header_len; + uint32_t pad; +}; + +struct __attribute__((__packed__)) xio_nop_hdr { + uint16_t hdr_len; /* req header length */ + uint16_t sn; /* serial number */ + uint16_t ack_sn; /* ack serial number */ + uint16_t credits; /* peer send credits */ + uint8_t opcode; /* opcode for peers */ + uint8_t flags; /* not used */ + uint16_t pad; +}; + +struct __attribute__((__packed__)) xio_rdma_read_ack_hdr { + uint16_t hdr_len; /* req header length */ + uint32_t rtid; /* remote task id */ +}; + +struct __attribute__((__packed__)) xio_rdma_cancel_hdr { + uint16_t hdr_len; /* req header length */ + uint16_t sn; /* serial number */ + uint32_t result; +}; + +struct xio_work_req { + union { + struct ibv_send_wr send_wr; + struct ibv_recv_wr recv_wr; + }; + struct ibv_sge *sge; +}; + +struct xio_rdma_task { + enum xio_ib_op_code out_ib_op; + enum xio_ib_op_code in_ib_op; + + /* The buffer mapped with the 3 xio_work_req + * used to transfer the headers + */ + struct xio_work_req txd; + struct xio_work_req rxd; + struct xio_work_req rdmad; + + /* User (from vmsg) or pool buffer used for */ + uint16_t read_num_reg_mem; + uint16_t write_num_reg_mem; + uint32_t pad0; + struct xio_reg_mem *read_reg_mem; + struct xio_reg_mem *write_reg_mem; + + /* What this side got from the peer for RDMA R/W + */ + uint16_t req_in_num_sge; + uint16_t req_out_num_sge; + uint16_t rsp_out_num_sge; + uint16_t pad1; + + /* can serve send/rdma write */ + struct xio_sge *req_in_sge; + + /* can serve send/rdma read */ + struct xio_sge *req_out_sge; + + /* can serve send/rdma read response/rdma write */ + struct xio_sge *rsp_out_sge; + + unsigned int phantom_idx; + uint16_t sn; + uint8_t rflags; + uint8_t pad; +}; + +struct xio_cq { + struct ibv_cq *cq; + struct ibv_comp_channel *channel; + struct xio_context *ctx; + struct xio_device *dev; + struct xio_ev_data consume_cq_event; + struct xio_ev_data poll_cq_event; + struct ibv_wc *wc_array; + int32_t wc_array_len; + int32_t cq_events_that_need_ack; + int32_t max_cqe; /* max snd elements */ + int32_t cq_depth; /* current cq depth */ + int32_t alloc_sz; /* allocation factor */ + int32_t cqe_avail; /* free elements */ + struct kref kref; /* utilization counter */ + int32_t num_delayed_arm; + int32_t num_poll_cq; + int32_t pad; + struct list_head trans_list; /* list of all transports + * attached to this cq + */ + struct list_head cq_list_entry; /* list of all + cq per device */ + struct xio_observer observer; + struct xio_srq *srq; +}; + +struct xio_srq { + HT_HEAD(, rdma_hndl, HASHTABLE_PRIME_SMALL) ht_rdma_hndl; + struct ibv_srq *srq; + struct list_head rx_list; + int rqe_avail; /* recv queue elements + avail */ + int pad; +}; + +struct xio_device { + struct list_head cq_list; + struct list_head dev_list_entry; /* list of all + xio devices */ + pthread_rwlock_t cq_lock; + struct ibv_context *verbs; + struct ibv_pd *pd; + struct ibv_xio_device_attr device_attr; + struct list_head xm_list; /* list of xio_mr_elem */ + struct kref kref; + uint32_t kref_pad; +}; + +struct xio_mr_elem { + struct xio_device *dev; + struct ibv_mr *mr; + struct list_head dm_list_entry; /* entry in mr list */ + struct list_head xm_list_entry; /* entry in dev list */ +}; + +struct xio_rdma_tasks_slab { + /* memory for non-rdma send/recv */ + void *data_pool; + + /* memory registration for data */ + struct xio_mr *data_mr; + struct xio_reg_mem reg_mem; + int buf_size; + int alloc_nr; +}; + +struct __attribute__((__packed__)) xio_rkey_tbl_pack { + uint32_t old_rkey; + uint32_t new_rkey; +}; + +struct xio_rkey_tbl { + uint32_t old_rkey; + uint32_t new_rkey; +}; + +struct xio_rdma_transport { + struct xio_transport_base base; + struct xio_cq *tcq; + struct ibv_qp *qp; + struct xio_mempool *rdma_mempool; + struct xio_tasks_pool *phantom_tasks_pool; + + struct list_head trans_list_entry; + + /* tasks queues */ + struct list_head tx_ready_list; + struct list_head tx_comp_list; + struct list_head in_flight_list; + struct list_head rx_list; + struct list_head io_list; + struct list_head rdma_rd_req_list; + struct list_head rdma_rd_req_in_flight_list; + struct list_head rdma_rd_rsp_list; + struct list_head rdma_rd_rsp_in_flight_list; + + /* rx parameters */ + int rq_depth; /* max rcv per qp + allowed */ + int rqe_avail; /* recv queue elements + avail */ + uint16_t sim_peer_credits; /* simulates the peer + * credits management + * to control nop + * sends + */ + uint16_t credits; /* the ack this + peer sends */ + uint16_t peer_credits; + + uint16_t pad; + uint32_t peer_max_header; + + /* fast path params */ + int rdma_rd_req_in_flight; + int rdma_rd_rsp_in_flight; + int sqe_avail; + enum xio_transport_state state; + + /* tx parameters */ + int kick_rdma_rd_req; + int kick_rdma_rd_rsp; + int reqs_in_flight_nr; + int rsps_in_flight_nr; + int tx_ready_tasks_num; + int max_tx_ready_tasks_num; + int max_inline_data; + size_t max_inline_buf_sz; + int max_sge; + uint16_t req_sig_cnt; + uint16_t rsp_sig_cnt; + /* sender window parameters */ + uint16_t sn; /* serial number */ + uint16_t ack_sn; /* serial number */ + + uint16_t max_sn; /* upper edge of + sender's window + 1 */ + + /* receiver window parameters */ + uint16_t exp_sn; /* lower edge of + receiver's window */ + + uint16_t max_exp_sn; /* upper edge of + receiver's window + 1 */ + + uint16_t pad1; + + /* control path params */ + int sq_depth; /* max snd allowed */ + uint16_t client_initiator_depth; + uint16_t client_responder_resources; + + uint32_t peer_max_in_iovsz; + uint32_t peer_max_out_iovsz; + int32_t handler_nesting; + /* connection's flow control */ + size_t membuf_sz; + + struct xio_transport *transport; + struct xio_cm_channel *cm_channel; + struct rdma_cm_id *cm_id; + struct xio_tasks_pool_cls initial_pool_cls; + struct xio_tasks_pool_cls primary_pool_cls; + + struct xio_rdma_setup_msg setup_rsp; + + /* for reconnect */ + struct xio_device *dev; + struct xio_rkey_tbl *rkey_tbl; + struct xio_rkey_tbl *peer_rkey_tbl; + + /* for reconnect */ + uint16_t rkey_tbl_size; + uint16_t peer_rkey_tbl_size; + + uint32_t ignore_timewait:1; + uint32_t timewait_nr:1; /* flag */ + uint32_t ignore_disconnect:1; + uint32_t disconnect_nr:1; /* flag */ + uint32_t beacon_sent:1; + uint32_t reserved:27; + + /* too big to be on stack - use as temporaries */ + union { + struct xio_msg dummy_msg; + struct xio_work_req dummy_wr; + }; + struct xio_ev_data close_event; + struct xio_ev_data timewait_exit_event; + xio_delayed_work_handle_t timewait_timeout_work; + xio_delayed_work_handle_t disconnect_timeout_work; + struct ibv_send_wr beacon; + struct xio_task beacon_task; + uint32_t trans_attr_mask; + struct xio_transport_attr trans_attr; + struct xio_srq *xio_srq; + HT_ENTRY(rdma_hndl, xio_key_int32) rdma_hndl_htbl; +}; + +struct xio_cm_channel { + struct rdma_event_channel *cm_channel; + struct xio_context *ctx; + struct list_head channels_list_entry; + struct kref kref; /* utilization counter */ + int pad; +}; + +struct xio_dev_tdata { + pthread_t dev_thread; + void *async_loop; +}; + +/* xio_rdma_verbs.c */ +void xio_mr_list_init(void); +int xio_mr_list_free(void); +const char *ibv_wc_opcode_str(enum ibv_wc_opcode opcode); + +void xio_cq_event_handler(int fd, int events, void *data); +int xio_post_recv(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task, int num_recv_bufs); +int xio_rdma_rearm_rq(struct xio_rdma_transport *rdma_hndl); + +int xio_rdma_send(struct xio_transport_base *transport, + struct xio_task *task); +int xio_rdma_poll(struct xio_transport_base *transport, + long min_nr, long nr, + struct timespec *ts_timeout); + +int xio_rdma_cancel_req(struct xio_transport_base *transport, + struct xio_msg *req, uint64_t stag, + void *ulp_msg, size_t ulp_msg_sz); + +int xio_rdma_cancel_rsp(struct xio_transport_base *transport, + struct xio_task *task, enum xio_status result, + void *ulp_msg, size_t ulp_msg_sz); + +/* xio_rdma_management.c */ +int xio_rdma_get_max_header_size(void); + +int xio_rdma_get_inline_buffer_size(void); + +struct xio_task *xio_rdma_primary_task_alloc( + struct xio_rdma_transport *rdma_hndl); + +struct xio_task *xio_rdma_primary_task_lookup( + struct xio_rdma_transport *rdma_hndl, + int tid); + +void xio_rdma_task_free(struct xio_rdma_transport *rdma_hndl, + struct xio_task *task); + +static inline void xio_device_get(struct xio_device *dev) +{ + kref_get(&dev->kref); +} + +void xio_rdma_close_cb(struct kref *kref); + +void xio_device_down(struct kref *kref); + +static inline void xio_device_put(struct xio_device *dev) +{ + kref_put(&dev->kref, xio_device_down); +} + +void xio_set_timewait_timer(struct xio_rdma_transport *rdma_hndl); + +/*---------------------------------------------------------------------------*/ +/* xio_reg_mr_add_dev */ +/* add a new discovered device to a the mr list */ +/*---------------------------------------------------------------------------*/ +int xio_reg_mr_add_dev(struct xio_device *dev); + +/*---------------------------------------------------------------------------*/ +/* xio_dereg_mr_by_dev */ +/*---------------------------------------------------------------------------*/ +int xio_dereg_mr_by_dev(struct xio_device *dev); + +/*---------------------------------------------------------------------------*/ +/* xio_rkey_table_create */ +/*---------------------------------------------------------------------------*/ +int xio_rkey_table_create(struct xio_device *old, struct xio_device *_new, + struct xio_rkey_tbl **htbl, uint16_t *len); + +void xio_rdma_poll_completions(struct xio_cq *tcq, int timeout_us); + +#endif /* XIO_RDMA_TRANSPORT_H */ diff --git a/open_src/xio/src/usr/transport/rdma/xio_rdma_utils.c b/open_src/xio/src/usr/transport/rdma/xio_rdma_utils.c new file mode 100644 index 0000000..07a0e5e --- /dev/null +++ b/open_src/xio/src/usr/transport/rdma/xio_rdma_utils.c @@ -0,0 +1,247 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include +#include + +#include "libxio.h" +#include "xio_log.h" +#include "xio_common.h" +#include "xio_observer.h" +#include "xio_transport.h" +#include "xio_usr_transport.h" +#include "xio_mempool.h" +#include "xio_protocol.h" +#include "xio_mbuf.h" +#include "xio_task.h" +#include "xio_rdma_utils.h" +#include "xio_ev_data.h" +#include "xio_objpool.h" +#include "xio_workqueue.h" +#include "xio_context.h" +#include "xio_rdma_transport.h" + +/*---------------------------------------------------------------------------*/ +/* xio_validate_rdma_op */ +/*---------------------------------------------------------------------------*/ +int xio_validate_rdma_op( + struct xio_sge *lsg_list, size_t lsize, + struct xio_sge *rsg_list, size_t rsize, + int op_size, + int max_sge, + int *tasks_used) +{ + unsigned int l = 0, + r = 0; + uint64_t laddr = lsg_list[0].addr; + uint64_t raddr = rsg_list[0].addr; + uint32_t llen = lsg_list[0].length; + uint32_t rlen = rsg_list[0].length; + int32_t tot_len = 0; + int k = 0; + + if (lsize < 1 || rsize < 1) { + ERROR_LOG("iovec size < 1 lsize:%zd, rsize:%zd\n", + lsize, rsize); + *tasks_used = 0; + return -1; + } + + /* At least one task */ + *tasks_used = 1; + + while (1) { + if (rlen < llen) { + r++; + tot_len += rlen; + if (r == rsize) + break; + llen -= rlen; + laddr += rlen; + raddr = rsg_list[r].addr; + rlen = rsg_list[r].length; + (*tasks_used)++; + k = 0; + } else if (llen < rlen) { + l++; + tot_len += llen; + if (l == lsize) + break; + k++; + if (k == max_sge - 1) { + /* reached last index */ + (*tasks_used)++; + k = 0; + } + rlen -= llen; + raddr += llen; + laddr = lsg_list[l].addr; + llen = lsg_list[l].length; + } else { + l++; + r++; + tot_len += llen; + if ((l == lsize) || (r == rsize)) + break; + laddr = lsg_list[l].addr; + llen = lsg_list[l].length; + raddr = rsg_list[r].addr; + rlen = rsg_list[r].length; + (*tasks_used)++; + k = 0; + } + } + + /* not enough buffers to complete */ + if (tot_len < op_size) { + *tasks_used = 0; + ERROR_LOG("iovec exhausted\n"); + return -1; + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_cm_rej_reason_str */ +/*---------------------------------------------------------------------------*/ +const char *xio_cm_rej_reason_str(int reason) +{ + switch (reason) { + case IB_CM_REJ_NO_QP: + return "No QP"; + case IB_CM_REJ_NO_EEC: + return "No EEC"; + case IB_CM_REJ_NO_RESOURCES: + return "No Resources"; + case IB_CM_REJ_TIMEOUT: + return "Timeout"; + case IB_CM_REJ_UNSUPPORTED: + return "Unsupported"; + case IB_CM_REJ_INVALID_COMM_ID: + return "Invalid COMM ID"; + case IB_CM_REJ_INVALID_COMM_INSTANCE: + return "Invalid COMM Instance"; + case IB_CM_REJ_INVALID_SERVICE_ID: + return "Invalid Service ID"; + case IB_CM_REJ_INVALID_TRANSPORT_TYPE: + return "Invalid Transport Type"; + case IB_CM_REJ_STALE_CONN: + return "Stale Connection"; + case IB_CM_REJ_RDC_NOT_EXIST: + return "RDC not exist"; + case IB_CM_REJ_INVALID_GID: + return "Invalid GID"; + case IB_CM_REJ_INVALID_LID: + return "Invalid LID"; + case IB_CM_REJ_INVALID_SL: + return "Invalid SL"; + case IB_CM_REJ_INVALID_TRAFFIC_CLASS: + return "Invalid Traffic Class"; + case IB_CM_REJ_INVALID_HOP_LIMIT: + return "Invalid HOP Limit"; + case IB_CM_REJ_INVALID_PACKET_RATE: + return "Invalid Packet Rate"; + case IB_CM_REJ_INVALID_ALT_GID: + return "Invalid Alt GID"; + case IB_CM_REJ_INVALID_ALT_LID: + return "Invalid Alt LID"; + case IB_CM_REJ_INVALID_ALT_SL: + return "Invalid Alt SL"; + case IB_CM_REJ_INVALID_ALT_TRAFFIC_CLASS: + return "Invalid Alt Traffic Class"; + case IB_CM_REJ_INVALID_ALT_HOP_LIMIT: + return "Invalid Alt HOP Limit"; + case IB_CM_REJ_INVALID_ALT_PACKET_RATE: + return "Invalid Alt Packet Rate"; + case IB_CM_REJ_PORT_CM_REDIRECT: + return "Invalid Alt Packet Rate"; + case IB_CM_REJ_PORT_REDIRECT: + return "Port Redirect"; + case IB_CM_REJ_INVALID_MTU: + return "Invalid MTU"; + case IB_CM_REJ_INSUFFICIENT_RESP_RESOURCES: + return "Invalid Response Resources"; + case IB_CM_REJ_CONSUMER_DEFINED: + return "Consumer Defined"; + case IB_CM_REJ_INVALID_RNR_RETRY: + return "Invalid RNR Retry"; + case IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID: + return "Duplicate Local Comm ID"; + case IB_CM_REJ_INVALID_CLASS_VERSION: + return "Invalid Class Version"; + case IB_CM_REJ_INVALID_FLOW_LABEL: + return "Invalid Flow Label"; + case IB_CM_REJ_INVALID_ALT_FLOW_LABEL: + return "Invalid Alt Flow Label"; + default: + return "Unknown error"; + }; +} + +void xio_validate_ulimit_memlock(void) +{ + struct rlimit mlock_limit; + + if (getrlimit(RLIMIT_MEMLOCK, &mlock_limit)) { + ERROR_LOG("getrlimit call failed. (errno=%d %m)\n", errno); + return; + } + if (mlock_limit.rlim_cur != RLIM_INFINITY) { + WARN_LOG("Verify that Max Locked Memory (ulimit -l) " \ + "setting is on unlimited (current is %ld)\n", + mlock_limit.rlim_cur); + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_mr_lookup */ +/*---------------------------------------------------------------------------*/ +struct ibv_mr *xio_rdma_mr_lookup(const struct xio_mr *tmr, + const struct xio_device *dev) +{ + struct xio_mr_elem *tmr_elem; + const struct list_head *dm_list = &tmr->dm_list; + + list_for_each_entry(tmr_elem, dm_list, dm_list_entry) { + if (dev == tmr_elem->dev) + return tmr_elem->mr; + } + return NULL; +} diff --git a/open_src/xio/src/usr/transport/rdma/xio_rdma_utils.h b/open_src/xio/src/usr/transport/rdma/xio_rdma_utils.h new file mode 100644 index 0000000..2cb6ad6 --- /dev/null +++ b/open_src/xio/src/usr/transport/rdma/xio_rdma_utils.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_RDMA_UTILS_H +#define XIO_RDMA_UTILS_H + +/* full memory barrier */ +#define cpu_relax() __sync_synchronize() + +int xio_validate_rdma_op( + struct xio_sge *lsge, size_t lsize, + struct xio_sge *rsge, size_t rsize, + int op_size, + int max_sge, + int *tasks_used); + +const char *xio_cm_rej_reason_str(int reason); + +void xio_validate_ulimit_memlock(void); + +struct xio_device; +struct ibv_mr *xio_rdma_mr_lookup(const struct xio_mr *tmr, + const struct xio_device *dev); + +#endif /*XIO_RDMA_UTILS_H */ diff --git a/open_src/xio/src/usr/transport/rdma/xio_rdma_verbs.c b/open_src/xio/src/usr/transport/rdma/xio_rdma_verbs.c new file mode 100644 index 0000000..3e7e306 --- /dev/null +++ b/open_src/xio/src/usr/transport/rdma/xio_rdma_verbs.c @@ -0,0 +1,687 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include +#include "libxio.h" +#include "xio_log.h" +#include "xio_common.h" +#include "xio_observer.h" +#include "xio_transport.h" +#include "xio_protocol.h" +#include "get_clock.h" +#include "xio_mem.h" +#include "xio_usr_transport.h" +#include "xio_mempool.h" +#include "xio_protocol.h" +#include "xio_mbuf.h" +#include "xio_task.h" +#include "xio_rdma_utils.h" +#include "xio_ev_data.h" +#include "xio_objpool.h" +#include "xio_workqueue.h" +#include "xio_context.h" +#include "xio_rdma_transport.h" + +/*---------------------------------------------------------------------------*/ +/* globals */ +/*---------------------------------------------------------------------------*/ +static LIST_HEAD(mr_list); +static spinlock_t mr_list_lock; +static uint32_t mr_num; /* checkpatch doesn't like initializing static vars */ + +/*---------------------------------------------------------------------------*/ +/* xio_register_transport */ +/*---------------------------------------------------------------------------*/ +static int xio_register_transport(void) +{ + static int init_transport; + + /* this may the first call in application so initialize the rdma */ + if (!init_transport) { + struct xio_transport *transport = xio_get_transport("rdma"); + + if (!transport) + return 0; + + init_transport = 1; + } + + return init_transport; +} + +/*---------------------------------------------------------------------------*/ +/* xio_mem_register_no_dev */ +/*---------------------------------------------------------------------------*/ +static inline int xio_mem_register_no_dev(void *addr, size_t length, + struct xio_reg_mem *reg_mem) +{ + static struct xio_mr dummy_mr; + + reg_mem->addr = addr; + reg_mem->length = length; + reg_mem->mr = &dummy_mr; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_mem_dereg_no_dev */ +/*---------------------------------------------------------------------------*/ +static inline int xio_mem_dereg_no_dev(struct xio_reg_mem *reg_mem) +{ + reg_mem->mr = NULL; + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_mem_alloc_no_dev */ +/*---------------------------------------------------------------------------*/ +static int xio_mem_alloc_no_dev(size_t length, struct xio_reg_mem *reg_mem) +{ + size_t real_size; + int alloced = 0; + + real_size = ALIGN(length, page_size); + reg_mem->addr = umemalign(page_size, real_size); + if (!reg_mem->addr) { + ERROR_LOG("xio_memalign failed. sz:%zu\n", real_size); + goto cleanup; + } + /*memset(reg_mem->addr, 0, real_size);*/ + alloced = 1; + + xio_mem_register_no_dev(reg_mem->addr, length, reg_mem); + if (!reg_mem->mr) { + ERROR_LOG("xio_reg_mr failed. addr:%p, length:%d\n", + reg_mem->addr, length, access); + + goto cleanup1; + } + reg_mem->length = length; + + return 0; + +cleanup1: + if (alloced) + ufree(reg_mem->addr); +cleanup: + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_mem_free_no_dev */ +/*---------------------------------------------------------------------------*/ +static int xio_mem_free_no_dev(struct xio_reg_mem *reg_mem) +{ + int retval = 0; + + if (reg_mem->addr) + ufree(reg_mem->addr); + + retval = xio_mem_dereg_no_dev(reg_mem); + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* ibv_wc_opcode_str */ +/*---------------------------------------------------------------------------*/ +const char *ibv_wc_opcode_str(enum ibv_wc_opcode opcode) +{ + switch (opcode) { + case IBV_WC_SEND: return "IBV_WC_SEND"; + case IBV_WC_RDMA_WRITE: return "IBV_WC_RDMA_WRITE"; + case IBV_WC_RDMA_READ: return "IBV_WC_RDMA_READ"; + case IBV_WC_COMP_SWAP: return "IBV_WC_COMP_SWAP"; + case IBV_WC_FETCH_ADD: return "IBV_WC_FETCH_ADD"; + case IBV_WC_BIND_MW: return "IBV_WC_BIND_MW"; + /* recv-side: inbound completion */ + case IBV_WC_RECV: return "IBV_WC_RECV"; + case IBV_WC_RECV_RDMA_WITH_IMM: return "IBV_WC_RECV_RDMA_WITH_IMM"; + default: return "IBV_WC_UNKNOWN"; + }; +} + +/*---------------------------------------------------------------------------*/ +/* xio_dereg_mr */ +/*---------------------------------------------------------------------------*/ +static int xio_dereg_mr(struct xio_mr *tmr) +{ + struct xio_mr *ptmr, *tmp_ptmr; + struct xio_mr_elem *tmr_elem, *tmp_tmr_elem; + int retval, found = 0; + + spin_lock(&mr_list_lock); + list_for_each_entry_safe(ptmr, tmp_ptmr, &mr_list, mr_list_entry) { + if (ptmr == tmr) { + list_del(&tmr->mr_list_entry); + found = 1; + break; + } + } + spin_unlock(&mr_list_lock); + + if (found) { + list_for_each_entry_safe(tmr_elem, tmp_tmr_elem, &tmr->dm_list, + dm_list_entry) { + retval = ibv_dereg_mr(tmr_elem->mr); + if (unlikely(retval != 0)) { + xio_set_error(errno); + ERROR_LOG("ibv_dereg_mr failed, %m\n"); + } + /* Remove the item from the list. */ + spin_lock(&dev_list_lock); + list_del(&tmr_elem->dm_list_entry); + list_del(&tmr_elem->xm_list_entry); + spin_unlock(&dev_list_lock); + ufree(tmr_elem); + } + ufree(tmr); + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_reg_mr_ex_dev */ +/*---------------------------------------------------------------------------*/ +static struct xio_mr_elem *xio_reg_mr_ex_dev(struct xio_device *dev, + void **addr, size_t length, + uint64_t access) +{ + struct xio_mr_elem *mr_elem; + struct ibv_mr *mr; + int retval; + struct ibv_exp_reg_mr_in reg_mr_in; + int alloc_mr = !(*addr); + + reg_mr_in.pd = dev->pd; + reg_mr_in.addr = *addr; + reg_mr_in.length = length; + reg_mr_in.exp_access = access; + reg_mr_in.comp_mask = 0; + + TRACE_LOG("before ibv_reg_mr\n"); + mr = ibv_xio_reg_mr(®_mr_in); + TRACE_LOG("after ibv_reg_mr\n"); + if (unlikely(!mr)) { + xio_set_error(errno); + if (!alloc_mr) + ERROR_LOG("ibv_reg_mr failed, %m. " \ + "addr:%p, length:%zd, access:0x%lx\n", + *addr, length, access); + if (errno == ENOMEM) + xio_validate_ulimit_memlock(); + return NULL; + } + mr_elem = (struct xio_mr_elem *)ucalloc(1, sizeof(*mr_elem)); + if (unlikely(!mr_elem)) + goto cleanup; + + mr_elem->dev = dev; + mr_elem->mr = mr; + + return mr_elem; + +cleanup: + retval = ibv_dereg_mr(mr); + if (retval) { + xio_set_error(errno); + ERROR_LOG("ibv_dereg_mr failed, %m\n"); + } + + return NULL; +} + +/*---------------------------------------------------------------------------*/ +/* xio_reg_mr_ex */ +/*---------------------------------------------------------------------------*/ +static struct xio_mr *xio_reg_mr_ex(void **addr, size_t length, uint64_t access) +{ + struct xio_mr *tmr; + struct xio_mr_elem *tmr_elem; + struct xio_device *dev; + int retval; + static int init_transport = 1; + + /* Show a warning in case the memory is non aligned */ + if ((access & IBV_XIO_ACCESS_ALLOCATE_MR) == 0 && + ((uintptr_t)(*addr) & (page_size - 1)) != 0) { + WARN_LOG("Unaligned memory for address %p: length is %d while page size is %d.\n.", *addr, length, page_size); + } + /* this may the first call in application so initialize the rdma */ + if (init_transport) { + struct xio_transport *transport = xio_get_transport("rdma"); + + if (!transport) { + ERROR_LOG("invalid protocol. proto: rdma\n"); + xio_set_error(XIO_E_ADDR_ERROR); + return NULL; + } + init_transport = 0; + } + + spin_lock(&dev_list_lock); + if (list_empty(&dev_list)) { + ERROR_LOG("dev_list is empty\n"); + spin_unlock(&dev_list_lock); + goto cleanup2; + } + spin_unlock(&dev_list_lock); + + tmr = (struct xio_mr *)ucalloc(1, sizeof(*tmr)); + if (unlikely(!tmr)) { + xio_set_error(errno); + ERROR_LOG("malloc failed. (errno=%d %m)\n", errno); + goto cleanup2; + } + INIT_LIST_HEAD(&tmr->dm_list); + /* xio_dereg_mr may be called on error path and it will call + * list_del on mr_list_entry, make sure it is initialized + */ + INIT_LIST_HEAD(&tmr->mr_list_entry); + + spin_lock(&dev_list_lock); + list_for_each_entry(dev, &dev_list, dev_list_entry) { + tmr_elem = xio_reg_mr_ex_dev(dev, addr, length, access); + if (!tmr_elem) { + xio_set_error(errno); + spin_unlock(&dev_list_lock); + goto cleanup1; + } + list_add(&tmr_elem->dm_list_entry, &tmr->dm_list); + list_add(&tmr_elem->xm_list_entry, &dev->xm_list); + + if (access & IBV_XIO_ACCESS_ALLOCATE_MR) { + access &= ~IBV_XIO_ACCESS_ALLOCATE_MR; + *addr = tmr_elem->mr->addr; + } + } + spin_unlock(&dev_list_lock); + + /* For dynamically discovered devices */ + tmr->addr = *addr; + tmr->length = length; + tmr->access = access; + + spin_lock(&mr_list_lock); + mr_num++; + list_add(&tmr->mr_list_entry, &mr_list); + spin_unlock(&mr_list_lock); + + return tmr; + +cleanup1: + retval = xio_dereg_mr(tmr); + if (retval != 0) + ERROR_LOG("xio_dereg_mr failed\n"); +cleanup2: + return NULL; +} + +/*---------------------------------------------------------------------------*/ +/* xio_dereg_mr_by_dev */ +/*---------------------------------------------------------------------------*/ +int xio_dereg_mr_by_dev(struct xio_device *dev) +{ + struct xio_mr_elem *tmr_elem, *tmp_tmr_elem; + int retval; + LIST_HEAD(tmp_list); + + spin_lock(&dev_list_lock); + if (list_empty(&dev->xm_list)) { + spin_unlock(&dev_list_lock); + return 0; + } + + list_splice_tail(&dev->xm_list, &tmp_list); + INIT_LIST_HEAD(&dev->xm_list); + list_for_each_entry_safe(tmr_elem, tmp_tmr_elem, &tmp_list, + xm_list_entry) + list_del(&tmr_elem->dm_list_entry); + spin_unlock(&dev_list_lock); + + list_for_each_entry_safe(tmr_elem, tmp_tmr_elem, &tmp_list, + xm_list_entry) { + if (tmr_elem->mr) { + retval = ibv_dereg_mr(tmr_elem->mr); + if (unlikely(retval != 0)) { + xio_set_error(errno); + ERROR_LOG("ibv_dereg_mr failed, %m\n"); + } + } + /* Remove the item from the lists. */ + list_del(&tmr_elem->xm_list_entry); + ufree(tmr_elem); + } + + return 0; +} + +/* The following functions is implemented in xio_connection.c, + * We prefer not to add an include dependency on xio_connection here */ +struct xio_msg; +const struct xio_transport_base *xio_req_to_transport_base( + const struct xio_msg *req); + +static inline const struct xio_device *xio_req_to_device( + const struct xio_msg *req) +{ + struct xio_rdma_transport *transport = (struct xio_rdma_transport *) + xio_req_to_transport_base(req); + return transport->tcq->dev; +} + +static inline const struct xio_device *xio_rsp_to_device( + const struct xio_msg *rsp) +{ + return xio_req_to_device(rsp->request); +} + +uint32_t xio_lookup_rkey_by_request(const struct xio_reg_mem *reg_mem, + const struct xio_msg *req) +{ + return xio_rdma_mr_lookup(reg_mem->mr, xio_req_to_device(req))->rkey; +} + +uint32_t xio_lookup_rkey_by_response(const struct xio_reg_mem *reg_mem, + const struct xio_msg *rsp) +{ + return xio_rdma_mr_lookup(reg_mem->mr, xio_rsp_to_device(rsp))->rkey; +} + +/*---------------------------------------------------------------------------*/ +/* xio_reg_mr_add_dev */ +/* add a new discovered device to a the mr list */ +/*---------------------------------------------------------------------------*/ +int xio_reg_mr_add_dev(struct xio_device *dev) +{ + struct xio_mr *tmr; + struct xio_mr_elem *tmr_elem; + + spin_lock(&dev_list_lock); + spin_lock(&mr_list_lock); + list_for_each_entry(tmr, &mr_list, mr_list_entry) { + tmr_elem = xio_reg_mr_ex_dev(dev, + &tmr->addr, tmr->length, + tmr->access); + if (unlikely(!tmr_elem)) { + xio_set_error(errno); + ERROR_LOG("ibv_reg_mr failed, %m\n"); + spin_unlock(&mr_list_lock); + spin_unlock(&dev_list_lock); + goto cleanup; + } + list_add(&tmr_elem->dm_list_entry, &tmr->dm_list); + list_add(&tmr_elem->xm_list_entry, &dev->xm_list); + } + spin_unlock(&mr_list_lock); + spin_unlock(&dev_list_lock); + + return 0; + +cleanup: + xio_dereg_mr_by_dev(dev); + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_mem_register */ +/*---------------------------------------------------------------------------*/ +int xio_mem_register(void *addr, size_t length, struct xio_reg_mem *reg_mem) +{ + if (!addr || length == 0) { + xio_set_error(EINVAL); + return -1; + } + if (list_empty(&dev_list)) { + if (!xio_register_transport() && list_empty(&dev_list)) + return xio_mem_register_no_dev(addr, length, reg_mem); + } + + reg_mem->mr = xio_reg_mr_ex(&addr, length, + IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_WRITE | + IBV_ACCESS_REMOTE_READ); + if (!reg_mem->mr) + return -1; + + reg_mem->addr = addr; + reg_mem->length = length; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_mem_dereg */ +/*---------------------------------------------------------------------------*/ +int xio_mem_dereg(struct xio_reg_mem *reg_mem) +{ + int retval; + + if (!reg_mem->mr) { + xio_set_error(EINVAL); + return -1; + } + if (list_empty(&dev_list)) + return xio_mem_dereg_no_dev(reg_mem); + + retval = xio_dereg_mr(reg_mem->mr); + + reg_mem->mr = NULL; + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_mem_alloc */ +/*---------------------------------------------------------------------------*/ +int xio_mem_alloc(size_t length, struct xio_reg_mem *reg_mem) +{ + struct xio_device *dev; + size_t real_size; + uint64_t access; + + if (length == 0 || !reg_mem) { + xio_set_error(EINVAL); + ERROR_LOG("xio_mem_alloc failed. length:%zu\n", length); + return -1; + } + if (list_empty(&dev_list)) { + if (!xio_register_transport() && list_empty(&dev_list)) + return xio_mem_alloc_no_dev(length, reg_mem); + } + + access = IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_WRITE | + IBV_ACCESS_REMOTE_READ; + + dev = list_first_entry(&dev_list, struct xio_device, dev_list_entry); + + if (dev && IBV_IS_MPAGES_AVAIL(&dev->device_attr)) { + access |= IBV_XIO_ACCESS_ALLOCATE_MR; + reg_mem->addr = NULL; + reg_mem->mr = xio_reg_mr_ex(®_mem->addr, length, access); + if (reg_mem->mr) { + reg_mem->length = length; + reg_mem->mr->addr_alloced = 0; + goto exit; + } + WARN_LOG("Contig pages allocation failed. (errno=%d %m)\n", + errno); + } + + real_size = ALIGN(length, page_size); + reg_mem->addr = umemalign(page_size, real_size); + if (unlikely(!reg_mem->addr)) { + xio_set_error(ENOMEM); + ERROR_LOG("memalign failed. sz:%zu\n", real_size); + goto cleanup; + } + reg_mem->mr = xio_reg_mr_ex(®_mem->addr, length, access); + if (unlikely(!reg_mem->mr)) { + ERROR_LOG("xio_reg_mr_ex failed. " \ + "addr:%p, length:%d, access:0x%x\n", + reg_mem->addr, length, access); + + goto cleanup1; + } + /*memset(reg_mem->addr, 0, length);*/ + reg_mem->length = length; + reg_mem->mr->addr_alloced = 1; + +exit: + return 0; + +cleanup1: + ufree(reg_mem->addr); +cleanup: + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_mem_free */ +/*---------------------------------------------------------------------------*/ +int xio_mem_free(struct xio_reg_mem *reg_mem) +{ + int retval; + + if (!reg_mem->mr) { + xio_set_error(EINVAL); + return -1; + } + if (list_empty(&dev_list)) + return xio_mem_free_no_dev(reg_mem); + + if (reg_mem->mr->addr_alloced) { + ufree(reg_mem->addr); + reg_mem->addr = NULL; + reg_mem->mr->addr_alloced = 0; + } + + retval = xio_dereg_mr(reg_mem->mr); + + reg_mem->mr = NULL; + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_mr_list_init */ +/*---------------------------------------------------------------------------*/ +void xio_mr_list_init(void) +{ + INIT_LIST_HEAD(&mr_list); + spin_lock_init(&mr_list_lock); +} + +/*---------------------------------------------------------------------------*/ +/* xio_mr_list_free */ +/*---------------------------------------------------------------------------*/ +int xio_mr_list_free(void) +{ + struct xio_mr *tmr; + + while (!list_empty(&mr_list)) { + tmr = list_first_entry(&mr_list, struct xio_mr, mr_list_entry); + xio_dereg_mr(tmr); + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rkey_table_create */ +/*---------------------------------------------------------------------------*/ +int xio_rkey_table_create(struct xio_device *old, struct xio_device *_new, + struct xio_rkey_tbl **htbl, uint16_t *len) +{ + struct xio_rkey_tbl *tbl, *te; + struct list_head *old_h, *new_h; + struct list_head *old_n, *new_n; + struct xio_mr_elem *old_e, *new_e; + + if (!mr_num) { + /* This is O.K. memory wasn't yet allocated and registered */ + *len = 0; + return 0; + } + + tbl = (struct xio_rkey_tbl *)ucalloc(mr_num, sizeof(*tbl)); + if (!tbl) { + *len = 0; + xio_set_error(ENOMEM); + return -1; + } + + /* MR elements are arranged in a matrix like fashion, were MR is one + * axis and device is the other axis + */ + old_h = &old->xm_list; + new_h = &_new->xm_list; + te = tbl; + + for (old_n = old_h->next, new_n = new_h->next; + old_n != old_h && new_n != new_h; + old_n = old_n->next, new_n = new_h->next) { + old_e = list_entry(old_n, struct xio_mr_elem, xm_list_entry); + new_e = list_entry(new_n, struct xio_mr_elem, xm_list_entry); + te->old_rkey = old_e->mr->rkey; + te->new_rkey = new_e->mr->rkey; + te++; + } + + if (old_n != old_h || new_n != new_h) { + /* one list terminated before the other this is a program error + * there should be an entry per device + */ + ERROR_LOG("bug\n"); + goto cleanup; + } + + *len = mr_num; + *htbl = tbl; + return 0; + +cleanup: + ufree(tbl); + *len = 0; + return -1; +} diff --git a/open_src/xio/src/usr/transport/tcp/xio_tcp_datapath.c b/open_src/xio/src/usr/transport/tcp/xio_tcp_datapath.c new file mode 100644 index 0000000..84564f2 --- /dev/null +++ b/open_src/xio/src/usr/transport/tcp/xio_tcp_datapath.c @@ -0,0 +1,3787 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include "libxio.h" +#include "xio_log.h" +#include "xio_common.h" +#include "xio_observer.h" +#include "xio_protocol.h" +#include "xio_mbuf.h" +#include "xio_task.h" +#include "xio_sg_table.h" +#include "xio_transport.h" +#include "xio_usr_transport.h" +#include "xio_ev_data.h" +#include "xio_objpool.h" +#include "xio_workqueue.h" +#include "xio_context.h" +#include "xio_tcp_transport.h" +#include "xio_mem.h" + +extern struct xio_tcp_options tcp_options; + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_send_work */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_send_work(int fd, void **buf, uint32_t *len, int block) +{ + int retval; + + while (*len) { + retval = send(fd, (const char *)*buf, *len, MSG_NOSIGNAL); + if (retval < 0) { + if (xio_get_last_socket_error() != XIO_EAGAIN) { + xio_set_error(xio_get_last_socket_error()); + ERROR_LOG("sendmsg failed. (errno=%d)\n", + xio_get_last_socket_error()); + /* ORK todo how to recover on remote side?*/ + return -1; + } else if (!block) { + xio_set_error(xio_get_last_socket_error()); + /* ORK todo set epollout event + * to trigger send again */ + /* ORK todo polling on sendmsg few more times + * before returning*/ + return -1; + } + } else { + *len -= retval; + inc_ptr(*buf, retval); + } + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_sendmsg_work */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_sendmsg_work(int fd, + struct xio_tcp_work_req *xio_send, + int block) +{ + int retval = 0, tmp_bytes, sent_bytes = 0; + int eagain_count = TX_EAGAIN_RETRY; + unsigned int i; + + while (xio_send->tot_iov_byte_len) { + retval = sendmsg(fd, &xio_send->msg, MSG_NOSIGNAL); + if (retval < 0) { + if (xio_get_last_socket_error() != XIO_EAGAIN) { + xio_set_error(xio_get_last_socket_error()); + DEBUG_LOG("sendmsg failed. (errno=%d)\n", + xio_get_last_socket_error()); + return -1; + } else if (!block && (eagain_count-- == 0)) { + xio_set_error(xio_get_last_socket_error()); + return -1; + } + } else { + sent_bytes += retval; + xio_send->tot_iov_byte_len -= retval; + + if (xio_send->tot_iov_byte_len == 0) { + xio_send->msg.msg_iovlen = 0; + break; + } + + tmp_bytes = 0; + for (i = 0; i < xio_send->msg.msg_iovlen; i++) { + if (xio_send->msg.msg_iov[i].iov_len + + tmp_bytes < (size_t)retval) { + tmp_bytes += + xio_send->msg.msg_iov[i].iov_len; + } else { + xio_send->msg.msg_iov[i].iov_len -= + (retval - tmp_bytes); + inc_ptr( + xio_send->msg.msg_iov[i].iov_base, + retval - tmp_bytes); + xio_send->msg.msg_iov = + &xio_send->msg.msg_iov[i]; + xio_send->msg.msg_iovlen -= i; + break; + } + } + + eagain_count = TX_EAGAIN_RETRY; + } + } + + return sent_bytes; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_write_setup_msg */ +/*---------------------------------------------------------------------------*/ +static void xio_tcp_write_setup_msg(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task, + struct xio_tcp_setup_msg *msg) +{ + struct xio_tcp_setup_msg *tmp_msg; + + /* set the mbuf after tlv header */ + xio_mbuf_set_val_start(&task->mbuf); + + /* jump after connection setup header */ + if (tcp_hndl->base.is_client) + xio_mbuf_inc(&task->mbuf, + sizeof(struct xio_nexus_setup_req)); + else + xio_mbuf_inc(&task->mbuf, + sizeof(struct xio_nexus_setup_rsp)); + + tmp_msg = (struct xio_tcp_setup_msg *) + xio_mbuf_get_curr_ptr(&task->mbuf); + + /* pack relevant values */ + PACK_LLVAL(msg, tmp_msg, buffer_sz); + PACK_LVAL(msg, tmp_msg, max_in_iovsz); + PACK_LVAL(msg, tmp_msg, max_out_iovsz); + PACK_LVAL(msg, tmp_msg, max_header_len); + +#ifdef EYAL_TODO + print_hex_dump_bytes("post_send: ", DUMP_PREFIX_ADDRESS, + task->mbuf.tlv.head, + 64); +#endif + xio_mbuf_inc(&task->mbuf, sizeof(struct xio_tcp_setup_msg)); +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_read_setup_msg */ +/*---------------------------------------------------------------------------*/ +static void xio_tcp_read_setup_msg(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task, + struct xio_tcp_setup_msg *msg) +{ + struct xio_tcp_setup_msg *tmp_msg; + + /* set the mbuf after tlv header */ + xio_mbuf_set_val_start(&task->mbuf); + + /* jump after connection setup header */ + if (tcp_hndl->base.is_client) + xio_mbuf_inc(&task->mbuf, + sizeof(struct xio_nexus_setup_rsp)); + else + xio_mbuf_inc(&task->mbuf, + sizeof(struct xio_nexus_setup_req)); + + tmp_msg = (struct xio_tcp_setup_msg *) + xio_mbuf_get_curr_ptr(&task->mbuf); + + /* pack relevant values */ + UNPACK_LLVAL(tmp_msg, msg, buffer_sz); + UNPACK_LVAL(tmp_msg, msg, max_in_iovsz); + UNPACK_LVAL(tmp_msg, msg, max_out_iovsz); + UNPACK_LVAL(tmp_msg, msg, max_header_len); + +#ifdef EYAL_TODO + print_hex_dump_bytes("post_send: ", DUMP_PREFIX_ADDRESS, + task->mbuf.curr, + 64); +#endif + xio_mbuf_inc(&task->mbuf, sizeof(struct xio_tcp_setup_msg)); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_send_setup_req */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_send_setup_req(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task) +{ + XIO_TO_TCP_TASK(task, tcp_task); + uint16_t payload; + struct xio_tcp_setup_msg req; + + DEBUG_LOG("xio_tcp_send_setup_req\n"); + + req.buffer_sz = xio_tcp_get_inline_buffer_size(); + req.max_in_iovsz = tcp_options.max_in_iovsz; + req.max_out_iovsz = tcp_options.max_out_iovsz; + req.max_header_len = g_options.max_inline_xio_hdr; + + xio_tcp_write_setup_msg(tcp_hndl, task, &req); + + payload = xio_mbuf_tlv_payload_len(&task->mbuf); + + /* add tlv */ + if (xio_mbuf_write_tlv(&task->mbuf, task->tlv_type, payload) != 0) + return -1; + + TRACE_LOG("tcp send setup request\n"); + + /* set the length */ + tcp_task->txd.msg_iov[0].iov_len = xio_mbuf_data_length(&task->mbuf); + tcp_task->txd.msg_len = 1; + tcp_task->txd.tot_iov_byte_len = tcp_task->txd.msg_iov[0].iov_len; + tcp_task->txd.msg.msg_iov = tcp_task->txd.msg_iov; + tcp_task->txd.msg.msg_iovlen = tcp_task->txd.msg_len; + + tcp_task->out_tcp_op = XIO_TCP_SEND; + + xio_task_addref(task); + + xio_tcp_sendmsg_work(tcp_hndl->sock.cfd, &tcp_task->txd, 1); + + list_move_tail(&task->tasks_list_entry, &tcp_hndl->in_flight_list); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_send_setup_rsp */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_send_setup_rsp(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task) +{ + XIO_TO_TCP_TASK(task, tcp_task); + uint16_t payload; + struct xio_tcp_setup_msg *rsp = &tcp_hndl->setup_rsp; + + DEBUG_LOG("xio_tcp_send_setup_rsp\n"); + + rsp->max_in_iovsz = tcp_options.max_in_iovsz; + rsp->max_out_iovsz = tcp_options.max_out_iovsz; + rsp->buffer_sz = tcp_hndl->membuf_sz; + rsp->max_header_len = g_options.max_inline_xio_hdr; + + xio_tcp_write_setup_msg(tcp_hndl, task, rsp); + + payload = xio_mbuf_tlv_payload_len(&task->mbuf); + + /* add tlv */ + if (xio_mbuf_write_tlv(&task->mbuf, task->tlv_type, payload) != 0) + return -1; + + TRACE_LOG("tcp send setup response\n"); + + /* set the length */ + tcp_task->txd.msg_iov[0].iov_len = xio_mbuf_data_length(&task->mbuf); + tcp_task->txd.msg_len = 1; + tcp_task->txd.tot_iov_byte_len = tcp_task->txd.msg_iov[0].iov_len; + tcp_task->txd.msg.msg_iov = tcp_task->txd.msg_iov; + tcp_task->txd.msg.msg_iovlen = tcp_task->txd.msg_len; + + tcp_task->out_tcp_op = XIO_TCP_SEND; + + xio_tcp_sendmsg_work(tcp_hndl->sock.cfd, &tcp_task->txd, 1); + + list_move(&task->tasks_list_entry, &tcp_hndl->in_flight_list); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_rdma_on_setup_msg */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_on_setup_msg(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task) +{ + union xio_transport_event_data event_data; + struct xio_tcp_setup_msg *rsp = &tcp_hndl->setup_rsp; + uint64_t local_buf_size; + + DEBUG_LOG("xio_tcp_on_setup_msg\n"); + + if (tcp_hndl->base.is_client) { + struct xio_task *sender_task = NULL; + + if (!list_empty(&tcp_hndl->in_flight_list)) + sender_task = list_first_entry( + &tcp_hndl->in_flight_list, + struct xio_task, tasks_list_entry); + else if (!list_empty(&tcp_hndl->tx_comp_list)) + sender_task = list_first_entry( + &tcp_hndl->tx_comp_list, + struct xio_task, tasks_list_entry); + else + ERROR_LOG("could not find sender task\n"); + + task->sender_task = sender_task; + xio_tcp_read_setup_msg(tcp_hndl, task, rsp); + } else { + struct xio_tcp_setup_msg req; + + xio_tcp_read_setup_msg(tcp_hndl, task, &req); + + /* current implementation is symmetric */ + local_buf_size = xio_tcp_get_inline_buffer_size(); + rsp->buffer_sz = min(req.buffer_sz, local_buf_size); + rsp->max_in_iovsz = req.max_in_iovsz; + rsp->max_out_iovsz = req.max_out_iovsz; + rsp->max_header_len = req.max_header_len; + } + + tcp_hndl->max_inline_buf_sz = (size_t)rsp->buffer_sz; + tcp_hndl->membuf_sz = (size_t)rsp->buffer_sz; + tcp_hndl->peer_max_in_iovsz = rsp->max_in_iovsz; + tcp_hndl->peer_max_out_iovsz = rsp->max_out_iovsz; + tcp_hndl->peer_max_header = rsp->max_header_len; + + tcp_hndl->sn = 0; + + tcp_hndl->state = XIO_TRANSPORT_STATE_CONNECTED; + + /* fill notification event */ + event_data.msg.op = XIO_WC_OP_RECV; + event_data.msg.task = task; + + list_move_tail(&task->tasks_list_entry, &tcp_hndl->io_list); + + xio_transport_notify_observer(&tcp_hndl->base, + XIO_TRANSPORT_EVENT_NEW_MESSAGE, + &event_data); + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_send_connect_msg */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_send_connect_msg(int fd, struct xio_tcp_connect_msg *msg) +{ + int retval; + struct xio_tcp_connect_msg smsg; + uint32_t size = sizeof(struct xio_tcp_connect_msg); + void *buf = &smsg; + + smsg.sock_type = (enum xio_tcp_sock_type) + htonl((uint32_t)msg->sock_type); + PACK_SVAL(msg, &smsg, second_port); + PACK_SVAL(msg, &smsg, pad); + + retval = xio_tcp_send_work(fd, &buf, &size, 1); + if (retval < 0) { + if (xio_get_last_socket_error() == XIO_EAGAIN) { + /* ORK todo set event */ + } else { + ERROR_LOG("send return with %d. (errno=%d %m)\n", + retval, xio_get_last_socket_error()); + return retval; + } + } + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_write_req_header */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_write_req_header(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task, + struct xio_tcp_req_hdr *req_hdr) +{ + XIO_TO_TCP_TASK(task, tcp_task); + struct xio_tcp_req_hdr *tmp_req_hdr; + struct xio_sge *tmp_sge; + struct xio_sge sge; + size_t hdr_len; + uint32_t i; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + void *sg; + + sgtbl = xio_sg_table_get(&task->omsg->in); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(task->omsg->in.sgl_type); + + /* point to transport header */ + xio_mbuf_set_trans_hdr(&task->mbuf); + tmp_req_hdr = (struct xio_tcp_req_hdr *) + xio_mbuf_get_curr_ptr(&task->mbuf); + + /* pack relevant values */ + tmp_req_hdr->version = req_hdr->version; + tmp_req_hdr->flags = req_hdr->flags; + PACK_SVAL(req_hdr, tmp_req_hdr, req_hdr_len); + PACK_LVAL(req_hdr, tmp_req_hdr, ltid); + tmp_req_hdr->in_tcp_op = req_hdr->in_tcp_op; + tmp_req_hdr->out_tcp_op = req_hdr->out_tcp_op; + + PACK_SVAL(req_hdr, tmp_req_hdr, in_num_sge); + PACK_SVAL(req_hdr, tmp_req_hdr, out_num_sge); + PACK_SVAL(req_hdr, tmp_req_hdr, ulp_hdr_len); + PACK_SVAL(req_hdr, tmp_req_hdr, ulp_pad_len); + /*remain_data_len is not used */ + PACK_LLVAL(req_hdr, tmp_req_hdr, ulp_imm_len); + + tmp_sge = (struct xio_sge *)((uint8_t *)tmp_req_hdr + + sizeof(struct xio_tcp_req_hdr)); + + /* IN: requester expect small input written via send */ + sg = sge_first(sgtbl_ops, sgtbl); + if (req_hdr->in_tcp_op == XIO_TCP_SEND) { + for (i = 0; i < req_hdr->in_num_sge; i++) { + sge.addr = 0; + sge.length = sge_length(sgtbl_ops, sg); + sge.stag = 0; + PACK_LLVAL(&sge, tmp_sge, addr); + PACK_LVAL(&sge, tmp_sge, length); + PACK_LVAL(&sge, tmp_sge, stag); + tmp_sge++; + sg = sge_next(sgtbl_ops, sgtbl, sg); + } + } + /* IN: requester expect big input written rdma write */ + if (req_hdr->in_tcp_op == XIO_TCP_WRITE) { + for (i = 0; i < req_hdr->in_num_sge; i++) { + sge.addr = uint64_from_ptr(tcp_task->read_reg_mem[i].addr); + sge.length = tcp_task->read_reg_mem[i].length; + sge.stag = 0; + PACK_LLVAL(&sge, tmp_sge, addr); + PACK_LVAL(&sge, tmp_sge, length); + PACK_LVAL(&sge, tmp_sge, stag); + tmp_sge++; + } + } + /* OUT: requester want to write data via rdma read */ + if (req_hdr->out_tcp_op == XIO_TCP_READ) { + for (i = 0; i < req_hdr->out_num_sge; i++) { + sge.addr = uint64_from_ptr(tcp_task->write_reg_mem[i].addr); + sge.length = tcp_task->write_reg_mem[i].length; + sge.stag = 0; + PACK_LLVAL(&sge, tmp_sge, addr); + PACK_LVAL(&sge, tmp_sge, length); + PACK_LVAL(&sge, tmp_sge, stag); + tmp_sge++; + } + } + if (req_hdr->out_tcp_op == XIO_TCP_SEND) { + for (i = 0; i < req_hdr->out_num_sge; i++) { + sge.addr = 0; + sge.length = sge_length(sgtbl_ops, sg); + sge.stag = 0; + PACK_LLVAL(&sge, tmp_sge, addr); + PACK_LVAL(&sge, tmp_sge, length); + PACK_LVAL(&sge, tmp_sge, stag); + tmp_sge++; + sg = sge_next(sgtbl_ops, sgtbl, sg); + } + } + + hdr_len = sizeof(struct xio_tcp_req_hdr); + hdr_len += sizeof(struct xio_sge) * (req_hdr->in_num_sge + + req_hdr->out_num_sge); +#ifdef EYAL_TODO + print_hex_dump_bytes("post_send: ", DUMP_PREFIX_ADDRESS, + task->mbuf.curr, + hdr_len + 16); +#endif + xio_mbuf_inc(&task->mbuf, hdr_len); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_prep_req_header */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_prep_req_header(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task, + uint16_t ulp_hdr_len, + uint16_t ulp_pad_len, + uint64_t ulp_imm_len, + uint32_t status) +{ + XIO_TO_TCP_TASK(task, tcp_task); + struct xio_tcp_req_hdr req_hdr; + + if (unlikely(!IS_REQUEST(task->tlv_type))) { + ERROR_LOG("unknown message type\n"); + return -1; + } + + /* write the headers */ + + /* fill request header */ + req_hdr.version = XIO_TCP_REQ_HEADER_VERSION; + req_hdr.req_hdr_len = sizeof(req_hdr); + req_hdr.ltid = task->ltid; + req_hdr.in_tcp_op = tcp_task->in_tcp_op; + req_hdr.out_tcp_op = tcp_task->out_tcp_op; + req_hdr.flags = 0; + + if (test_bits(XIO_MSG_FLAG_PEER_WRITE_RSP, &task->omsg_flags)) + set_bits(XIO_MSG_FLAG_PEER_WRITE_RSP, &req_hdr.flags); + else if (test_bits(XIO_MSG_FLAG_LAST_IN_BATCH, &task->omsg_flags)) + set_bits(XIO_MSG_FLAG_LAST_IN_BATCH, &req_hdr.flags); + + req_hdr.ulp_hdr_len = ulp_hdr_len; + req_hdr.ulp_pad_len = ulp_pad_len; + req_hdr.ulp_imm_len = ulp_imm_len; + req_hdr.in_num_sge = tcp_task->read_num_reg_mem; + req_hdr.out_num_sge = tcp_task->write_num_reg_mem; + + if (xio_tcp_write_req_header(tcp_hndl, task, &req_hdr) != 0) + goto cleanup; + + tcp_task->txd.ctl_msg_len = xio_mbuf_tlv_len(&task->mbuf); + + /* write the payload header */ + if (ulp_hdr_len) { + if (xio_mbuf_write_array( + &task->mbuf, + task->omsg->out.header.iov_base, + task->omsg->out.header.iov_len) != 0) + goto cleanup; + } + + /* write the pad between header and data */ + if (ulp_pad_len) + xio_mbuf_inc(&task->mbuf, ulp_pad_len); + + return 0; + +cleanup: + xio_set_error(XIO_E_MSG_SIZE); + ERROR_LOG("xio_rdma_write_req_header failed\n"); + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_write_send_data */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_write_send_data( + struct xio_tcp_transport *tcp_hndl, + struct xio_task *task) +{ + XIO_TO_TCP_TASK(task, tcp_task); + size_t i; + size_t byte_len = 0; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + void *sg; + + sgtbl = xio_sg_table_get(&task->omsg->out); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(task->omsg->out.sgl_type); + + /* user provided mr */ + sg = sge_first(sgtbl_ops, sgtbl); + if (sge_mr(sgtbl_ops, sg) || !tcp_options.enable_mr_check) { + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + tcp_task->txd.msg_iov[i + 1].iov_base = + sge_addr(sgtbl_ops, sg); + tcp_task->txd.msg_iov[i + 1].iov_len = + sge_length(sgtbl_ops, sg); + + byte_len += sge_length(sgtbl_ops, sg); + } + tcp_task->txd.msg_len = + tbl_nents(sgtbl_ops, sgtbl) + 1; + tcp_task->txd.tot_iov_byte_len = byte_len; + } else { + /* copy to internal buffer */ + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + /* copy the data into internal buffer */ + if (xio_mbuf_write_array( + &task->mbuf, + sge_addr(sgtbl_ops, sg), + sge_length(sgtbl_ops, sg)) != 0) + goto cleanup; + } + tcp_task->txd.msg_len = 1; + tcp_task->txd.tot_iov_byte_len = 0; + } + + return 0; + +cleanup: + xio_set_error(XIO_E_MSG_SIZE); + ERROR_LOG("xio_tcp_send_msg failed\n"); + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_prep_req_out_data */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_prep_req_out_data( + struct xio_tcp_transport *tcp_hndl, + struct xio_task *task) +{ + XIO_TO_TCP_TASK(task, tcp_task); + struct xio_vmsg *vmsg = &task->omsg->out; + uint64_t xio_hdr_len; + uint64_t xio_max_hdr_len; + uint64_t ulp_hdr_len; + uint64_t ulp_pad_len = 0; + uint64_t ulp_imm_len; + size_t retval; + unsigned int i; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + void *sg; + int tx_by_sr; + int nents; + + sgtbl = xio_sg_table_get(&task->omsg->out); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(task->omsg->out.sgl_type); + nents = tbl_nents(sgtbl_ops, sgtbl); + + /* calculate headers */ + ulp_hdr_len = vmsg->header.iov_len; + ulp_imm_len = tbl_length(sgtbl_ops, sgtbl); + + xio_hdr_len = xio_mbuf_get_curr_offset(&task->mbuf); + xio_hdr_len += sizeof(struct xio_tcp_req_hdr); + xio_hdr_len += sizeof(struct xio_sge) * tcp_task->read_num_reg_mem; + xio_max_hdr_len = xio_hdr_len + sizeof(struct xio_sge) * nents; + + if (g_options.inline_xio_data_align && ulp_imm_len) { + uint16_t hdr_len = xio_hdr_len + ulp_hdr_len; + + ulp_pad_len = ALIGN(hdr_len, g_options.inline_xio_data_align) - + hdr_len; + } + + /* + if (tcp_hndl->max_inline_buf_sz < (xio_hdr_len + ulp_hdr_len)) { + ERROR_LOG("header size %lu exceeds max header %lu\n", + ulp_hdr_len, tcp_hndl->max_inline_buf_sz - + xio_hdr_len); + xio_set_error(XIO_E_MSG_SIZE); + return -1; + } + */ + /* test for using send/receive or rdma_read */ + if (test_bits(XIO_MSG_FLAG_PEER_READ_REQ, &task->omsg_flags) && nents) + tx_by_sr = 0; + else + tx_by_sr = (((ulp_hdr_len + ulp_pad_len + + ulp_imm_len + xio_max_hdr_len) <= + tcp_hndl->max_inline_buf_sz) && + (((int)(ulp_imm_len) <= + g_options.max_inline_xio_data) || + ulp_imm_len == 0)); + + /* the data is outgoing via SEND */ + if (tx_by_sr) { + tcp_task->out_tcp_op = XIO_TCP_SEND; + /* user has small request - no rdma operation expected */ + tcp_task->write_num_reg_mem = 0; + + /* write xio header to the buffer */ + retval = xio_tcp_prep_req_header( + tcp_hndl, task, + (uint16_t)ulp_hdr_len, + (uint16_t)ulp_pad_len, ulp_imm_len, + XIO_E_SUCCESS); + if (retval) + return -1; + + /* if there is data, set it to buffer or directly to the sge */ + if (ulp_imm_len) { + retval = xio_tcp_write_send_data(tcp_hndl, task); + if (retval) + return -1; + } else { + tcp_task->txd.tot_iov_byte_len = 0; + tcp_task->txd.msg_len = 1; + } + } else { + tcp_task->out_tcp_op = XIO_TCP_READ; + sg = sge_first(sgtbl_ops, sgtbl); + if (sge_mr(sgtbl_ops, sg) || !tcp_options.enable_mr_check) { + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + tcp_task->write_reg_mem[i].addr = + sge_addr(sgtbl_ops, sg); + tcp_task->write_reg_mem[i].priv = NULL; + tcp_task->write_reg_mem[i].mr = + (struct xio_mr *)sge_mr(sgtbl_ops, sg); + tcp_task->write_reg_mem[i].length = + sge_length(sgtbl_ops, sg); + } + } else { + if (!tcp_hndl->tcp_mempool) { + xio_set_error(XIO_E_NO_BUFS); + ERROR_LOG("message /read/write failed - " \ + "library's memory pool disabled\n"); + goto cleanup; + } + + /* user did not provide mr - take buffers from pool + * and do copy */ + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + retval = xio_mempool_alloc( + tcp_hndl->tcp_mempool, + sge_length(sgtbl_ops, sg), + &tcp_task->write_reg_mem[i]); + if (unlikely(retval)) { + tcp_task->write_num_reg_mem = i; + xio_set_error(ENOMEM); + ERROR_LOG("mempool is empty " \ + "for %zd bytes\n", + sge_length(sgtbl_ops, sg)); + goto cleanup; + } + + tcp_task->write_reg_mem[i].length = + sge_length(sgtbl_ops, sg); + + /* copy the data to the buffer */ + memcpy(tcp_task->write_reg_mem[i].addr, + sge_addr(sgtbl_ops, sg), + sge_length(sgtbl_ops, sg)); + } + } + tcp_task->write_num_reg_mem = tbl_nents(sgtbl_ops, sgtbl); + + if (ulp_imm_len) { + tcp_task->txd.tot_iov_byte_len = 0; + for (i = 0; i < tcp_task->write_num_reg_mem; i++) { + tcp_task->txd.msg_iov[i + 1].iov_base = + tcp_task->write_reg_mem[i].addr; + tcp_task->txd.msg_iov[i + 1].iov_len = + tcp_task->write_reg_mem[i].length; + tcp_task->txd.tot_iov_byte_len += + tcp_task->write_reg_mem[i].length; + } + tcp_task->txd.msg_len = tcp_task->write_num_reg_mem + 1; + } else { + tcp_task->txd.tot_iov_byte_len = 0; + tcp_task->txd.msg_len = 1; + } + + /* write xio header to the buffer */ + retval = xio_tcp_prep_req_header( + tcp_hndl, task, + (uint16_t)ulp_hdr_len, 0, 0, XIO_E_SUCCESS); + + if (retval) { + ERROR_LOG("Failed to write header\n"); + goto cleanup; + } + } + + return 0; + +cleanup: + for (i = 0; i < tcp_task->write_num_reg_mem; i++) + xio_mempool_free(&tcp_task->write_reg_mem[i]); + + tcp_task->write_num_reg_mem = 0; + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_on_rsp_send_comp */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_on_rsp_send_comp(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task) +{ + union xio_transport_event_data event_data; + + if (IS_CANCEL(task->tlv_type)) { + xio_tasks_pool_put(task); + return 0; + } + + event_data.msg.op = XIO_WC_OP_SEND; + event_data.msg.task = task; + + xio_transport_notify_observer(&tcp_hndl->base, + XIO_TRANSPORT_EVENT_SEND_COMPLETION, + &event_data); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_on_req_send_comp */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_on_req_send_comp(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task) +{ + union xio_transport_event_data event_data; + + if (IS_CANCEL(task->tlv_type)) + return 0; + + event_data.msg.op = XIO_WC_OP_SEND; + event_data.msg.task = task; + + xio_transport_notify_observer(&tcp_hndl->base, + XIO_TRANSPORT_EVENT_SEND_COMPLETION, + &event_data); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_tx_comp_handler */ +/*---------------------------------------------------------------------------*/ +static void xio_tcp_tx_completion_handler(void *xio_task) +{ + struct xio_task *ptask, *next_ptask; + int found = 0; + int removed = 0; + struct xio_task *task = (struct xio_task *)xio_task; + + XIO_TO_TCP_HNDL(task, tcp_hndl); + + list_for_each_entry_safe(ptask, next_ptask, &tcp_hndl->in_flight_list, + tasks_list_entry) { + list_move_tail(&ptask->tasks_list_entry, + &tcp_hndl->tx_comp_list); + removed++; + if (IS_REQUEST(ptask->tlv_type)) { + xio_tcp_on_req_send_comp(tcp_hndl, ptask); + xio_tasks_pool_put(ptask); + } else if (IS_RESPONSE(ptask->tlv_type)) { + xio_tcp_on_rsp_send_comp(tcp_hndl, ptask); + } else { + ERROR_LOG("unexpected task %p id:%d magic:0x%lx\n", + ptask, + ptask->ltid, ptask->magic); + continue; + } + if (ptask == task) { + found = 1; + break; + } + } + + if (!found && removed) + ERROR_LOG("not found but removed %d type:0x%x\n", + removed, task->tlv_type); + + tcp_hndl->tx_comp_cnt = 0; + + /* after work completion - report disconnect */ + if (tcp_hndl->state == XIO_TRANSPORT_STATE_DISCONNECTED) { + xio_context_add_event(tcp_hndl->base.ctx, &tcp_hndl->disconnect_event); + } else { + if (tcp_hndl->tx_ready_tasks_num) + xio_tcp_xmit(tcp_hndl); + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_disconnect_helper */ +/*---------------------------------------------------------------------------*/ +void xio_tcp_disconnect_helper(void *xio_tcp_hndl) +{ + struct xio_tcp_transport *tcp_hndl = (struct xio_tcp_transport *) + xio_tcp_hndl; + + if (tcp_hndl->state >= XIO_TRANSPORT_STATE_DISCONNECTED) + return; + + tcp_hndl->state = XIO_TRANSPORT_STATE_DISCONNECTED; + + /* flush all tasks in completion */ + if (!list_empty(&tcp_hndl->in_flight_list)) { + struct xio_task *task = NULL; + + task = list_last_entry(&tcp_hndl->in_flight_list, + struct xio_task, + tasks_list_entry); + if (task) { + XIO_TO_TCP_TASK(task, tcp_task); + + xio_ctx_add_work(tcp_hndl->base.ctx, task, + xio_tcp_tx_completion_handler, + &tcp_task->comp_work); + } + } else { + /* call disconnect if no message to flush other wise defer */ + xio_context_add_event(tcp_hndl->base.ctx, + &tcp_hndl->disconnect_event); + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_write_sn */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_write_sn(struct xio_task *task, uint16_t sn) +{ + uint16_t *psn; + + /* save the current place */ + xio_mbuf_push(&task->mbuf); + /* goto the first transport header*/ + xio_mbuf_set_trans_hdr(&task->mbuf); + + /* jump over the first uint32_t */ + xio_mbuf_inc(&task->mbuf, sizeof(uint32_t)); + + /* and set serial number */ + psn = (uint16_t *)xio_mbuf_get_curr_ptr(&task->mbuf); + *psn = htons(sn); + + /* pop to the original place */ + xio_mbuf_pop(&task->mbuf); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_xmit */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_xmit(struct xio_tcp_transport *tcp_hndl) +{ + struct xio_task *task = NULL, *task_success = NULL, + *next_task = NULL; + struct xio_tcp_task *tcp_task = NULL, *next_tcp_task = NULL; + int retval = 0, retval2 = 0; + int imm_comp = 0; + int batch_nr = TX_BATCH, batch_count = 0, tmp_count; + unsigned int i; + unsigned int iov_len; + uint64_t bytes_sent; + + if (tcp_hndl->tx_ready_tasks_num == 0 || + tcp_hndl->tx_comp_cnt > COMPLETION_BATCH_MAX || + tcp_hndl->state != XIO_TRANSPORT_STATE_CONNECTED) { + xio_set_error(XIO_EAGAIN); + return -1; + } + + task = list_first_entry(&tcp_hndl->tx_ready_list, struct xio_task, + tasks_list_entry); + + /* if "ready to send queue" is not empty */ + while (likely(tcp_hndl->tx_ready_tasks_num && + (tcp_hndl->tx_comp_cnt < COMPLETION_BATCH_MAX))) { + next_task = list_first_entry_or_null(&task->tasks_list_entry, + struct xio_task, + tasks_list_entry); + next_tcp_task = next_task ? + (struct xio_tcp_task *)next_task->dd_data : NULL; + + tcp_task = (struct xio_tcp_task *)task->dd_data; + + switch (tcp_task->txd.stage) { + case XIO_TCP_TX_BEFORE: + xio_tcp_write_sn(task, tcp_hndl->sn); + tcp_task->sn = tcp_hndl->sn; + tcp_hndl->sn++; + tcp_task->txd.stage = XIO_TCP_TX_IN_SEND_CTL; + /*fallthrough*/ + case XIO_TCP_TX_IN_SEND_CTL: + /* for single socket, ctl_msg_len is zero */ + if (tcp_task->txd.ctl_msg_len == 0) { + tcp_task->txd.stage = XIO_TCP_TX_IN_SEND_DATA; + break; + } + + tcp_hndl->tmp_work.msg_iov[batch_count].iov_base = + tcp_task->txd.ctl_msg; + tcp_hndl->tmp_work.msg_iov[batch_count].iov_len = + tcp_task->txd.ctl_msg_len; + ++tcp_hndl->tmp_work.msg_len; + tcp_hndl->tmp_work.tot_iov_byte_len += + tcp_task->txd.ctl_msg_len; + + ++batch_count; + if (batch_count != batch_nr && + batch_count != tcp_hndl->tx_ready_tasks_num && + next_task && + next_tcp_task->txd.stage + <= XIO_TCP_TX_IN_SEND_CTL) { + task = next_task; + break; + } + + tcp_hndl->tmp_work.msg.msg_iov = + tcp_hndl->tmp_work.msg_iov; + tcp_hndl->tmp_work.msg.msg_iovlen = + tcp_hndl->tmp_work.msg_len; + + retval = xio_tcp_sendmsg_work(tcp_hndl->sock.cfd, + &tcp_hndl->tmp_work, 0); + + task = list_first_entry(&tcp_hndl->tx_ready_list, + struct xio_task, + tasks_list_entry); + iov_len = tcp_hndl->tmp_work.msg_len - + tcp_hndl->tmp_work.msg.msg_iovlen; + for (i = 0; i < iov_len; i++) { + tcp_task = (struct xio_tcp_task *)task->dd_data; + tcp_task->txd.stage = XIO_TCP_TX_IN_SEND_DATA; + tcp_task->txd.ctl_msg_len = 0; + task = list_first_entry_or_null( + &task->tasks_list_entry, + struct xio_task, + tasks_list_entry); + } + if (tcp_hndl->tmp_work.msg.msg_iovlen) { + tcp_task = (struct xio_tcp_task *)task->dd_data; + tcp_task->txd.ctl_msg = + tcp_hndl->tmp_work.msg.msg_iov[0].iov_base; + tcp_task->txd.ctl_msg_len = + tcp_hndl->tmp_work.msg.msg_iov[0].iov_len; + } + tcp_hndl->tmp_work.msg_len = 0; + tcp_hndl->tmp_work.tot_iov_byte_len = 0; + batch_count = 0; + + if (retval < 0) { + if (xio_get_last_socket_error() == + XIO_ECONNRESET || + xio_get_last_socket_error() == + XIO_ECONNABORTED || + /*EPIPE is not rellevant for Windows*/ + xio_get_last_socket_error() == EPIPE) { + DEBUG_LOG("tcp trans got reset "); + DEBUG_LOG("tcp_hndl=%p\n", tcp_hndl); + xio_tcp_disconnect_helper(tcp_hndl); + return 0; + } + + if (xio_get_last_socket_error() != XIO_EAGAIN) + return -1; + + /* for eagain, add event for ready for write*/ + retval = xio_context_modify_ev_handler( + tcp_hndl->base.ctx, + tcp_hndl->sock.cfd, + XIO_POLLIN | XIO_POLLRDHUP | + XIO_POLLOUT); + if (retval != 0) + ERROR_LOG("modify events failed.\n"); + + retval = -1; + goto handle_completions; + } + + task = list_first_entry( + &tcp_hndl->tx_ready_list, + struct xio_task, tasks_list_entry); + + break; + case XIO_TCP_TX_IN_SEND_DATA: + + for (i = 0; i < tcp_task->txd.msg.msg_iovlen; i++) { + tcp_hndl->tmp_work.msg_iov + [tcp_hndl->tmp_work.msg_len].iov_base = + tcp_task->txd.msg.msg_iov[i].iov_base; + tcp_hndl->tmp_work.msg_iov + [tcp_hndl->tmp_work.msg_len].iov_len = + tcp_task->txd.msg.msg_iov[i].iov_len; + ++tcp_hndl->tmp_work.msg_len; + } + tcp_hndl->tmp_work.tot_iov_byte_len += + tcp_task->txd.tot_iov_byte_len; + + ++batch_count; + if (batch_count != batch_nr && + batch_count != tcp_hndl->tx_ready_tasks_num && + next_task && + (next_tcp_task->txd.stage == + XIO_TCP_TX_IN_SEND_DATA) && + (next_tcp_task->txd.msg.msg_iovlen + + tcp_hndl->tmp_work.msg_len) < IOV_MAX) { + task = next_task; + break; + } + + tcp_hndl->tmp_work.msg.msg_iov = + tcp_hndl->tmp_work.msg_iov; + tcp_hndl->tmp_work.msg.msg_iovlen = + tcp_hndl->tmp_work.msg_len; + + bytes_sent = tcp_hndl->tmp_work.tot_iov_byte_len; + retval = xio_tcp_sendmsg_work(tcp_hndl->sock.dfd, + &tcp_hndl->tmp_work, 0); + bytes_sent -= tcp_hndl->tmp_work.tot_iov_byte_len; + + task = list_first_entry(&tcp_hndl->tx_ready_list, + struct xio_task, + tasks_list_entry); + iov_len = tcp_hndl->tmp_work.msg_len - + tcp_hndl->tmp_work.msg.msg_iovlen; + tmp_count = batch_count; + while (tmp_count) { + tcp_task = (struct xio_tcp_task *)task->dd_data; + + if (tcp_task->txd.msg.msg_iovlen > iov_len) + break; + + iov_len -= tcp_task->txd.msg.msg_iovlen; + bytes_sent -= tcp_task->txd.tot_iov_byte_len; + + tcp_hndl->tx_ready_tasks_num--; + + list_move_tail(&task->tasks_list_entry, + &tcp_hndl->in_flight_list); + + task_success = task; + + ++tcp_hndl->tx_comp_cnt; + + imm_comp = imm_comp || task->is_control || + (task->omsg && + (task->omsg->flags & + XIO_MSG_FLAG_IMM_SEND_COMP)); + + --tmp_count; + + task = list_first_entry( + &tcp_hndl->tx_ready_list, + struct xio_task, tasks_list_entry); + } + if (tcp_hndl->tmp_work.msg.msg_iovlen) { + tcp_task = (struct xio_tcp_task *)task->dd_data; + tcp_task->txd.msg.msg_iov = + &tcp_task->txd.msg.msg_iov[iov_len]; + tcp_task->txd.msg.msg_iov[0].iov_base = + tcp_hndl->tmp_work.msg.msg_iov[0].iov_base; + tcp_task->txd.msg.msg_iov[0].iov_len = + tcp_hndl->tmp_work.msg.msg_iov[0].iov_len; + tcp_task->txd.msg.msg_iovlen -= iov_len; + tcp_task->txd.tot_iov_byte_len -= bytes_sent; + } + + tcp_hndl->tmp_work.msg_len = 0; + tcp_hndl->tmp_work.tot_iov_byte_len = 0; + batch_count = 0; + + if (retval < 0) { + if (xio_get_last_socket_error() == + XIO_ECONNRESET || + xio_get_last_socket_error() == + XIO_ECONNABORTED || + /*EPIPE is not relevant for Windows*/ + xio_get_last_socket_error() == EPIPE) { + DEBUG_LOG("tcp trans got reset "); + DEBUG_LOG("tcp_hndl=%p\n", tcp_hndl); + xio_tcp_disconnect_helper(tcp_hndl); + return 0; + } + + if (xio_get_last_socket_error() != XIO_EAGAIN) + return -1; + + /* for eagain, add event for ready for write*/ + retval = xio_context_modify_ev_handler( + tcp_hndl->base.ctx, + tcp_hndl->sock.dfd, + XIO_POLLIN | XIO_POLLRDHUP | + XIO_POLLOUT); + if (retval != 0) + ERROR_LOG("modify events failed.\n"); + + retval = -1; + goto handle_completions; + } + + task = list_first_entry(&tcp_hndl->tx_ready_list, + struct xio_task, + tasks_list_entry); + + break; + default: + ERROR_LOG("unknown TX stage %d\n", tcp_task->txd.stage); + break; + } + } + +handle_completions: + if (task_success && + (tcp_hndl->tx_comp_cnt >= COMPLETION_BATCH_MAX || + imm_comp)) { + tcp_task = (struct xio_tcp_task *)task_success->dd_data; + retval2 = xio_ctx_add_work(tcp_hndl->base.ctx, + task_success, + xio_tcp_tx_completion_handler, + &tcp_task->comp_work); + if (retval2 != 0) { + ERROR_LOG("xio_ctx_add_work failed.\n"); + return retval2; + } + } + xio_context_disable_event(&tcp_hndl->flush_tx_event); + + return retval < 0 ? retval : 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_prep_req_in_data */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_prep_req_in_data(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task) +{ + XIO_TO_TCP_TASK(task, tcp_task); + size_t hdr_len; + size_t data_len; + size_t xio_hdr_len; + struct xio_vmsg *vmsg = &task->omsg->in; + unsigned int i; + int retval; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + void *sg; + int nents; + + sgtbl = xio_sg_table_get(&task->omsg->in); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(task->omsg->in.sgl_type); + nents = tbl_nents(sgtbl_ops, sgtbl); + + if (nents == 0) { + tcp_task->in_tcp_op = XIO_TCP_SEND; + tcp_task->read_num_reg_mem = 0; + return 0; + } + + data_len = tbl_length(sgtbl_ops, sgtbl); + hdr_len = vmsg->header.iov_len; + if (hdr_len && hdr_len >= tcp_hndl->peer_max_header) { + ERROR_LOG("hdr_len=%d is bigger than peer_max_reader=%d\n", + hdr_len, tcp_hndl->peer_max_header); + return -1; + } + + /* before working on the out - current place after the session header */ + xio_hdr_len = xio_mbuf_get_curr_offset(&task->mbuf); + xio_hdr_len += sizeof(struct xio_tcp_rsp_hdr); + xio_hdr_len += sizeof(struct xio_sge) * nents; + + /* requester may insist on RDMA for small buffers to eliminate copy + * from receive buffers to user buffers + */ + if (!(task->omsg_flags & XIO_MSG_FLAG_PEER_WRITE_RSP) && + data_len + hdr_len + xio_hdr_len < tcp_hndl->max_inline_buf_sz) { + /* user has small response - no rdma operation expected */ + tcp_task->in_tcp_op = XIO_TCP_SEND; + tcp_task->read_num_reg_mem = (data_len) ? tbl_nents(sgtbl_ops, sgtbl) : 0; + } else { + /* user provided buffers with length for RDMA WRITE */ + /* user provided mr */ + tcp_task->in_tcp_op = XIO_TCP_WRITE; + sg = sge_first(sgtbl_ops, sgtbl); + if (sge_addr(sgtbl_ops, sg) && + (sge_mr(sgtbl_ops, sg) || !tcp_options.enable_mr_check)) { + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + tcp_task->read_reg_mem[i].addr = + sge_addr(sgtbl_ops, sg); + tcp_task->read_reg_mem[i].priv = NULL; + tcp_task->read_reg_mem[i].mr = + (struct xio_mr *)sge_mr(sgtbl_ops, sg); + tcp_task->read_reg_mem[i].length = + sge_length(sgtbl_ops, sg); + } + } else { + if (!tcp_hndl->tcp_mempool) { + xio_set_error(XIO_E_NO_BUFS); + ERROR_LOG("message /read/write failed - " \ + "library's memory pool disabled\n"); + goto cleanup; + } + + /* user did not provide mr */ + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + retval = xio_mempool_alloc( + tcp_hndl->tcp_mempool, + sge_length(sgtbl_ops, sg), + &tcp_task->read_reg_mem[i]); + + if (unlikely(retval)) { + tcp_task->read_num_reg_mem = i; + xio_set_error(ENOMEM); + ERROR_LOG( + "mempool is empty for %zd bytes\n", + sge_length(sgtbl_ops, sg)); + goto cleanup; + } + tcp_task->read_reg_mem[i].length = + sge_length(sgtbl_ops, sg); + } + } + tcp_task->read_num_reg_mem = nents; + } + if (tcp_task->read_num_reg_mem > tcp_hndl->peer_max_out_iovsz) { + ERROR_LOG("request in iovlen %d is bigger " \ + "than peer max out iovlen %d\n", + tcp_task->read_num_reg_mem, + tcp_hndl->peer_max_out_iovsz); + goto cleanup; + } + + return 0; + +cleanup: + for (i = 0; i < tcp_task->read_num_reg_mem; i++) + xio_mempool_free(&tcp_task->read_reg_mem[i]); + + tcp_task->read_num_reg_mem = 0; + xio_set_error(EMSGSIZE); + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_single_sock_set_txd */ +/*---------------------------------------------------------------------------*/ +size_t xio_tcp_single_sock_set_txd(struct xio_task *task) +{ + XIO_TO_TCP_TASK(task, tcp_task); + size_t iov_len; + size_t tlv_len; + + tcp_task->txd.ctl_msg_len = 0; + + iov_len = xio_mbuf_get_curr_offset(&task->mbuf); + tcp_task->txd.msg_iov[0].iov_len = iov_len; + + tlv_len = iov_len - XIO_TLV_LEN; + if (tcp_task->out_tcp_op == XIO_TCP_SEND) + tlv_len += (size_t)tcp_task->txd.tot_iov_byte_len; + + tcp_task->txd.tot_iov_byte_len += iov_len; + + tcp_task->txd.msg.msg_iov = tcp_task->txd.msg_iov; + tcp_task->txd.msg.msg_iovlen = tcp_task->txd.msg_len; + + return tlv_len; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_dual_sock_set_txd */ +/*---------------------------------------------------------------------------*/ +size_t xio_tcp_dual_sock_set_txd(struct xio_task *task) +{ + XIO_TO_TCP_TASK(task, tcp_task); + size_t iov_len; + /* is this is an application message than send the user header + * with the XIO message to be used with assign_data_in_buf + */ + if (IS_APPLICATION_MSG(task->tlv_type)) { + iov_len = xio_mbuf_get_curr_offset(&task->mbuf); + tcp_task->txd.ctl_msg_len = iov_len; + tcp_task->txd.msg_iov[0].iov_len = iov_len; + /* header is sent with XIO management data*/ + --tcp_task->txd.msg_len; + /* set the send location considering the user header */ + if (iov_len == 0) + tcp_task->txd.msg.msg_iov = tcp_task->txd.msg_iov; + else + tcp_task->txd.msg.msg_iov = &tcp_task->txd.msg_iov[1]; + } else { + iov_len = xio_mbuf_get_curr_offset(&task->mbuf) + - tcp_task->txd.ctl_msg_len; + tcp_task->txd.msg_iov[0].iov_len = iov_len; + inc_ptr(tcp_task->txd.msg_iov[0].iov_base, + tcp_task->txd.ctl_msg_len); + + tcp_task->txd.tot_iov_byte_len += iov_len; + + if (tcp_task->txd.msg_iov[0].iov_len == 0) { + tcp_task->txd.msg.msg_iov = &tcp_task->txd.msg_iov[1]; + --tcp_task->txd.msg_len; + } else { + tcp_task->txd.msg.msg_iov = tcp_task->txd.msg_iov; + } + } + tcp_task->txd.msg.msg_iovlen = tcp_task->txd.msg_len; + return tcp_task->txd.ctl_msg_len - XIO_TLV_LEN; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_send_req */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_send_req(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task) +{ + XIO_TO_TCP_TASK(task, tcp_task); + size_t retval; + size_t tlv_len; + + /* prepare buffer for response */ + retval = xio_tcp_prep_req_in_data(tcp_hndl, task); + if (unlikely(retval != 0)) { + ERROR_LOG("tcp_prep_req_in_data failed\n"); + return -1; + } + + /* prepare the out message */ + retval = xio_tcp_prep_req_out_data(tcp_hndl, task); + if (unlikely(retval != 0)) { + ERROR_LOG("tcp_prep_req_out_data failed\n"); + return -1; + } + + /* set the length */ + tlv_len = tcp_hndl->sock.ops->set_txd(task); + + /* add tlv */ + if (xio_mbuf_write_tlv(&task->mbuf, task->tlv_type, tlv_len) != 0) { + ERROR_LOG("write tlv failed\n"); + xio_set_error(EOVERFLOW); + return -1; + } + + xio_task_addref(task); + + tcp_task->out_tcp_op = XIO_TCP_SEND; + + list_move_tail(&task->tasks_list_entry, &tcp_hndl->tx_ready_list); + + tcp_hndl->tx_ready_tasks_num++; + + retval = xio_tcp_xmit(tcp_hndl); + if (retval) { + if (xio_errno() != XIO_EAGAIN) { + DEBUG_LOG("xio_tcp_xmit failed\n"); + return -1; + } + xio_context_add_event(tcp_hndl->base.ctx, + &tcp_hndl->flush_tx_event); + retval = 0; + } + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_write_rsp_header */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_write_rsp_header(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task, + struct xio_tcp_rsp_hdr *rsp_hdr) +{ + XIO_TO_TCP_TASK(task, tcp_task); + struct xio_tcp_rsp_hdr *tmp_rsp_hdr; + uint32_t *wr_len; + int i; + size_t hdr_len; + + /* point to transport header */ + xio_mbuf_set_trans_hdr(&task->mbuf); + tmp_rsp_hdr = (struct xio_tcp_rsp_hdr *) + xio_mbuf_get_curr_ptr(&task->mbuf); + + /* pack relevant values */ + tmp_rsp_hdr->version = rsp_hdr->version; + tmp_rsp_hdr->flags = rsp_hdr->flags; + PACK_SVAL(rsp_hdr, tmp_rsp_hdr, rsp_hdr_len); + PACK_LVAL(rsp_hdr, tmp_rsp_hdr, ltid); + PACK_LVAL(rsp_hdr, tmp_rsp_hdr, rtid); + tmp_rsp_hdr->out_tcp_op = rsp_hdr->out_tcp_op; + PACK_LVAL(rsp_hdr, tmp_rsp_hdr, status); + PACK_SVAL(rsp_hdr, tmp_rsp_hdr, out_num_sge); + PACK_SVAL(rsp_hdr, tmp_rsp_hdr, ulp_hdr_len); + PACK_SVAL(rsp_hdr, tmp_rsp_hdr, ulp_pad_len); + /* remain_data_len not in use */ + PACK_LLVAL(rsp_hdr, tmp_rsp_hdr, ulp_imm_len); + + if (rsp_hdr->out_num_sge) { + wr_len = (uint32_t *)((uint8_t *)tmp_rsp_hdr + + sizeof(struct xio_tcp_rsp_hdr)); + + /* params for RDMA WRITE equivalent*/ + for (i = 0; i < rsp_hdr->out_num_sge; i++) { + *wr_len = htonl(tcp_task->rsp_out_sge[i].length); + wr_len++; + } + } + + hdr_len = sizeof(struct xio_tcp_rsp_hdr); + hdr_len += sizeof(uint32_t) * rsp_hdr->out_num_sge; + + xio_mbuf_inc(&task->mbuf, hdr_len); + +#ifdef EYAL_TODO + print_hex_dump_bytes("post_send: ", DUMP_PREFIX_ADDRESS, + task->mbuf.tlv.head, 64); +#endif + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_prep_rsp_header */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_prep_rsp_header(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task, + uint16_t ulp_hdr_len, + uint16_t ulp_pad_len, + uint64_t ulp_imm_len, + uint32_t status) +{ + XIO_TO_TCP_TASK(task, tcp_task); + struct xio_tcp_rsp_hdr rsp_hdr; + + if (unlikely(!IS_RESPONSE(task->tlv_type))) { + ERROR_LOG("unknown message type\n"); + return -1; + } + + /* fill response header */ + rsp_hdr.version = XIO_TCP_RSP_HEADER_VERSION; + rsp_hdr.rsp_hdr_len = sizeof(rsp_hdr); + rsp_hdr.rtid = task->rtid; + rsp_hdr.ltid = task->ltid; + rsp_hdr.out_tcp_op = tcp_task->out_tcp_op; + rsp_hdr.flags = XIO_HEADER_FLAG_NONE; + rsp_hdr.out_num_sge = tcp_task->rsp_out_num_sge; + rsp_hdr.ulp_hdr_len = ulp_hdr_len; + rsp_hdr.ulp_pad_len = ulp_pad_len; + rsp_hdr.ulp_imm_len = ulp_imm_len; + rsp_hdr.status = status; + if (xio_tcp_write_rsp_header(tcp_hndl, task, &rsp_hdr) != 0) + goto cleanup; + + tcp_task->txd.ctl_msg_len = xio_mbuf_tlv_len(&task->mbuf); + + /* write the payload header */ + if (ulp_hdr_len) { + if (xio_mbuf_write_array( + &task->mbuf, + task->omsg->out.header.iov_base, + task->omsg->out.header.iov_len) != 0) + goto cleanup; + } + + /* write the pad between header and data */ + if (ulp_pad_len) + xio_mbuf_inc(&task->mbuf, ulp_pad_len); + + return 0; + +cleanup: + xio_set_error(XIO_E_MSG_SIZE); + ERROR_LOG("xio_tcp_write_rsp_header failed\n"); + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_prep_rsp_wr_data */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_prep_rsp_wr_data(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task) +{ + XIO_TO_TCP_TASK(task, tcp_task); + unsigned int i, llen = 0, rlen = 0; + int retval; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + void *sg; + + sgtbl = xio_sg_table_get(&task->omsg->out); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(task->omsg->out.sgl_type); + + /* user did not provided mr */ + sg = sge_first(sgtbl_ops, sgtbl); + if (!sge_mr(sgtbl_ops, sg) && + tcp_options.enable_mr_check) { + if (!tcp_hndl->tcp_mempool) { + xio_set_error(XIO_E_NO_BUFS); + ERROR_LOG("message /read/write failed - " \ + "library's memory pool disabled\n"); + goto cleanup; + } + /* user did not provide mr - take buffers from pool + * and do copy */ + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + retval = xio_mempool_alloc( + tcp_hndl->tcp_mempool, + sge_length(sgtbl_ops, sg), + &tcp_task->write_reg_mem[i]); + if (unlikely(retval)) { + tcp_task->write_num_reg_mem = i; + xio_set_error(ENOMEM); + ERROR_LOG("mempool is empty for %zd bytes\n", + sge_length(sgtbl_ops, sg)); + goto cleanup; + } + + /* copy the data to the buffer */ + memcpy(tcp_task->write_reg_mem[i].addr, + sge_addr(sgtbl_ops, sg), + sge_length(sgtbl_ops, sg)); + + tcp_task->txd.msg_iov[i + 1].iov_base = + tcp_task->write_reg_mem[i].addr; + tcp_task->txd.msg_iov[i + 1].iov_len = + sge_length(sgtbl_ops, sg); + llen += sge_length(sgtbl_ops, sg); + } + } else { + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + tcp_task->txd.msg_iov[i + 1].iov_base = + sge_addr(sgtbl_ops, sg); + tcp_task->txd.msg_iov[i + 1].iov_len = + sge_length(sgtbl_ops, sg); + llen += sge_length(sgtbl_ops, sg); + } + } + + tcp_task->txd.msg_len = + tbl_nents(sgtbl_ops, sgtbl) + 1; + tcp_task->txd.tot_iov_byte_len = llen; + + for (i = 0; i < tcp_task->req_in_num_sge; i++) + rlen += tcp_task->req_in_sge[i].length; + + if (rlen < llen) { + ERROR_LOG("peer provided too small iovec\n"); + ERROR_LOG("tcp write is ignored\n"); + task->status = EINVAL; + goto cleanup; + } + + i = 0; + while (llen) { + if (tcp_task->req_in_sge[i].length < llen) { + tcp_task->rsp_out_sge[i].length = + tcp_task->req_in_sge[i].length; + } else { + tcp_task->rsp_out_sge[i].length = + llen; + } + llen -= tcp_task->rsp_out_sge[i].length; + ++i; + } + tcp_task->rsp_out_num_sge = i; + + return 0; +cleanup: + for (i = 0; i < tcp_task->write_num_reg_mem; i++) + xio_mempool_free(&tcp_task->write_reg_mem[i]); + + tcp_task->write_num_reg_mem = 0; + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_send_rsp */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_send_rsp(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task) +{ + XIO_TO_TCP_TASK(task, tcp_task); + uint64_t xio_hdr_len; + uint64_t ulp_hdr_len; + uint64_t ulp_pad_len = 0; + uint64_t ulp_imm_len; + size_t retval; + int enforce_write_rsp; + int tlv_len = 0; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + + sgtbl = xio_sg_table_get(&task->omsg->out); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(task->omsg->out.sgl_type); + + /* calculate headers */ + ulp_hdr_len = task->omsg->out.header.iov_len; + ulp_imm_len = tbl_length(sgtbl_ops, sgtbl); + xio_hdr_len = xio_mbuf_get_curr_offset(&task->mbuf); + xio_hdr_len += sizeof(struct xio_tcp_rsp_hdr); + xio_hdr_len += tcp_task->req_in_num_sge * sizeof(struct xio_sge); + enforce_write_rsp = task->imsg_flags & XIO_HEADER_FLAG_PEER_WRITE_RSP; + + if (g_options.inline_xio_data_align && ulp_imm_len) { + uint16_t hdr_len = xio_hdr_len + ulp_hdr_len; + + ulp_pad_len = ALIGN(hdr_len, g_options.inline_xio_data_align) - + hdr_len; + } + /* + if (tcp_hndl->max_inline_buf_sz < xio_hdr_len + ulp_hdr_len) { + ERROR_LOG("header size %lu exceeds max header %lu\n", + ulp_hdr_len, + tcp_hndl->max_inline_buf_sz - xio_hdr_len); + xio_set_error(XIO_E_MSG_SIZE); + goto cleanup; + } + */ + + /* Small data is outgoing via SEND unless the requester explicitly + * insisted on RDMA operation and provided resources. + */ + if ((ulp_imm_len == 0) || (!enforce_write_rsp && + ((xio_hdr_len + ulp_hdr_len + + ulp_pad_len + ulp_imm_len) + < tcp_hndl->max_inline_buf_sz))) { + tcp_task->out_tcp_op = XIO_TCP_SEND; + /* write xio header to the buffer */ + retval = xio_tcp_prep_rsp_header( + tcp_hndl, task, + (uint16_t)ulp_hdr_len, + (uint16_t)ulp_pad_len, ulp_imm_len, + XIO_E_SUCCESS); + if (retval) + goto cleanup; + + /* if there is data, set it to buffer or directly to the sge */ + if (ulp_imm_len) { + retval = xio_tcp_write_send_data(tcp_hndl, task); + if (retval) + goto cleanup; + } + } else { + if (tcp_task->req_in_sge[0].addr && + tcp_task->req_in_sge[0].length) { + /* the data is sent via RDMA_WRITE equivalent*/ + tcp_task->out_tcp_op = XIO_TCP_WRITE; + /* prepare rdma write equivalent */ + retval = xio_tcp_prep_rsp_wr_data(tcp_hndl, task); + if (retval) + goto cleanup; + + /* and the header is sent via SEND */ + /* write xio header to the buffer */ + retval = xio_tcp_prep_rsp_header( + tcp_hndl, task, + (uint16_t)ulp_hdr_len, 0, ulp_imm_len, + XIO_E_SUCCESS); + if (retval) + goto cleanup; + } else { + ERROR_LOG("partial completion of request due " \ + "to missing, response buffer\n"); + + /* the client did not provide buffer for response */ + retval = xio_tcp_prep_rsp_header( + tcp_hndl, task, + (uint16_t)ulp_hdr_len, 0, 0, + XIO_E_PARTIAL_MSG); + goto cleanup; + } + } + + if (ulp_imm_len == 0) { + /* no data at all */ + tbl_set_nents(sgtbl_ops, sgtbl, 0); + tcp_task->txd.tot_iov_byte_len = 0; + tcp_task->txd.msg_len = 1; + } + + /* set the length */ + tlv_len = tcp_hndl->sock.ops->set_txd(task); + + /* add tlv */ + if (xio_mbuf_write_tlv(&task->mbuf, task->tlv_type, tlv_len) != 0) + goto cleanup; + + list_move_tail(&task->tasks_list_entry, &tcp_hndl->tx_ready_list); + + tcp_hndl->tx_ready_tasks_num++; + + retval = xio_tcp_xmit(tcp_hndl); + if (retval) { + /* no need xio_get_last_error here */ + retval = xio_errno(); + if (retval != XIO_EAGAIN) { + ERROR_LOG("xio_xmit_tcp failed. %s\n", + xio_strerror(retval)); + return -1; + } + xio_context_add_event(tcp_hndl->base.ctx, + &tcp_hndl->flush_tx_event); + retval = 0; + } + + return retval; + +cleanup: + xio_set_error(XIO_E_MSG_SIZE); + ERROR_LOG("xio_tcp_send_msg failed\n"); + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_read_req_header */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_read_req_header(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task, + struct xio_tcp_req_hdr *req_hdr) +{ + XIO_TO_TCP_TASK(task, tcp_task); + struct xio_tcp_req_hdr *tmp_req_hdr; + struct xio_sge *tmp_sge; + int i; + size_t hdr_len; + + /* point to transport header */ + xio_mbuf_set_trans_hdr(&task->mbuf); + tmp_req_hdr = (struct xio_tcp_req_hdr *) + xio_mbuf_get_curr_ptr(&task->mbuf); + + req_hdr->version = tmp_req_hdr->version; + req_hdr->flags = tmp_req_hdr->flags; + UNPACK_SVAL(tmp_req_hdr, req_hdr, req_hdr_len); + + if (unlikely(req_hdr->req_hdr_len != sizeof(struct xio_tcp_req_hdr))) { + ERROR_LOG( + "header length's read failed. arrived:%d expected:%zd\n", + req_hdr->req_hdr_len, sizeof(struct xio_tcp_req_hdr)); + return -1; + } + + UNPACK_SVAL(tmp_req_hdr, req_hdr, sn); + UNPACK_LVAL(tmp_req_hdr, req_hdr, ltid); + req_hdr->out_tcp_op = tmp_req_hdr->out_tcp_op; + req_hdr->in_tcp_op = tmp_req_hdr->in_tcp_op; + + UNPACK_SVAL(tmp_req_hdr, req_hdr, in_num_sge); + UNPACK_SVAL(tmp_req_hdr, req_hdr, out_num_sge); + UNPACK_SVAL(tmp_req_hdr, req_hdr, ulp_hdr_len); + UNPACK_SVAL(tmp_req_hdr, req_hdr, ulp_pad_len); + + /* remain_data_len not in use */ + UNPACK_LLVAL(tmp_req_hdr, req_hdr, ulp_imm_len); + + tmp_sge = (struct xio_sge *)((uint8_t *)tmp_req_hdr + + sizeof(struct xio_tcp_req_hdr)); + + tcp_task->sn = req_hdr->sn; + + /* params for SEND/RDMA_WRITE */ + for (i = 0; i < req_hdr->in_num_sge; i++) { + UNPACK_LLVAL(tmp_sge, &tcp_task->req_in_sge[i], addr); + UNPACK_LVAL(tmp_sge, &tcp_task->req_in_sge[i], length); + UNPACK_LVAL(tmp_sge, &tcp_task->req_in_sge[i], stag); + tmp_sge++; + } + tcp_task->req_in_num_sge = i; + + /* params for RDMA_READ */ + for (i = 0; i < req_hdr->out_num_sge; i++) { + UNPACK_LLVAL(tmp_sge, &tcp_task->req_out_sge[i], addr); + UNPACK_LVAL(tmp_sge, &tcp_task->req_out_sge[i], length); + UNPACK_LVAL(tmp_sge, &tcp_task->req_out_sge[i], stag); + tmp_sge++; + } + tcp_task->req_out_num_sge = i; + + hdr_len = sizeof(struct xio_tcp_req_hdr); + hdr_len += sizeof(struct xio_sge) * (req_hdr->in_num_sge + + req_hdr->out_num_sge); + + xio_mbuf_inc(&task->mbuf, hdr_len); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_read_rsp_header */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_read_rsp_header(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task, + struct xio_tcp_rsp_hdr *rsp_hdr) +{ + XIO_TO_TCP_TASK(task, tcp_task); + struct xio_tcp_rsp_hdr *tmp_rsp_hdr; + uint32_t *wr_len; + int i; + size_t hdr_len; + + /* point to transport header */ + xio_mbuf_set_trans_hdr(&task->mbuf); + tmp_rsp_hdr = (struct xio_tcp_rsp_hdr *) + xio_mbuf_get_curr_ptr(&task->mbuf); + + rsp_hdr->version = tmp_rsp_hdr->version; + rsp_hdr->flags = tmp_rsp_hdr->flags; + UNPACK_SVAL(tmp_rsp_hdr, rsp_hdr, rsp_hdr_len); + + if (unlikely(rsp_hdr->rsp_hdr_len != sizeof(struct xio_tcp_rsp_hdr))) { + ERROR_LOG( + "header length's read failed. arrived:%d expected:%zd\n", + rsp_hdr->rsp_hdr_len, sizeof(struct xio_tcp_rsp_hdr)); + return -1; + } + + UNPACK_SVAL(tmp_rsp_hdr, rsp_hdr, sn); + UNPACK_LVAL(tmp_rsp_hdr, rsp_hdr, rtid); + UNPACK_LVAL(tmp_rsp_hdr, rsp_hdr, ltid); + rsp_hdr->out_tcp_op = tmp_rsp_hdr->out_tcp_op; + UNPACK_LVAL(tmp_rsp_hdr, rsp_hdr, status); + UNPACK_SVAL(tmp_rsp_hdr, rsp_hdr, out_num_sge); + UNPACK_SVAL(tmp_rsp_hdr, rsp_hdr, ulp_hdr_len); + UNPACK_SVAL(tmp_rsp_hdr, rsp_hdr, ulp_pad_len); + /* remain_data_len not in use */ + UNPACK_LLVAL(tmp_rsp_hdr, rsp_hdr, ulp_imm_len); + + if (rsp_hdr->out_num_sge) { + wr_len = (uint32_t *)((uint8_t *)tmp_rsp_hdr + + sizeof(struct xio_tcp_rsp_hdr)); + + /* params for RDMA WRITE */ + for (i = 0; i < rsp_hdr->out_num_sge; i++) { + tcp_task->rsp_out_sge[i].length = ntohl(*wr_len); + wr_len++; + } + tcp_task->rsp_out_num_sge = rsp_hdr->out_num_sge; + } + + hdr_len = sizeof(struct xio_tcp_rsp_hdr); + hdr_len += sizeof(uint32_t) * rsp_hdr->out_num_sge; + + xio_mbuf_inc(&task->mbuf, hdr_len); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_notify_assign_in_buf */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_assign_in_buf(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task, int *is_assigned) +{ + union xio_transport_event_data event_data = {}; + + event_data.assign_in_buf.task = task; + + xio_transport_notify_observer(&tcp_hndl->base, + XIO_TRANSPORT_EVENT_ASSIGN_IN_BUF, + &event_data); + + *is_assigned = event_data.assign_in_buf.is_assigned; + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_alloc_data_buf */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_alloc_data_buf(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task) +{ + union xio_transport_event_data event_data = {}; + + event_data.alloc_data_buf.task = task; + event_data.alloc_data_buf.is_assigned = 0; + xio_transport_notify_observer(&tcp_hndl->base, + XIO_TRANSPORT_EVENT_ALLOC_DATA_BUF, + &event_data); + return event_data.assign_in_buf.is_assigned; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_alloc_head_buf */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_alloc_head_buf(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task, struct xio_iovec *header) +{ + union xio_transport_event_data event_data = {}; + + event_data.alloc_head_buf.task = task; + event_data.alloc_head_buf.is_assigned = 0; + event_data.alloc_head_buf.header = header; + xio_transport_notify_observer(&tcp_hndl->base, + XIO_TRANSPORT_EVENT_ALLOC_HEAD_BUF, + &event_data); + return event_data.assign_in_buf.is_assigned; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_recv_ctl_work */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_recv_ctl_work(struct xio_tcp_transport *tcp_hndl, int fd, + struct xio_tcp_work_req *xio_recv, int block) +{ + int retval; + int bytes_to_copy; + + if (xio_recv->tot_iov_byte_len == 0) + return 1; + + if (xio_recv->msg.msg_iovlen > 1 || + xio_recv->tot_iov_byte_len != xio_recv->msg.msg_iov[0].iov_len) { + ERROR_LOG("expecting only 1 sized iovec\n"); + return 0; + } + + while (xio_recv->tot_iov_byte_len) { + while (tcp_hndl->tmp_rx_buf_len == 0) { + retval = recv(fd, (char *)tcp_hndl->tmp_rx_buf, + TMP_RX_BUF_SIZE, 0); + if (retval > 0) { + tcp_hndl->tmp_rx_buf_len = retval; + tcp_hndl->tmp_rx_buf_cur = tcp_hndl->tmp_rx_buf; + } else if (retval == 0) { + /*so errno is not EAGAIN*/ + xio_set_error(XIO_ECONNABORTED); + DEBUG_LOG("tcp transport got EOF,tcp_hndl=%p\n", + tcp_hndl); + return 0; + } else { + if (xio_get_last_socket_error() == XIO_EAGAIN) { + if (!block) { + xio_set_error( + xio_get_last_socket_error()); + return -1; + } + } else if (xio_get_last_socket_error() == + XIO_ECONNRESET || + xio_get_last_socket_error() == + XIO_ECONNABORTED) { + xio_set_error( + xio_get_last_socket_error()); + DEBUG_LOG("recv failed.(errno=%d)\n", + xio_get_last_socket_error()); + return 0; + } else { + xio_set_error( + xio_get_last_socket_error()); + ERROR_LOG("recv failed.(errno=%d)\n", + xio_get_last_socket_error()); + return -1; + } + } + } + bytes_to_copy = xio_recv->tot_iov_byte_len > + tcp_hndl->tmp_rx_buf_len ? + (int)tcp_hndl->tmp_rx_buf_len : + (int)xio_recv->tot_iov_byte_len; + memcpy(xio_recv->msg.msg_iov[0].iov_base, + tcp_hndl->tmp_rx_buf_cur, bytes_to_copy); + inc_ptr(tcp_hndl->tmp_rx_buf_cur, bytes_to_copy); + inc_ptr(xio_recv->msg.msg_iov[0].iov_base, bytes_to_copy); + tcp_hndl->tmp_rx_buf_len -= bytes_to_copy; + xio_recv->msg.msg_iov[0].iov_len -= bytes_to_copy; + xio_recv->tot_iov_byte_len -= bytes_to_copy; + } + + xio_recv->msg.msg_iovlen = 0; + + return 1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_recvmsg_work */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_recvmsg_work(struct xio_tcp_transport *tcp_hndl, int fd, + struct xio_tcp_work_req *xio_recv, int block) +{ + unsigned int i; + int retval; + int recv_bytes = 0, tmp_bytes; + + if (xio_recv->tot_iov_byte_len == 0) + return 1; + + while (xio_recv->tot_iov_byte_len) { + retval = recvmsg(fd, &xio_recv->msg, 0); + if (retval > 0) { + recv_bytes += retval; + xio_recv->tot_iov_byte_len -= retval; + + if (xio_recv->tot_iov_byte_len == 0) { + xio_recv->msg.msg_iovlen = 0; + break; + } + + tmp_bytes = 0; + for (i = 0; i < xio_recv->msg.msg_iovlen; i++) { + if (xio_recv->msg.msg_iov[i].iov_len + + tmp_bytes <= (size_t)retval) { + tmp_bytes += + xio_recv->msg.msg_iov[i].iov_len; + } else { + xio_recv->msg.msg_iov[i].iov_len -= + (retval - tmp_bytes); + inc_ptr( + xio_recv->msg.msg_iov[i].iov_base, + retval - tmp_bytes); + xio_recv->msg.msg_iov = + &xio_recv->msg.msg_iov[i]; + xio_recv->msg.msg_iovlen -= i; + break; + } + } + } else if (retval == 0) { + xio_set_error(ECONNABORTED); /*so errno is not EAGAIN*/ + DEBUG_LOG("tcp transport got EOF, tcp_hndl=%p\n", + tcp_hndl); + return 0; + } else { + if (xio_get_last_socket_error() == XIO_EAGAIN) { + if (!block) { + xio_set_error( + xio_get_last_socket_error()); + return -1; + } + } else if (xio_get_last_socket_error() == + XIO_ECONNRESET || + xio_get_last_socket_error() == + XIO_ECONNABORTED) { + xio_set_error(xio_get_last_socket_error()); + DEBUG_LOG("recvmsg failed. (errno=%d)\n", + xio_get_last_socket_error()); + return 0; + } + xio_set_error(xio_get_last_socket_error()); + ERROR_LOG("recvmsg failed. (errno=%d)\n", + xio_get_last_socket_error()); + return -1; + } + } + + return recv_bytes; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_single_sock_set_rxd */ +/*---------------------------------------------------------------------------*/ +void xio_tcp_single_sock_set_rxd(struct xio_task *task, + void *buf, uint32_t len) +{ + XIO_TO_TCP_TASK(task, tcp_task); + tcp_task->rxd.tot_iov_byte_len = 0; + tcp_task->rxd.msg_len = 0; +} +void xio_tcp_single_sock_set_rxd_iov(struct xio_task *task, + void *msg_iov, uint32_t msg_len, uint64_t total_len) +{ + XIO_TO_TCP_TASK(task, tcp_task); + tcp_task->rxd.tot_iov_byte_len = 0; + tcp_task->rxd.msg_len = 0; +} +/*---------------------------------------------------------------------------*/ +/* xio_tcp_dual_sock_set_rxd */ +/*---------------------------------------------------------------------------*/ +void xio_tcp_dual_sock_set_rxd(struct xio_task *task, + void *buf, uint32_t len) +{ + XIO_TO_TCP_TASK(task, tcp_task); + if (IS_APPLICATION_MSG(task->tlv_type)) + inc_ptr(buf, task->imsg.in.header.iov_len); + tcp_task->rxd.msg_iov[0].iov_base = buf; + tcp_task->rxd.msg_iov[0].iov_len = len; + tcp_task->rxd.tot_iov_byte_len = len; + if (len) { + tcp_task->rxd.msg_len = 1; + tcp_task->rxd.msg.msg_iovlen = 1; + tcp_task->rxd.msg.msg_iov = tcp_task->rxd.msg_iov; + } else { + tcp_task->rxd.msg_len = 0; + tcp_task->rxd.msg.msg_iovlen = 0; + } +} +/*---------------------------------------------------------------------------*/ +/* xio_tcp_dual_sock_set_rxd */ +/*---------------------------------------------------------------------------*/ +void xio_tcp_dual_sock_set_rxd_iov(struct xio_task *task, + void *msg_iov, uint32_t msg_len, uint64_t total_len) +{ + struct iovec *ptr_iov = (struct iovec *)msg_iov; + XIO_TO_TCP_TASK(task, tcp_task); + if(!ptr_iov || !msg_len || !total_len){ + tcp_task->rxd.msg_len = 0; + tcp_task->rxd.msg.msg_iovlen = 0; + tcp_task->rxd.tot_iov_byte_len = 0; + return; + } + if (IS_APPLICATION_MSG(task->tlv_type)){ + tcp_task->rxd.msg_iov[0].iov_base = ptr_iov[0].iov_base; + tcp_task->rxd.msg_iov[0].iov_len = ptr_iov[0].iov_len; + tcp_task->rxd.tot_iov_byte_len = total_len; + tcp_task->rxd.msg_len = msg_len; + if (msg_len) { + tcp_task->rxd.msg.msg_iovlen = msg_len; + tcp_task->rxd.msg.msg_iov = (struct iovec *)ptr_iov; + } else { + tcp_task->rxd.msg_len = 0; + tcp_task->rxd.msg.msg_iovlen = 0; + } + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_rd_req_header */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_rd_req_header(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task) +{ + XIO_TO_TCP_TASK(task, tcp_task); + unsigned int i, vec_size = 0; + int retval; + int user_assign_flag = 0; + size_t rlen = 0, llen = 0; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + void *sg; + + /* responder side got request for rdma read */ + + /* need for buffer to do rdma read. there are two options: */ + /* option 1: user provides call back that fills application memory */ + /* option 2: use internal buffer pool */ + + /* hint the upper layer of sizes */ + sgtbl = xio_sg_table_get(&task->imsg.in); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(task->imsg.in.sgl_type); + tbl_set_nents(sgtbl_ops, sgtbl, tcp_task->req_out_num_sge); + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + sge_set_addr(sgtbl_ops, sg, NULL); + sge_set_length(sgtbl_ops, sg, + tcp_task->req_out_sge[i].length); + rlen += tcp_task->req_out_sge[i].length; + tcp_task->read_reg_mem[i].priv = NULL; + } + sgtbl = xio_sg_table_get(&task->imsg.out); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(task->imsg.out.sgl_type); + if (tcp_task->req_in_num_sge) { + tbl_set_nents(sgtbl_ops, sgtbl, tcp_task->req_in_num_sge); + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + sge_set_addr(sgtbl_ops, sg, NULL); + sge_set_length(sgtbl_ops, sg, + tcp_task->req_in_sge[i].length); + tcp_task->write_reg_mem[i].priv = NULL; + } + } else { + tbl_set_nents(sgtbl_ops, sgtbl, 0); + } + sgtbl = xio_sg_table_get(&task->imsg.in); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(task->imsg.in.sgl_type); + + xio_tcp_assign_in_buf(tcp_hndl, task, &user_assign_flag); + if (user_assign_flag) { + /* if user does not have buffers ignore */ + if (tbl_nents(sgtbl_ops, sgtbl) == 0) { + WARN_LOG("application has not provided buffers\n"); + WARN_LOG("tcp read is ignored\n"); + task->status = XIO_E_NO_USER_BUFS; + return -1; + } + + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + if (!sge_mr(sgtbl_ops, sg) && tcp_options.enable_mr_check) { + ERROR_LOG("application has not provided mr\n"); + ERROR_LOG("tcp read is ignored\n"); + task->status = XIO_E_NO_USER_MR; + return -1; + } + if (!sge_addr(sgtbl_ops, sg)) { + ERROR_LOG("application has provided " \ + "null address\n"); + ERROR_LOG("tcp read is ignored\n"); + task->status = XIO_E_NO_USER_BUFS; + return -1; + } + llen += sge_length(sgtbl_ops, sg); + vec_size++; + if (llen > rlen) { + sge_set_length(sgtbl_ops, sg, rlen - + (llen - + sge_length(sgtbl_ops, sg))); + tcp_task->req_out_sge[i].length = + sge_length(sgtbl_ops, sg); + break; + } + tcp_task->req_out_sge[i].length = + sge_length(sgtbl_ops, sg); + } + if (rlen > llen) { + ERROR_LOG("application provided too small iovec\n"); + ERROR_LOG("remote peer want to write %zd bytes while" \ + "local peer provided buffer size %zd bytes\n", + rlen, llen); + ERROR_LOG("tcp read is ignored\n"); + task->status = XIO_E_USER_BUF_OVERFLOW; + return -1; + } + + tcp_task->req_out_num_sge = vec_size; + tbl_set_nents(sgtbl_ops, sgtbl, vec_size); + set_bits(XIO_MSG_HINT_ASSIGNED_DATA_IN_BUF, &task->imsg.hints); + } else { + if (!tcp_hndl->tcp_mempool) { + ERROR_LOG("message /read/write failed - " \ + "library's memory pool disabled\n"); + task->status = XIO_E_NO_BUFS; + goto cleanup; + } + + tbl_set_nents(sgtbl_ops, sgtbl, tcp_task->req_out_num_sge); + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + retval = xio_mempool_alloc( + tcp_hndl->tcp_mempool, + tcp_task->req_out_sge[i].length, + &tcp_task->read_reg_mem[i]); + + if (retval) { + tcp_task->read_num_reg_mem = i; + ERROR_LOG("mempool is empty for %zd bytes\n", + tcp_task->read_reg_mem[i].length); + + task->status = ENOMEM; + goto cleanup; + } + sge_set_addr(sgtbl_ops, sg, + tcp_task->read_reg_mem[i].addr); + sge_set_length(sgtbl_ops, sg, + tcp_task->read_reg_mem[i].length); + sge_set_mr(sgtbl_ops, sg, + tcp_task->read_reg_mem[i].mr); + } + tcp_task->read_num_reg_mem = tcp_task->req_out_num_sge; + } + + sg = sge_first(sgtbl_ops, sgtbl); + for (i = 0; i < tcp_task->req_out_num_sge; i++) { + tcp_task->rxd.msg_iov[i + 1].iov_base = + sge_addr(sgtbl_ops, sg); + tcp_task->rxd.msg_iov[i + 1].iov_len = + tcp_task->req_out_sge[i].length; + sge_set_length(sgtbl_ops, sg, + tcp_task->req_out_sge[i].length); + sg = sge_next(sgtbl_ops, sgtbl, sg); + } + tcp_task->rxd.msg_len += tcp_task->req_out_num_sge; + + /* prepare the in side of the message */ + tcp_task->rxd.tot_iov_byte_len += rlen; + if (tcp_task->rxd.msg.msg_iovlen) + tcp_task->rxd.msg.msg_iov = tcp_task->rxd.msg_iov; + else + tcp_task->rxd.msg.msg_iov = &tcp_task->rxd.msg_iov[1]; + tcp_task->rxd.msg.msg_iovlen = tcp_task->rxd.msg_len; + + return 0; +cleanup: + for (i = 0; i < tcp_task->read_num_reg_mem; i++) + xio_mempool_free(&tcp_task->read_reg_mem[i]); + + tcp_task->read_num_reg_mem = 0; + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_on_recv_req_header */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_on_recv_req_header(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task) +{ + XIO_TO_TCP_TASK(task, tcp_task); + int retval = 0; + struct xio_tcp_req_hdr req_hdr; + struct xio_msg *imsg; + void *ulp_hdr; + unsigned int i; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + void *sg; + uint32_t inents = 0; + /* read header */ + retval = xio_tcp_read_req_header(tcp_hndl, task, &req_hdr); + if (retval != 0) { + xio_set_error(XIO_E_MSG_INVALID); + goto cleanup; + } + + /* save originator identifier */ + task->rtid = req_hdr.ltid; + task->imsg_flags = req_hdr.flags; + + imsg = &task->imsg; + sgtbl = xio_sg_table_get(&imsg->out); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(imsg->out.sgl_type); + + ulp_hdr = xio_mbuf_get_curr_ptr(&task->mbuf); + + imsg->type = (enum xio_msg_type)task->tlv_type; + imsg->in.header.iov_len = req_hdr.ulp_hdr_len; + + clr_bits(XIO_MSG_HINT_ASSIGNED_DATA_IN_BUF, &imsg->hints); + + if (req_hdr.ulp_hdr_len){ + // 预分配内存 + imsg->in.header.iov_base = ulp_hdr; + } + else + imsg->in.header.iov_base = NULL; + /* hint upper layer about expected response */ + if (tcp_task->req_in_num_sge) { + tbl_set_nents(sgtbl_ops, sgtbl, tcp_task->req_in_num_sge); + for_each_sge(sgtbl, sgtbl_ops, sg, i) { + sge_set_addr(sgtbl_ops, sg, NULL); + sge_set_length(sgtbl_ops, sg, + tcp_task->req_in_sge[i].length); + sge_set_mr(sgtbl_ops, sg, NULL); + } + } else { + tbl_set_nents(sgtbl_ops, sgtbl, 0); + } + + tcp_task->out_tcp_op = (enum xio_tcp_op_code)req_hdr.out_tcp_op; + tcp_task->in_tcp_op = (enum xio_tcp_op_code)req_hdr.in_tcp_op; + switch (req_hdr.out_tcp_op) { + case XIO_TCP_SEND: //TCP模式下 + if (IS_APPLICATION_MSG(task->tlv_type)){ + /* we already got the header with the XIO management */ + imsg->in.total_data_len = req_hdr.ulp_imm_len; + if(xio_tcp_alloc_data_buf(tcp_hndl, task)){ + sgtbl = vmsg_base_sglist(&imsg->in); + inents = vmsg_sglist_nents(&imsg->in); + tcp_hndl->sock.ops->set_rxd_iov(task, sgtbl, inents, req_hdr.ulp_imm_len); + set_bits(XIO_MSG_HINT_USER_ALLOC_DATA_BUF, &imsg->hints); + break; + }else + tcp_hndl->sock.ops->set_rxd(task, ulp_hdr, + (uint32_t)req_hdr.ulp_imm_len); + } + else + // 协议消息,则内部分配内存处理 + tcp_hndl->sock.ops->set_rxd(task, ulp_hdr, + req_hdr.ulp_hdr_len + + req_hdr.ulp_pad_len + + (uint32_t)req_hdr.ulp_imm_len); + sgtbl = xio_sg_table_get(&imsg->in); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(imsg->in.sgl_type); + if (req_hdr.ulp_imm_len) { + /* incoming data via SEND */ + /* if data arrived, set the pointers */ + tbl_set_nents(sgtbl_ops, sgtbl, 1); + sg = sge_first(sgtbl_ops, sgtbl); + + sge_set_addr(sgtbl_ops, sg, + sum_to_ptr(ulp_hdr, + imsg->in.header.iov_len + + req_hdr.ulp_pad_len)); + sge_set_length(sgtbl_ops, sg, + (size_t)req_hdr.ulp_imm_len); + } else { + /* no data at all */ + tbl_set_nents(sgtbl_ops, sgtbl, 0); + } + break; + case XIO_TCP_READ: + /* we already got the header with the XIO management */ + tcp_hndl->sock.ops->set_rxd(task, ulp_hdr, + (uint32_t)req_hdr.ulp_imm_len); + /* handle RDMA READ equivalent. */ + TRACE_LOG("tcp read header\n"); + retval = xio_tcp_rd_req_header(tcp_hndl, task); + if (unlikely(retval)) { + ERROR_LOG("tcp read header failed\n"); + goto cleanup; + } + break; + default: + ERROR_LOG("unexpected opcode\n"); + xio_set_error(XIO_E_MSG_INVALID); + task->status = XIO_E_MSG_INVALID; + break; + }; + + return 0; + +cleanup: + retval = xio_errno(); /* no need get_last_socket_error() */ + ERROR_LOG("xio_tcp_on_recv_req failed. (errno=%d %s)\n", retval, + xio_strerror(retval)); + xio_transport_notify_observer_error(&tcp_hndl->base, retval); + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_on_recv_req_data */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_on_recv_req_data(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task) +{ + XIO_TO_TCP_TASK(task, tcp_task); + union xio_transport_event_data event_data; + + switch (tcp_task->out_tcp_op) { + case XIO_TCP_SEND: + break; + case XIO_TCP_READ: + /* handle RDMA READ equivalent. */ + TRACE_LOG("tcp read data\n"); + break; + default: + ERROR_LOG("unexpected opcode\n"); + break; + }; + + /* fill notification event */ + event_data.msg.op = XIO_WC_OP_RECV; + event_data.msg.task = task; + + list_move_tail(&task->tasks_list_entry, &tcp_hndl->io_list); + + xio_transport_notify_observer(&tcp_hndl->base, + XIO_TRANSPORT_EVENT_NEW_MESSAGE, + &event_data); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_on_recv_rsp_header */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_on_recv_rsp_header(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task) +{ + XIO_TO_TCP_TASK(task, tcp_task); + int retval = 0; + struct xio_tcp_rsp_hdr rsp_hdr; + struct xio_msg *imsg; + void *ulp_hdr; + struct xio_tcp_task *tcp_sender_task; + unsigned int i; + struct xio_sg_table_ops *isgtbl_ops; + void *isgtbl; + void *sg; + uint32_t inents = 0; + + /* read the response header */ + retval = xio_tcp_read_rsp_header(tcp_hndl, task, &rsp_hdr); + if (retval != 0) { + xio_set_error(XIO_E_MSG_INVALID); + goto cleanup; + } + /* read the sn */ + tcp_task->sn = rsp_hdr.sn; + + /* find the sender task */ + task->sender_task = + xio_tcp_primary_task_lookup(tcp_hndl, rsp_hdr.rtid); + task->rtid = rsp_hdr.ltid; + + tcp_sender_task = (struct xio_tcp_task *)task->sender_task->dd_data; + + /* mark the sender task as arrived */ + task->sender_task->state = XIO_TASK_STATE_RESPONSE_RECV; + + imsg = &task->imsg; + imsg->type = (enum xio_msg_type)task->tlv_type; + + isgtbl = xio_sg_table_get(&imsg->in); + isgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(imsg->in.sgl_type); + + clr_bits(XIO_MSG_HINT_ASSIGNED_DATA_IN_BUF, &imsg->hints); + + ulp_hdr = xio_mbuf_get_curr_ptr(&task->mbuf); + /* msg from received message */ + if (rsp_hdr.ulp_hdr_len) { + imsg->in.header.iov_base = ulp_hdr; + imsg->in.header.iov_len = rsp_hdr.ulp_hdr_len; + } else { + imsg->in.header.iov_base = NULL; + imsg->in.header.iov_len = 0; + } + task->status = rsp_hdr.status; + + tcp_task->out_tcp_op = (enum xio_tcp_op_code)rsp_hdr.out_tcp_op; + + switch (rsp_hdr.out_tcp_op) { + case XIO_TCP_SEND: + if (IS_APPLICATION_MSG(task->tlv_type)){ + /* we already got the header with the XIO management */ + imsg->in.total_data_len = rsp_hdr.ulp_imm_len; + imsg->user_context = task->sender_task->omsg->user_context; + if(xio_tcp_alloc_data_buf(tcp_hndl, task)){ + isgtbl = vmsg_base_sglist(&imsg->in); + inents = vmsg_sglist_nents(&imsg->in); + tcp_hndl->sock.ops->set_rxd_iov(task, isgtbl, inents, rsp_hdr.ulp_imm_len); + set_bits(XIO_MSG_HINT_USER_ALLOC_DATA_BUF, &imsg->hints); + break; + }else + tcp_hndl->sock.ops->set_rxd(task, ulp_hdr, + (uint32_t)rsp_hdr.ulp_imm_len); + }else + tcp_hndl->sock.ops->set_rxd(task, ulp_hdr, + rsp_hdr.ulp_hdr_len + rsp_hdr.ulp_pad_len + + (uint32_t)rsp_hdr.ulp_imm_len); + /* if data arrived, set the pointers */ + if (rsp_hdr.ulp_imm_len) { + tbl_set_nents(isgtbl_ops, isgtbl, 1); + sg = sge_first(isgtbl_ops, isgtbl); + sge_set_addr(isgtbl_ops, sg, + sum_to_ptr(ulp_hdr, + imsg->in.header.iov_len + + rsp_hdr.ulp_pad_len)); + sge_set_length(isgtbl_ops, sg, + (size_t)rsp_hdr.ulp_imm_len); + } else { + tbl_set_nents(isgtbl_ops, isgtbl, 0); + } + break; + case XIO_TCP_WRITE: + /* the data size is set later this is only for the header size */ + tcp_hndl->sock.ops->set_rxd(task->sender_task, ulp_hdr, 0); + if (tcp_task->rsp_out_num_sge > + tcp_sender_task->read_num_reg_mem) { + ERROR_LOG("local in data_iovec is too small %d < %d\n", + tcp_sender_task->read_num_reg_mem, + tcp_task->rsp_out_num_sge); + goto partial_msg; + } + + tbl_set_nents(isgtbl_ops, isgtbl, + tcp_task->rsp_out_num_sge); + sg = sge_first(isgtbl_ops, isgtbl); + for (i = 0; i < tcp_task->rsp_out_num_sge; i++) { + sge_set_addr(isgtbl_ops, sg, + tcp_sender_task->read_reg_mem[i].addr); + sge_set_length(isgtbl_ops, sg, + tcp_task->rsp_out_sge[i].length); + tcp_sender_task->rxd.msg_iov[i + 1].iov_base = + tcp_sender_task->read_reg_mem[i].addr; + tcp_sender_task->rxd.msg_iov[i + 1].iov_len = + tcp_task->rsp_out_sge[i].length; + sg = sge_next(isgtbl_ops, isgtbl, sg); + } + + tcp_sender_task->rxd.msg_len += + tcp_task->rsp_out_num_sge; + tcp_sender_task->rxd.tot_iov_byte_len += + rsp_hdr.ulp_imm_len; + if (tcp_sender_task->rxd.msg.msg_iovlen) + tcp_sender_task->rxd.msg.msg_iov = + tcp_sender_task->rxd.msg_iov; + else + tcp_sender_task->rxd.msg.msg_iov = + &tcp_sender_task->rxd.msg_iov[1]; + tcp_sender_task->rxd.msg.msg_iovlen = + tcp_sender_task->rxd.msg_len; + break; + default: + ERROR_LOG("unexpected opcode %d\n", rsp_hdr.out_tcp_op); + break; + } + +partial_msg: + return 0; + +cleanup: + retval = xio_errno(); + ERROR_LOG("xio_tcp_on_recv_rsp failed. (errno=%d %s)\n", + retval, xio_strerror(retval)); + xio_transport_notify_observer_error(&tcp_hndl->base, retval); + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_on_recv_rsp_data */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_on_recv_rsp_data(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task) +{ + XIO_TO_TCP_TASK(task, tcp_task); + union xio_transport_event_data event_data; + struct xio_msg *imsg; + struct xio_msg *omsg; + unsigned int i; + struct xio_tcp_task *tcp_sender_task; + struct xio_sg_table_ops *isgtbl_ops; + void *isgtbl; + struct xio_sg_table_ops *osgtbl_ops; + void *osgtbl; + void *sg; + + omsg = task->sender_task->omsg; + imsg = &task->imsg; + isgtbl = xio_sg_table_get(&imsg->in); + isgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(imsg->in.sgl_type); + osgtbl = xio_sg_table_get(&omsg->in); + osgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(omsg->in.sgl_type); + + if (test_bits(XIO_MSG_HINT_USER_ALLOC_DATA_BUF, &imsg->hints)){ + omsg->in = imsg->in; + tbl_set_nents(isgtbl_ops, isgtbl, 0); + sge_set_addr(isgtbl_ops, isgtbl, NULL); + goto partial_msg; + } + /* handle the headers */ + if (omsg->in.header.iov_base) { + /* copy header to user buffers */ + size_t hdr_len = 0; + + if (imsg->in.header.iov_len > omsg->in.header.iov_len) { + hdr_len = omsg->in.header.iov_len; + task->status = XIO_E_MSG_SIZE; + } else { + hdr_len = imsg->in.header.iov_len; + task->status = XIO_E_SUCCESS; + } + if (hdr_len) + memcpy(omsg->in.header.iov_base, + imsg->in.header.iov_base, + hdr_len); + else + *((char *)omsg->in.header.iov_base) = 0; + + omsg->in.header.iov_len = hdr_len; + } else { + /* no copy - just pointers */ + memclonev(&omsg->in.header, 1, &imsg->in.header, 1); + } + + switch (tcp_task->out_tcp_op) { + case XIO_TCP_SEND: + if (tbl_nents(osgtbl_ops, osgtbl)) { + /* deep copy */ + if (tbl_nents(isgtbl_ops, isgtbl)) { + size_t idata_len = + tbl_length(isgtbl_ops, isgtbl); + size_t odata_len = + tbl_length(osgtbl_ops, osgtbl); + if (idata_len > odata_len) { + task->status = XIO_E_MSG_SIZE; + goto partial_msg; + } else { + task->status = XIO_E_SUCCESS; + } + sg = sge_first(osgtbl_ops, osgtbl); + if (sge_addr(osgtbl_ops, sg)) { + /* user provided buffer so do copy */ + tbl_copy(osgtbl_ops, osgtbl, + isgtbl_ops, isgtbl); + } else { + /* use provided only length - set user + * pointers */ + tbl_clone(osgtbl_ops, osgtbl, + isgtbl_ops, isgtbl); + } + } else { + tbl_set_nents(osgtbl_ops, osgtbl, + tbl_nents(isgtbl_ops, isgtbl)); + } + } else { + tbl_clone(osgtbl_ops, osgtbl, + isgtbl_ops, isgtbl); + } + break; + case XIO_TCP_WRITE: + /* user provided mr */ + sg = sge_first(osgtbl_ops, osgtbl); + if (sge_addr(osgtbl_ops, sg) && + (sge_mr(osgtbl_ops, sg) || !tcp_options.enable_mr_check)) { + void *isg; + /* data was copied directly to user buffer */ + /* need to update the buffer length */ + for_each_sge(isgtbl, isgtbl_ops, isg, i) { + sge_set_length(osgtbl_ops, sg, + sge_length(isgtbl_ops, isg)); + sg = sge_next(osgtbl_ops, osgtbl, sg); + } + tbl_set_nents(osgtbl_ops, osgtbl, + tbl_nents(isgtbl_ops, isgtbl)); + } else { + /* user provided buffer but not mr */ + /* deep copy */ + if (sge_addr(osgtbl_ops, sg)) { + tbl_copy(osgtbl_ops, osgtbl, + isgtbl_ops, isgtbl); + tcp_sender_task = (struct xio_tcp_task *) + task->sender_task->dd_data; + /* put buffers back to pool */ + for (i = 0; i < tcp_sender_task->read_num_reg_mem; + i++) { + xio_mempool_free( + &tcp_sender_task->read_reg_mem[i]); + tcp_sender_task->read_reg_mem[i].priv = + NULL; + } + tcp_sender_task->read_num_reg_mem = 0; + } else { + /* use provided only length - set user + * pointers */ + tbl_clone(osgtbl_ops, osgtbl, + isgtbl_ops, isgtbl); + } + } + break; + default: + ERROR_LOG("unexpected opcode %d\n", tcp_task->out_tcp_op); + break; + } + +partial_msg: + + /* fill notification event */ + event_data.msg.op = XIO_WC_OP_RECV; + event_data.msg.task = task; + + list_move_tail(&task->tasks_list_entry, &tcp_hndl->io_list); + + /* notify the upper layer of received message */ + xio_transport_notify_observer(&tcp_hndl->base, + XIO_TRANSPORT_EVENT_NEW_MESSAGE, + &event_data); + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_cancel_req_handler */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_cancel_req_handler(struct xio_tcp_transport *tcp_hndl, + void *ulp_msg, size_t ulp_msg_sz) +{ + union xio_transport_event_data event_data; + + /* fill notification event */ + event_data.cancel.ulp_msg = ulp_msg; + event_data.cancel.ulp_msg_sz = ulp_msg_sz; + event_data.cancel.task = NULL; + event_data.cancel.result = (enum xio_status)0; + + xio_transport_notify_observer(&tcp_hndl->base, + XIO_TRANSPORT_EVENT_CANCEL_REQUEST, + &event_data); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_cancel_rsp_handler */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_cancel_rsp_handler(struct xio_tcp_transport *tcp_hndl, + struct xio_tcp_cancel_hdr *cancel_hdr, + void *ulp_msg, size_t ulp_msg_sz) +{ + union xio_transport_event_data event_data; + struct xio_task *ptask, *next_ptask; + struct xio_tcp_task *tcp_task; + struct xio_task *task_to_cancel = NULL; + + if ((cancel_hdr->result == XIO_E_MSG_CANCELED) || + (cancel_hdr->result == XIO_E_MSG_CANCEL_FAILED)) { + /* look in the in_flight */ + list_for_each_entry_safe(ptask, next_ptask, + &tcp_hndl->in_flight_list, + tasks_list_entry) { + tcp_task = (struct xio_tcp_task *)ptask->dd_data; + if (tcp_task->sn == cancel_hdr->sn) { + task_to_cancel = ptask; + break; + } + } + if (!task_to_cancel) { + /* look in the tx_comp */ + list_for_each_entry_safe(ptask, next_ptask, + &tcp_hndl->tx_comp_list, + tasks_list_entry) { + tcp_task = (struct xio_tcp_task *) + ptask->dd_data; + if (tcp_task->sn == cancel_hdr->sn) { + task_to_cancel = ptask; + break; + } + } + } + + if (!task_to_cancel) { + ERROR_LOG("[%u] - Failed to found canceled message\n", + cancel_hdr->sn); + /* fill notification event */ + event_data.cancel.ulp_msg = ulp_msg; + event_data.cancel.ulp_msg_sz = ulp_msg_sz; + event_data.cancel.task = NULL; + event_data.cancel.result = XIO_E_MSG_NOT_FOUND; + + xio_transport_notify_observer( + &tcp_hndl->base, + XIO_TRANSPORT_EVENT_CANCEL_RESPONSE, + &event_data); + return 0; + } + } + + /* fill notification event */ + event_data.cancel.ulp_msg = ulp_msg; + event_data.cancel.ulp_msg_sz = ulp_msg_sz; + event_data.cancel.task = task_to_cancel; + event_data.cancel.result = (enum xio_status)cancel_hdr->result; + + xio_transport_notify_observer(&tcp_hndl->base, + XIO_TRANSPORT_EVENT_CANCEL_RESPONSE, + &event_data); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_on_recv_cancel_rsp_data */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_on_recv_cancel_rsp_data(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task) +{ + struct xio_msg *imsg; + void *buff; + uint16_t ulp_msg_sz; + struct xio_tcp_cancel_hdr cancel_hdr; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + void *sg; + + imsg = &task->imsg; + sgtbl = xio_sg_table_get(&task->imsg.in); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(task->imsg.in.sgl_type); + sg = sge_first(sgtbl_ops, sgtbl); + sge_set_addr(sgtbl_ops, sg, NULL); + tbl_set_nents(sgtbl_ops, sgtbl, 0); + + buff = imsg->in.header.iov_base; + inc_ptr(buff, xio_read_uint16(&cancel_hdr.hdr_len, 0, + (const uint8_t *)buff)); + inc_ptr(buff, xio_read_uint16(&cancel_hdr.sn, 0, + (const uint8_t *)buff)); + inc_ptr(buff, xio_read_uint32(&cancel_hdr.result, 0, + (const uint8_t *)buff)); + inc_ptr(buff, xio_read_uint16(&ulp_msg_sz, 0, + (const uint8_t *)buff)); + + list_move_tail(&task->tasks_list_entry, &tcp_hndl->io_list); + + xio_tcp_cancel_rsp_handler(tcp_hndl, &cancel_hdr, + buff, ulp_msg_sz); + /* return the the cancel response task to pool */ + xio_tasks_pool_put(task); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_on_recv_cancel_rsp_header */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_on_recv_cancel_rsp_header( + struct xio_tcp_transport *tcp_hndl, + struct xio_task *task) +{ + XIO_TO_TCP_TASK(task, tcp_task); + struct xio_tcp_rsp_hdr rsp_hdr; + struct xio_msg *imsg; + void *ulp_hdr; + int retval = 0; + + /* read the response header */ + retval = xio_tcp_read_rsp_header(tcp_hndl, task, &rsp_hdr); + if (retval != 0) { + xio_set_error(XIO_E_MSG_INVALID); + return -1; + } + + /* read the sn */ + tcp_task->sn = rsp_hdr.sn; + + imsg = &task->imsg; + ulp_hdr = xio_mbuf_get_curr_ptr(&task->mbuf); + + imsg->type = (enum xio_msg_type)task->tlv_type; + imsg->in.header.iov_len = rsp_hdr.ulp_hdr_len; + imsg->in.header.iov_base = ulp_hdr; + + tcp_hndl->sock.ops->set_rxd(task, ulp_hdr, rsp_hdr.ulp_hdr_len); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_on_recv_cancel_req_data */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_on_recv_cancel_req_data(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task) +{ + struct xio_tcp_cancel_hdr cancel_hdr; + struct xio_msg *imsg; + void *buff; + uint16_t ulp_msg_sz; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + void *sg; + + imsg = &task->imsg; + sgtbl = xio_sg_table_get(&task->imsg.in); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(task->imsg.in.sgl_type); + sg = sge_first(sgtbl_ops, sgtbl); + sge_set_addr(sgtbl_ops, sg, NULL); + tbl_set_nents(sgtbl_ops, sgtbl, 0); + + buff = imsg->in.header.iov_base; + inc_ptr(buff, xio_read_uint16(&cancel_hdr.hdr_len, 0, + (const uint8_t *)buff)); + inc_ptr(buff, xio_read_uint16(&cancel_hdr.sn, 0, + (const uint8_t *)buff)); + inc_ptr(buff, xio_read_uint32(&cancel_hdr.result, 0, + (const uint8_t *)buff)); + inc_ptr(buff, xio_read_uint16(&ulp_msg_sz, 0, + (const uint8_t *)buff)); + + list_move_tail(&task->tasks_list_entry, &tcp_hndl->io_list); + + xio_tcp_cancel_req_handler(tcp_hndl, buff, ulp_msg_sz); + /* return the the cancel request task to pool */ + xio_tasks_pool_put(task); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_on_recv_cancel_req_header */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_on_recv_cancel_req_header( + struct xio_tcp_transport *tcp_hndl, + struct xio_task *task) +{ + XIO_TO_TCP_TASK(task, tcp_task); + struct xio_tcp_req_hdr req_hdr; + struct xio_msg *imsg; + void *ulp_hdr; + int retval = 0; + + /* read header */ + retval = xio_tcp_read_req_header(tcp_hndl, task, &req_hdr); + if (retval != 0) { + xio_set_error(XIO_E_MSG_INVALID); + goto cleanup; + } + + /* read the sn */ + tcp_task->sn = req_hdr.sn; + + imsg = &task->imsg; + ulp_hdr = xio_mbuf_get_curr_ptr(&task->mbuf); + + /* set header pointers */ + imsg->type = (enum xio_msg_type)task->tlv_type; + imsg->in.header.iov_len = req_hdr.ulp_hdr_len; + imsg->in.header.iov_base = ulp_hdr; + + tcp_hndl->sock.ops->set_rxd(task, ulp_hdr, req_hdr.ulp_hdr_len); + + return 0; + +cleanup: + retval = xio_errno(); /* no need get_last_socket_error() */ + ERROR_LOG("recv_cancel_req_header failed. (errno=%d %s)\n", retval, + xio_strerror(retval)); + xio_transport_notify_observer_error(&tcp_hndl->base, retval); + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_send_cancel */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_send_cancel(struct xio_tcp_transport *tcp_hndl, + uint32_t tlv_type, + struct xio_tcp_cancel_hdr *cancel_hdr, + void *ulp_msg, size_t ulp_msg_sz) +{ + uint64_t tlv_len; + uint16_t ulp_hdr_len; + int retval; + struct xio_task *task; + struct xio_tcp_task *tcp_task; + void *buff; + + task = xio_tcp_primary_task_alloc(tcp_hndl); + if (!task) { + ERROR_LOG("primary tasks pool is empty\n"); + return -1; + } + xio_mbuf_reset(&task->mbuf); + + /* set start of the tlv */ + if (xio_mbuf_tlv_start(&task->mbuf) != 0) + return -1; + + task->tlv_type = tlv_type; + tcp_task = (struct xio_tcp_task *)task->dd_data; + tcp_task->out_tcp_op = XIO_TCP_SEND; + tcp_task->write_num_reg_mem = 0; + tcp_task->read_num_reg_mem = 0; + + ulp_hdr_len = sizeof(*cancel_hdr) + sizeof(uint16_t) + ulp_msg_sz; + tcp_hndl->dummy_msg.out.header.iov_base = ucalloc(1, ulp_hdr_len); + tcp_hndl->dummy_msg.out.header.iov_len = ulp_hdr_len; + + /* write the message */ + /* get the pointer */ + buff = tcp_hndl->dummy_msg.out.header.iov_base; + + /* pack relevant values */ + inc_ptr(buff, xio_write_uint16(cancel_hdr->hdr_len, 0, + (uint8_t *)buff)); + inc_ptr(buff, xio_write_uint16(cancel_hdr->sn, 0, + (uint8_t *)buff)); + inc_ptr(buff, xio_write_uint32(cancel_hdr->result, 0, + (uint8_t *)buff)); + inc_ptr(buff, xio_write_uint16((uint16_t)(ulp_msg_sz), 0, + (uint8_t *)buff)); + inc_ptr(buff, xio_write_array((const uint8_t *)ulp_msg, ulp_msg_sz, 0, + (uint8_t *)buff)); + + task->omsg = &tcp_hndl->dummy_msg; + + /* write xio header to the buffer */ + if (IS_REQUEST(task->tlv_type)) { + retval = xio_tcp_prep_req_header( + tcp_hndl, task, + ulp_hdr_len, 0, 0, + XIO_E_SUCCESS); + } else { + retval = xio_tcp_prep_rsp_header( + tcp_hndl, task, + ulp_hdr_len, 0, 0, + XIO_E_SUCCESS); + } + + if (retval) + return -1; + + /* set the length */ + tcp_task->txd.msg_len = 1; + tcp_task->txd.tot_iov_byte_len = 0; + + tlv_len = tcp_hndl->sock.ops->set_txd(task); + + /* add tlv */ + if (xio_mbuf_write_tlv(&task->mbuf, task->tlv_type, (uint16_t)tlv_len) + != 0) + return -1; + + task->omsg = NULL; + free(tcp_hndl->dummy_msg.out.header.iov_base); + + tcp_hndl->tx_ready_tasks_num++; + list_move_tail(&task->tasks_list_entry, &tcp_hndl->tx_ready_list); + + xio_tcp_xmit(tcp_hndl); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_send */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_send(struct xio_transport_base *transport, + struct xio_task *task) +{ + struct xio_tcp_transport *tcp_hndl = + (struct xio_tcp_transport *)transport; + int retval = -1; + + switch (task->tlv_type) { + case XIO_NEXUS_SETUP_REQ: + retval = xio_tcp_send_setup_req(tcp_hndl, task); + break; + case XIO_NEXUS_SETUP_RSP: + retval = xio_tcp_send_setup_rsp(tcp_hndl, task); + break; + default: + if (IS_REQUEST(task->tlv_type)) + retval = xio_tcp_send_req(tcp_hndl, task); + else if (IS_RESPONSE(task->tlv_type)) + retval = xio_tcp_send_rsp(tcp_hndl, task); + else + ERROR_LOG("unknown message type:0x%x\n", + task->tlv_type); + break; + } + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_get_data_rxd */ +/*---------------------------------------------------------------------------*/ +struct xio_tcp_work_req *xio_tcp_get_data_rxd(struct xio_task *task) +{ + XIO_TO_TCP_TASK(task, tcp_task); + struct xio_tcp_task *tcp_sender_task; + + switch (tcp_task->out_tcp_op) { + case XIO_TCP_SEND: + case XIO_TCP_READ: + return &tcp_task->rxd; + case XIO_TCP_WRITE: + tcp_sender_task = (struct xio_tcp_task *) + task->sender_task->dd_data; + return &tcp_sender_task->rxd; + default: + ERROR_LOG("unexpected opcode %d\n", tcp_task->out_tcp_op); + break; + } + + return NULL; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_rx_data_handler */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_rx_data_handler(struct xio_tcp_transport *tcp_hndl, int batch_nr) +{ + int retval = 0, recvmsg_retval = 0; + struct xio_tcp_task *tcp_task, *next_tcp_task; + struct xio_task *task, *next_task/*, *task1 = NULL, *task2*/; + unsigned int i, last_in_rxq; + int batch_count = 0, tmp_count = 0, ret_count = 0; + unsigned int iov_len; + uint64_t bytes_recv; + struct xio_tcp_work_req *rxd_work, *next_rxd_work; + + task = list_first_entry_or_null(&tcp_hndl->rx_list, + struct xio_task, + tasks_list_entry); + + while (task && batch_count < batch_nr) { + tcp_task = (struct xio_tcp_task *)task->dd_data; + + if (tcp_task->rxd.stage != XIO_TCP_RX_IO_DATA) + break; + + next_task = list_first_entry_or_null( + &task->tasks_list_entry, + struct xio_task, tasks_list_entry); + next_tcp_task = next_task ? (struct xio_tcp_task *) + next_task->dd_data : NULL; + next_rxd_work = (next_tcp_task && + next_tcp_task->rxd.stage == XIO_TCP_RX_IO_DATA) + ? xio_tcp_get_data_rxd(next_task) : NULL; + + /* An Accelio application runs on Side A would crush, + * when it connects Side B by a port binded by an + * application (not accelio) run on Side B. + */ + rxd_work = xio_tcp_get_data_rxd(task); + if (!rxd_work) { + ERROR_LOG("rxd_work is NULL! Disconnect!\n"); + xio_tcp_disconnect_helper(tcp_hndl); + return -1; + } + + for (i = 0; i < rxd_work->msg.msg_iovlen; i++) { + tcp_hndl->tmp_work.msg_iov + [tcp_hndl->tmp_work.msg_len].iov_base = + rxd_work->msg.msg_iov[i].iov_base; + tcp_hndl->tmp_work.msg_iov + [tcp_hndl->tmp_work.msg_len].iov_len = + rxd_work->msg.msg_iov[i].iov_len; + ++tcp_hndl->tmp_work.msg_len; + } + tcp_hndl->tmp_work.tot_iov_byte_len += + rxd_work->tot_iov_byte_len; + + ++batch_count; + ++tmp_count; + + if (batch_count != batch_nr && next_rxd_work && + (next_rxd_work->msg.msg_iovlen + tcp_hndl->tmp_work.msg_len) + < IOV_MAX) { + task = next_task; + continue; + } + + tcp_hndl->tmp_work.msg.msg_iov = tcp_hndl->tmp_work.msg_iov; + tcp_hndl->tmp_work.msg.msg_iovlen = tcp_hndl->tmp_work.msg_len; + + bytes_recv = tcp_hndl->tmp_work.tot_iov_byte_len; + recvmsg_retval = xio_tcp_recvmsg_work(tcp_hndl, + tcp_hndl->sock.dfd, + &tcp_hndl->tmp_work, 0); + bytes_recv -= tcp_hndl->tmp_work.tot_iov_byte_len; + + task = list_first_entry(&tcp_hndl->rx_list, + struct xio_task, tasks_list_entry); + iov_len = tcp_hndl->tmp_work.msg_len - + tcp_hndl->tmp_work.msg.msg_iovlen; + for (i = 0; i < (unsigned int)tmp_count; i++) { + tcp_task = (struct xio_tcp_task *)task->dd_data; + rxd_work = xio_tcp_get_data_rxd(task); + if (!rxd_work){ + ERROR_LOG("rxd_work is NULL! Disconnect!\n"); + break; + } + if (rxd_work->msg.msg_iovlen > iov_len) + break; + + iov_len -= rxd_work->msg.msg_iovlen; + bytes_recv -= rxd_work->tot_iov_byte_len; + + task = list_first_entry(&task->tasks_list_entry, + struct xio_task, + tasks_list_entry); + } + if (tcp_hndl->tmp_work.msg.msg_iovlen) { + tcp_task = (struct xio_tcp_task *)task->dd_data; + rxd_work = xio_tcp_get_data_rxd(task); + if (!rxd_work){ + ERROR_LOG("rxd_work is NULL! Disconnect!\n"); + continue; + } + rxd_work->msg.msg_iov = &rxd_work->msg.msg_iov[iov_len]; + rxd_work->msg.msg_iov[0].iov_base = + tcp_hndl->tmp_work.msg.msg_iov[0].iov_base; + rxd_work->msg.msg_iov[0].iov_len = + tcp_hndl->tmp_work.msg.msg_iov[0].iov_len; + rxd_work->msg.msg_iovlen -= iov_len; + rxd_work->tot_iov_byte_len -= bytes_recv; + } + + tcp_hndl->tmp_work.msg_len = 0; + tcp_hndl->tmp_work.tot_iov_byte_len = 0; + + /* look for the maximum last in rxq index */ + tmp_count = 0; + last_in_rxq = 0; + list_for_each_entry(task, &tcp_hndl->rx_list, tasks_list_entry) { + if (IS_APPLICATION_MSG(task->tlv_type)) + last_in_rxq = (int)tmp_count; + if (++tmp_count == (int)i) + break; + } + tmp_count = 0; + + task = list_first_entry(&tcp_hndl->rx_list, struct xio_task, + tasks_list_entry); + while (i--) { + task->last_in_rxq = (ret_count == (int)last_in_rxq); + ++ret_count; + tcp_task = (struct xio_tcp_task *)task->dd_data; + switch (task->tlv_type) { + case XIO_CANCEL_REQ: + xio_tcp_on_recv_cancel_req_data(tcp_hndl, task); + break; + case XIO_CANCEL_RSP: + xio_tcp_on_recv_cancel_rsp_data(tcp_hndl, task); + break; + default: + if (IS_REQUEST(task->tlv_type)) { + retval = + xio_tcp_on_recv_req_data(tcp_hndl, + task); + } else if (IS_RESPONSE(task->tlv_type)) { + retval = + xio_tcp_on_recv_rsp_data(tcp_hndl, + task); + } else { + ERROR_LOG("unknown message type:0x%x\n", + task->tlv_type); + } + if (retval < 0) + return retval; + } + + task = list_first_entry(&tcp_hndl->rx_list, + struct xio_task, + tasks_list_entry); + } + + if (recvmsg_retval == 0) { + DEBUG_LOG("tcp transport got EOF, tcp_hndl=%p\n", + tcp_hndl); + if (tcp_task->out_tcp_op == XIO_TCP_READ) { /*TODO needed?*/ + for (i = 0; i < tcp_task->read_num_reg_mem; i++) { + xio_mempool_free( + &tcp_task->read_reg_mem[i]); + } + tcp_task->read_num_reg_mem = 0; + } + xio_tcp_disconnect_helper(tcp_hndl); + return -1; + } else if (recvmsg_retval < 0) { + break; + } + + task = list_first_entry_or_null(&tcp_hndl->rx_list, + struct xio_task, + tasks_list_entry); + } + + if (tcp_hndl->tx_ready_tasks_num) { + retval = xio_tcp_xmit(tcp_hndl); + if (retval < 0) { + if (xio_errno() != XIO_EAGAIN) { + ERROR_LOG("xio_tcp_xmit failed\n"); + return -1; + } + return ret_count; + } + } + + return ret_count; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_rx_ctl_handler */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_rx_ctl_handler(struct xio_tcp_transport *tcp_hndl, int batch_nr) +{ + int retval = 0; + struct xio_tcp_task *tcp_task; + struct xio_task *task, *task_next; + int exit; + int count; + + task = list_first_entry_or_null(&tcp_hndl->rx_list, + struct xio_task, + tasks_list_entry); + + count = 0; + exit = 0; + while (task && (&task->tasks_list_entry != &tcp_hndl->rx_list) && + (count < batch_nr) && !exit) { + tcp_task = (struct xio_tcp_task *)task->dd_data; + + switch (tcp_task->rxd.stage) { + case XIO_TCP_RX_START: + /* ORK todo find a better place to rearm rx_list?*/ + if (tcp_hndl->state == + XIO_TRANSPORT_STATE_CONNECTED || + tcp_hndl->state == + XIO_TRANSPORT_STATE_DISCONNECTED) { + task_next = + xio_tcp_primary_task_alloc(tcp_hndl); + if (!task_next) { + ERROR_LOG( + "primary task pool is empty\n"); + exit = 1; + continue; + } else { + list_add_tail( + &task_next->tasks_list_entry, + &tcp_hndl->rx_list); + } + } + tcp_task->rxd.tot_iov_byte_len = sizeof(struct xio_tlv); + tcp_task->rxd.msg.msg_iov = tcp_task->rxd.msg_iov; + tcp_task->rxd.msg.msg_iovlen = 1; + tcp_task->rxd.stage = XIO_TCP_RX_TLV; + /*fallthrough*/ + case XIO_TCP_RX_TLV: + retval = tcp_hndl->sock.ops->rx_ctl_work( + tcp_hndl, + tcp_hndl->sock.cfd, + &tcp_task->rxd, 0); + if (retval == 0) { + DEBUG_LOG("tcp transport got EOF,tcp_hndl=%p\n", + tcp_hndl); + if (count) { + exit = 1; + break; + } + xio_tcp_disconnect_helper(tcp_hndl); + return -1; + } else if (retval < 0) { + exit = 1; + break; + } + retval = xio_mbuf_read_first_tlv(&task->mbuf); + if (retval < 0) { + exit = 1; + break; + } + tcp_task->rxd.msg.msg_iov[0].iov_base = + tcp_task->rxd.msg_iov[1].iov_base; + tcp_task->rxd.msg.msg_iov[0].iov_len = + (size_t)task->mbuf.tlv.len; + tcp_task->rxd.msg.msg_iovlen = 1; + tcp_task->rxd.tot_iov_byte_len = task->mbuf.tlv.len; + tcp_task->rxd.stage = XIO_TCP_RX_HEADER; + /*fallthrough*/ + case XIO_TCP_RX_HEADER: + retval = tcp_hndl->sock.ops->rx_ctl_work( + tcp_hndl, + tcp_hndl->sock.cfd, + &tcp_task->rxd, 0); + if (retval == 0) { + DEBUG_LOG("tcp transport got EOF,tcp_hndl=%p\n", + tcp_hndl); + if (count) { + exit = 1; + break; + } + xio_tcp_disconnect_helper(tcp_hndl); + return -1; + } else if (retval < 0) { + exit = 1; + break; + } + task->tlv_type = xio_mbuf_tlv_type(&task->mbuf); + /* call recv completion */ + switch (task->tlv_type) { + case XIO_NEXUS_SETUP_REQ: + case XIO_NEXUS_SETUP_RSP: + xio_tcp_on_setup_msg(tcp_hndl, task); + return 1; + case XIO_CANCEL_REQ: + xio_tcp_on_recv_cancel_req_header(tcp_hndl, + task); + break; + case XIO_CANCEL_RSP: + xio_tcp_on_recv_cancel_rsp_header(tcp_hndl, + task); + break; + default: + if (IS_REQUEST(task->tlv_type)) + retval = + xio_tcp_on_recv_req_header(tcp_hndl, + task); + else if (IS_RESPONSE(task->tlv_type)) + retval = + xio_tcp_on_recv_rsp_header(tcp_hndl, + task); + else + ERROR_LOG("unknown message type:0x%x\n", + task->tlv_type); + if (unlikely(retval < 0)) { + ERROR_LOG("error reading header\n"); + return retval; + } + } + tcp_task->rxd.stage = XIO_TCP_RX_IO_DATA; + /*fallthrough*/ + case XIO_TCP_RX_IO_DATA: + ++count; + break; + default: + ERROR_LOG("unknown stage type:%d\n", + tcp_task->rxd.stage); + break; + } + task = list_first_entry(&task->tasks_list_entry, + struct xio_task, tasks_list_entry); + } + + if (count == 0) + return 0; + + retval = tcp_hndl->sock.ops->rx_data_handler(tcp_hndl, batch_nr); + if (unlikely(retval < 0)) + return retval; + count = retval; + + if (tcp_hndl->tx_ready_tasks_num) { + retval = xio_tcp_xmit(tcp_hndl); + if (retval < 0) { + if (xio_errno() != XIO_EAGAIN) { + ERROR_LOG("xio_tcp_xmit failed\n"); + return -1; + } + return count; + } + } + + return count; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_poll */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_poll(struct xio_transport_base *transport, + long min_nr, long max_nr, + struct timespec *ts_timeout) +{ + struct xio_tcp_transport *tcp_hndl; + int nr_comp = 0, recv_counter; + cycles_t timeout = -1; + cycles_t start_time = get_cycles(); + + if (min_nr > max_nr) + return -1; + + if (ts_timeout) + timeout = (cycles_t)(timespec_to_usecs(ts_timeout) * g_mhz); + + tcp_hndl = (struct xio_tcp_transport *)transport; + + if (tcp_hndl->state != XIO_TRANSPORT_STATE_CONNECTED) { + ERROR_LOG("tcp transport is not connected, state=%d\n", + tcp_hndl->state); + return -1; + } + + while (1) { + /* ORK todo blocking recv with timeout?*/ + recv_counter = tcp_hndl->sock.ops->rx_ctl_handler(tcp_hndl); + if (recv_counter < 0 && xio_errno() != XIO_EAGAIN) + break; + + nr_comp += recv_counter; + max_nr -= recv_counter; + if (nr_comp >= min_nr || max_nr <= 0) + break; + if ((get_cycles() - start_time) >= timeout) + break; + } + + return nr_comp; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_cancel_req */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_cancel_req(struct xio_transport_base *transport, + struct xio_msg *req, uint64_t stag, + void *ulp_msg, size_t ulp_msg_sz) +{ + struct xio_tcp_transport *tcp_hndl = + (struct xio_tcp_transport *)transport; + struct xio_task *ptask, *next_ptask; + union xio_transport_event_data event_data; + struct xio_tcp_task *tcp_task; + struct xio_tcp_cancel_hdr cancel_hdr = { + .hdr_len = sizeof(cancel_hdr), + }; + + /* look in the tx_ready */ + list_for_each_entry_safe(ptask, next_ptask, &tcp_hndl->tx_ready_list, + tasks_list_entry) { + if (ptask->omsg && + (ptask->omsg->sn == req->sn) && + (ptask->stag == stag)) { + TRACE_LOG("[%lu] - message found on tx_ready_list\n", + req->sn); + + tcp_task = (struct xio_tcp_task *)ptask->dd_data; + + if (tcp_task->txd.stage != XIO_TCP_TX_BEFORE) + goto send_cancel; + + /* return decrease ref count from task */ + xio_tasks_pool_put(ptask); + tcp_hndl->tx_ready_tasks_num--; + list_move_tail(&ptask->tasks_list_entry, + &tcp_hndl->tx_comp_list); + + /* fill notification event */ + event_data.cancel.ulp_msg = ulp_msg; + event_data.cancel.ulp_msg_sz = ulp_msg_sz; + event_data.cancel.task = ptask; + event_data.cancel.result = XIO_E_MSG_CANCELED; + + xio_transport_notify_observer( + &tcp_hndl->base, + XIO_TRANSPORT_EVENT_CANCEL_RESPONSE, + &event_data); + return 0; + } + } + /* look in the in_flight */ + list_for_each_entry_safe(ptask, next_ptask, &tcp_hndl->in_flight_list, + tasks_list_entry) { + if (ptask->omsg && + (ptask->omsg->sn == req->sn) && + (ptask->stag == stag) && + (ptask->state != XIO_TASK_STATE_RESPONSE_RECV)) { + TRACE_LOG("[%lu] - message found on in_flight_list\n", + req->sn); + goto send_cancel; + } + } + /* look in the tx_comp */ + list_for_each_entry_safe(ptask, next_ptask, &tcp_hndl->tx_comp_list, + tasks_list_entry) { + if (ptask->omsg && + (ptask->omsg->sn == req->sn) && + (ptask->stag == stag) && + (ptask->state != XIO_TASK_STATE_RESPONSE_RECV)) { + TRACE_LOG("[%lu] - message found on tx_comp_list\n", + req->sn); + goto send_cancel; + } + } + TRACE_LOG("[%lu] - message not found on tx path\n", req->sn); + + /* fill notification event */ + event_data.cancel.ulp_msg = ulp_msg; + event_data.cancel.ulp_msg_sz = ulp_msg_sz; + event_data.cancel.task = NULL; + event_data.cancel.result = XIO_E_MSG_NOT_FOUND; + + xio_transport_notify_observer(&tcp_hndl->base, + XIO_TRANSPORT_EVENT_CANCEL_RESPONSE, + &event_data); + + return 0; + +send_cancel: + + TRACE_LOG("[%lu] - send cancel request\n", req->sn); + + tcp_task = (struct xio_tcp_task *)ptask->dd_data; + cancel_hdr.sn = tcp_task->sn; + + xio_tcp_send_cancel(tcp_hndl, XIO_CANCEL_REQ, &cancel_hdr, + ulp_msg, ulp_msg_sz); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_cancel_rsp */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_cancel_rsp(struct xio_transport_base *transport, + struct xio_task *task, enum xio_status result, + void *ulp_msg, size_t ulp_msg_sz) +{ + struct xio_tcp_transport *tcp_hndl = + (struct xio_tcp_transport *)transport; + struct xio_tcp_task *tcp_task; + struct xio_tcp_cancel_hdr cancel_hdr = { + .hdr_len = sizeof(cancel_hdr), + .result = result, + }; + + cancel_hdr.hdr_len = sizeof(cancel_hdr); + cancel_hdr.result = result; + + if (task) { + tcp_task = (struct xio_tcp_task *)task->dd_data; + cancel_hdr.sn = tcp_task->sn; + } else { + /* 0 might be a valid sn for another task */ + if ((cancel_hdr.result == XIO_E_MSG_CANCELED) || + (cancel_hdr.result == XIO_E_MSG_CANCEL_FAILED)) { + ERROR_LOG("task cannot be null if result is " \ + "MSG_CANCELED or MSG_CANCEL_FAILED\n"); + return -1; + } + cancel_hdr.sn = 0; + } + + /* fill dummy transport header since was handled by upper layer + */ + return xio_tcp_send_cancel(tcp_hndl, XIO_CANCEL_RSP, + &cancel_hdr, ulp_msg, ulp_msg_sz); +} diff --git a/open_src/xio/src/usr/transport/tcp/xio_tcp_management.c b/open_src/xio/src/usr/transport/tcp/xio_tcp_management.c new file mode 100644 index 0000000..295f666 --- /dev/null +++ b/open_src/xio/src/usr/transport/tcp/xio_tcp_management.c @@ -0,0 +1,2663 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include "libxio.h" +#include "xio_log.h" +#include "xio_common.h" +#include "xio_observer.h" +#include "xio_protocol.h" +#include "xio_mbuf.h" +#include "xio_task.h" +#include "xio_mempool.h" +#include "xio_sg_table.h" +#include "xio_transport.h" +#include "xio_usr_transport.h" +#include "xio_ev_data.h" +#include "xio_objpool.h" +#include "xio_workqueue.h" +#include "xio_context.h" +#include "xio_tcp_transport.h" +#include "xio_mem.h" + +/* default option values */ +#define XIO_OPTVAL_DEF_ENABLE_MEM_POOL 1 +#define XIO_OPTVAL_DEF_ENABLE_MR_CHECK 0 +#define XIO_OPTVAL_DEF_TCP_ENABLE_DMA_LATENCY 0 +#define XIO_OPTVAL_DEF_TCP_MAX_IN_IOVSZ XIO_IOVLEN +#define XIO_OPTVAL_DEF_TCP_MAX_OUT_IOVSZ XIO_IOVLEN +#define XIO_OPTVAL_DEF_TCP_NO_DELAY 0 +#define XIO_OPTVAL_DEF_TCP_SO_SNDBUF 4194304 +#define XIO_OPTVAL_DEF_TCP_SO_RCVBUF 4194304 +#define XIO_OPTVAL_DEF_TCP_DUAL_SOCK 1 + +/*---------------------------------------------------------------------------*/ +/* globals */ +/*---------------------------------------------------------------------------*/ +static spinlock_t mngmt_lock; +static thread_once_t ctor_key_once = THREAD_ONCE_INIT; +static thread_once_t dtor_key_once = THREAD_ONCE_INIT; +static struct xio_tcp_socket_ops single_sock_ops; +static struct xio_tcp_socket_ops dual_sock_ops; +extern struct xio_transport xio_tcp_transport; + +static int cdl_fd = -1; + +/* tcp options */ +struct xio_tcp_options tcp_options = { + XIO_OPTVAL_DEF_ENABLE_MEM_POOL, /*enable_mem_pool*/ + XIO_OPTVAL_DEF_TCP_ENABLE_DMA_LATENCY, /*enable_dma_latency*/ + XIO_OPTVAL_DEF_ENABLE_MR_CHECK, /*enable_mr_check*/ + XIO_OPTVAL_DEF_TCP_MAX_IN_IOVSZ, /*max_in_iovsz*/ + XIO_OPTVAL_DEF_TCP_MAX_OUT_IOVSZ, /*max_out_iovsz*/ + XIO_OPTVAL_DEF_TCP_NO_DELAY, /*tcp_no_delay*/ + XIO_OPTVAL_DEF_TCP_SO_SNDBUF, /*tcp_so_sndbuf*/ + XIO_OPTVAL_DEF_TCP_SO_RCVBUF, /*tcp_so_rcvbuf*/ + XIO_OPTVAL_DEF_TCP_DUAL_SOCK, /*tcp_dual_sock*/ + 0 /*pad*/ +}; + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_get_max_header_size */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_get_max_header_size(void) +{ + int req_hdr = XIO_TRANSPORT_OFFSET + sizeof(struct xio_tcp_req_hdr); + int rsp_hdr = XIO_TRANSPORT_OFFSET + sizeof(struct xio_tcp_rsp_hdr); + int iovsz = tcp_options.max_out_iovsz + tcp_options.max_in_iovsz; + + req_hdr += iovsz * sizeof(struct xio_sge); + rsp_hdr += tcp_options.max_out_iovsz * sizeof(struct xio_sge); + + return max(req_hdr, rsp_hdr); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_get_inline_buffer_size */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_get_inline_buffer_size(void) +{ + int inline_buf_sz = ALIGN(xio_tcp_get_max_header_size() + + g_options.max_inline_xio_hdr + + g_options.max_inline_xio_data, 1024); + return inline_buf_sz; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_flush_all_tasks */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_flush_all_tasks(struct xio_tcp_transport *tcp_hndl) +{ + if (!list_empty(&tcp_hndl->in_flight_list)) { + TRACE_LOG("in_flight_list not empty!\n"); + xio_transport_flush_task_list(&tcp_hndl->in_flight_list); + /* for task that attached to senders with ref count = 2 */ + xio_transport_flush_task_list(&tcp_hndl->in_flight_list); + } + + if (!list_empty(&tcp_hndl->tx_comp_list)) { + TRACE_LOG("tx_comp_list not empty!\n"); + xio_transport_flush_task_list(&tcp_hndl->tx_comp_list); + } + if (!list_empty(&tcp_hndl->io_list)) { + TRACE_LOG("io_list not empty!\n"); + xio_transport_flush_task_list(&tcp_hndl->io_list); + } + + if (!list_empty(&tcp_hndl->tx_ready_list)) { + TRACE_LOG("tx_ready_list not empty!\n"); + xio_transport_flush_task_list(&tcp_hndl->tx_ready_list); + /* for task that attached to senders with ref count = 2 */ + xio_transport_flush_task_list(&tcp_hndl->tx_ready_list); + } + + if (!list_empty(&tcp_hndl->rx_list)) { + TRACE_LOG("rx_list not empty!\n"); + xio_transport_flush_task_list(&tcp_hndl->rx_list); + } + + tcp_hndl->tx_ready_tasks_num = 0; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* on_sock_close */ +/*---------------------------------------------------------------------------*/ +static void on_sock_close(struct xio_tcp_transport *tcp_hndl) +{ + TRACE_LOG("on_sock_close tcp_hndl:%p, state:%d\n\n", + tcp_hndl, tcp_hndl->state); + + xio_tcp_flush_all_tasks(tcp_hndl); + + xio_transport_notify_observer(&tcp_hndl->base, + XIO_TRANSPORT_EVENT_CLOSED, + NULL); + + tcp_hndl->state = XIO_TRANSPORT_STATE_DESTROYED; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_single_sock_del_ev_handlers */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_single_sock_del_ev_handlers(struct xio_tcp_transport *tcp_hndl) +{ + int retval; + + if (tcp_hndl->in_epoll[0]) + return 0; + + /* remove from epoll */ + retval = xio_context_del_ev_handler(tcp_hndl->base.ctx, + tcp_hndl->sock.cfd); + + if (retval) { + ERROR_LOG("tcp_hndl:%p fd=%d del_ev_handler failed, %m\n", + tcp_hndl, tcp_hndl->sock.cfd); + } + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_dual_sock_del_ev_handlers */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_dual_sock_del_ev_handlers(struct xio_tcp_transport *tcp_hndl) +{ + int retval1 = 0, retval2 = 0; + + /* remove from epoll */ + if (tcp_hndl->in_epoll[0]) { + retval1 = xio_context_del_ev_handler(tcp_hndl->base.ctx, + tcp_hndl->sock.cfd); + if (retval1) { + ERROR_LOG("tcp_hndl:%p fd=%d del_ev_handler failed, %m\n", + tcp_hndl, tcp_hndl->sock.cfd); + } + } + /* remove from epoll */ + if (tcp_hndl->in_epoll[1]) { + retval2 = xio_context_del_ev_handler(tcp_hndl->base.ctx, + tcp_hndl->sock.dfd); + + if (retval2) { + ERROR_LOG("tcp_hndl:%p fd=%d del_ev_handler failed, %m\n", + tcp_hndl, tcp_hndl->sock.dfd); + } + } + + return retval1 | retval2; +} + +/*---------------------------------------------------------------------------*/ +/* on_sock_disconnected */ +/*---------------------------------------------------------------------------*/ +void on_sock_disconnected(struct xio_tcp_transport *tcp_hndl, + int passive_close) +{ + struct xio_tcp_pending_conn *pconn, *next_pconn; + int retval; + + TRACE_LOG("on_sock_disconnected. tcp_hndl:%p, state:%d\n", + tcp_hndl, tcp_hndl->state); + if (tcp_hndl->state == XIO_TRANSPORT_STATE_DISCONNECTED) { + TRACE_LOG("call to close. tcp_hndl:%p\n", + tcp_hndl); + tcp_hndl->state = XIO_TRANSPORT_STATE_CLOSED; + + xio_context_disable_event(&tcp_hndl->flush_tx_event); + xio_context_disable_event(&tcp_hndl->ctl_rx_event); + + if (tcp_hndl->sock.ops->del_ev_handlers) + tcp_hndl->sock.ops->del_ev_handlers(tcp_hndl); + + if (!passive_close && !tcp_hndl->is_listen) { /*active close*/ + tcp_hndl->sock.ops->shutdown(&tcp_hndl->sock); + } + tcp_hndl->sock.ops->close(&tcp_hndl->sock); + + list_for_each_entry_safe(pconn, next_pconn, + &tcp_hndl->pending_conns, + conns_list_entry) { + retval = xio_context_del_ev_handler(tcp_hndl->base.ctx, + pconn->fd); + if (retval) { + ERROR_LOG( + "removing conn handler failed.(errno=%d %m)\n", + xio_get_last_socket_error()); + } + list_del(&pconn->conns_list_entry); + ufree(pconn); + } + + if (passive_close) { + xio_transport_notify_observer( + &tcp_hndl->base, + XIO_TRANSPORT_EVENT_DISCONNECTED, + NULL); + } + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_post_close */ +/*---------------------------------------------------------------------------*/ +static void xio_tcp_post_close(struct xio_tcp_transport *tcp_hndl) +{ + TRACE_LOG("tcp transport: [post close] handle:%p\n", + tcp_hndl); + + xio_context_disable_event(&tcp_hndl->disconnect_event); + + xio_observable_unreg_all_observers(&tcp_hndl->base.observable); + + if (tcp_hndl->tmp_rx_buf) { + ufree(tcp_hndl->tmp_rx_buf); + tcp_hndl->tmp_rx_buf = NULL; + } + + ufree(tcp_hndl->base.portal_uri); + + XIO_OBSERVABLE_DESTROY(&tcp_hndl->base.observable); + + ufree(tcp_hndl); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_close_cb */ +/*---------------------------------------------------------------------------*/ +static void xio_tcp_close_cb(struct kref *kref) +{ + struct xio_transport_base *transport = container_of( + kref, struct xio_transport_base, kref); + struct xio_tcp_transport *tcp_hndl = + (struct xio_tcp_transport *)transport; + + /* now it is zero */ + TRACE_LOG("xio_tcp_close: [close] handle:%p, fd:%d\n", + tcp_hndl, tcp_hndl->sock.cfd); + + switch (tcp_hndl->state) { + case XIO_TRANSPORT_STATE_LISTEN: + case XIO_TRANSPORT_STATE_CONNECTED: + tcp_hndl->state = XIO_TRANSPORT_STATE_DISCONNECTED; + /*fallthrough*/ + case XIO_TRANSPORT_STATE_DISCONNECTED: + on_sock_disconnected(tcp_hndl, 0); + /*fallthrough*/ + case XIO_TRANSPORT_STATE_CLOSED: + on_sock_close(tcp_hndl); + break; + default: + xio_transport_notify_observer( + &tcp_hndl->base, + XIO_TRANSPORT_EVENT_CLOSED, + NULL); + tcp_hndl->state = XIO_TRANSPORT_STATE_DESTROYED; + break; + } + + if (tcp_hndl->state == XIO_TRANSPORT_STATE_DESTROYED) + xio_tcp_post_close(tcp_hndl); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_close */ +/*---------------------------------------------------------------------------*/ +static void xio_tcp_close(struct xio_transport_base *transport) +{ + int was = atomic_read(&transport->kref.refcount); + + /* this is only for debugging - please note that the combination of + * atomic_read and kref_put is not atomic - please remove if this + * error does not pop up. Otherwise contact me and report bug. + */ + + /* was already 0 */ + if (!was) { + ERROR_LOG("xio_tcp_close double close. handle:%p\n", + transport); + return; + } + + kref_put(&transport->kref, xio_tcp_close_cb); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_single_sock_shutdown */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_single_sock_shutdown(struct xio_tcp_socket *sock) +{ + int retval; + + retval = shutdown(sock->cfd, SHUT_RDWR); + if (retval) { + xio_set_error(xio_get_last_socket_error()); + DEBUG_LOG("tcp shutdown failed. (errno=%d %m)\n", + xio_get_last_socket_error()); + } + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_single_sock_close */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_single_sock_close(struct xio_tcp_socket *sock) +{ + int retval; + + retval = xio_closesocket(sock->cfd); + if (retval) { + xio_set_error(xio_get_last_socket_error()); + DEBUG_LOG("tcp close failed. (errno=%d %m)\n", + xio_get_last_socket_error()); + } + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_dual_sock_shutdown */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_dual_sock_shutdown(struct xio_tcp_socket *sock) +{ + int retval1, retval2; + + retval1 = shutdown(sock->cfd, SHUT_RDWR); + if (retval1) { + xio_set_error(xio_get_last_socket_error()); + DEBUG_LOG("tcp shutdown failed. (errno=%d %m)\n", + xio_get_last_socket_error()); + } + + retval2 = shutdown(sock->dfd, SHUT_RDWR); + if (retval2) { + xio_set_error(xio_get_last_socket_error()); + DEBUG_LOG("tcp shutdown failed. (errno=%d %m)\n", + xio_get_last_socket_error()); + } + + return (retval1 | retval2); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_dual_sock_close */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_dual_sock_close(struct xio_tcp_socket *sock) +{ + int retval1, retval2; + + retval1 = xio_closesocket(sock->cfd); + if (retval1) { + xio_set_error(xio_get_last_socket_error()); + DEBUG_LOG("tcp close failed. (errno=%d %m)\n", + xio_get_last_socket_error()); + } + + retval2 = xio_closesocket(sock->dfd); + if (retval2) { + xio_set_error(xio_get_last_socket_error()); + DEBUG_LOG("tcp close failed. (errno=%d %m)\n", + xio_get_last_socket_error()); + } + + return (retval1 | retval2); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_reject */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_reject(struct xio_transport_base *transport) +{ + struct xio_tcp_transport *tcp_hndl = + (struct xio_tcp_transport *)transport; + int retval; + + tcp_hndl->sock.ops->shutdown(&tcp_hndl->sock); + + retval = tcp_hndl->sock.ops->close(&tcp_hndl->sock); + if (retval) + return -1; + + TRACE_LOG("tcp transport: [reject] handle:%p\n", tcp_hndl); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_context_shutdown */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_context_shutdown(struct xio_transport_base *trans_hndl, + struct xio_context *ctx) +{ + struct xio_tcp_transport *tcp_hndl = + (struct xio_tcp_transport *)trans_hndl; + + TRACE_LOG("tcp transport context_shutdown handle:%p\n", tcp_hndl); + + switch (tcp_hndl->state) { + case XIO_TRANSPORT_STATE_INIT: + DEBUG_LOG("shutting context while tcp_hndl=%p state is INIT?\n", + tcp_hndl); + /*fallthrough*/ + case XIO_TRANSPORT_STATE_LISTEN: + case XIO_TRANSPORT_STATE_CONNECTING: + case XIO_TRANSPORT_STATE_CONNECTED: + tcp_hndl->state = XIO_TRANSPORT_STATE_DISCONNECTED; + /*fallthrough*/ + case XIO_TRANSPORT_STATE_DISCONNECTED: + on_sock_disconnected(tcp_hndl, 0); + break; + default: + break; + } + + tcp_hndl->state = XIO_TRANSPORT_STATE_DESTROYED; + xio_tcp_flush_all_tasks(tcp_hndl); + + xio_transport_notify_observer(&tcp_hndl->base, + XIO_TRANSPORT_EVENT_CLOSED, + NULL); + + xio_tcp_post_close(tcp_hndl); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_disconnect_handler */ +/*---------------------------------------------------------------------------*/ +static void xio_tcp_disconnect_handler(void *xio_tcp_hndl) +{ + struct xio_tcp_transport *tcp_hndl = (struct xio_tcp_transport *) + xio_tcp_hndl; + on_sock_disconnected(tcp_hndl, 1); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_flush_tx_handler */ +/*---------------------------------------------------------------------------*/ +void xio_tcp_flush_tx_handler(void *xio_tcp_hndl) +{ + struct xio_tcp_transport *tcp_hndl = (struct xio_tcp_transport *) + xio_tcp_hndl; + xio_tcp_xmit(tcp_hndl); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_single_sock_rx_ctl_handler */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_single_sock_rx_ctl_handler(struct xio_tcp_transport *tcp_hndl) +{ + return xio_tcp_rx_ctl_handler(tcp_hndl, 1); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_dual_sock_rx_ctl_handler */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_dual_sock_rx_ctl_handler(struct xio_tcp_transport *tcp_hndl) +{ + return xio_tcp_rx_ctl_handler(tcp_hndl, RX_BATCH); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_consume_ctl_rx */ +/*---------------------------------------------------------------------------*/ +void xio_tcp_consume_ctl_rx(void *xio_tcp_hndl) +{ + struct xio_tcp_transport *tcp_hndl = (struct xio_tcp_transport *) + xio_tcp_hndl; + int retval = 0, count = 0; + + xio_context_disable_event(&tcp_hndl->ctl_rx_event); + + do { + retval = tcp_hndl->sock.ops->rx_ctl_handler(tcp_hndl); + ++count; + } while (retval > 0 && count < RX_POLL_NR_MAX); + + if (/*retval > 0 && */ tcp_hndl->tmp_rx_buf_len && + tcp_hndl->state == XIO_TRANSPORT_STATE_CONNECTED) { + xio_context_add_event(tcp_hndl->base.ctx, + &tcp_hndl->ctl_rx_event); + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_ctl_ready_ev_handler */ +/*---------------------------------------------------------------------------*/ +void xio_tcp_ctl_ready_ev_handler(int fd, int events, void *user_context) +{ + struct xio_tcp_transport *tcp_hndl = (struct xio_tcp_transport *) + user_context; + + if (events & XIO_POLLOUT) { + xio_context_modify_ev_handler(tcp_hndl->base.ctx, fd, + XIO_POLLIN | XIO_POLLRDHUP); + xio_tcp_xmit(tcp_hndl); + } + + if (events & XIO_POLLIN) + xio_tcp_consume_ctl_rx(tcp_hndl); + + if (events & (XIO_POLLHUP | XIO_POLLRDHUP | XIO_POLLERR)) { + DEBUG_LOG("epoll returned with error events=%d for fd=%d\n", + events, fd); + xio_tcp_disconnect_helper(tcp_hndl); + } + + /* ORK todo add work instead of poll_nr? */ +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_data_ready_ev_handler */ +/*---------------------------------------------------------------------------*/ +void xio_tcp_data_ready_ev_handler(int fd, int events, void *user_context) +{ + struct xio_tcp_transport *tcp_hndl = (struct xio_tcp_transport *) + user_context; + int retval = 0, count = 0; + + if (events & XIO_POLLOUT) { + xio_context_modify_ev_handler(tcp_hndl->base.ctx, fd, + XIO_POLLIN | XIO_POLLRDHUP); + xio_tcp_xmit(tcp_hndl); + } + + if (events & XIO_POLLIN) { + do { + retval = tcp_hndl->sock.ops->rx_data_handler( + tcp_hndl, RX_BATCH); + ++count; + } while (retval > 0 && count < RX_POLL_NR_MAX); + } + + if (events & (XIO_POLLHUP | XIO_POLLRDHUP | XIO_POLLERR)) { + DEBUG_LOG("epoll returned with error events=%d for fd=%d\n", + events, fd); + xio_tcp_disconnect_helper(tcp_hndl); + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_single_sock_add_ev_handlers */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_single_sock_add_ev_handlers(struct xio_tcp_transport *tcp_hndl) +{ + /* add to epoll */ + int retval = xio_context_add_ev_handler( + tcp_hndl->base.ctx, + tcp_hndl->sock.cfd, + XIO_POLLIN | XIO_POLLRDHUP, + xio_tcp_ctl_ready_ev_handler, + tcp_hndl); + + if (retval) { + ERROR_LOG("setting connection handler failed. (errno=%d %m)\n", + xio_get_last_socket_error()); + } + tcp_hndl->in_epoll[0] = 1; + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_dual_sock_add_ev_handlers */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_dual_sock_add_ev_handlers(struct xio_tcp_transport *tcp_hndl) +{ + int retval = 0; + + /* add to epoll */ + retval = xio_context_add_ev_handler( + tcp_hndl->base.ctx, + tcp_hndl->sock.cfd, + XIO_POLLIN | XIO_POLLRDHUP, + xio_tcp_ctl_ready_ev_handler, + tcp_hndl); + + if (retval) { + ERROR_LOG("setting connection handler failed. (errno=%d %m)\n", + xio_get_last_socket_error()); + return retval; + } + tcp_hndl->in_epoll[0] = 1; + + /* add to epoll */ + retval = xio_context_add_ev_handler( + tcp_hndl->base.ctx, + tcp_hndl->sock.dfd, + XIO_POLLIN | XIO_POLLRDHUP, + xio_tcp_data_ready_ev_handler, + tcp_hndl); + + if (retval) { + ERROR_LOG("setting connection handler failed. (errno=%d %m)\n", + xio_get_last_socket_error()); + (void)xio_context_del_ev_handler(tcp_hndl->base.ctx, + tcp_hndl->sock.cfd); + } + tcp_hndl->in_epoll[1] = 1; + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_accept */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_accept(struct xio_transport_base *transport) +{ + struct xio_tcp_transport *tcp_hndl = + (struct xio_tcp_transport *)transport; + + if (tcp_hndl->sock.ops->add_ev_handlers(tcp_hndl)) { + xio_transport_notify_observer_error(&tcp_hndl->base, + XIO_E_UNSUCCESSFUL); + } + + TRACE_LOG("tcp transport: [accept] handle:%p\n", tcp_hndl); + + xio_transport_notify_observer( + &tcp_hndl->base, + XIO_TRANSPORT_EVENT_ESTABLISHED, + NULL); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_socket_create */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_socket_create(void) +{ + int sock_fd, retval, optval = 1; + + sock_fd = xio_socket_non_blocking(AF_INET, SOCK_STREAM, 0); + if (sock_fd < 0) { + xio_set_error(xio_get_last_socket_error()); + ERROR_LOG("create socket failed. (errno=%d %m)\n", + xio_get_last_socket_error()); + return sock_fd; + } + + retval = setsockopt(sock_fd, + SOL_SOCKET, + SO_REUSEADDR, + (char *)&optval, + sizeof(optval)); + if (retval) { + xio_set_error(xio_get_last_socket_error()); + ERROR_LOG("setsockopt failed. (errno=%d %m)\n", + xio_get_last_socket_error()); + goto cleanup; + } + + if (tcp_options.tcp_no_delay) { + retval = setsockopt(sock_fd, + IPPROTO_TCP, + TCP_NODELAY, + (char *)&optval, + sizeof(int)); + if (retval) { + xio_set_error(xio_get_last_socket_error()); + ERROR_LOG("setsockopt failed. (errno=%d %m)\n", + xio_get_last_socket_error()); + goto cleanup; + } + } + + optval = tcp_options.tcp_so_sndbuf; + retval = setsockopt(sock_fd, SOL_SOCKET, SO_SNDBUF, + (char *)&optval, sizeof(optval)); + if (retval) { + xio_set_error(xio_get_last_socket_error()); + ERROR_LOG("setsockopt failed. (errno=%d %m)\n", + xio_get_last_socket_error()); + goto cleanup; + } + optval = tcp_options.tcp_so_rcvbuf; + retval = setsockopt(sock_fd, SOL_SOCKET, SO_RCVBUF, + (char *)&optval, sizeof(optval)); + if (retval) { + xio_set_error(xio_get_last_socket_error()); + ERROR_LOG("setsockopt failed. (errno=%d %m)\n", + xio_get_last_socket_error()); + goto cleanup; + } + + return sock_fd; + +cleanup: + xio_closesocket(sock_fd); + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_single_sock_create */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_single_sock_create(struct xio_tcp_socket *sock) +{ + sock->cfd = xio_tcp_socket_create(); + if (sock->cfd < 0) + return -1; + + sock->dfd = sock->cfd; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_dual_sock_create */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_dual_sock_create(struct xio_tcp_socket *sock) +{ + sock->cfd = xio_tcp_socket_create(); + if (sock->cfd < 0) + return -1; + + sock->dfd = xio_tcp_socket_create(); + if (sock->dfd < 0) { + xio_closesocket(sock->cfd); + return -1; + } + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_transport_create */ +/*---------------------------------------------------------------------------*/ +struct xio_tcp_transport *xio_tcp_transport_create( + struct xio_transport *transport, + struct xio_context *ctx, + struct xio_observer *observer, + int create_socket) +{ + struct xio_tcp_transport *tcp_hndl; + + /*allocate tcp handl */ + tcp_hndl = (struct xio_tcp_transport *) + ucalloc(1, sizeof(struct xio_tcp_transport)); + if (!tcp_hndl) { + xio_set_error(ENOMEM); + ERROR_LOG("ucalloc failed. %m\n"); + return NULL; + } + + XIO_OBSERVABLE_INIT(&tcp_hndl->base.observable, tcp_hndl); + + if (tcp_options.enable_mem_pool) { + tcp_hndl->tcp_mempool = + xio_transport_mempool_get(ctx, 0); + if (!tcp_hndl->tcp_mempool) { + xio_set_error(ENOMEM); + ERROR_LOG("allocating tcp mempool failed. %m\n"); + goto cleanup; + } + } + + tcp_hndl->base.portal_uri = NULL; + tcp_hndl->base.proto = XIO_PROTO_TCP; + kref_init(&tcp_hndl->base.kref); + tcp_hndl->transport = transport; + tcp_hndl->base.ctx = ctx; + tcp_hndl->is_listen = 0; + + tcp_hndl->tmp_rx_buf = NULL; + tcp_hndl->tmp_rx_buf_cur = NULL; + tcp_hndl->tmp_rx_buf_len = 0; + + tcp_hndl->tx_ready_tasks_num = 0; + tcp_hndl->tx_comp_cnt = 0; + + memset(&tcp_hndl->tmp_work, 0, sizeof(struct xio_tcp_work_req)); + tcp_hndl->tmp_work.msg_iov = tcp_hndl->tmp_iovec; + + /* create tcp socket */ + if (create_socket) { + memcpy(tcp_hndl->sock.ops, + (tcp_options.tcp_dual_sock ? + &dual_sock_ops : &single_sock_ops), + sizeof(*tcp_hndl->sock.ops)); + if (tcp_hndl->sock.ops->open(&tcp_hndl->sock)) + goto cleanup; + } + + /* from now on don't allow changes */ + tcp_hndl->max_inline_buf_sz = xio_tcp_get_inline_buffer_size(); + tcp_hndl->membuf_sz = tcp_hndl->max_inline_buf_sz; + + if (observer) + xio_observable_reg_observer(&tcp_hndl->base.observable, + observer); + + INIT_LIST_HEAD(&tcp_hndl->in_flight_list); + INIT_LIST_HEAD(&tcp_hndl->tx_ready_list); + INIT_LIST_HEAD(&tcp_hndl->tx_comp_list); + INIT_LIST_HEAD(&tcp_hndl->rx_list); + INIT_LIST_HEAD(&tcp_hndl->io_list); + + INIT_LIST_HEAD(&tcp_hndl->pending_conns); + + memset(&tcp_hndl->flush_tx_event, 0, sizeof(struct xio_ev_data)); + tcp_hndl->flush_tx_event.handler = xio_tcp_flush_tx_handler; + tcp_hndl->flush_tx_event.data = tcp_hndl; + + memset(&tcp_hndl->ctl_rx_event, 0, sizeof(struct xio_ev_data)); + tcp_hndl->ctl_rx_event.handler = xio_tcp_consume_ctl_rx; + tcp_hndl->ctl_rx_event.data = tcp_hndl; + + memset(&tcp_hndl->disconnect_event, 0, sizeof(struct xio_ev_data)); + tcp_hndl->disconnect_event.handler = xio_tcp_disconnect_handler; + tcp_hndl->disconnect_event.data = tcp_hndl; + + TRACE_LOG("xio_tcp_open: [new] handle:%p\n", tcp_hndl); + + return tcp_hndl; + +cleanup: + ufree(tcp_hndl); + + return NULL; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_handle_pending_conn */ +/*---------------------------------------------------------------------------*/ +void xio_tcp_handle_pending_conn(int fd, + struct xio_tcp_transport *parent_hndl, + int error) +{ + int retval; + struct xio_tcp_pending_conn *pconn, *next_pconn; + struct xio_tcp_pending_conn *pending_conn = NULL, *matching_conn = NULL; + struct xio_tcp_pending_conn *ctl_conn = NULL, *data_conn = NULL; + void *buf; + int cfd = 0, dfd = 0, is_single = 1; + socklen_t len = 0; + struct xio_tcp_transport *child_hndl = NULL; + union xio_transport_event_data ev_data; + + list_for_each_entry_safe(pconn, next_pconn, + &parent_hndl->pending_conns, + conns_list_entry) { + if (pconn->fd == fd) { + pending_conn = pconn; + break; + } + } + + if (!pending_conn) { + ERROR_LOG("could not find pending fd [%d] on the list\n", fd); + goto cleanup2; + } + + if (error) { + DEBUG_LOG("epoll returned with error=%d for fd=%d\n", + error, fd); + goto cleanup1; + } + + buf = &pending_conn->msg; + inc_ptr(buf, sizeof(struct xio_tcp_connect_msg) - + pending_conn->waiting_for_bytes); + while (pending_conn->waiting_for_bytes) { + retval = recv(fd, (char *)buf, + pending_conn->waiting_for_bytes, 0); + if (retval > 0) { + pending_conn->waiting_for_bytes -= retval; + inc_ptr(buf, retval); + } else if (retval == 0) { + ERROR_LOG("got EOF while establishing connection\n"); + goto cleanup1; + } else { + if (xio_get_last_socket_error() != XIO_EAGAIN) { + ERROR_LOG("recv return with errno=%d\n", + xio_get_last_socket_error()); + goto cleanup1; + } + return; + } + } + + pending_conn->msg.sock_type = (enum xio_tcp_sock_type) + ntohl((uint32_t)pending_conn->msg.sock_type); + UNPACK_SVAL(&pending_conn->msg, &pending_conn->msg, second_port); + UNPACK_SVAL(&pending_conn->msg, &pending_conn->msg, pad); + + if (pending_conn->msg.sock_type == XIO_TCP_SINGLE_SOCK) { + ctl_conn = pending_conn; + goto single_sock; + } + + is_single = 0; + + list_for_each_entry_safe(pconn, next_pconn, + &parent_hndl->pending_conns, + conns_list_entry) { + if (pconn->waiting_for_bytes) + continue; + + if (pconn->sa.sa.sa_family == AF_INET) { + if ((pconn->msg.second_port == + ntohs(pending_conn->sa.sa_in.sin_port)) && + (pconn->sa.sa_in.sin_addr.s_addr == + pending_conn->sa.sa_in.sin_addr.s_addr)) { + matching_conn = pconn; + if (ntohs(matching_conn->sa.sa_in.sin_port) != + pending_conn->msg.second_port) { + ERROR_LOG("ports mismatch\n"); + return; + } + break; + } + } else if (pconn->sa.sa.sa_family == AF_INET6) { + if ((pconn->msg.second_port == + ntohs(pending_conn->sa.sa_in6.sin6_port)) && + !memcmp(&pconn->sa.sa_in6.sin6_addr, + &pending_conn->sa.sa_in6.sin6_addr, + sizeof(pconn->sa.sa_in6.sin6_addr))) { + matching_conn = pconn; + if (ntohs(matching_conn->sa.sa_in6.sin6_port) + != pending_conn->msg.second_port) { + ERROR_LOG("ports mismatch\n"); + return; + } + break; + } + } else { + ERROR_LOG("unknown family %d\n", + pconn->sa.sa.sa_family); + } + } + + if (!matching_conn) + return; + + if (pending_conn->msg.sock_type == XIO_TCP_CTL_SOCK) { + ctl_conn = pending_conn; + data_conn = matching_conn; + } else if (pending_conn->msg.sock_type == XIO_TCP_DATA_SOCK) { + ctl_conn = matching_conn; + data_conn = pending_conn; + } + cfd = ctl_conn->fd; + dfd = data_conn->fd; + + retval = xio_context_del_ev_handler(parent_hndl->base.ctx, + data_conn->fd); + list_del(&data_conn->conns_list_entry); + if (retval) { + ERROR_LOG("removing connection handler failed.(errno=%d %m)\n", + xio_get_last_socket_error()); + } + ufree(data_conn); + +single_sock: + + list_del(&ctl_conn->conns_list_entry); + retval = xio_context_del_ev_handler(parent_hndl->base.ctx, + ctl_conn->fd); + if (retval) { + ERROR_LOG("removing connection handler failed.(errno=%d %m)\n", + xio_get_last_socket_error()); + } + + child_hndl = xio_tcp_transport_create(parent_hndl->transport, + parent_hndl->base.ctx, + NULL, + 0); + if (!child_hndl) { + ERROR_LOG("failed to create tcp child\n"); + xio_transport_notify_observer_error(&parent_hndl->base, + xio_errno()); + ufree(ctl_conn); + goto cleanup3; + } + + memcpy(&child_hndl->base.peer_addr, + &ctl_conn->sa.sa_stor, + sizeof(child_hndl->base.peer_addr)); + ufree(ctl_conn); + + if (is_single) { + child_hndl->sock.cfd = fd; + child_hndl->sock.dfd = fd; + memcpy(child_hndl->sock.ops, &single_sock_ops, + sizeof(*child_hndl->sock.ops)); + + } else { + child_hndl->sock.cfd = cfd; + child_hndl->sock.dfd = dfd; + memcpy(child_hndl->sock.ops, &dual_sock_ops, + sizeof(*child_hndl->sock.ops)); + + child_hndl->tmp_rx_buf = ucalloc(1, TMP_RX_BUF_SIZE); + if (!child_hndl->tmp_rx_buf) { + xio_set_error(ENOMEM); + ERROR_LOG("ucalloc failed. %m\n"); + goto cleanup3; + } + child_hndl->tmp_rx_buf_cur = child_hndl->tmp_rx_buf; + } + + len = sizeof(child_hndl->base.local_addr); + retval = getsockname(child_hndl->sock.cfd, + (struct sockaddr *)&child_hndl->base.local_addr, + &len); + if (retval) { + xio_set_error(xio_get_last_socket_error()); + ERROR_LOG("tcp getsockname failed. (errno=%d %m)\n", + xio_get_last_socket_error()); + } + + child_hndl->state = XIO_TRANSPORT_STATE_CONNECTING; + + ev_data.new_connection.child_trans_hndl = + (struct xio_transport_base *)child_hndl; + xio_transport_notify_observer((struct xio_transport_base *)parent_hndl, + XIO_TRANSPORT_EVENT_NEW_CONNECTION, + &ev_data); + + return; + +cleanup1: + list_del(&pending_conn->conns_list_entry); + ufree(pending_conn); +cleanup2: + /* remove from epoll */ + retval = xio_context_del_ev_handler(parent_hndl->base.ctx, fd); + if (retval) { + ERROR_LOG( + "removing connection handler failed.(errno=%d %m)\n", + xio_get_last_socket_error()); + } +cleanup3: + if (is_single) { + xio_closesocket(fd); + } else { + xio_closesocket(cfd); + xio_closesocket(dfd); + } + + if (child_hndl) + xio_tcp_post_close(child_hndl); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_pending_conn_ev_handler */ +/*---------------------------------------------------------------------------*/ +void xio_tcp_pending_conn_ev_handler(int fd, int events, void *user_context) +{ + struct xio_tcp_transport *tcp_hndl = (struct xio_tcp_transport *) + user_context; + + xio_tcp_handle_pending_conn( + fd, tcp_hndl, + events & + (XIO_POLLHUP | XIO_POLLRDHUP | XIO_POLLERR)); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_new_connection */ +/*---------------------------------------------------------------------------*/ +void xio_tcp_new_connection(struct xio_tcp_transport *parent_hndl) +{ + int retval; + socklen_t len = sizeof(struct sockaddr_storage); + struct xio_tcp_pending_conn *pending_conn; + + /*allocate pending fd struct */ + pending_conn = (struct xio_tcp_pending_conn *) + ucalloc(1, sizeof(struct xio_tcp_pending_conn)); + if (!pending_conn) { + xio_set_error(ENOMEM); + ERROR_LOG("ucalloc failed. %m\n"); + xio_transport_notify_observer_error(&parent_hndl->base, + xio_errno()); + return; + } + + pending_conn->waiting_for_bytes = sizeof(struct xio_tcp_connect_msg); + + /* "accept" the connection */ + retval = xio_accept_non_blocking( + parent_hndl->sock.cfd, + (struct sockaddr *)&pending_conn->sa.sa_stor, + &len); + if (retval < 0) { + xio_set_error(xio_get_last_socket_error()); + ERROR_LOG("tcp accept failed. (errno=%d %m)\n", + xio_get_last_socket_error()); + ufree(pending_conn); + return; + } + pending_conn->fd = retval; + + list_add_tail(&pending_conn->conns_list_entry, + &parent_hndl->pending_conns); + + /* add to epoll */ + retval = xio_context_add_ev_handler( + parent_hndl->base.ctx, + pending_conn->fd, + XIO_POLLIN | XIO_POLLRDHUP, + xio_tcp_pending_conn_ev_handler, + parent_hndl); + if (retval) + ERROR_LOG("adding pending_conn_ev_handler failed\n"); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_listener_ev_handler */ +/*---------------------------------------------------------------------------*/ +void xio_tcp_listener_ev_handler(int fd, int events, void *user_context) +{ + struct xio_tcp_transport *tcp_hndl = (struct xio_tcp_transport *) + user_context; + + if (events & XIO_POLLIN) + xio_tcp_new_connection(tcp_hndl); + + if ((events & (XIO_POLLHUP | XIO_POLLERR))) { + DEBUG_LOG("epoll returned with error events=%d for fd=%d\n", + events, fd); + xio_tcp_disconnect_helper(tcp_hndl); + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_listen */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_listen(struct xio_transport_base *transport, + const char *portal_uri, uint16_t *src_port, + int backlog) +{ + struct xio_tcp_transport *tcp_hndl = + (struct xio_tcp_transport *)transport; + union xio_sockaddr sa; + int sa_len; + int retval = 0; + uint16_t sport; + + /* resolve the portal_uri */ + sa_len = xio_uri_to_ss(portal_uri, &sa.sa_stor); + if (sa_len == -1) { + xio_set_error(XIO_E_ADDR_ERROR); + ERROR_LOG("address [%s] resolving failed\n", portal_uri); + goto exit1; + } + tcp_hndl->base.is_client = 0; + + /* bind */ + retval = bind(tcp_hndl->sock.cfd, + (struct sockaddr *)&sa.sa_stor, + sa_len); + if (retval) { + xio_set_error(xio_get_last_socket_error()); + ERROR_LOG("tcp bind failed. (errno=%d %m)\n", + xio_get_last_socket_error()); + goto exit1; + } + + tcp_hndl->is_listen = 1; + + retval = listen(tcp_hndl->sock.cfd, + backlog > 0 ? backlog : MAX_BACKLOG); + if (retval) { + xio_set_error(xio_get_last_socket_error()); + ERROR_LOG("tcp listen failed. (errno=%d %m)\n", + xio_get_last_socket_error()); + goto exit1; + } + + /* add to epoll */ + retval = xio_context_add_ev_handler( + tcp_hndl->base.ctx, + tcp_hndl->sock.cfd, + XIO_POLLIN, + xio_tcp_listener_ev_handler, + tcp_hndl); + if (retval) { + ERROR_LOG("xio_context_add_ev_handler failed.\n"); + goto exit1; + } + tcp_hndl->in_epoll[0] = 1; + + retval = getsockname(tcp_hndl->sock.cfd, + (struct sockaddr *)&sa.sa_stor, + (socklen_t *)&sa_len); + if (retval) { + xio_set_error(xio_get_last_socket_error()); + ERROR_LOG("getsockname failed. (errno=%d %m)\n", + xio_get_last_socket_error()); + goto exit; + } + + switch (sa.sa_stor.ss_family) { + case AF_INET: + sport = ntohs(sa.sa_in.sin_port); + break; + case AF_INET6: + sport = ntohs(sa.sa_in6.sin6_port); + break; + default: + xio_set_error(XIO_E_ADDR_ERROR); + ERROR_LOG("invalid family type %d.\n", sa.sa_stor.ss_family); + goto exit; + } + + if (src_port) + *src_port = sport; + + tcp_hndl->state = XIO_TRANSPORT_STATE_LISTEN; + DEBUG_LOG("listen on [%s] src_port:%d\n", portal_uri, sport); + + return 0; + +exit1: + tcp_hndl->sock.ops->del_ev_handlers = NULL; +exit: + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_conn_established_helper */ +/*---------------------------------------------------------------------------*/ +void xio_tcp_conn_established_helper(int fd, + struct xio_tcp_transport *tcp_hndl, + struct xio_tcp_connect_msg *msg, + int error) +{ + int retval = 0; + int so_error = 0; + socklen_t len = sizeof(so_error); + + /* remove from epoll */ + retval = xio_context_del_ev_handler(tcp_hndl->base.ctx, + tcp_hndl->sock.cfd); + if (retval) { + ERROR_LOG("removing connection handler failed.(errno=%d %m)\n", + xio_get_last_socket_error()); + goto cleanup; + } + + retval = getsockopt(tcp_hndl->sock.cfd, + SOL_SOCKET, + SO_ERROR, + (char *)&so_error, + &len); + if (retval) { + ERROR_LOG("getsockopt failed. (errno=%d %m)\n", + xio_get_last_socket_error()); + so_error = xio_get_last_socket_error(); + } + if (so_error || error) { + DEBUG_LOG("fd=%d connection establishment failed\n", + tcp_hndl->sock.cfd); + DEBUG_LOG("so_error=%d, epoll_error=%d\n", so_error, error); + tcp_hndl->sock.ops->del_ev_handlers = NULL; + goto cleanup; + } + + /* add to epoll */ + retval = tcp_hndl->sock.ops->add_ev_handlers(tcp_hndl); + if (retval) { + ERROR_LOG("setting connection handler failed. (errno=%d %m)\n", + xio_get_last_socket_error()); + goto cleanup; + } + + len = sizeof(tcp_hndl->base.peer_addr); + retval = getpeername(tcp_hndl->sock.cfd, + (struct sockaddr *)&tcp_hndl->base.peer_addr, + &len); + if (retval) { + xio_set_error(xio_get_last_socket_error()); + ERROR_LOG("tcp getpeername failed. (errno=%d %m)\n", + xio_get_last_socket_error()); + so_error = xio_get_last_socket_error(); + goto cleanup; + } + tcp_hndl->state = XIO_TRANSPORT_STATE_CONNECTING; + + retval = xio_tcp_send_connect_msg(tcp_hndl->sock.cfd, msg); + if (retval) + goto cleanup; + + xio_transport_notify_observer(&tcp_hndl->base, + XIO_TRANSPORT_EVENT_ESTABLISHED, + NULL); + + return; + +cleanup: + if (so_error == XIO_ECONNREFUSED) + xio_transport_notify_observer(&tcp_hndl->base, + XIO_TRANSPORT_EVENT_REFUSED, + NULL); + else + xio_transport_notify_observer_error(&tcp_hndl->base, + so_error ? so_error : + XIO_E_CONNECT_ERROR); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_single_conn_established_ev_handler */ +/*---------------------------------------------------------------------------*/ +void xio_tcp_single_conn_established_ev_handler(int fd, + int events, void *user_context) +{ + struct xio_tcp_transport *tcp_hndl = (struct xio_tcp_transport *) + user_context; + struct xio_tcp_connect_msg msg; + + msg.sock_type = XIO_TCP_SINGLE_SOCK; + msg.second_port = 0; + msg.pad = 0; + xio_tcp_conn_established_helper( + fd, tcp_hndl, &msg, + events & + (XIO_POLLERR | XIO_POLLHUP | XIO_POLLRDHUP)); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_cfd_conn_established_ev_handler */ +/*---------------------------------------------------------------------------*/ +void xio_tcp_cfd_conn_established_ev_handler(int fd, + int events, void *user_context) +{ + struct xio_tcp_transport *tcp_hndl = (struct xio_tcp_transport *) + user_context; + struct xio_tcp_connect_msg msg; + + msg.sock_type = XIO_TCP_CTL_SOCK; + msg.second_port = tcp_hndl->sock.port_dfd; + msg.pad = 0; + xio_tcp_conn_established_helper( + fd, tcp_hndl, &msg, + events & + (XIO_POLLERR | XIO_POLLHUP | XIO_POLLRDHUP)); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_dfd_conn_established_ev_handler */ +/*---------------------------------------------------------------------------*/ +void xio_tcp_dfd_conn_established_ev_handler(int fd, + int events, void *user_context) +{ + struct xio_tcp_transport *tcp_hndl = (struct xio_tcp_transport *) + user_context; + int retval = 0; + int so_error = 0; + socklen_t so_error_len = sizeof(so_error); + struct xio_tcp_connect_msg msg; + + /* remove from epoll */ + retval = xio_context_del_ev_handler(tcp_hndl->base.ctx, + tcp_hndl->sock.dfd); + if (retval) { + ERROR_LOG("removing connection handler failed.(errno=%d %m)\n", + xio_get_last_socket_error()); + goto cleanup; + } + + retval = getsockopt(tcp_hndl->sock.dfd, + SOL_SOCKET, + SO_ERROR, + (char *)&so_error, + &so_error_len); + if (retval) { + ERROR_LOG("getsockopt failed. (errno=%d %m)\n", + xio_get_last_socket_error()); + so_error = xio_get_last_socket_error(); + } + if (so_error || + (events & (XIO_POLLERR | XIO_POLLHUP | XIO_POLLRDHUP))) { + DEBUG_LOG("fd=%d connection establishment failed\n", + tcp_hndl->sock.dfd); + DEBUG_LOG("so_error=%d, epoll_events=%d\n", so_error, events); + tcp_hndl->sock.ops->del_ev_handlers = NULL; + goto cleanup; + } + + /* add to epoll */ + retval = xio_context_add_ev_handler( + tcp_hndl->base.ctx, + tcp_hndl->sock.cfd, + XIO_POLLOUT | XIO_POLLRDHUP, + xio_tcp_cfd_conn_established_ev_handler, + tcp_hndl); + if (retval) { + ERROR_LOG("setting connection handler failed. (errno=%d %m)\n", + xio_get_last_socket_error()); + goto cleanup; + } + tcp_hndl->in_epoll[0] = 1; + + msg.sock_type = XIO_TCP_DATA_SOCK; + msg.second_port = tcp_hndl->sock.port_cfd; + msg.pad = 0; + retval = xio_tcp_send_connect_msg(tcp_hndl->sock.dfd, & + msg); + if (retval) + goto cleanup; + + return; + +cleanup: + if (so_error == XIO_ECONNREFUSED) + xio_transport_notify_observer(&tcp_hndl->base, + XIO_TRANSPORT_EVENT_REFUSED, + NULL); + else + xio_transport_notify_observer_error(&tcp_hndl->base, + so_error ? so_error : + XIO_E_CONNECT_ERROR); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_connect_helper */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_connect_helper(int fd, struct sockaddr *sa, + socklen_t sa_len, uint16_t *bound_port, + struct sockaddr_storage *lss) +{ + int retval; + union xio_sockaddr *lsa = (union xio_sockaddr *)lss; + struct sockaddr_storage sa_stor; + socklen_t lsa_len = sizeof(struct sockaddr_storage); + + retval = connect(fd, sa, sa_len); + if (retval) { + if (xio_get_last_socket_error() == XIO_EINPROGRESS) { + /*set iomux for write event*/ + } else { + xio_set_error(xio_get_last_socket_error()); + ERROR_LOG("tcp connect failed. (errno=%d %m)\n", + xio_get_last_socket_error()); + return retval; + } + } else { + /*handle in ev_handler*/ + } + + if (!lss) + lsa = (union xio_sockaddr *)&sa_stor; + + retval = getsockname(fd, &lsa->sa, &lsa_len); + if (retval) { + xio_set_error(xio_get_last_socket_error()); + ERROR_LOG("tcp getsockname failed. (errno=%d %m)\n", + xio_get_last_socket_error()); + return retval; + } + + if (lsa->sa.sa_family == AF_INET) { + *bound_port = ntohs(lsa->sa_in.sin_port); + } else if (lsa->sa.sa_family == AF_INET6) { + *bound_port = ntohs(lsa->sa_in6.sin6_port); + } else { + ERROR_LOG("getsockname unknown family = %d\n", + lsa->sa.sa_family); + return -1; + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_single_sock_connect */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_single_sock_connect(struct xio_tcp_transport *tcp_hndl, + struct sockaddr *sa, + socklen_t sa_len) +{ + int retval; + + retval = xio_tcp_connect_helper(tcp_hndl->sock.cfd, sa, sa_len, + &tcp_hndl->sock.port_cfd, + &tcp_hndl->base.local_addr); + if (retval) + return retval; + + /* add to epoll */ + retval = xio_context_add_ev_handler( + tcp_hndl->base.ctx, + tcp_hndl->sock.cfd, + XIO_POLLOUT | XIO_POLLRDHUP, + xio_tcp_single_conn_established_ev_handler, + tcp_hndl); + if (retval) { + ERROR_LOG("setting connection handler failed. (errno=%d %m)\n", + xio_get_last_socket_error()); + return retval; + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_dual_sock_connect */ +/*---------------------------------------------------------------------------*/ +int xio_tcp_dual_sock_connect(struct xio_tcp_transport *tcp_hndl, + struct sockaddr *sa, + socklen_t sa_len) +{ + int retval; + + tcp_hndl->tmp_rx_buf = ucalloc(1, TMP_RX_BUF_SIZE); + if (!tcp_hndl->tmp_rx_buf) { + xio_set_error(ENOMEM); + ERROR_LOG("ucalloc failed. %m\n"); + return -1; + } + tcp_hndl->tmp_rx_buf_cur = tcp_hndl->tmp_rx_buf; + + retval = xio_tcp_connect_helper(tcp_hndl->sock.cfd, sa, sa_len, + &tcp_hndl->sock.port_cfd, + &tcp_hndl->base.local_addr); + if (retval) + return retval; + + retval = xio_tcp_connect_helper(tcp_hndl->sock.dfd, sa, sa_len, + &tcp_hndl->sock.port_dfd, + NULL); + if (retval) + return retval; + + /* add to epoll */ + retval = xio_context_add_ev_handler( + tcp_hndl->base.ctx, + tcp_hndl->sock.dfd, + XIO_POLLOUT | XIO_POLLRDHUP, + xio_tcp_dfd_conn_established_ev_handler, + tcp_hndl); + if (retval) { + ERROR_LOG("setting connection handler failed. (errno=%d %m)\n", + xio_get_last_socket_error()); + return retval; + } + tcp_hndl->in_epoll[1] = 1; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_connect */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_connect(struct xio_transport_base *transport, + const char *portal_uri, const char *out_if_addr) +{ + struct xio_tcp_transport *tcp_hndl = + (struct xio_tcp_transport *)transport; + union xio_sockaddr rsa; + socklen_t rsa_len = 0; + int retval = 0; + + /* resolve the portal_uri */ + rsa_len = xio_uri_to_ss(portal_uri, &rsa.sa_stor); + if (rsa_len == (socklen_t)-1) { + xio_set_error(XIO_E_ADDR_ERROR); + ERROR_LOG("address [%s] resolving failed\n", portal_uri); + goto exit1; + } + /* allocate memory for portal_uri */ + tcp_hndl->base.portal_uri = strdup(portal_uri); + if (!tcp_hndl->base.portal_uri) { + xio_set_error(ENOMEM); + ERROR_LOG("strdup failed. %m\n"); + goto exit1; + } + tcp_hndl->base.is_client = 1; + + if (out_if_addr) { + union xio_sockaddr if_sa; + int sa_len; + + sa_len = xio_host_port_to_ss(out_if_addr, &if_sa.sa_stor); + if (sa_len == -1) { + xio_set_error(XIO_E_ADDR_ERROR); + ERROR_LOG("outgoing interface [%s] resolving failed\n", + out_if_addr); + goto exit; + } + retval = bind(tcp_hndl->sock.cfd, + (struct sockaddr *)&if_sa.sa_stor, + sa_len); + if (retval) { + xio_set_error(xio_get_last_socket_error()); + ERROR_LOG("tcp bind failed. (errno=%d %m)\n", + xio_get_last_socket_error()); + goto exit; + } + } + + /* connect */ + retval = tcp_hndl->sock.ops->connect(tcp_hndl, + (struct sockaddr *)&rsa.sa_stor, + rsa_len); + if (retval) + goto exit; + + return 0; + +exit: + ufree(tcp_hndl->base.portal_uri); +exit1: + tcp_hndl->sock.ops->del_ev_handlers = NULL; + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_open */ +/*---------------------------------------------------------------------------*/ +static struct xio_transport_base *xio_tcp_open( + struct xio_transport *transport, + struct xio_context *ctx, + struct xio_observer *observer, + uint32_t trans_attr_mask, + struct xio_transport_init_attr *attr) +{ + struct xio_tcp_transport *tcp_hndl; + + tcp_hndl = xio_tcp_transport_create(transport, ctx, observer, 1); + if (!tcp_hndl) { + ERROR_LOG("failed. to create tcp transport%m\n"); + return NULL; + } + if (attr && trans_attr_mask) { + memcpy(&tcp_hndl->trans_attr, attr, sizeof(*attr)); + tcp_hndl->trans_attr_mask = trans_attr_mask; + } + + return (struct xio_transport_base *)tcp_hndl; +} + +/* + * To dynamically control C-states, open the file /dev/cpu_dma_latency and + * write the maximum allowable latency to it. This will prevent C-states with + * transition latencies higher than the specified value from being used, as + * long as the file /dev/cpu_dma_latency is kept open. + * Writing a maximum allowable latency of 0 will keep the processors in C0 + * (like using kernel parameter ―idle=poll), and writing 1 should force + * the processors to C1 when idle. Higher values could also be written to + * restrict the use of C-states with latency greater than the value written. + * + * http://en.community.dell.com/techcenter/extras/m/white_papers/20227764/download.aspx + */ + +/*---------------------------------------------------------------------------*/ +/* xio_set_cpu_latency */ +/*---------------------------------------------------------------------------*/ +static int xio_set_cpu_latency(int *fd) +{ + int32_t latency = 0; + + if (!tcp_options.enable_dma_latency) + return 0; + + DEBUG_LOG("setting latency to %d us\n", latency); + *fd = open("/dev/cpu_dma_latency", O_WRONLY); + if (*fd < 0) { + ERROR_LOG( + "open /dev/cpu_dma_latency %m - need root permissions\n"); + return -1; + } + if (write(*fd, &latency, sizeof(latency)) != sizeof(latency)) { + ERROR_LOG( + "write to /dev/cpu_dma_latency %m - need root permissions\n"); + close(*fd); + *fd = -1; + return -1; + } + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_init */ +/*---------------------------------------------------------------------------*/ +static void xio_tcp_init(void) +{ + spin_lock_init(&mngmt_lock); + + /* set cpu latency until process is down */ + xio_set_cpu_latency(&cdl_fd); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_init_msvc */ +/* This function is required for MSVC compilation under Windows */ +/*---------------------------------------------------------------------------*/ +int CALLBACK xio_tcp_init_msvc(thread_once_t *a, void *b, void **c) +{ + xio_tcp_init(); + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_transport_init */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_transport_init(struct xio_transport *transport) +{ + thread_once(&ctor_key_once, xio_tcp_init); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_release */ +/*---------------------------------------------------------------------------*/ +static void xio_tcp_release(void) +{ + if (cdl_fd >= 0) + xio_closesocket(cdl_fd); + + /*ORK todo close everything? see xio_cq_release*/ +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_release_msvc */ +/* This function is required for MSVC compilation under Windows */ +/*---------------------------------------------------------------------------*/ +int CALLBACK xio_tcp_release_msvc(thread_once_t *a, void *b, void **c) +{ + xio_tcp_release(); + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_transport_constructor */ +/*---------------------------------------------------------------------------*/ +void xio_tcp_transport_constructor(void) +{ +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_transport_destructor */ +/*---------------------------------------------------------------------------*/ +void xio_tcp_transport_destructor(void) +{ + reset_thread_once_t(&ctor_key_once); + reset_thread_once_t(&dtor_key_once); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_transport_release */ +/*---------------------------------------------------------------------------*/ +static void xio_tcp_transport_release(struct xio_transport *transport) +{ + if (is_reset_thread_once_t(&ctor_key_once)) + return; + + thread_once(&dtor_key_once, xio_tcp_release); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_rxd_init */ +/*---------------------------------------------------------------------------*/ +static void xio_tcp_rxd_init(struct xio_tcp_work_req *rxd, + void *buf, unsigned size) +{ + rxd->msg_iov[0].iov_base = buf; + rxd->msg_iov[0].iov_len = sizeof(struct xio_tlv); + rxd->msg_iov[1].iov_base = sum_to_ptr(rxd->msg_iov[0].iov_base, + rxd->msg_iov[0].iov_len); + rxd->msg_iov[1].iov_len = size - sizeof(struct xio_tlv); + rxd->msg_len = 2; + + rxd->tot_iov_byte_len = 0; + + rxd->stage = XIO_TCP_RX_START; + rxd->msg.msg_control = NULL; + rxd->msg.msg_controllen = 0; + rxd->msg.msg_flags = 0; + rxd->msg.msg_name = NULL; + rxd->msg.msg_namelen = 0; + rxd->msg.msg_iov = NULL; + rxd->msg.msg_iovlen = 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_txd_init */ +/*---------------------------------------------------------------------------*/ +static void xio_tcp_txd_init(struct xio_tcp_work_req *txd, + void *buf, unsigned size) +{ + txd->ctl_msg = buf; + txd->ctl_msg_len = 0; + txd->msg_iov[0].iov_base = buf; + txd->msg_iov[0].iov_len = size; + txd->msg_len = 1; + txd->tot_iov_byte_len = 0; + + txd->stage = XIO_TCP_TX_BEFORE; + txd->msg.msg_control = NULL; + txd->msg.msg_controllen = 0; + txd->msg.msg_flags = 0; + txd->msg.msg_name = NULL; + txd->msg.msg_namelen = 0; + txd->msg.msg_iov = NULL; + txd->msg.msg_iovlen = 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_task_init */ +/*---------------------------------------------------------------------------*/ +static void xio_tcp_task_init(struct xio_task *task, + struct xio_tcp_transport *tcp_hndl, + void *buf, + unsigned long size) +{ + XIO_TO_TCP_TASK(task, tcp_task); + + xio_tcp_rxd_init(&tcp_task->rxd, buf, size); + xio_tcp_txd_init(&tcp_task->txd, buf, size); + + /* initialize the mbuf */ + xio_mbuf_init(&task->mbuf, buf, size, 0); +} + +/* task pools management */ +/*---------------------------------------------------------------------------*/ +/* xio_tcp_initial_pool_slab_pre_create */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_initial_pool_slab_pre_create( + struct xio_transport_base *transport_hndl, + int alloc_nr, + void *pool_dd_data, void *slab_dd_data) +{ + struct xio_tcp_tasks_slab *tcp_slab = + (struct xio_tcp_tasks_slab *)slab_dd_data; + uint32_t pool_size; + + tcp_slab->buf_size = CONN_SETUP_BUF_SIZE; + pool_size = tcp_slab->buf_size * alloc_nr; + + tcp_slab->data_pool = ucalloc(pool_size * alloc_nr, sizeof(uint8_t)); + if (!tcp_slab->data_pool) { + xio_set_error(ENOMEM); + ERROR_LOG("ucalloc conn_setup_data_pool sz: %u failed\n", + pool_size); + return -1; + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_initial_task_alloc */ +/*---------------------------------------------------------------------------*/ +static inline struct xio_task *xio_tcp_initial_task_alloc( + struct xio_tcp_transport *tcp_hndl) +{ + if (tcp_hndl->initial_pool_cls.task_get) { + struct xio_task *task = tcp_hndl->initial_pool_cls.task_get( + tcp_hndl->initial_pool_cls.pool, + tcp_hndl); + return task; + } + return NULL; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_primary_task_alloc */ +/*---------------------------------------------------------------------------*/ +struct xio_task *xio_tcp_primary_task_alloc( + struct xio_tcp_transport *tcp_hndl) +{ + if (tcp_hndl->primary_pool_cls.task_get) { + struct xio_task *task = tcp_hndl->primary_pool_cls.task_get( + tcp_hndl->primary_pool_cls.pool, + tcp_hndl); + return task; + } + return NULL; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_primary_task_lookup */ +/*---------------------------------------------------------------------------*/ +struct xio_task *xio_tcp_primary_task_lookup( + struct xio_tcp_transport *tcp_hndl, + int tid) +{ + if (tcp_hndl->primary_pool_cls.task_lookup) + return tcp_hndl->primary_pool_cls.task_lookup( + tcp_hndl->primary_pool_cls.pool, tid); + return NULL; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_task_free */ +/*---------------------------------------------------------------------------*/ +inline void xio_tcp_task_free(struct xio_tcp_transport *tcp_hndl, + struct xio_task *task) +{ + if (tcp_hndl->primary_pool_cls.task_put) + return tcp_hndl->primary_pool_cls.task_put(task); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_initial_pool_post_create */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_initial_pool_post_create( + struct xio_transport_base *transport_hndl, + void *pool, void *pool_dd_data) +{ + struct xio_task *task; + struct xio_tcp_task *tcp_task; + struct xio_tcp_transport *tcp_hndl = + (struct xio_tcp_transport *)transport_hndl; + + if (!tcp_hndl) + return 0; + + tcp_hndl->initial_pool_cls.pool = pool; + + task = xio_tcp_initial_task_alloc(tcp_hndl); + if (!task) { + ERROR_LOG("failed to get task\n"); + } else { + list_add_tail(&task->tasks_list_entry, &tcp_hndl->rx_list); + tcp_task = (struct xio_tcp_task *)task->dd_data; + tcp_task->out_tcp_op = XIO_TCP_RECV; + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_initial_pool_slab_destroy */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_initial_pool_slab_destroy( + struct xio_transport_base *transport_hndl, + void *pool_dd_data, void *slab_dd_data) +{ + struct xio_tcp_tasks_slab *tcp_slab = + (struct xio_tcp_tasks_slab *)slab_dd_data; + + ufree(tcp_slab->data_pool); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_initial_pool_slab_init_task */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_initial_pool_slab_init_task( + struct xio_transport_base *transport_hndl, + void *pool_dd_data, void *slab_dd_data, + int tid, struct xio_task *task) +{ + struct xio_tcp_transport *tcp_hndl = + (struct xio_tcp_transport *)transport_hndl; + struct xio_tcp_tasks_slab *tcp_slab = + (struct xio_tcp_tasks_slab *)slab_dd_data; + void *buf = sum_to_ptr(tcp_slab->data_pool, + tid * ALIGN(tcp_slab->buf_size, PAGE_SIZE)); + char *ptr; + + XIO_TO_TCP_TASK(task, tcp_task); + + /* fill xio_tcp_task */ + ptr = (char *)tcp_task; + ptr += sizeof(struct xio_tcp_task); + + /* fill xio_tcp_work_req */ + tcp_task->txd.msg_iov = (struct iovec *)ptr; + ptr += sizeof(struct iovec); + + tcp_task->rxd.msg_iov = (struct iovec *)ptr; + ptr += 2 * sizeof(struct iovec); + /*****************************************/ + + xio_tcp_task_init( + task, + tcp_hndl, + buf, + tcp_slab->buf_size); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_initial_pool_get_params */ +/*---------------------------------------------------------------------------*/ +static void xio_tcp_initial_pool_get_params( + struct xio_transport_base *transport_hndl, + int *start_nr, int *max_nr, int *alloc_nr, + int *pool_dd_sz, int *slab_dd_sz, int *task_dd_sz) +{ + + *start_nr = 10 * NUM_CONN_SETUP_TASKS; + *alloc_nr = 10 * NUM_CONN_SETUP_TASKS; + *max_nr = 10 * NUM_CONN_SETUP_TASKS; + + *pool_dd_sz = 0; + *slab_dd_sz = sizeof(struct xio_tcp_tasks_slab); + *task_dd_sz = sizeof(struct xio_tcp_task) + 3 * sizeof(struct iovec); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_task_pre_put */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_task_pre_put( + struct xio_transport_base *trans_hndl, + struct xio_task *task) +{ + XIO_TO_TCP_TASK(task, tcp_task); + XIO_TO_TCP_HNDL(task, tcp_hndl); + unsigned int i; + + /* recycle TCP buffers back to pool */ + + /* put buffers back to pool */ + + for (i = 0; i < tcp_task->read_num_reg_mem; i++) { + if (tcp_task->read_reg_mem[i].priv) { + xio_mempool_free(&tcp_task->read_reg_mem[i]); + tcp_task->read_reg_mem[i].priv = NULL; + } + } + tcp_task->read_num_reg_mem = 0; + + for (i = 0; i < tcp_task->write_num_reg_mem; i++) { + if (tcp_task->write_reg_mem[i].priv) { + xio_mempool_free(&tcp_task->write_reg_mem[i]); + tcp_task->write_reg_mem[i].priv = NULL; + } + } + tcp_task->write_num_reg_mem = 0; + tcp_task->req_in_num_sge = 0; + tcp_task->req_out_num_sge = 0; + tcp_task->rsp_out_num_sge = 0; + tcp_task->sn = 0; + + tcp_task->out_tcp_op = XIO_TCP_NULL; + + xio_tcp_rxd_init(&tcp_task->rxd, + task->mbuf.buf.head, + task->mbuf.buf.buflen); + xio_tcp_txd_init(&tcp_task->txd, + task->mbuf.buf.head, + task->mbuf.buf.buflen); + + xio_ctx_del_work(tcp_hndl->base.ctx, &tcp_task->comp_work); + + return 0; +} + +static struct xio_tasks_pool_ops initial_tasks_pool_ops; +/*---------------------------------------------------------------------------*/ +static void init_initial_tasks_pool_ops(void) +{ + initial_tasks_pool_ops.pool_get_params = + xio_tcp_initial_pool_get_params; + initial_tasks_pool_ops.slab_pre_create = + xio_tcp_initial_pool_slab_pre_create; + initial_tasks_pool_ops.slab_destroy = + xio_tcp_initial_pool_slab_destroy; + initial_tasks_pool_ops.slab_init_task = + xio_tcp_initial_pool_slab_init_task; + initial_tasks_pool_ops.pool_post_create = + xio_tcp_initial_pool_post_create; + initial_tasks_pool_ops.task_pre_put = + xio_tcp_task_pre_put; +}; + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_primary_pool_slab_pre_create */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_primary_pool_slab_pre_create( + struct xio_transport_base *transport_hndl, + int alloc_nr, void *pool_dd_data, void *slab_dd_data) +{ + struct xio_tcp_tasks_slab *tcp_slab = + (struct xio_tcp_tasks_slab *)slab_dd_data; + size_t inline_buf_sz = xio_tcp_get_inline_buffer_size(); + size_t alloc_sz = alloc_nr * ALIGN(inline_buf_sz, PAGE_SIZE); + int retval; + + tcp_slab->buf_size = inline_buf_sz; + + if (disable_huge_pages) { + retval = xio_mem_alloc(alloc_sz, &tcp_slab->reg_mem); + if (retval) { + xio_set_error(ENOMEM); + ERROR_LOG("xio_alloc tcp pool sz:%zu failed\n", + alloc_sz); + return -1; + } + tcp_slab->data_pool = tcp_slab->reg_mem.addr; + } else { + /* maybe allocation of with unuma_alloc can provide better + * performance? + */ + tcp_slab->data_pool = umalloc_huge_pages(alloc_sz); + if (!tcp_slab->data_pool) { + xio_set_error(ENOMEM); + ERROR_LOG("malloc tcp pool sz:%zu failed\n", + alloc_sz); + return -1; + } + } + + DEBUG_LOG("pool buf:%p\n", tcp_slab->data_pool); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_primary_pool_post_create */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_primary_pool_post_create( + struct xio_transport_base *transport_hndl, + void *pool, void *pool_dd_data) +{ + struct xio_task *task = NULL; + struct xio_tcp_task *tcp_task = NULL; + int i; + struct xio_tcp_transport *tcp_hndl = + (struct xio_tcp_transport *)transport_hndl; + + if (!tcp_hndl) + return 0; + + tcp_hndl->primary_pool_cls.pool = pool; + + for (i = 0; i < RX_LIST_POST_NR; i++) { + /* get ready to receive message */ + task = xio_tcp_primary_task_alloc(tcp_hndl); + if (task == 0) { + ERROR_LOG("primary task pool is empty\n"); + return -1; + } + tcp_task = (struct xio_tcp_task *)task->dd_data; + tcp_task->out_tcp_op = XIO_TCP_RECV; + list_add_tail(&task->tasks_list_entry, &tcp_hndl->rx_list); + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_primary_pool_slab_destroy */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_primary_pool_slab_destroy( + struct xio_transport_base *transport_hndl, + void *pool_dd_data, void *slab_dd_data) +{ + struct xio_tcp_tasks_slab *tcp_slab = + (struct xio_tcp_tasks_slab *)slab_dd_data; + + if (tcp_slab->reg_mem.addr) + xio_mem_free(&tcp_slab->reg_mem); + else + ufree_huge_pages(tcp_slab->data_pool); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_primary_pool_slab_init_task */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_primary_pool_slab_init_task( + struct xio_transport_base *transport_hndl, + void *pool_dd_data, + void *slab_dd_data, int tid, struct xio_task *task) +{ + struct xio_tcp_transport *tcp_hndl = + (struct xio_tcp_transport *)transport_hndl; + struct xio_tcp_tasks_slab *tcp_slab = + (struct xio_tcp_tasks_slab *)slab_dd_data; + void *buf = sum_to_ptr(tcp_slab->data_pool, tid * tcp_slab->buf_size); + int max_iovsz = max(tcp_options.max_out_iovsz, + tcp_options.max_in_iovsz) + 1; + char *ptr; + + XIO_TO_TCP_TASK(task, tcp_task); + + /* fill xio_tco_task */ + ptr = (char *)tcp_task; + ptr += sizeof(struct xio_tcp_task); + + /* fill xio_tcp_work_req */ + tcp_task->txd.msg_iov = (struct iovec *)ptr; + ptr += (max_iovsz + 1) * sizeof(struct iovec); + tcp_task->rxd.msg_iov = (struct iovec *)ptr; + ptr += (max_iovsz + 1) * sizeof(struct iovec); + + tcp_task->read_reg_mem = (struct xio_reg_mem *)ptr; + ptr += max_iovsz * sizeof(struct xio_reg_mem); + tcp_task->write_reg_mem = (struct xio_reg_mem *)ptr; + ptr += max_iovsz * sizeof(struct xio_reg_mem); + + tcp_task->req_in_sge = (struct xio_sge *)ptr; + ptr += max_iovsz * sizeof(struct xio_sge); + tcp_task->req_out_sge = (struct xio_sge *)ptr; + ptr += max_iovsz * sizeof(struct xio_sge); + tcp_task->rsp_out_sge = (struct xio_sge *)ptr; + ptr += max_iovsz * sizeof(struct xio_sge); + /*****************************************/ + + tcp_task->out_tcp_op = (enum xio_tcp_op_code)0x200; + xio_tcp_task_init( + task, + tcp_hndl, + buf, + tcp_slab->buf_size); + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_primary_pool_get_params */ +/*---------------------------------------------------------------------------*/ +static void xio_tcp_primary_pool_get_params( + struct xio_transport_base *transport_hndl, + int *start_nr, int *max_nr, int *alloc_nr, + int *pool_dd_sz, int *slab_dd_sz, int *task_dd_sz) +{ + int max_iovsz = max(tcp_options.max_out_iovsz, + tcp_options.max_in_iovsz) + 1; + + /* per transport */ + *start_nr = NUM_START_PRIMARY_POOL_TASKS; + *alloc_nr = NUM_ALLOC_PRIMARY_POOL_TASKS; + *max_nr = max((g_options.snd_queue_depth_msgs + + g_options.rcv_queue_depth_msgs), *start_nr); + + *pool_dd_sz = 0; + *slab_dd_sz = sizeof(struct xio_tcp_tasks_slab); + *task_dd_sz = sizeof(struct xio_tcp_task) + + (2 * (max_iovsz + 1)) * sizeof(struct iovec) + + 2 * max_iovsz * sizeof(struct xio_reg_mem) + + 3 * max_iovsz * sizeof(struct xio_sge); +} + +static struct xio_tasks_pool_ops primary_tasks_pool_ops; +/*---------------------------------------------------------------------------*/ +static void init_primary_tasks_pool_ops(void) +{ + primary_tasks_pool_ops.pool_get_params = + xio_tcp_primary_pool_get_params; + primary_tasks_pool_ops.slab_pre_create = + xio_tcp_primary_pool_slab_pre_create; + primary_tasks_pool_ops.slab_destroy = + xio_tcp_primary_pool_slab_destroy; + primary_tasks_pool_ops.slab_init_task = + xio_tcp_primary_pool_slab_init_task; + primary_tasks_pool_ops.pool_post_create = + xio_tcp_primary_pool_post_create; + primary_tasks_pool_ops.task_pre_put = xio_tcp_task_pre_put; +}; + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_get_pools_ops */ +/*---------------------------------------------------------------------------*/ +static void xio_tcp_get_pools_ops(struct xio_transport_base *trans_hndl, + struct xio_tasks_pool_ops **initial_pool_ops, + struct xio_tasks_pool_ops **primary_pool_ops) +{ + *initial_pool_ops = &initial_tasks_pool_ops; + *primary_pool_ops = &primary_tasks_pool_ops; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_set_pools_cls */ +/*---------------------------------------------------------------------------*/ +static void xio_tcp_set_pools_cls(struct xio_transport_base *trans_hndl, + struct xio_tasks_pool_cls *initial_pool_cls, + struct xio_tasks_pool_cls *primary_pool_cls) +{ + struct xio_tcp_transport *tcp_hndl = + (struct xio_tcp_transport *)trans_hndl; + + if (initial_pool_cls) + tcp_hndl->initial_pool_cls = *initial_pool_cls; + if (primary_pool_cls) + tcp_hndl->primary_pool_cls = *primary_pool_cls; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_set_opt */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_set_opt(void *xio_obj, + int optname, const void *optval, int optlen) +{ + switch (optname) { + case XIO_OPTNAME_ENABLE_MEM_POOL: + VALIDATE_SZ(sizeof(int)); + tcp_options.enable_mem_pool = *((int *)optval); + return 0; + case XIO_OPTNAME_ENABLE_DMA_LATENCY: + VALIDATE_SZ(sizeof(int)); + tcp_options.enable_dma_latency = *((int *)optval); + return 0; + case XIO_OPTNAME_MAX_IN_IOVLEN: + VALIDATE_SZ(sizeof(int)); + tcp_options.max_in_iovsz = *((int *)optval); + return 0; + case XIO_OPTNAME_MAX_OUT_IOVLEN: + VALIDATE_SZ(sizeof(int)); + tcp_options.max_out_iovsz = *((int *)optval); + return 0; + case XIO_OPTNAME_TCP_ENABLE_MR_CHECK: + VALIDATE_SZ(sizeof(int)); + tcp_options.enable_mr_check = *((int *)optval); + return 0; + case XIO_OPTNAME_TCP_NO_DELAY: + VALIDATE_SZ(sizeof(int)); + tcp_options.tcp_no_delay = *((int *)optval); + return 0; + case XIO_OPTNAME_TCP_SO_SNDBUF: + VALIDATE_SZ(sizeof(int)); + tcp_options.tcp_so_sndbuf = *((int *)optval); + return 0; + case XIO_OPTNAME_TCP_SO_RCVBUF: + VALIDATE_SZ(sizeof(int)); + tcp_options.tcp_so_rcvbuf = *((int *)optval); + return 0; + case XIO_OPTNAME_TCP_DUAL_STREAM: + VALIDATE_SZ(sizeof(int)); + tcp_options.tcp_dual_sock = *((int *)optval); + return 0; + default: + break; + } + xio_set_error(XIO_E_NOT_SUPPORTED); + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_get_opt */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_get_opt(void *xio_obj, + int optname, void *optval, int *optlen) +{ + switch (optname) { + case XIO_OPTNAME_ENABLE_MEM_POOL: + *((int *)optval) = tcp_options.enable_mem_pool; + *optlen = sizeof(int); + return 0; + case XIO_OPTNAME_ENABLE_DMA_LATENCY: + *((int *)optval) = tcp_options.enable_dma_latency; + *optlen = sizeof(int); + return 0; + case XIO_OPTNAME_MAX_IN_IOVLEN: + *((int *)optval) = tcp_options.max_in_iovsz; + *optlen = sizeof(int); + return 0; + case XIO_OPTNAME_MAX_OUT_IOVLEN: + *((int *)optval) = tcp_options.max_out_iovsz; + *optlen = sizeof(int); + return 0; + case XIO_OPTNAME_TCP_ENABLE_MR_CHECK: + *((int *)optval) = tcp_options.enable_mr_check; + *optlen = sizeof(int); + return 0; + case XIO_OPTNAME_TCP_NO_DELAY: + *((int *)optval) = tcp_options.tcp_no_delay; + *optlen = sizeof(int); + return 0; + case XIO_OPTNAME_TCP_SO_SNDBUF: + *((int *)optval) = tcp_options.tcp_so_sndbuf; + *optlen = sizeof(int); + return 0; + case XIO_OPTNAME_TCP_SO_RCVBUF: + *((int *)optval) = tcp_options.tcp_so_rcvbuf; + *optlen = sizeof(int); + return 0; + case XIO_OPTNAME_TCP_DUAL_STREAM: + *((int *)optval) = tcp_options.tcp_dual_sock; + *optlen = sizeof(int); + return 0; + default: + break; + } + xio_set_error(XIO_E_NOT_SUPPORTED); + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_is_valid_in_req */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_is_valid_in_req(struct xio_msg *msg) +{ + unsigned int i; + unsigned int mr_found = 0; + struct xio_vmsg *vmsg = &msg->in; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + void *sge; + unsigned long nents, max_nents; + + sgtbl = xio_sg_table_get(&msg->in); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(msg->in.sgl_type); + nents = tbl_nents(sgtbl_ops, sgtbl); + max_nents = tbl_max_nents(sgtbl_ops, sgtbl); + + if ((nents > (unsigned long)tcp_options.max_in_iovsz) || + (nents > max_nents) || + (max_nents > (unsigned long)tcp_options.max_in_iovsz)) { + return 0; + } + + if (vmsg->sgl_type == XIO_SGL_TYPE_IOV && nents > XIO_IOVLEN) + return 0; + + if (vmsg->header.iov_base && + (vmsg->header.iov_len == 0)) + return 0; + + for_each_sge(sgtbl, sgtbl_ops, sge, i) { + if (sge_mr(sgtbl_ops, sge)) + mr_found++; + if (!sge_addr(sgtbl_ops, sge)) { + if (sge_mr(sgtbl_ops, sge)) + return 0; + } else { + if (sge_length(sgtbl_ops, sge) == 0) + return 0; + } + } + if (tcp_options.enable_mr_check && + (mr_found != nents) && mr_found) + return 0; + + return 1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_is_valid_out_msg */ +/*---------------------------------------------------------------------------*/ +static int xio_tcp_is_valid_out_msg(struct xio_msg *msg) +{ + unsigned int i; + unsigned int mr_found = 0; + struct xio_vmsg *vmsg = &msg->out; + struct xio_sg_table_ops *sgtbl_ops; + void *sgtbl; + void *sge; + unsigned long nents, max_nents; + + sgtbl = xio_sg_table_get(&msg->out); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(msg->out.sgl_type); + nents = tbl_nents(sgtbl_ops, sgtbl); + max_nents = tbl_max_nents(sgtbl_ops, sgtbl); + + if ((nents > (unsigned long)tcp_options.max_out_iovsz) || + (nents > max_nents) || + (max_nents > (unsigned long)tcp_options.max_out_iovsz)) + return 0; + + if (vmsg->sgl_type == XIO_SGL_TYPE_IOV && nents > XIO_IOVLEN) + return 0; + + if ((vmsg->header.iov_base && + (vmsg->header.iov_len == 0)) || + (!vmsg->header.iov_base && + (vmsg->header.iov_len != 0))) + return 0; + + if (vmsg->header.iov_len > (size_t)g_options.max_inline_xio_hdr) + return 0; + + for_each_sge(sgtbl, sgtbl_ops, sge, i) { + if (sge_mr(sgtbl_ops, sge)) + mr_found++; + if (!sge_addr(sgtbl_ops, sge) || + (sge_length(sgtbl_ops, sge) == 0)) + return 0; + } + + if (tcp_options.enable_mr_check && + (mr_found != nents) && mr_found) + return 0; + + return 1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_dup2 */ +/* makes new_trans_hndl be the copy of old_trans_hndl, closes new_trans_hndl */ +/* Note old and new are in dup2 terminology opposite to reconnect terms */ +/* --------------------------------------------------------------------------*/ +static int xio_tcp_dup2(struct xio_transport_base *old_trans_hndl, + struct xio_transport_base **new_trans_hndl) +{ + xio_tcp_close(*new_trans_hndl); + + /* conn layer will call close which will only decrement */ + /*kref_get(&old_trans_hndl->kref);*/ + *new_trans_hndl = old_trans_hndl; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +static void init_single_sock_ops(void) +{ + single_sock_ops.open = xio_tcp_single_sock_create; + single_sock_ops.add_ev_handlers = xio_tcp_single_sock_add_ev_handlers; + single_sock_ops.del_ev_handlers = xio_tcp_single_sock_del_ev_handlers; + single_sock_ops.connect = xio_tcp_single_sock_connect; + single_sock_ops.set_txd = xio_tcp_single_sock_set_txd; + single_sock_ops.set_rxd = xio_tcp_single_sock_set_rxd; + single_sock_ops.set_rxd_iov = xio_tcp_single_sock_set_rxd_iov; + single_sock_ops.rx_ctl_work = xio_tcp_recvmsg_work; + single_sock_ops.rx_ctl_handler = xio_tcp_single_sock_rx_ctl_handler; + single_sock_ops.rx_data_handler = xio_tcp_rx_data_handler; + single_sock_ops.shutdown = xio_tcp_single_sock_shutdown; + single_sock_ops.close = xio_tcp_single_sock_close; +}; + +/*---------------------------------------------------------------------------*/ +static void init_dual_sock_ops(void) +{ + dual_sock_ops.open = xio_tcp_dual_sock_create; + dual_sock_ops.add_ev_handlers = xio_tcp_dual_sock_add_ev_handlers; + dual_sock_ops.del_ev_handlers = xio_tcp_dual_sock_del_ev_handlers; + dual_sock_ops.connect = xio_tcp_dual_sock_connect; + dual_sock_ops.set_txd = xio_tcp_dual_sock_set_txd; + dual_sock_ops.set_rxd = xio_tcp_dual_sock_set_rxd; + dual_sock_ops.set_rxd_iov = xio_tcp_dual_sock_set_rxd_iov; + dual_sock_ops.rx_ctl_work = xio_tcp_recv_ctl_work; + dual_sock_ops.rx_ctl_handler = xio_tcp_dual_sock_rx_ctl_handler; + dual_sock_ops.rx_data_handler = xio_tcp_rx_data_handler; + dual_sock_ops.shutdown = xio_tcp_dual_sock_shutdown; + dual_sock_ops.close = xio_tcp_dual_sock_close; +}; + +struct xio_transport xio_tcp_transport; +/*---------------------------------------------------------------------------*/ +static void init_xio_tcp_transport(void) +{ + xio_tcp_transport.name = "tcp"; + xio_tcp_transport.ctor = xio_tcp_transport_constructor; + xio_tcp_transport.dtor = xio_tcp_transport_destructor; + xio_tcp_transport.init = xio_tcp_transport_init; + xio_tcp_transport.release = xio_tcp_transport_release; + xio_tcp_transport.context_shutdown = xio_tcp_context_shutdown; + xio_tcp_transport.open = xio_tcp_open; + xio_tcp_transport.connect = xio_tcp_connect; + xio_tcp_transport.listen = xio_tcp_listen; + xio_tcp_transport.accept = xio_tcp_accept; + xio_tcp_transport.reject = xio_tcp_reject; + xio_tcp_transport.close = xio_tcp_close; + xio_tcp_transport.dup2 = xio_tcp_dup2; + /* .update_task = xio_tcp_update_task;*/ + xio_tcp_transport.send = xio_tcp_send; + xio_tcp_transport.poll = xio_tcp_poll; + xio_tcp_transport.set_opt = xio_tcp_set_opt; + xio_tcp_transport.get_opt = xio_tcp_get_opt; + xio_tcp_transport.cancel_req = xio_tcp_cancel_req; + xio_tcp_transport.cancel_rsp = xio_tcp_cancel_rsp; + xio_tcp_transport.get_pools_setup_ops = xio_tcp_get_pools_ops; + xio_tcp_transport.set_pools_cls = xio_tcp_set_pools_cls; + + xio_tcp_transport.validators_cls.is_valid_in_req = + xio_tcp_is_valid_in_req; + xio_tcp_transport.validators_cls.is_valid_out_msg = + xio_tcp_is_valid_out_msg; +} + +/*---------------------------------------------------------------------------*/ +static void init_static_structs(void) +{ + init_initial_tasks_pool_ops(); + init_primary_tasks_pool_ops(); + init_single_sock_ops(); + init_dual_sock_ops(); + init_xio_tcp_transport(); +} + +/*---------------------------------------------------------------------------*/ +/* xio_tcp_get_transport_func_list */ +/*---------------------------------------------------------------------------*/ +struct xio_transport *xio_tcp_get_transport_func_list(void) +{ + init_static_structs(); + return &xio_tcp_transport; +} diff --git a/open_src/xio/src/usr/transport/tcp/xio_tcp_transport.h b/open_src/xio/src/usr/transport/tcp/xio_tcp_transport.h new file mode 100644 index 0000000..02865a8 --- /dev/null +++ b/open_src/xio/src/usr/transport/tcp/xio_tcp_transport.h @@ -0,0 +1,411 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_TCP_TRANSPORT_H_ +#define XIO_TCP_TRANSPORT_H_ + +struct xio_tcp_transport; +struct xio_tcp_socket; + +/*---------------------------------------------------------------------------*/ +/* externals */ +/*---------------------------------------------------------------------------*/ +extern double g_mhz; + +/* definitions */ +#define NUM_TASKS 54400 /* 100 * (MAX_SEND_WR + + * MAX_RECV_WR + EXTRA_RQE) + */ + +#define RX_LIST_POST_NR 31 /* Initial number of buffers + * to put in the rx_list + */ + +#define COMPLETION_BATCH_MAX 64 /* Trigger TX completion every + * COMPLETION_BATCH_MAX + * packets + */ + +#define TX_BATCH 32 /* Number of TX tasks to batch */ + +#define TX_EAGAIN_RETRY 2 /* Number of retries when send + * fail with EAGAIN before return. + */ + +#define RX_POLL_NR_MAX 4 /* Max num of RX messages + * to receive in one poll + */ + +#define RX_BATCH 32 /* Number of RX tasks to batch */ + +#define MAX_BACKLOG 1024 /* listen socket max backlog */ + +#define TMP_RX_BUF_SIZE (RX_BATCH * MAX_HDR_SZ) + +#define XIO_TO_TCP_TASK(xt, tt) \ + struct xio_tcp_task *(tt) = \ + (struct xio_tcp_task *)(xt)->dd_data +#define XIO_TO_TCP_HNDL(xt, th) \ + struct xio_tcp_transport *(th) = \ + (struct xio_tcp_transport *)(xt)->context + +#define PAGE_SIZE page_size + +/*---------------------------------------------------------------------------*/ +/* enums */ +/*---------------------------------------------------------------------------*/ +enum xio_tcp_op_code { + XIO_TCP_NULL, + XIO_TCP_RECV = 1, + XIO_TCP_SEND, + XIO_TCP_WRITE, + XIO_TCP_READ +}; + +enum xio_tcp_rx_stage { + XIO_TCP_RX_START, + XIO_TCP_RX_TLV, + XIO_TCP_RX_HEADER, + XIO_TCP_RX_IO_DATA, + XIO_TCP_RX_DONE +}; + +enum xio_tcp_tx_stage { + XIO_TCP_TX_BEFORE, + XIO_TCP_TX_IN_SEND_CTL, + XIO_TCP_TX_IN_SEND_DATA, + XIO_TCP_TX_DONE +}; + +enum xio_tcp_sock_type { + XIO_TCP_SINGLE_SOCK = 1, + XIO_TCP_CTL_SOCK, + XIO_TCP_DATA_SOCK +}; + +/*---------------------------------------------------------------------------*/ +struct xio_tcp_options { + int enable_mem_pool; + int enable_dma_latency; + int enable_mr_check; + int max_in_iovsz; + int max_out_iovsz; + int tcp_no_delay; + int tcp_so_sndbuf; + int tcp_so_rcvbuf; + int tcp_dual_sock; + int pad; +}; + +#define XIO_TCP_REQ_HEADER_VERSION 1 + +PACKED_MEMORY(struct xio_tcp_req_hdr { + uint8_t version; /* request version */ + uint8_t flags; + uint16_t req_hdr_len; /* req header length */ + uint16_t sn; /* serial number */ + uint16_t pad0; + + uint32_t ltid; /* originator identifier*/ + uint16_t pad; + uint8_t in_tcp_op; /* opcode for peers */ + uint8_t out_tcp_op; + + uint16_t in_num_sge; + uint16_t out_num_sge; + uint32_t pad1; + + uint16_t ulp_hdr_len; /* ulp header length */ + uint16_t ulp_pad_len; /* pad_len length */ + uint32_t remain_data_len;/* remaining data length */ + + uint64_t ulp_imm_len; /* ulp data length */ +}); + +#define XIO_TCP_RSP_HEADER_VERSION 1 + +PACKED_MEMORY(struct xio_tcp_rsp_hdr { + uint8_t version; /* response version */ + uint8_t flags; + uint16_t rsp_hdr_len; /* rsp header length */ + uint16_t sn; /* serial number */ + uint16_t pad; + + uint32_t ltid; /* local task id */ + uint32_t rtid; /* remote task id */ + + uint8_t out_tcp_op; /* opcode for peers */ + uint8_t pad1; + uint16_t out_num_sge; + uint32_t status; /* status */ + + uint16_t ulp_hdr_len; /* ulp header length */ + uint16_t ulp_pad_len; /* pad_len length */ + uint32_t remain_data_len;/* remaining data length */ + + uint64_t ulp_imm_len; /* ulp data length */ +}); + +PACKED_MEMORY(struct xio_tcp_connect_msg { + enum xio_tcp_sock_type sock_type; + uint16_t second_port; + uint16_t pad; +}); + +PACKED_MEMORY(struct xio_tcp_setup_msg { + uint64_t buffer_sz; + uint32_t max_in_iovsz; + uint32_t max_out_iovsz; + uint32_t max_header_len; + uint32_t pad; +}); + +PACKED_MEMORY(struct xio_tcp_cancel_hdr { + uint16_t hdr_len; /* req header length */ + uint16_t sn; /* msg serial number */ + uint32_t result; +}); + +struct xio_tcp_work_req { + struct iovec *msg_iov; + uint32_t msg_len; + uint32_t pad; + uint64_t tot_iov_byte_len; + void *ctl_msg; + uint32_t ctl_msg_len; + int stage; + struct msghdr msg; +}; + +struct xio_tcp_task { + enum xio_tcp_op_code in_tcp_op; + enum xio_tcp_op_code out_tcp_op; + + struct xio_tcp_work_req txd; + struct xio_tcp_work_req rxd; + + + /* User (from vmsg) or pool buffer used for */ + uint16_t read_num_reg_mem; + uint16_t write_num_reg_mem; + uint32_t pad0; + + struct xio_reg_mem *read_reg_mem; + struct xio_reg_mem *write_reg_mem; + + uint16_t req_in_num_sge; + uint16_t req_out_num_sge; + uint16_t rsp_out_num_sge; + uint16_t sn; + + /* What this side got from the peer for SEND */ + /* What this side got from the peer for RDMA equivalent R/W + */ + /* can serve send/rdma write */ + struct xio_sge *req_in_sge; + + /* can serve send/rdma read */ + struct xio_sge *req_out_sge; + + /* can serve send/rdma read response/rdma write */ + struct xio_sge *rsp_out_sge; + + xio_work_handle_t comp_work; +}; + +struct xio_tcp_tasks_slab { + void *data_pool; + struct xio_reg_mem reg_mem; + int buf_size; + int pad; +}; + +struct xio_tcp_pending_conn { + int fd; + int waiting_for_bytes; + struct xio_tcp_connect_msg msg; + union xio_sockaddr sa; + struct list_head conns_list_entry; +}; + +struct xio_tcp_socket_ops { + int (*open)(struct xio_tcp_socket *sock); + int (*add_ev_handlers)(struct xio_tcp_transport *tcp_hndl); + int (*del_ev_handlers)(struct xio_tcp_transport *tcp_hndl); + int (*connect)(struct xio_tcp_transport *tcp_hndl, + struct sockaddr *sa, socklen_t sa_len); + size_t (*set_txd)(struct xio_task *task); + void (*set_rxd)(struct xio_task *task, void *buf, uint32_t len); + void (*set_rxd_iov)(struct xio_task *task, void *msg_iov, uint32_t msg_len, uint64_t total_len); + int (*rx_ctl_work)(struct xio_tcp_transport *tcp_hndl, int fd, + struct xio_tcp_work_req *xio_recv, + int block); + int (*rx_ctl_handler)(struct xio_tcp_transport *tcp_hndl); + int (*rx_data_handler)(struct xio_tcp_transport *tcp_hndl, + int batch_nr); + int (*shutdown)(struct xio_tcp_socket *sock); + int (*close)(struct xio_tcp_socket *sock); +}; + +struct xio_tcp_socket { + int cfd; + int dfd; + uint16_t port_cfd; + uint16_t port_dfd; + int pad; + struct xio_tcp_socket_ops ops[1]; +}; + +struct xio_tcp_transport { + struct xio_transport_base base; + struct xio_mempool *tcp_mempool; + struct list_head trans_list_entry; + + /* tasks queues */ + struct list_head tx_ready_list; + struct list_head tx_comp_list; + struct list_head in_flight_list; + struct list_head rx_list; + struct list_head io_list; + + struct xio_tcp_socket sock; + uint16_t is_listen; + uint8_t in_epoll[2]; + + /* fast path params */ + enum xio_transport_state state; + + /* tx parameters */ + size_t max_inline_buf_sz; + + int tx_ready_tasks_num; + + uint16_t tx_comp_cnt; + + uint16_t sn; /* serial number */ + + /* control path params */ + + uint32_t peer_max_in_iovsz; + uint32_t peer_max_out_iovsz; + + /* connection's flow control */ + size_t membuf_sz; + + struct xio_transport *transport; + struct xio_tasks_pool_cls initial_pool_cls; + struct xio_tasks_pool_cls primary_pool_cls; + + struct xio_tcp_setup_msg setup_rsp; + + /* too big to be on stack - use as temporaries */ + union { + struct xio_msg dummy_msg; + }; + + struct list_head pending_conns; + + void *tmp_rx_buf; + void *tmp_rx_buf_cur; + uint32_t tmp_rx_buf_len; + uint32_t peer_max_header; + + uint32_t trans_attr_mask; + struct xio_transport_attr trans_attr; + + struct xio_tcp_work_req tmp_work; + struct iovec tmp_iovec[IOV_MAX]; + + struct xio_ev_data flush_tx_event; + struct xio_ev_data ctl_rx_event; + struct xio_ev_data disconnect_event; +}; + +int xio_tcp_get_max_header_size(void); + +int xio_tcp_get_inline_buffer_size(void); + +int xio_tcp_send(struct xio_transport_base *transport, + struct xio_task *task); + +int xio_tcp_rx_handler(struct xio_tcp_transport *tcp_hndl); + +int xio_tcp_poll(struct xio_transport_base *transport, + long min_nr, long max_nr, + struct timespec *ts_timeout); + +struct xio_task *xio_tcp_primary_task_lookup( + struct xio_tcp_transport *tcp_hndl, + int tid); + +struct xio_task *xio_tcp_primary_task_alloc( + struct xio_tcp_transport *tcp_hndl); + +void on_sock_disconnected(struct xio_tcp_transport *tcp_hndl, + int notify_observer); + +int xio_tcp_cancel_req(struct xio_transport_base *transport, + struct xio_msg *req, uint64_t stag, + void *ulp_msg, size_t ulp_msg_sz); + +int xio_tcp_cancel_rsp(struct xio_transport_base *transport, + struct xio_task *task, enum xio_status result, + void *ulp_msg, size_t ulp_msg_sz); + +int xio_tcp_send_connect_msg(int fd, struct xio_tcp_connect_msg *msg); + +size_t xio_tcp_single_sock_set_txd(struct xio_task *task); +size_t xio_tcp_dual_sock_set_txd(struct xio_task *task); +void xio_tcp_single_sock_set_rxd(struct xio_task *task, void *buf, + uint32_t len); +void xio_tcp_single_sock_set_rxd_iov(struct xio_task *task, + void *msg_iov, uint32_t msg_len, uint64_t total_len); +void xio_tcp_dual_sock_set_rxd(struct xio_task *task, void *buf, uint32_t len); +void xio_tcp_dual_sock_set_rxd_iov(struct xio_task *task, + void *msg_iov, uint32_t msg_len, uint64_t total_len); +int xio_tcp_rx_ctl_handler(struct xio_tcp_transport *tcp_hndl, int batch_nr); +int xio_tcp_rx_data_handler(struct xio_tcp_transport *tcp_hndl, int batch_nr); +int xio_tcp_recv_ctl_work(struct xio_tcp_transport *tcp_hndl, int fd, + struct xio_tcp_work_req *xio_recv, int block); +int xio_tcp_recvmsg_work(struct xio_tcp_transport *tcp_hndl, int fd, + struct xio_tcp_work_req *xio_recv, int block); + +void xio_tcp_disconnect_helper(void *xio_tcp_hndl); + +int xio_tcp_xmit(struct xio_tcp_transport *tcp_hndl); + +#endif /* XIO_TCP_TRANSPORT_H_ */ diff --git a/open_src/xio/src/usr/transport/xio_mempool.c b/open_src/xio/src/usr/transport/xio_mempool.c new file mode 100644 index 0000000..9cde0b6 --- /dev/null +++ b/open_src/xio/src/usr/transport/xio_mempool.c @@ -0,0 +1,782 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include "libxio.h" +#include "xio_log.h" +#include "xio_common.h" +#include "xio_mem.h" +#include "xio_usr_utils.h" + +/* Accelio's default mempool profile (don't expose it) */ +#define XIO_MEM_SLABS_NR 4 + +#define _16K_BLOCK_SZ (16 * 1024) +#define _16K_MIN_NR 0 +#define _16K_MAX_NR (1024 * 24) +#define _16K_ALLOC_NR 128 + +#define _64K_BLOCK_SZ (64 * 1024) +#define _64K_MIN_NR 0 +#define _64K_MAX_NR (1024 * 24) +#define _64K_ALLOC_NR 128 + +#define _256K_BLOCK_SZ (256 * 1024) +#define _256K_MIN_NR 0 +#define _256K_MAX_NR (1024 * 24) +#define _256K_ALLOC_NR 128 + +#define _1M_BLOCK_SZ (1024 * 1024) +#define _1M_MIN_NR 0 +#define _1M_MAX_NR (1024 * 24) +#define _1M_ALLOC_NR 128 + +struct xio_mempool_config g_mempool_config = { + XIO_MEM_SLABS_NR, + { + {_16K_BLOCK_SZ, _16K_MIN_NR, _16K_ALLOC_NR, _16K_MAX_NR}, + {_64K_BLOCK_SZ, _64K_MIN_NR, _64K_ALLOC_NR, _64K_MAX_NR}, + {_256K_BLOCK_SZ, _256K_MIN_NR, _256K_ALLOC_NR, _256K_MAX_NR}, + {_1M_BLOCK_SZ, _1M_MIN_NR, _1M_ALLOC_NR, _1M_MAX_NR}, + {0, 0, 0, 0}, + {0, 0, 0, 0} + } +}; + +/* #define DEBUG_MEMPOOL_MT */ + +/*---------------------------------------------------------------------------*/ +/* structures */ +/*---------------------------------------------------------------------------*/ +typedef volatile int combined_t; + +struct xio_mem_block { + struct xio_mem_slab *parent_slab; + struct xio_mr *omr; + void *buf; + struct xio_mem_block *next; + combined_t refcnt_claim; + + volatile int refcnt; + struct list_head blocks_list_entry; +}; + +struct xio_mem_region { + struct xio_mr *omr; + void *buf; + struct list_head mem_region_entry; +}; + +struct xio_mem_slab { + struct xio_mempool *pool; + struct list_head mem_regions_list; + struct xio_mem_block *free_blocks_list; + struct list_head blocks_list; + + size_t mb_size; /*memory block size */ + spinlock_t lock; + + int init_mb_nr; /* initial mb + size */ + int curr_mb_nr; /* current size */ + int max_mb_nr; /* max allowed size */ + int alloc_quantum_nr; /* number of items + per allocation */ + int used_mb_nr; + int align; + int pad; +}; + +struct xio_mempool { + uint32_t slabs_nr; /* less sentinel */ + uint32_t flags; + int nodeid; + int safe_mt; + struct xio_mem_slab *slab; +}; + +/* Lock free algorithm based on: Maged M. Michael & Michael L. Scott's + * Correction of a Memory Management Method for Lock-Free Data Structures + * of John D. Valois's Lock-Free Data Structures. Ph.D. Dissertation + */ +static inline int decrement_and_test_and_set(combined_t *ptr) +{ + int old, _new; + + do { + old = *ptr; + _new = old - 2; + if (_new == 0) + _new = 1; /* claimed be MP */ + } while (!xio_sync_bool_compare_and_swap(ptr, old, _new)); + + return (old - _new) & 1; +} + +/*---------------------------------------------------------------------------*/ +/* clear_lowest_bit */ +/*---------------------------------------------------------------------------*/ +static inline void clear_lowest_bit(combined_t *ptr) +{ + int old, _new; + + do { + old = *ptr; + _new = old - 1; + } while (!xio_sync_bool_compare_and_swap(ptr, old, _new)); +} + +/*---------------------------------------------------------------------------*/ +/* reclaim */ +/*---------------------------------------------------------------------------*/ +static inline void reclaim(struct xio_mem_slab *slab, struct xio_mem_block *p) +{ + struct xio_mem_block *q; + + do { + q = slab->free_blocks_list; + p->next = q; + } while (!xio_sync_bool_compare_and_swap(&slab->free_blocks_list, + q, p)); +} + +/*---------------------------------------------------------------------------*/ +/* release */ +/*---------------------------------------------------------------------------*/ +static inline void safe_release(struct xio_mem_slab *slab, + struct xio_mem_block *p) +{ + if (!p) + return; + + if (decrement_and_test_and_set(&p->refcnt_claim) == 0) + return; + + reclaim(slab, p); +} + +/*---------------------------------------------------------------------------*/ +/* release */ +/*---------------------------------------------------------------------------*/ +static inline void non_safe_release(struct xio_mem_slab *slab, + struct xio_mem_block *p) +{ + struct xio_mem_block *q; + + if (!p) + return; + + q = slab->free_blocks_list; + p->next = q; + slab->free_blocks_list = p; +} + +/*---------------------------------------------------------------------------*/ +/* safe_read */ +/*---------------------------------------------------------------------------*/ +static struct xio_mem_block *safe_read(struct xio_mem_slab *slab) +{ + struct xio_mem_block *q; + + while (1) { + q = slab->free_blocks_list; + if (!q) + return NULL; + xio_sync_fetch_and_add32(&q->refcnt_claim, 2); + /* make sure q is still the head */ + if (xio_sync_bool_compare_and_swap(&slab->free_blocks_list, + q, q)) + return q; + safe_release(slab, q); + } +} + +/*---------------------------------------------------------------------------*/ +/* new_block */ +/*---------------------------------------------------------------------------*/ +static struct xio_mem_block *safe_new_block(struct xio_mem_slab *slab) +{ + struct xio_mem_block *p; + + while (1) { + p = safe_read(slab); + if (!p) + return NULL; + + if (xio_sync_bool_compare_and_swap(&slab->free_blocks_list, + p, p->next)) { + clear_lowest_bit(&p->refcnt_claim); + return p; + } + safe_release(slab, p); + } +} + +/*---------------------------------------------------------------------------*/ +/* new_block */ +/*---------------------------------------------------------------------------*/ +static struct xio_mem_block *non_safe_new_block(struct xio_mem_slab *slab) +{ + struct xio_mem_block *p; + + if (!slab->free_blocks_list) + return NULL; + + p = slab->free_blocks_list; + slab->free_blocks_list = p->next; + p->next = NULL; + + return p; +} + +/*---------------------------------------------------------------------------*/ +/* xio_mem_slab_free */ +/*---------------------------------------------------------------------------*/ +static int xio_mem_slab_free(struct xio_mem_slab *slab) +{ + struct xio_mem_region *r, *tmp_r; + + slab->free_blocks_list = NULL; + +#ifdef DEBUG_MEMPOOL_MT + if (slab->used_mb_nr) + ERROR_LOG("buffers are still in use before free: " \ + "pool:%p - slab[%p]: " \ + "size:%zd, used:%d, alloced:%d, max_alloc:%d\n", + slab->pool, slab, slab->mb_size, slab->used_mb_nr, + slab->curr_mb_nr, slab->max_mb_nr); +#endif + + if (slab->curr_mb_nr) { + list_for_each_entry_safe(r, tmp_r, &slab->mem_regions_list, + mem_region_entry) { + list_del(&r->mem_region_entry); + if (test_bits(XIO_MEMPOOL_FLAG_REG_MR, + &slab->pool->flags)) { + struct xio_reg_mem reg_mem; + + reg_mem.mr = r->omr; + xio_mem_dereg(®_mem); + } + + if (test_bits(XIO_MEMPOOL_FLAG_HUGE_PAGES_ALLOC, + &slab->pool->flags)) + ufree_huge_pages(r->buf); + else if (test_bits(XIO_MEMPOOL_FLAG_NUMA_ALLOC, + &slab->pool->flags)) + unuma_free(r->buf); + else if (test_bits(XIO_MEMPOOL_FLAG_REGULAR_PAGES_ALLOC, + &slab->pool->flags)) + ufree(r->buf); + ufree(r); + } + } + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_mem_slab_resize */ +/*---------------------------------------------------------------------------*/ +static struct xio_mem_block *xio_mem_slab_resize(struct xio_mem_slab *slab, + int alloc) +{ + char *buf; + struct xio_mem_region *region; + struct xio_mem_block *block; + struct xio_mem_block *pblock; + struct xio_mem_block *qblock; + struct xio_mem_block dummy; + int nr_blocks; + size_t region_alloc_sz; + size_t data_alloc_sz; + int i; + size_t aligned_sz; + + if (slab->curr_mb_nr == 0) { + if (slab->init_mb_nr > slab->max_mb_nr) + slab->init_mb_nr = slab->max_mb_nr; + if (slab->init_mb_nr == 0) + nr_blocks = min(slab->max_mb_nr, + slab->alloc_quantum_nr); + else + nr_blocks = slab->init_mb_nr; + } else { + nr_blocks = slab->max_mb_nr - slab->curr_mb_nr; + nr_blocks = min(nr_blocks, slab->alloc_quantum_nr); + } + if (nr_blocks <= 0) + return NULL; + + region_alloc_sz = sizeof(*region) + + nr_blocks * sizeof(struct xio_mem_block); + buf = (char *)ucalloc(region_alloc_sz, sizeof(uint8_t)); + if (!buf) + return NULL; + + /* region */ + region = (struct xio_mem_region *)buf; + buf = buf + sizeof(*region); + block = (struct xio_mem_block *)buf; + + /* region data */ + aligned_sz = ALIGN(slab->mb_size, slab->align); + data_alloc_sz = nr_blocks * aligned_sz; + + /* allocate the buffers and register them */ + if (test_bits(XIO_MEMPOOL_FLAG_HUGE_PAGES_ALLOC, + &slab->pool->flags)) + region->buf = umalloc_huge_pages(data_alloc_sz); + else if (test_bits(XIO_MEMPOOL_FLAG_NUMA_ALLOC, + &slab->pool->flags)) + region->buf = unuma_alloc(data_alloc_sz, slab->pool->nodeid); + else if (test_bits(XIO_MEMPOOL_FLAG_REGULAR_PAGES_ALLOC, + &slab->pool->flags)) + region->buf = umemalign(slab->align, data_alloc_sz); + + if (!region->buf) { + ufree(region); + return NULL; + } + + if (test_bits(XIO_MEMPOOL_FLAG_REG_MR, &slab->pool->flags)) { + struct xio_reg_mem reg_mem; + + xio_mem_register(region->buf, data_alloc_sz, ®_mem); + region->omr = reg_mem.mr; + if (!region->omr) { + if (test_bits(XIO_MEMPOOL_FLAG_HUGE_PAGES_ALLOC, + &slab->pool->flags)) + ufree_huge_pages(region->buf); + else if (test_bits(XIO_MEMPOOL_FLAG_NUMA_ALLOC, + &slab->pool->flags)) + unuma_free(region->buf); + else if (test_bits(XIO_MEMPOOL_FLAG_REGULAR_PAGES_ALLOC, + &slab->pool->flags)) + ufree(region->buf); + + ufree(region); + return NULL; + } + } + + qblock = &dummy; + pblock = block; + for (i = 0; i < nr_blocks; i++) { + list_add(&pblock->blocks_list_entry, &slab->blocks_list); + + pblock->parent_slab = slab; + pblock->omr = region->omr; + pblock->buf = (char *)(region->buf) + i * aligned_sz; + pblock->refcnt_claim = 1; /* free - claimed be MP */ + qblock->next = pblock; + qblock = pblock; + pblock++; + } + + /* first block given to allocator */ + if (alloc) { + if (nr_blocks == 1) + pblock = NULL; + else + pblock = block + 1; + block->next = NULL; + /* ref count 1, not claimed by MP */ + block->refcnt_claim = 2; + } else { + pblock = block; + } + /* Concatenate [pblock -- qblock] to free list + * qblock points to the last allocate block + */ + + if (slab->pool->safe_mt) { + do { + qblock->next = slab->free_blocks_list; + } while (!xio_sync_bool_compare_and_swap( + &slab->free_blocks_list, + qblock->next, pblock)); + } else { + qblock->next = slab->free_blocks_list; + slab->free_blocks_list = pblock; + } + + slab->curr_mb_nr += nr_blocks; + + list_add(®ion->mem_region_entry, &slab->mem_regions_list); + + return block; +} + +/*---------------------------------------------------------------------------*/ +/* xio_mempool_destroy */ +/*---------------------------------------------------------------------------*/ +void xio_mempool_destroy(struct xio_mempool *p) +{ + unsigned int i; + + if (!p) + return; + + for (i = 0; i < p->slabs_nr; i++) + xio_mem_slab_free(&p->slab[i]); + + ufree(p->slab); + ufree(p); +} + +/*---------------------------------------------------------------------------*/ +/* xio_mempool_dump */ +/*---------------------------------------------------------------------------*/ +void xio_mempool_dump(struct xio_mempool *p) +{ + unsigned int i; + struct xio_mem_slab *s; + + if (!p) + return; + + DEBUG_LOG("------------------------------------------------\n"); + for (i = 0; i < p->slabs_nr; i++) { + s = &p->slab[i]; + DEBUG_LOG("pool:%p - slab[%d]: " \ + "size:%zd, used:%d, alloced:%d, max_alloc:%d\n", + p, i, s->mb_size, s->used_mb_nr, + s->curr_mb_nr, s->max_mb_nr); + } + DEBUG_LOG("------------------------------------------------\n"); +} + +/*---------------------------------------------------------------------------*/ +/* xio_mempool_create */ +/*---------------------------------------------------------------------------*/ +struct xio_mempool *xio_mempool_create(int nodeid, uint32_t flags) +{ + struct xio_mempool *p; + + if (test_bits(XIO_MEMPOOL_FLAG_HUGE_PAGES_ALLOC, &flags)) { + clr_bits(XIO_MEMPOOL_FLAG_REGULAR_PAGES_ALLOC, &flags); + clr_bits(XIO_MEMPOOL_FLAG_NUMA_ALLOC, &flags); + DEBUG_LOG("mempool: using huge pages allocator\n"); + } else if (test_bits(XIO_MEMPOOL_FLAG_NUMA_ALLOC, &flags)) { + clr_bits(XIO_MEMPOOL_FLAG_REGULAR_PAGES_ALLOC, &flags); + DEBUG_LOG("mempool: using numa allocator\n"); + } else { + set_bits(XIO_MEMPOOL_FLAG_REGULAR_PAGES_ALLOC, &flags); + DEBUG_LOG("mempool: using regular allocator\n"); + } + + if (test_bits(XIO_MEMPOOL_FLAG_NUMA_ALLOC, &flags)) { + int ret; + + if (nodeid == -1) { + int cpu = xio_get_cpu(); + + nodeid = xio_numa_node_of_cpu(cpu); + } + /* pin to node */ + ret = xio_numa_run_on_node(nodeid); + if (ret) + return NULL; + } + + p = (struct xio_mempool *)ucalloc(1, sizeof(struct xio_mempool)); + if (!p) + return NULL; + + p->nodeid = nodeid; + p->flags = flags; + p->slabs_nr = 0; + p->safe_mt = 1; + p->slab = NULL; + + return p; +} + +/*---------------------------------------------------------------------------*/ +/* xio_mempool_create_prv */ +/*---------------------------------------------------------------------------*/ +struct xio_mempool *xio_mempool_create_prv(int nodeid, uint32_t flags) +{ + struct xio_mempool *p; + size_t i; + int ret; + + if (g_mempool_config.slabs_nr < 1 || + g_mempool_config.slabs_nr > XIO_MAX_SLABS_NR) { + xio_set_error(EINVAL); + return NULL; + } + + p = xio_mempool_create(nodeid, flags); + if (!p) + return NULL; + + for (i = 0; i < g_mempool_config.slabs_nr; i++) { + ret = xio_mempool_add_slab( + p, + g_mempool_config.slab_cfg[i].block_sz, + g_mempool_config.slab_cfg[i].init_blocks_nr, + g_mempool_config.slab_cfg[i].max_blocks_nr, + g_mempool_config.slab_cfg[i].grow_blocks_nr, + 0); /*default alignment */ + if (ret != 0) + goto cleanup; + } + + return p; + +cleanup: + xio_mempool_destroy(p); + return NULL; +} + +/*---------------------------------------------------------------------------*/ +/* size2index */ +/*---------------------------------------------------------------------------*/ +static inline int size2index(struct xio_mempool *p, size_t sz) +{ + unsigned int i; + + for (i = 0; i <= p->slabs_nr; i++) + if (sz <= p->slab[i].mb_size) + break; + + return (i == p->slabs_nr) ? -1 : (int)i; +} + +/*---------------------------------------------------------------------------*/ +/* xio_mempool_alloc */ +/*---------------------------------------------------------------------------*/ +int xio_mempool_alloc(struct xio_mempool *p, size_t length, + struct xio_reg_mem *reg_mem) +{ + int index; + struct xio_mem_slab *slab; + struct xio_mem_block *block; + int ret = 0; + + index = size2index(p, length); +retry: + if (index == -1) { + errno = EINVAL; + ret = -1; + reg_mem->addr = NULL; + reg_mem->mr = NULL; + reg_mem->priv = NULL; + reg_mem->length = 0; + goto cleanup; + } + slab = &p->slab[index]; + + if (p->safe_mt) + block = safe_new_block(slab); + else + block = non_safe_new_block(slab); + if (!block) { + if (p->safe_mt) { + spin_lock(&slab->lock); + /* we may been blocked on the spinlock while other + * thread resized the pool + */ + block = safe_new_block(slab); + } else { + block = non_safe_new_block(slab); + } + if (!block) { + block = xio_mem_slab_resize(slab, 1); + if (!block) { + if (++index == (int)p->slabs_nr || + test_bits( + XIO_MEMPOOL_FLAG_USE_SMALLEST_SLAB, + &p->flags)) + index = -1; + + if (p->safe_mt) + spin_unlock(&slab->lock); + ret = 0; + goto retry; + } + DEBUG_LOG("resizing slab size:%zd\n", slab->mb_size); + } + if (p->safe_mt) + spin_unlock(&slab->lock); + } + + reg_mem->addr = block->buf; + reg_mem->mr = block->omr; + reg_mem->priv = block; + reg_mem->length = length; + +#ifdef DEBUG_MEMPOOL_MT + __sync_fetch_and_add(&slab->used_mb_nr, 1); + if (__sync_fetch_and_add(&block->refcnt, 1) != 0) { + ERROR_LOG("pool alloc failed\n"); + abort(); /* core dump - double free */ + } +#else + slab->used_mb_nr++; +#endif + +cleanup: + +#ifdef DEBUG_MEMPOOL_MT + xio_mempool_dump(p); +#endif + return ret; +} + +/*---------------------------------------------------------------------------*/ +/* xio_mempool_free */ +/*---------------------------------------------------------------------------*/ +void xio_mempool_free(struct xio_reg_mem *reg_mem) +{ + struct xio_mem_block *block; + + if (!reg_mem || !reg_mem->priv) + return; + + block = (struct xio_mem_block *)reg_mem->priv; + +#ifdef DEBUG_MEMPOOL_MT + if (__sync_fetch_and_sub(&block->refcnt, 1) != 1) { + ERROR_LOG("pool: release failed"); + abort(); /* core dump - double free */ + } + __sync_fetch_and_sub(&block->parent_slab->used_mb_nr, 1); +#else + block->parent_slab->used_mb_nr--; +#endif + + if (block->parent_slab->pool->safe_mt) + safe_release(block->parent_slab, block); + else + non_safe_release(block->parent_slab, block); +} + +/*---------------------------------------------------------------------------*/ +/* xio_mempool_add_slab */ +/*---------------------------------------------------------------------------*/ +int xio_mempool_add_slab(struct xio_mempool *p, + size_t size, size_t min, size_t max, + size_t alloc_quantum_nr, int alignment) +{ + struct xio_mem_slab *new_slab; + struct xio_mem_block *block; + unsigned int ix, slab_ix, slab_shift = 0; + int align = alignment; + + slab_ix = p->slabs_nr; + if (p->slabs_nr) { + for (ix = 0; ix < p->slabs_nr; ++ix) { + if (p->slab[ix].mb_size == size) { + xio_set_error(EEXIST); + return -1; + } + if (p->slab[ix].mb_size > size) { + slab_ix = ix; + break; + } + } + } + if (!alignment) { + align = g_options.xfer_buf_align; + } else if (!is_power_of_2(alignment) || + !(alignment % sizeof(void *) == 0)) { + ERROR_LOG("invalid alignment %d\n", alignment); + xio_set_error(EINVAL); + return -1; + } + + /* expand */ + new_slab = (struct xio_mem_slab *)ucalloc(p->slabs_nr + 2, + sizeof(struct xio_mem_slab)); + if (!new_slab) { + xio_set_error(ENOMEM); + return -1; + } + + /* fill/shift slabs */ + for (ix = 0; ix < p->slabs_nr + 1; ++ix) { + if (ix == slab_ix) { + /* new slab */ + new_slab[ix].pool = p; + new_slab[ix].mb_size = size; + new_slab[ix].init_mb_nr = min; + new_slab[ix].max_mb_nr = max; + new_slab[ix].alloc_quantum_nr = alloc_quantum_nr; + new_slab[ix].align = align; + + spin_lock_init(&new_slab[ix].lock); + INIT_LIST_HEAD(&new_slab[ix].mem_regions_list); + INIT_LIST_HEAD(&new_slab[ix].blocks_list); + new_slab[ix].free_blocks_list = NULL; + if (new_slab[ix].init_mb_nr) { + (void)xio_mem_slab_resize( + &new_slab[ix], 0); + } + /* src adjust */ + slab_shift = 1; + continue; + } + /* shift it */ + new_slab[ix] = p->slab[ix - slab_shift]; + INIT_LIST_HEAD(&new_slab[ix].mem_regions_list); + list_splice_init(&p->slab[ix - slab_shift].mem_regions_list, + &new_slab[ix].mem_regions_list); + INIT_LIST_HEAD(&new_slab[ix].blocks_list); + list_splice_init(&p->slab[ix - slab_shift].blocks_list, + &new_slab[ix].blocks_list); + list_for_each_entry(block, &new_slab[ix].blocks_list, + blocks_list_entry) { + block->parent_slab = &new_slab[ix]; + } + } + + /* sentinel */ + new_slab[p->slabs_nr + 1].mb_size = SIZE_MAX; + + /* swap slabs */ + ufree(p->slab); + p->slab = new_slab; + + /* adjust length */ + (p->slabs_nr)++; + + return 0; +} + diff --git a/open_src/xio/src/usr/transport/xio_mempool.h b/open_src/xio/src/usr/transport/xio_mempool.h new file mode 100644 index 0000000..b106ebb --- /dev/null +++ b/open_src/xio/src/usr/transport/xio_mempool.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_TRANSPORT_MEMPOOL_H +#define XIO_TRANSPORT_MEMPOOL_H + +/** + * create private mempool with default allocators + * + * @param[in] nodeid numa node id. -1 if don't care + * @param[in] flags mask of mempool creation flags + * defined (@ref xio_mempool_flag) + * + * @returns success (0), or a (negative) error value + */ +struct xio_mempool *xio_mempool_create_prv(int nodeid, uint32_t flags); + +#endif + diff --git a/open_src/xio/src/usr/transport/xio_usr_transport.c b/open_src/xio/src/usr/transport/xio_usr_transport.c new file mode 100644 index 0000000..c0a0f23 --- /dev/null +++ b/open_src/xio/src/usr/transport/xio_usr_transport.c @@ -0,0 +1,194 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include "libxio.h" +#include "xio_log.h" +#include "xio_common.h" +#include "xio_observer.h" +#include "xio_protocol.h" +#include "xio_mbuf.h" +#include "xio_task.h" +#include "xio_transport.h" +#include "xio_protocol.h" +#include "xio_mem.h" +#include "xio_usr_transport.h" +#include "xio_mempool.h" +#include "xio_ev_data.h" +#include "xio_objpool.h" +#include "xio_workqueue.h" +#include "xio_context.h" + +#ifndef HAVE_INFINIBAND_VERBS_H + +/*---------------------------------------------------------------------------*/ +/* xio_mem_register */ +/*---------------------------------------------------------------------------*/ +int xio_mem_register(void *addr, size_t length, struct xio_reg_mem *reg_mem) +{ + static struct xio_mr dummy_mr; + + if (!addr || !reg_mem) { + xio_set_error(EINVAL); + return -1; + } + + reg_mem->addr = addr; + reg_mem->length = length; + reg_mem->mr = &dummy_mr; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_mem_dereg */ +/*---------------------------------------------------------------------------*/ +int xio_mem_dereg(struct xio_reg_mem *reg_mem) +{ + reg_mem->mr = NULL; + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_mem_alloc */ +/*---------------------------------------------------------------------------*/ +int xio_mem_alloc(size_t length, struct xio_reg_mem *reg_mem) +{ + size_t real_size; + int alloced = 0; + + real_size = ALIGN(length, page_size); + reg_mem->addr = umemalign(page_size, real_size); + if (!reg_mem->addr) { + ERROR_LOG("xio_memalign failed. sz:%zu\n", real_size); + goto cleanup; + } + /*memset(reg_mem->addr, 0, real_size);*/ + alloced = 1; + + xio_mem_register(reg_mem->addr, length, reg_mem); + if (!reg_mem->mr) { + ERROR_LOG("xio_reg_mr failed. addr:%p, length:%d\n", + reg_mem->addr, length, access); + + goto cleanup1; + } + reg_mem->length = length; + + return 0; + +cleanup1: + if (alloced) + ufree(reg_mem->addr); +cleanup: + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_mem_free */ +/*---------------------------------------------------------------------------*/ +int xio_mem_free(struct xio_reg_mem *reg_mem) +{ + int retval = 0; + + if (reg_mem->addr) + ufree(reg_mem->addr); + + retval = xio_mem_dereg(reg_mem); + + return retval; +} + +#endif /*HAVE_INFINIBAND_VERBS_H*/ + +/*---------------------------------------------------------------------------*/ +/* xio_transport_mempool_get */ +/*---------------------------------------------------------------------------*/ +struct xio_mempool *xio_transport_mempool_get( + struct xio_context *ctx, int reg_mr) +{ + if (ctx->mempool) + return (struct xio_mempool *)ctx->mempool; + + /* user asked to force registration and rdma exist on machine*/ + if (ctx->register_internal_mempool && xio_get_transport("rdma")) + reg_mr = 1; + + ctx->mempool = xio_mempool_create_prv( + ctx->nodeid, + (reg_mr ? XIO_MEMPOOL_FLAG_REG_MR : 0) | + XIO_MEMPOOL_FLAG_HUGE_PAGES_ALLOC); + + if (!ctx->mempool) { + ERROR_LOG("xio_mempool_create failed (errno=%d %m)\n", errno); + return NULL; + } + return (struct xio_mempool *)ctx->mempool; +} + +/*---------------------------------------------------------------------------*/ +/* xio_transport_state_str */ +/*---------------------------------------------------------------------------*/ +char *xio_transport_state_str(enum xio_transport_state state) +{ + switch (state) { + case XIO_TRANSPORT_STATE_INIT: + return "INIT"; + case XIO_TRANSPORT_STATE_LISTEN: + return "LISTEN"; + case XIO_TRANSPORT_STATE_CONNECTING: + return "CONNECTING"; + case XIO_TRANSPORT_STATE_CONNECTED: + return "CONNECTED"; + case XIO_TRANSPORT_STATE_DISCONNECTED: + return "DISCONNECTED"; + case XIO_TRANSPORT_STATE_RECONNECT: + return "RECONNECT"; + case XIO_TRANSPORT_STATE_CLOSED: + return "CLOSED"; + case XIO_TRANSPORT_STATE_DESTROYED: + return "DESTROYED"; + case XIO_TRANSPORT_STATE_ERROR: + return "ERROR"; + default: + return "UNKNOWN"; + } + + return NULL; +}; + diff --git a/open_src/xio/src/usr/transport/xio_usr_transport.h b/open_src/xio/src/usr/transport/xio_usr_transport.h new file mode 100644 index 0000000..b5ed6ac --- /dev/null +++ b/open_src/xio/src/usr/transport/xio_usr_transport.h @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_COMMON_TRANSPORT_H +#define XIO_COMMON_TRANSPORT_H + +#define MAX_SGE (XIO_IOVLEN + 1) + +#define MAX_HDR_SZ 512 + +#define NUM_CONN_SETUP_TASKS 2 /* one posted for req rx, + * one for reply tx + */ +#define CONN_SETUP_BUF_SIZE 4096 + +#define NUM_START_PRIMARY_POOL_TASKS 312 /* must be enough to send few + + * fully post_recv buffers + */ +#define NUM_ALLOC_PRIMARY_POOL_TASKS 512 + +#define USECS_IN_SEC 1000000 +#define NSECS_IN_USEC 1000 + +#define VALIDATE_SZ(sz) do { \ + if (optlen != (sz)) { \ + xio_set_error(EINVAL); \ + return -1; \ + } \ + } while (0) + +#define xio_prefetch(p) __builtin_prefetch(p) + +/*---------------------------------------------------------------------------*/ +/* enums */ +/*---------------------------------------------------------------------------*/ +enum xio_transport_state { + XIO_TRANSPORT_STATE_INIT, + XIO_TRANSPORT_STATE_LISTEN, + XIO_TRANSPORT_STATE_CONNECTING, + XIO_TRANSPORT_STATE_CONNECTED, + XIO_TRANSPORT_STATE_DISCONNECTED, + XIO_TRANSPORT_STATE_RECONNECT, + XIO_TRANSPORT_STATE_CLOSED, + XIO_TRANSPORT_STATE_DESTROYED, + XIO_TRANSPORT_STATE_ERROR +}; + +struct xio_mr { + void *addr; /* for new devices */ + size_t length; /* for new devices */ + int access; /* for new devices */ + int addr_alloced; /* address was + allocated by xio */ + struct list_head dm_list; + struct list_head mr_list_entry; +}; + +/* + * The next routines deal with comparing 16 bit unsigned ints + * and worry about wraparound (automatic with unsigned arithmetic). + */ + +static inline int16_t before(uint16_t seq1, uint16_t seq2) +{ + return (int16_t)(seq1 - seq2) < 0; +} + +#define after(seq2, seq1) before(seq1, seq2) + +static inline int16_t before_eq(uint16_t seq1, uint16_t seq2) +{ + return (int16_t)(seq1 - seq2) <= 0; +} + +#define after_eq(seq2, seq1) before_eq(seq1, seq2) + +/* is s2<=s1tv_sec * USECS_IN_SEC; + retval += time_spec->tv_nsec / NSECS_IN_USEC; + + return retval; +} + +struct xio_mempool *xio_transport_mempool_get( + struct xio_context *ctx, + int reg_mr); + +char *xio_transport_state_str(enum xio_transport_state state); + +#endif /* XIO_COMMON_TRANSPORT_H */ diff --git a/open_src/xio/src/usr/xio/get_clock.c b/open_src/xio/src/usr/xio/get_clock.c new file mode 100644 index 0000000..49b74a3 --- /dev/null +++ b/open_src/xio/src/usr/xio/get_clock.c @@ -0,0 +1,254 @@ +/* + * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Author: Michael S. Tsirkin + */ + +/* #define DEBUG 1 */ +/* #define DEBUG_DATA 1 */ +/* #define GET_CPU_MHZ_FROM_PROC 1 */ + +/* For gettimeofday */ +#define _BSD_SOURCE +#define _DEFAULT_SOURCE +#include + +#include + +#include "xio_usr_utils.h" +#include "get_clock.h" + +#ifndef GETCLOCK_DEBUG +#define _DEBUG_MODE 0 +#else +#define _DEBUG_MODE 1 +#endif + +#ifndef DEBUG_DATA +#define DEBUG_DATA 0 +#endif + +#define MEASUREMENTS 200 +#define USECSTEP 10 +#define USECSTART 100 + +/* + Use linear regression to calculate cycles per microsecond. + http://en.wikipedia.org/wiki/Linear_regression#Parameter_estimation +*/ +static double sample_get_cpu_mhz(void) +{ + struct timeval tv1, tv2; + cycles_t start; + double sx = 0, sy = 0, sxx = 0, syy = 0, sxy = 0; + double tx, ty; + int i; + + /* Regression: y = a + b x */ + long x[MEASUREMENTS]; + cycles_t y[MEASUREMENTS]; + double a; /* system call overhead in cycles */ + double b; /* cycles per microsecond */ + double r_2; + + for (i = 0; i < MEASUREMENTS; ++i) { + start = get_cycles(); + + if (gettimeofday(&tv1, NULL)) { + fprintf(stderr, "gettimeofday failed.\n"); + return 0; + } + + do { + if (gettimeofday(&tv2, NULL)) { + fprintf(stderr, "gettimeofday failed.\n"); + return 0; + } + } while ((tv2.tv_sec - tv1.tv_sec) * 1000000 + + (tv2.tv_usec - tv1.tv_usec) < USECSTART + i * USECSTEP); + + x[i] = (tv2.tv_sec - tv1.tv_sec) * 1000000 + + tv2.tv_usec - tv1.tv_usec; + y[i] = get_cycles() - start; + if (DEBUG_DATA) + fprintf(stderr, "x=%ld y=%lld\n", + x[i], (long long)y[i]); + } + + for (i = 0; i < MEASUREMENTS; ++i) { + tx = x[i]; + ty = (double)y[i]; + sx += tx; + sy += ty; + sxx += tx * tx; + syy += ty * ty; + sxy += tx * ty; + } + + b = (MEASUREMENTS * sxy - sx * sy) / (MEASUREMENTS * sxx - sx * sx); + a = (sy - b * sx) / MEASUREMENTS; + + if (_DEBUG_MODE) + fprintf(stderr, "a = %g\n", a); + if (_DEBUG_MODE) + fprintf(stderr, "b = %g\n", b); + if (_DEBUG_MODE) + fprintf(stderr, "a / b = %g\n", a / b); + r_2 = (MEASUREMENTS * sxy - sx * sy) * (MEASUREMENTS * sxy - sx * sy) / + (MEASUREMENTS * sxx - sx * sx) / + (MEASUREMENTS * syy - sy * sy); + + if (_DEBUG_MODE) + fprintf(stderr, "r^2 = %g\n", r_2); + if (r_2 < 0.9) { + fprintf(stderr, "Correlation coefficient r^2: %g < 0.9\n", r_2); + return 0; + } + + return b; +} + +static double proc_get_cpu_mhz(int no_cpu_freq_fail) +{ + FILE *f; + char buf[256]; + double mhz = 0.0; + + f = fopen("/proc/cpuinfo", "r"); + if (!f) + return 0.0; + while (fgets(buf, sizeof(buf), f)) { + double m; + int rc; + +#if defined(__ia64__) + /* Use the ITC frequency on IA64 */ + rc = sscanf(buf, "itc MHz : %lf", &m); +#elif defined(__PPC__) || defined(__PPC64__) + /* PPC has a different format as well */ + rc = sscanf(buf, "clock : %lf", &m); +#else + rc = sscanf(buf, "cpu MHz : %lf", &m); +#endif + + if (rc != 1) + continue; + + if (mhz == 0.0) { + mhz = m; + continue; + } + if (mhz != m) { + fprintf(stderr, "Conflicting CPU frequency values" \ + " detected: %lf != %lf\n", mhz, m); + if (no_cpu_freq_fail) + fprintf(stderr, "Test integrity may" \ + " be harmed !\n"); + else { + mhz = 0.0; + goto exit; + } + continue; + } + } +exit: + fclose(f); + return mhz; +} + +double get_core_freq(void) +{ + int cpu; + + FILE *f; + char buf[256]; + unsigned long khz = 0; + + cpu = xio_get_cpu(); + if (cpu < 0) { + perror("sched_getcpu"); + return 0.0; + } + + sprintf(buf, + "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_max_freq", + cpu); + + f = fopen(buf, "r"); + if (!f) { + /* perror("cpufreq not supported"); */ + return 0.0; + } + + while (fgets(buf, sizeof(buf), f)) { + errno = 0; + khz = strtol(buf, NULL, 0); + if (errno) { + fclose(f); + /* perror("Can't read cpufreq"); */ + return 0; + } + fclose(f); + /* value in KHz */ + return khz / 1000.0; + } + + /* NOT FOUND */ + fprintf(stderr, "Empty cpufreq\n"); + fclose(f); + + return 0.0; +} + +double get_cpu_mhz(int no_cpu_freq_fail) +{ + double freq, sample, proc, delta; + + freq = get_core_freq(); + /* even with core freq cycles are at maximum */ + if (freq) + return freq; + + sample = sample_get_cpu_mhz(); + proc = proc_get_cpu_mhz(no_cpu_freq_fail); + + if (!proc || !sample) + return 0; + + delta = proc > sample ? proc - sample : sample - proc; + if (delta / proc > 0.01) { + fprintf(stderr, "Warning: measured timestamp" \ + " frequency %g differs from nominal %g MHz\n", + sample, proc); + return sample; + } + return proc; +} diff --git a/open_src/xio/src/usr/xio/get_clock.h b/open_src/xio/src/usr/xio/get_clock.h new file mode 100644 index 0000000..3890e55 --- /dev/null +++ b/open_src/xio/src/usr/xio/get_clock.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Author: Michael S. Tsirkin + */ + +#ifndef GET_CLOCK_H +#define GET_CLOCK_H + +#if defined(__x86_64__) || defined(__i386__) + +/* Note: only x86 CPUs which have rdtsc instruction are supported. */ +typedef unsigned long long cycles_t; +static inline cycles_t get_cycles(void) +{ + union { + cycles_t val; + struct { + unsigned int low; + unsigned int high; + } __attribute__((packed)); + } value; + + asm volatile ("rdtsc" : "=a" (value.low), "=d" (value.high)); + return value.val; +} +#elif defined(__PPC__) || defined(__PPC64__) +/* Note: only PPC CPUs which have mftb instruction are supported. */ +/* PPC64 has mftb */ +typedef unsigned long cycles_t; +static inline cycles_t get_cycles(void) +{ + cycles_t ret; + + asm volatile ("mftb %0" : "=r" (ret) : ); + return ret; +} +#elif defined(__ia64__) +/* Itanium2 and up has ar.itc (Itanium1 has errata) */ +typedef unsigned long cycles_t; +static inline cycles_t get_cycles(void) +{ + cycles_t ret; + + asm volatile ("mov %0=ar.itc" : "=r" (ret)); + return ret; +} +#elif defined(WIN32) +#include + +typedef LONGLONG cycles_t; +static __inline cycles_t get_cycles(void) +{ + LARGE_INTEGER performance_count; + + return QueryPerformanceCounter(&performance_count) ? + performance_count.QuadPart : 0; +} + +#else +#warning get_cycles not implemented for this architecture: attempt asm/timex.h +#include +#endif + +extern double get_cpu_mhz(int); + +#endif diff --git a/open_src/xio/src/usr/xio/xio_context.c b/open_src/xio/src/usr/xio/xio_context.c new file mode 100644 index 0000000..037ec24 --- /dev/null +++ b/open_src/xio/src/usr/xio/xio_context.c @@ -0,0 +1,849 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include "libxio.h" +#include "xio_log.h" +#include "xio_common.h" +#include "xio_observer.h" +#include "get_clock.h" +#include "xio_ev_data.h" +#include "xio_ev_loop.h" +#include "xio_idr.h" +#include "xio_objpool.h" +#include "xio_workqueue.h" +#include +#include "xio_timers_list.h" +#include "xio_protocol.h" +#include "xio_mbuf.h" +#include "xio_task.h" +#include "xio_transport.h" +#include "xio_context.h" +#include "xio_usr_utils.h" +#include "xio_init.h" + +#ifdef XIO_THREAD_SAFE_DEBUG +#include +#endif + +#define MSGPOOL_INIT_NR 8 +#define MSGPOOL_GROW_NR 64 + +int xio_netlink(struct xio_context *ctx); + +/*---------------------------------------------------------------------------*/ +/* xio_context_reg_observer */ +/*---------------------------------------------------------------------------*/ +int xio_context_reg_observer(struct xio_context *ctx, + struct xio_observer *observer) +{ + xio_observable_reg_observer(&ctx->observable, observer); + + return 0; +} +EXPORT_SYMBOL(xio_context_reg_observer); + +/*---------------------------------------------------------------------------*/ +/* xio_context_unreg_observer */ +/*---------------------------------------------------------------------------*/ +void xio_context_unreg_observer(struct xio_context *ctx, + struct xio_observer *observer) +{ + xio_observable_unreg_observer(&ctx->observable, observer); +} +EXPORT_SYMBOL(xio_context_unreg_observer); + +/*---------------------------------------------------------------------------*/ +/* xio_context_create */ +/*---------------------------------------------------------------------------*/ +struct xio_context *xio_context_create(struct xio_context_params *ctx_params, + int polling_timeout_us, int cpu_hint) +{ + struct xio_context *ctx = NULL; + struct xio_transport *transport; + int cpu; + + /* check if user called xio_init() */ + if (!xio_inited()) { + ERROR_LOG("xio_init() must be called before any accelio func\n"); + return NULL; + } + + xio_read_logging_level(); + + if (cpu_hint == -1) { + cpu = xio_get_cpu(); + if (cpu == -1) { + xio_set_error(errno); + return NULL; + } + } else { + cpu = cpu_hint; + } + /*pin the process to cpu */ + xio_pin_to_cpu(cpu); + /* pin to the numa node of the cpu */ + if (0) + if (-1 == xio_pin_to_node(cpu)) { + xio_set_error(errno); + ERROR_LOG("could not set affinity to cpu. %m\n"); + } + + /* allocate new context */ + ctx = (struct xio_context *)ucalloc(1, sizeof(struct xio_context)); + if (!ctx) { + xio_set_error(ENOMEM); + ERROR_LOG("calloc failed. %m\n"); + return NULL; + } + ctx->ev_loop = xio_ev_loop_create(); + ctx->run_private = 0; + + ctx->cpuid = cpu; + ctx->nodeid = xio_numa_node_of_cpu(cpu);// 此处可能存在内存泄露 + //ctx->nodeid = 0; + ctx->polling_timeout = polling_timeout_us; + ctx->worker = xio_get_current_thread_id(); + + if (ctx_params) { + ctx->user_context = ctx_params->user_context; + ctx->prealloc_xio_inline_bufs = + !!ctx_params->prealloc_xio_inline_bufs; + ctx->max_conns_per_ctx = + max(ctx_params->max_conns_per_ctx, 2); + ctx->register_internal_mempool = + !!ctx_params->register_internal_mempool; + ctx->rq_depth = ctx_params->rq_depth; + } + if (!ctx->max_conns_per_ctx) + ctx->max_conns_per_ctx = 100; + + XIO_OBSERVABLE_INIT(&ctx->observable, ctx); + INIT_LIST_HEAD(&ctx->ctx_list); + + ctx->workqueue = xio_workqueue_create(ctx); + if (!ctx->workqueue) { + xio_set_error(ENOMEM); + ERROR_LOG("context's workqueue create failed. %m\n"); + goto cleanup; + } + ctx->msg_pool = xio_objpool_create(sizeof(struct xio_msg), + MSGPOOL_INIT_NR, MSGPOOL_GROW_NR); + if (!ctx->msg_pool) { + xio_set_error(ENOMEM); + ERROR_LOG("context's msg_pool create failed. %m\n"); + goto cleanup1; + } + + if (-1 == xio_netlink(ctx)) + goto cleanup2; + + /* initialize rdma pools only */ + transport = xio_get_transport("rdma"); + if (transport && ctx->prealloc_xio_inline_bufs) { + int retval = xio_ctx_pool_create(ctx, XIO_PROTO_RDMA, + XIO_CONTEXT_POOL_CLASS_INITIAL); + if (retval) { + ERROR_LOG("Failed to create initial pool. ctx:%p\n", ctx); + goto cleanup2; + } + retval = xio_ctx_pool_create(ctx, XIO_PROTO_RDMA, + XIO_CONTEXT_POOL_CLASS_PRIMARY); + if (retval) { + ERROR_LOG("Failed to create primary pool. ctx:%p\n", ctx); + goto cleanup2; + } + } +#ifdef XIO_THREAD_SAFE_DEBUG + pthread_mutex_init(&ctx->dbg_thread_mutex, NULL); +#endif + spin_lock_init(&ctx->ctx_list_lock); + + DEBUG_LOG("context created. context:%p\n", ctx); + + xio_idr_add_uobj(usr_idr, ctx, "xio_context"); + return ctx; + +cleanup2: + xio_objpool_destroy(ctx->msg_pool); +cleanup1: + xio_workqueue_destroy(ctx->workqueue); +cleanup: + ufree(ctx); + return NULL; +} +EXPORT_SYMBOL(xio_context_create); + +/*---------------------------------------------------------------------------*/ +/* xio_context_reset_stop */ +/*---------------------------------------------------------------------------*/ +static inline void xio_context_reset_stop(struct xio_context *ctx) +{ + xio_ev_loop_reset_stop(ctx->ev_loop); +} + +/*---------------------------------------------------------------------------*/ +/* xio_ctx_tasks_pools_destroy */ +/*---------------------------------------------------------------------------*/ +static void xio_ctx_task_pools_destroy(struct xio_context *ctx) +{ + int i; + + for (i = 0; i < XIO_PROTO_LAST; i++) { + if (ctx->initial_tasks_pool[i]) { + xio_tasks_pool_free_tasks(ctx->initial_tasks_pool[i]); + xio_tasks_pool_destroy(ctx->initial_tasks_pool[i]); + ctx->initial_tasks_pool[i] = NULL; + } + if (ctx->primary_tasks_pool[i]) { + xio_tasks_pool_free_tasks(ctx->primary_tasks_pool[i]); + xio_tasks_pool_destroy(ctx->primary_tasks_pool[i]); + ctx->primary_tasks_pool[i] = NULL; + } + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_context_destroy */ +/*---------------------------------------------------------------------------*/ +void xio_context_destroy(struct xio_context *ctx) +{ + int i; + int found; + + if (unlikely(!ctx)) + return; + + if (unlikely(ctx->is_running && !ctx->defered_destroy)) { + ctx->defered_destroy = 1; + xio_ev_loop_stop(ctx->ev_loop); + return; + } + + found = xio_idr_lookup_uobj(usr_idr, ctx); + if (found) { + xio_idr_remove_uobj(usr_idr, ctx); + } else { + ERROR_LOG("context not found:%p\n", ctx); + xio_set_error(XIO_E_USER_OBJ_NOT_FOUND); + return; + } + ctx->run_private = 0; + xio_observable_notify_all_observers(&ctx->observable, + XIO_CONTEXT_EVENT_CLOSE, NULL); + + /* allow internally to run the loop for final cleanup */ + if (ctx->run_private) { + xio_context_reset_stop(ctx); + xio_context_run_loop(ctx, XIO_INFINITE); + } + + if (ctx->run_private) + ERROR_LOG("not all observers finished! run_private=%d\n", + ctx->run_private); + + xio_observable_notify_all_observers(&ctx->observable, + XIO_CONTEXT_EVENT_POST_CLOSE, NULL); + + if (!xio_observable_is_empty(&ctx->observable)) + ERROR_LOG("context destroy: observers leak - %p\n", ctx); + + xio_observable_unreg_all_observers(&ctx->observable); + + if (ctx->netlink_sock) { + int fd = (int)(long)ctx->netlink_sock; + + xio_ev_loop_del(ctx->ev_loop, fd); + close(fd); + ctx->netlink_sock = NULL; + } + for (i = 0; i < XIO_STAT_LAST; i++) + if (ctx->stats.name[i]) + free(ctx->stats.name[i]); + + xio_workqueue_destroy(ctx->workqueue); + + xio_objpool_destroy(ctx->msg_pool); + + if (ctx->mempool) { + xio_mempool_destroy((struct xio_mempool *)ctx->mempool); + ctx->mempool = NULL; + } + xio_ev_loop_destroy(ctx->ev_loop); + ctx->ev_loop = NULL; + + xio_ctx_task_pools_destroy(ctx); +#ifdef XIO_THREAD_SAFE_DEBUG + pthread_mutex_destroy(&ctx->dbg_thread_mutex); +#endif + + XIO_OBSERVABLE_DESTROY(&ctx->observable); + ufree(ctx); +} +EXPORT_SYMBOL(xio_context_destroy); + +/*---------------------------------------------------------------------------*/ +/* xio_ctx_add_delayed_work */ +/*---------------------------------------------------------------------------*/ +int xio_ctx_add_delayed_work(struct xio_context *ctx, + int msec_duration, void *data, + void (*timer_fn)(void *data), + xio_ctx_delayed_work_t *work) +{ + int retval; + + /* test if delayed work is pending */ + if (xio_is_delayed_work_pending(work)) + return 0; + + retval = xio_workqueue_add_delayed_work(ctx->workqueue, + msec_duration, data, + timer_fn, work); + if (retval) { + xio_set_error(errno); + ERROR_LOG("xio_workqueue_add_delayed_work failed. %m\n"); + } + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_ctx_del_delayed_work */ +/*---------------------------------------------------------------------------*/ +int xio_ctx_del_delayed_work(struct xio_context *ctx, + xio_ctx_delayed_work_t *work) +{ + int retval; + + /* test if delayed work is pending */ + if (!xio_is_delayed_work_pending(work)) + return 0; + + retval = xio_workqueue_del_delayed_work(ctx->workqueue, work); + if (retval) { + xio_set_error(errno); + ERROR_LOG("xio_workqueue_del_delayed_work failed. %m\n"); + } + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_add_counter */ +/*---------------------------------------------------------------------------*/ +int xio_add_counter(struct xio_context *ctx, char *name) +{ + int i; + + for (i = XIO_STAT_USER_FIRST; i < XIO_STAT_LAST; i++) { + if (!ctx->stats.name[i]) { + ctx->stats.name[i] = strdup(name); + if (!ctx->stats.name[i]) { + ERROR_LOG("stddup failed. %m"); + return -1; + } + ctx->stats.counter[i] = 0; + return i; + } + } + + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_del_counter */ +/*---------------------------------------------------------------------------*/ +int xio_del_counter(struct xio_context *ctx, int counter) +{ + if (counter < XIO_STAT_USER_FIRST || counter >= XIO_STAT_LAST) { + ERROR_LOG("counter(%d) out of range\n", counter); + return -1; + } + + /* free the name and mark as free for reuse */ + free(ctx->stats.name[counter]); + ctx->stats.name[counter] = NULL; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_modify_context */ +/*---------------------------------------------------------------------------*/ +int xio_modify_context(struct xio_context *ctx, + struct xio_context_attr *attr, + int attr_mask) +{ + if (!ctx || !attr) { + xio_set_error(EINVAL); + ERROR_LOG("invalid parameters\n"); + return -1; + } + + if (attr_mask & XIO_CONTEXT_ATTR_USER_CTX) + ctx->user_context = attr->user_context; + + return 0; +} +EXPORT_SYMBOL(xio_modify_context); + +/*---------------------------------------------------------------------------*/ +/* xio_query_context */ +/*---------------------------------------------------------------------------*/ +int xio_query_context(struct xio_context *ctx, + struct xio_context_attr *attr, + int attr_mask) +{ + if (!ctx || !attr) { + xio_set_error(EINVAL); + ERROR_LOG("invalid parameters\n"); + return -1; + } + + if (attr_mask & XIO_CONTEXT_ATTR_USER_CTX) + attr->user_context = ctx->user_context; + + return 0; +} +EXPORT_SYMBOL(xio_query_context); + +/*---------------------------------------------------------------------------*/ +/* xio_context_get_poll_fd */ +/*---------------------------------------------------------------------------*/ +int xio_context_get_poll_fd(struct xio_context *ctx) +{ + return xio_ev_loop_get_poll_fd(ctx->ev_loop); +} +EXPORT_SYMBOL(xio_context_get_poll_fd); + +/*---------------------------------------------------------------------------*/ +/* xio_context_poll_wait */ +/*---------------------------------------------------------------------------*/ +int xio_context_poll_wait(struct xio_context *ctx, int timeout_ms) +{ + return xio_ev_loop_poll_wait(ctx->ev_loop, timeout_ms); +} +EXPORT_SYMBOL(xio_context_poll_wait); + +/*---------------------------------------------------------------------------*/ +/* xio_context_add_ev_handler */ +/*---------------------------------------------------------------------------*/ +int xio_context_add_ev_handler(struct xio_context *ctx, + int fd, int events, + xio_ev_handler_t handler, + void *data) +{ + return xio_ev_loop_add(ctx->ev_loop, + fd, events, handler, data); +} +EXPORT_SYMBOL(xio_context_add_ev_handler); + +/*---------------------------------------------------------------------------*/ +/* xio_context_modify_ev_handler */ +/*---------------------------------------------------------------------------*/ +int xio_context_modify_ev_handler(struct xio_context *ctx, + int fd, int events) +{ + return xio_ev_loop_modify(ctx->ev_loop, fd, events); +} + +/*---------------------------------------------------------------------------*/ +/* xio_context_del_ev_handler */ +/*---------------------------------------------------------------------------*/ +int xio_context_del_ev_handler(struct xio_context *ctx, + int fd) +{ + return xio_ev_loop_del(ctx->ev_loop, fd); +} + +/*---------------------------------------------------------------------------*/ +/* xio_context_run_loop */ +/*---------------------------------------------------------------------------*/ +int xio_context_run_loop(struct xio_context *ctx, int timeout_ms) +{ + int retval = 0; + +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_lock(ctx); +#endif + + ctx->is_running = 1; + retval = (timeout_ms == XIO_INFINITE) ? xio_ev_loop_run(ctx->ev_loop) : + xio_ev_loop_run_timeout(ctx->ev_loop, timeout_ms); + ctx->is_running = 0; + + if (unlikely(ctx->defered_destroy)) + xio_context_destroy(ctx); + +#ifdef XIO_THREAD_SAFE_DEBUG + xio_ctx_debug_thread_unlock(ctx); +#endif + + return retval; +} +EXPORT_SYMBOL(xio_context_run_loop); + +/*---------------------------------------------------------------------------*/ +/* xio_context_stop_loop */ +/*---------------------------------------------------------------------------*/ +void xio_context_stop_loop(struct xio_context *ctx) +{ + xio_ev_loop_stop(ctx->ev_loop); +} +EXPORT_SYMBOL(xio_context_stop_loop); + +/*---------------------------------------------------------------------------*/ +/* xio_context_is_loop_stopping */ +/*---------------------------------------------------------------------------*/ +inline int xio_context_is_loop_stopping(struct xio_context *ctx) +{ + return xio_ev_loop_is_stopping(ctx->ev_loop); +} + +/*---------------------------------------------------------------------------*/ +/* xio_ctx_add_work */ +/*---------------------------------------------------------------------------*/ +int xio_ctx_add_work(struct xio_context *ctx, + void *data, + void (*function)(void *data), + xio_ctx_work_t *work) +{ + int retval; + + /* test if work is pending */ + if (xio_is_work_pending(work)) + return 0; + + retval = xio_workqueue_add_work(ctx->workqueue, + data, function, work); + if (retval) { + xio_set_error(errno); + ERROR_LOG("xio_workqueue_add_work failed. %m\n"); + } + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_ctx_set_work_destructor */ +/*---------------------------------------------------------------------------*/ +int xio_ctx_set_work_destructor( + struct xio_context *ctx, void *data, + void (*destructor)(void *data), + xio_ctx_work_t *work) +{ + int retval; + + /* test if work is pending */ + if (xio_is_work_pending(work)) + return 0; + + retval = xio_workqueue_set_work_destructor( + ctx->workqueue, + data, destructor, work); + if (retval) { + xio_set_error(errno); + ERROR_LOG("xio_workqueue_set_work_destructor failed. %m\n"); + } + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_ctx_is_work_in_handler */ +/*---------------------------------------------------------------------------*/ +int xio_ctx_is_work_in_handler(struct xio_context *ctx, xio_ctx_work_t *work) +{ + /* test if work is pending */ + if (xio_is_work_pending(work)) + return 0; + + return xio_workqueue_is_work_in_handler(ctx->workqueue, work); +} + +/*---------------------------------------------------------------------------*/ +/* xio_ctx_del_work */ +/*---------------------------------------------------------------------------*/ +int xio_ctx_del_work(struct xio_context *ctx, + xio_ctx_work_t *work) + +{ + int retval; + + /* test if work is pending */ + if (!xio_is_work_pending(work)) + return 0; + + retval = xio_workqueue_del_work(ctx->workqueue, work); + if (retval) { + xio_set_error(errno); + ERROR_LOG("xio_workqueue_del_work failed. %m\n"); + } + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_context_add_event */ +/*---------------------------------------------------------------------------*/ +int xio_context_add_event(struct xio_context *ctx, struct xio_ev_data *data) +{ + xio_ev_loop_add_event(ctx->ev_loop, data); + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_context_disable_event */ +/*---------------------------------------------------------------------------*/ +void xio_context_disable_event(struct xio_ev_data *data) +{ + xio_ev_loop_remove_event(data); +} + +/*---------------------------------------------------------------------------*/ +/* xio_context_is_pending_event */ +/*---------------------------------------------------------------------------*/ +int xio_context_is_pending_event(struct xio_ev_data *data) +{ + return xio_ev_loop_is_pending_event(data); +} + +/*---------------------------------------------------------------------------*/ +/* xio_context_poll_completions */ +/*---------------------------------------------------------------------------*/ +int xio_context_poll_completions(struct xio_context *ctx, int timeout_us) +{ + if (ctx->poll_completions_fn) + return ctx->poll_completions_fn(ctx->poll_completions_ctx, + timeout_us); + + return 0; +} +EXPORT_SYMBOL(xio_context_poll_completions); + +/* + * should be called only from loop context + */ +/*---------------------------------------------------------------------------*/ +/* xio_context_destroy_resume */ +/*---------------------------------------------------------------------------*/ +void xio_context_destroy_resume(struct xio_context *ctx) +{ + if (ctx->run_private) { + if (!--ctx->run_private) { + xio_context_stop_loop(ctx); + } + } +} +EXPORT_SYMBOL(xio_context_destroy_resume); + +/*---------------------------------------------------------------------------*/ +/* xio_context_set_poll_completions_fn */ +/*---------------------------------------------------------------------------*/ +void xio_context_set_poll_completions_fn( + struct xio_context *ctx, + poll_completions_fn_t poll_completions_fn, + void *poll_completions_ctx) +{ + ctx->poll_completions_ctx = poll_completions_ctx; + ctx->poll_completions_fn = poll_completions_fn; +} +EXPORT_SYMBOL(xio_context_set_poll_completions_fn); + +/*---------------------------------------------------------------------------*/ +/* xio_ctx_pool_create */ +/*---------------------------------------------------------------------------*/ +int xio_ctx_pool_create(struct xio_context *ctx, enum xio_proto proto, + enum xio_context_pool_class pool_cls) +{ + struct xio_tasks_pool_ops *pool_ops; + struct xio_tasks_pool **tasks_pool; + struct xio_transport *transport; + struct xio_tasks_pool_params params; + char pool_name[64]; + const char *proto_str = xio_proto_str(proto); + + /* get the transport's proto */ + transport = xio_get_transport(proto_str); + if (!transport) { + ERROR_LOG("failed to load %s transport layer.\n", proto_str); + ERROR_LOG("validate that your system support %s " \ + "and the accelio's %s module is loaded\n", + proto_str, proto_str); + xio_set_error(ENOPROTOOPT); + return -1; + } + + if (transport->get_pools_setup_ops) { + if (!ctx->primary_pool_ops[proto] || + !ctx->initial_pool_ops[proto]) + transport->get_pools_setup_ops( + NULL, + &ctx->initial_pool_ops[proto], + &ctx->primary_pool_ops[proto]); + } else { + ERROR_LOG("transport does not implement " \ + "\"get_pools_setup_ops\"\n"); + return -1; + } + + switch (pool_cls) { + case XIO_CONTEXT_POOL_CLASS_INITIAL: + tasks_pool = &ctx->initial_tasks_pool[proto]; + pool_ops = ctx->initial_pool_ops[proto]; + sprintf(pool_name, "ctx:%p - initial_pool_%s", ctx, proto_str); + + break; + case XIO_CONTEXT_POOL_CLASS_PRIMARY: + tasks_pool = &ctx->primary_tasks_pool[proto]; + pool_ops = ctx->primary_pool_ops[proto]; + sprintf(pool_name, "ctx:%p - primary_pool_%s", ctx, proto_str); + break; + default: + xio_set_error(EINVAL); + ERROR_LOG("unknown pool class\n"); + return -1; + }; + + /* if already exist */ + if (*tasks_pool) + return 0; + + if (!pool_ops) + return -1; + + if (!pool_ops->pool_get_params || + !pool_ops->slab_pre_create || + !pool_ops->slab_init_task || + !pool_ops->pool_post_create || + !pool_ops->slab_destroy) + return -1; + + /* get pool properties from the transport */ + memset(¶ms, 0, sizeof(params)); + + pool_ops->pool_get_params(NULL, + (int *)¶ms.start_nr, + (int *)¶ms.max_nr, + (int *)¶ms.alloc_nr, + (int *)¶ms.pool_dd_data_sz, + (int *)¶ms.slab_dd_data_sz, + (int *)¶ms.task_dd_data_sz); + params.max_nr = params.max_nr * ctx->max_conns_per_ctx; + if (ctx->prealloc_xio_inline_bufs) { + params.start_nr = params.max_nr; + params.alloc_nr = 0; + } + params.pool_hooks.slab_pre_create = + (int (*)(void *, int, void *, void *)) + pool_ops->slab_pre_create; + params.pool_hooks.slab_post_create = (int (*)(void *, void *, void *)) + pool_ops->slab_post_create; + params.pool_hooks.slab_destroy = (int (*)(void *, void *, void *)) + pool_ops->slab_destroy; + params.pool_hooks.slab_init_task = + (int (*)(void *, void *, void *, int, struct xio_task *)) + pool_ops->slab_init_task; + params.pool_hooks.slab_uninit_task = + (int (*)(void *, void *, void *, struct xio_task *)) + pool_ops->slab_uninit_task; + params.pool_hooks.slab_remap_task = + (int (*)(void *, void *, void *, void *, struct xio_task *)) + pool_ops->slab_remap_task; + params.pool_hooks.pool_pre_create = (int (*)(void *, void *, void *)) + pool_ops->pool_pre_create; + params.pool_hooks.pool_post_create = (int (*)(void *, void *, void *)) + pool_ops->pool_post_create; + params.pool_hooks.pool_destroy = (int (*)(void *, void *, void *)) + pool_ops->pool_destroy; + params.pool_hooks.task_pre_put = (int (*)(void *, struct xio_task *)) + pool_ops->task_pre_put; + params.pool_hooks.task_post_get = (int (*)(void *, struct xio_task *)) + pool_ops->task_post_get; + + params.pool_name = kstrdup(pool_name, GFP_KERNEL); + + /* initialize the tasks pool */ + *tasks_pool = xio_tasks_pool_create(¶ms); + if (!*tasks_pool) { + ERROR_LOG("xio_tasks_pool_create failed\n"); + return -1; + } + + return 0; +} + + +#ifdef XIO_THREAD_SAFE_DEBUG + +void xio_ctx_debug_thread_print_stack(int frames, void * const *callstack) +{ + char **strs; + int i; + ERROR_LOG("\tstack trace is\n"); + strs = backtrace_symbols(callstack, frames); + for (i = 0; i < frames; ++i) { + ERROR_LOG("%s\n", strs[i]); + } + free(strs); +} + +int xio_ctx_debug_thread_lock(struct xio_context *ctx) +{ + if (!pthread_mutex_trylock(&ctx->dbg_thread_mutex)) { + /* mutex was acquired - saving the current stacktrace */ + ctx->nptrs = backtrace(ctx->buffer, BACKTRACE_BUFFER_SIZE); + return 1; + } + ERROR_LOG("trying to lock an already locked lock for ctx %p\n", ctx); + xio_ctx_debug_thread_print_stack(ctx->nptrs, ctx->buffer); + ctx->nptrs = backtrace(ctx->buffer, BACKTRACE_BUFFER_SIZE); + xio_ctx_debug_thread_print_stack(ctx->nptrs, ctx->buffer); + + /*since lock was unsuccessful, wait until the lock becomes available */ + pthread_mutex_lock(&ctx->dbg_thread_mutex); + return 0; +} + +int xio_ctx_debug_thread_unlock(struct xio_context *ctx) +{ + pthread_mutex_unlock(&ctx->dbg_thread_mutex); + return 0; +} + +#endif + diff --git a/open_src/xio/src/usr/xio/xio_context_priv.h b/open_src/xio/src/usr/xio/xio_context_priv.h new file mode 100644 index 0000000..102f7bc --- /dev/null +++ b/open_src/xio/src/usr/xio/xio_context_priv.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_CONTEXT_PRIV_H_ +#define XIO_CONTEXT_PRIV_H_ + +#endif /* XIO_CONTEXT_PRIV_H_ */ diff --git a/open_src/xio/src/usr/xio/xio_ev_data.h b/open_src/xio/src/usr/xio/xio_ev_data.h new file mode 100644 index 0000000..5d8ee13 --- /dev/null +++ b/open_src/xio/src/usr/xio/xio_ev_data.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_EV_DATA_H +#define XIO_EV_DATA_H + +struct xio_ev_data; + +typedef void (*xio_event_handler_t)(void *data); + +/*---------------------------------------------------------------------------*/ +/* structs */ +/*---------------------------------------------------------------------------*/ +struct xio_ev_data { + union { + xio_ev_handler_t ev_handler; + xio_event_handler_t handler; + }; + union { + int fd; + int scheduled; + }; + int reserved; + void *data; + struct list_head events_list_entry; +}; + +#endif + diff --git a/open_src/xio/src/usr/xio/xio_ev_loop.c b/open_src/xio/src/usr/xio/xio_ev_loop.c new file mode 100644 index 0000000..c1d810e --- /dev/null +++ b/open_src/xio/src/usr/xio/xio_ev_loop.c @@ -0,0 +1,621 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include + +#include +#include "libxio.h" +#include "xio_log.h" +#include "xio_common.h" +#include "get_clock.h" +#include "xio_ev_data.h" +#include "xio_ev_loop.h" + +#define MAX_DELETED_EVENTS 1024 + +/*---------------------------------------------------------------------------*/ +/* structs */ +/*---------------------------------------------------------------------------*/ +struct xio_ev_loop { + int efd; + /* flags */ + volatile uint32_t in_dispatch:1; + volatile uint32_t stop_loop:1; + volatile uint32_t wakeup_armed:1; + volatile uint32_t pad:29; + + int wakeup_event; + int deleted_events_nr; + struct list_head poll_events_list; + struct list_head events_list; + struct xio_ev_data *deleted_events[MAX_DELETED_EVENTS]; +}; + +/*---------------------------------------------------------------------------*/ +/* epoll_to_xio_poll_events */ +/*---------------------------------------------------------------------------*/ +static inline uint32_t epoll_to_xio_poll_events(uint32_t epoll_events) +{ + uint32_t xio_events = 0; + + if (epoll_events & EPOLLIN) + xio_events |= XIO_POLLIN; + if (epoll_events & EPOLLOUT) + xio_events |= XIO_POLLOUT; + if (epoll_events & EPOLLRDHUP) + xio_events |= XIO_POLLRDHUP; + if (epoll_events & EPOLLET) + xio_events |= XIO_POLLET; + if (epoll_events & EPOLLONESHOT) + xio_events |= XIO_ONESHOT; + if (epoll_events & EPOLLHUP) + xio_events |= XIO_POLLHUP; + if (epoll_events & EPOLLERR) + xio_events |= XIO_POLLERR; + + return xio_events; +} + +/*---------------------------------------------------------------------------*/ +/* xio_to_epoll_poll_events */ +/*---------------------------------------------------------------------------*/ +static inline uint32_t xio_to_epoll_poll_events(uint32_t xio_events) +{ + uint32_t epoll_events = 0; + + if (xio_events & XIO_POLLIN) + epoll_events |= EPOLLIN; + if (xio_events & XIO_POLLOUT) + epoll_events |= EPOLLOUT; + if (xio_events & XIO_POLLRDHUP) + epoll_events |= EPOLLRDHUP; + if (xio_events & XIO_POLLET) + epoll_events |= EPOLLET; + if (xio_events & XIO_ONESHOT) + epoll_events |= EPOLLONESHOT; + if (xio_events & XIO_POLLHUP) + epoll_events |= EPOLLHUP; + if (xio_events & XIO_POLLERR) + epoll_events |= EPOLLERR; + + return epoll_events; +} + +/*---------------------------------------------------------------------------*/ +/* xio_event_add */ +/*---------------------------------------------------------------------------*/ +int xio_ev_loop_add(void *loop_hndl, int fd, int events, + xio_ev_handler_t handler, void *data) +{ + struct xio_ev_loop *loop = (struct xio_ev_loop *)loop_hndl; + struct epoll_event ev; + struct xio_ev_data *tev = NULL; + int err; + + memset(&ev, 0, sizeof(ev)); + ev.events = xio_to_epoll_poll_events(events); + + if (fd != loop->wakeup_event) { + tev = (struct xio_ev_data *)ucalloc(1, sizeof(*tev)); + if (!tev) { + xio_set_error(errno); + ERROR_LOG("calloc failed, %m\n"); + return -1; + } + tev->data = data; + tev->ev_handler = handler; + tev->fd = fd; + + list_add(&tev->events_list_entry, &loop->poll_events_list); + } + + ev.data.ptr = tev; + err = epoll_ctl(loop->efd, EPOLL_CTL_ADD, fd, &ev); + if (err) { + if (fd != loop->wakeup_event) + list_del(&tev->events_list_entry); + xio_set_error(errno); + if (errno != EEXIST) + ERROR_LOG("epoll_ctl failed fd:%d, %m\n", fd); + else + DEBUG_LOG("epoll_ctl already exists fd:%d, %m\n", fd); + ufree(tev); + } + + return err; +} + +/*---------------------------------------------------------------------------*/ +/* xio_event_lookup */ +/*---------------------------------------------------------------------------*/ +static struct xio_ev_data *xio_event_lookup(void *loop_hndl, int fd) +{ + struct xio_ev_loop *loop = (struct xio_ev_loop *)loop_hndl; + struct xio_ev_data *tev; + + list_for_each_entry(tev, &loop->poll_events_list, events_list_entry) { + if (tev->fd == fd) + return tev; + } + return NULL; +} + +/*---------------------------------------------------------------------------*/ +/* xio_ev_loop_del */ +/*---------------------------------------------------------------------------*/ +int xio_ev_loop_del(void *loop_hndl, int fd) +{ + struct xio_ev_loop *loop = (struct xio_ev_loop *)loop_hndl; + struct xio_ev_data *tev; + int ret; + + if (fd != loop->wakeup_event) { + tev = xio_event_lookup(loop, fd); + if (!tev) { + xio_set_error(ENOENT); + ERROR_LOG("event lookup failed. fd:%d\n", fd); + return -1; + } + list_del(&tev->events_list_entry); + if (loop->deleted_events_nr < MAX_DELETED_EVENTS) { + loop->deleted_events[loop->deleted_events_nr] = tev; + loop->deleted_events_nr++; + } else { + ERROR_LOG("failed to delete event\n"); + } + } + + ret = epoll_ctl(loop->efd, EPOLL_CTL_DEL, fd, NULL); + if (ret < 0) { + xio_set_error(errno); + ERROR_LOG("epoll_ctl failed. %m\n"); + } + + return ret; +} + +/*---------------------------------------------------------------------------*/ +/* xio_ev_loop_modify */ +/*---------------------------------------------------------------------------*/ +int xio_ev_loop_modify(void *loop_hndl, int fd, int events) +{ + struct xio_ev_loop *loop = (struct xio_ev_loop *)loop_hndl; + struct epoll_event ev; + struct xio_ev_data *tev = NULL; + int retval; + + if (fd != loop->wakeup_event) { + tev = xio_event_lookup(loop, fd); + if (!tev) { + xio_set_error(ENOENT); + ERROR_LOG("event lookup failed. fd:%d\n", fd); + return -1; + } + } + + memset(&ev, 0, sizeof(ev)); + ev.events = xio_to_epoll_poll_events(events); + ev.data.ptr = tev; + + retval = epoll_ctl(loop->efd, EPOLL_CTL_MOD, fd, &ev); + if (retval != 0) { + xio_set_error(errno); + ERROR_LOG("epoll_ctl failed. efd:%d, fd:%d %m\n", + loop->efd, fd); + } + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_ev_loop_create */ +/*---------------------------------------------------------------------------*/ +void *xio_ev_loop_create() +{ + struct xio_ev_loop *loop; + int retval; + eventfd_t val = 1; + + loop = (struct xio_ev_loop *)ucalloc(1, sizeof(struct xio_ev_loop)); + if (!loop) { + xio_set_error(errno); + ERROR_LOG("calloc failed. %m\n"); + return NULL; + } + + INIT_LIST_HEAD(&loop->poll_events_list); + INIT_LIST_HEAD(&loop->events_list); + + loop->stop_loop = 0; + loop->wakeup_armed = 0; + loop->deleted_events_nr = 0; + loop->efd = epoll_create(4096); + if (loop->efd == -1) { + xio_set_error(errno); + ERROR_LOG("epoll_create failed. %m\n"); + goto cleanup; + } + + /* prepare the wakeup eventfd */ + loop->wakeup_event = eventfd(0, EFD_NONBLOCK); + if (loop->wakeup_event == -1) { + xio_set_error(errno); + ERROR_LOG("eventfd failed. %m\n"); + goto cleanup1; + } + /* ADD & SET the wakeup fd and once application wants to arm + * just MODify the already prepared eventfd to the epoll */ + xio_ev_loop_add(loop, loop->wakeup_event, 0, NULL, NULL); + retval = eventfd_write(loop->wakeup_event, val); + if (retval != 0) + goto cleanup2; + + return loop; + +cleanup2: + close(loop->wakeup_event); +cleanup1: + close(loop->efd); +cleanup: + ufree(loop); + return NULL; +} + +/*---------------------------------------------------------------------------*/ +/* xio_ev_loop_init_event */ +/*---------------------------------------------------------------------------*/ +void xio_ev_loop_init_event(struct xio_ev_data *evt, + xio_event_handler_t event_handler, void *data) +{ + evt->handler = event_handler; + evt->scheduled = 0; + evt->data = data; + INIT_LIST_HEAD(&evt->events_list_entry); +} + +/*---------------------------------------------------------------------------*/ +/* xio_ev_loop_add_event */ +/*---------------------------------------------------------------------------*/ +void xio_ev_loop_add_event(void *_loop, struct xio_ev_data *evt) +{ + struct xio_ev_loop *loop = (struct xio_ev_loop *)_loop; + + if (!evt->scheduled) { + evt->scheduled = 1; + list_add_tail(&evt->events_list_entry, + &loop->events_list); + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_ev_loop_remove_event */ +/*---------------------------------------------------------------------------*/ +void xio_ev_loop_remove_event(struct xio_ev_data *evt) +{ + if (evt->scheduled) { + evt->scheduled = 0; + list_del_init(&evt->events_list_entry); + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_ev_loop_is_pending_event */ +/*---------------------------------------------------------------------------*/ +int xio_ev_loop_is_pending_event(struct xio_ev_data *evt) +{ + return evt->scheduled; +} + +/*---------------------------------------------------------------------------*/ +/* xio_ev_loop_exec_scheduled */ +/*---------------------------------------------------------------------------*/ +static int xio_ev_loop_exec_scheduled(struct xio_ev_loop *loop) +{ + struct list_head *last_sched; + struct list_head *events_list_entry; + struct xio_ev_data *tev, *tevn; + xio_event_handler_t event_handler; + void *event_data; + int work_remains = 0; + + if (!list_empty(&loop->events_list)) { + /* execute only work scheduled till now */ + last_sched = loop->events_list.prev; + list_for_each_entry_safe(tev, tevn, &loop->events_list, + events_list_entry) { + xio_ev_loop_remove_event(tev); + /* copy the relevant fields tev can be freed in + * callback + */ + event_handler = tev->handler; + event_data = tev->data; + events_list_entry = &tev->events_list_entry; + event_handler(event_data); + if (events_list_entry == last_sched) + break; + } + if (!list_empty(&loop->events_list)) + work_remains = 1; + } + return work_remains; +} + +/*---------------------------------------------------------------------------*/ +/* xio_ev_loop_deleted_event_lookup */ +/*---------------------------------------------------------------------------*/ +static inline int xio_ev_loop_deleted_event_lookup(struct xio_ev_loop *loop, + struct xio_ev_data *tev) +{ + int j; + + for (j = 0; j < loop->deleted_events_nr; j++) { + if (loop->deleted_events[j] == tev) + return 1; + } + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_ev_loop_run_helper */ +/*---------------------------------------------------------------------------*/ +static inline int xio_ev_loop_run_helper(void *loop_hndl, int timeout) +{ + struct xio_ev_loop *loop = (struct xio_ev_loop *)loop_hndl; + int nevent = 0, i, found = 0; + struct epoll_event events[1024]; + struct xio_ev_data *tev; + int work_remains; + int tmout; + int wait_time = timeout; + uint32_t out_events; + cycles_t start_cycle = 0; + + if (timeout != -1) + start_cycle = get_cycles(); + +retry: + work_remains = xio_ev_loop_exec_scheduled(loop); + tmout = work_remains ? 0 : timeout; + + /* free deleted event handlers */ + if (unlikely(loop->deleted_events_nr)) + while (loop->deleted_events_nr) + ufree(loop->deleted_events[--loop->deleted_events_nr]); + + nevent = epoll_wait(loop->efd, events, ARRAY_SIZE(events), tmout); + if (unlikely(nevent < 0)) { + if (errno != EINTR) { + xio_set_error(errno); + ERROR_LOG("epoll_wait failed. %m\n"); + return -1; + } + goto retry; + } else if (nevent > 0) { + /* save the epoll modify in "stop" while dispatching handlers */ + loop->in_dispatch = 1; + for (i = 0; i < nevent; i++) { + tev = (struct xio_ev_data *)events[i].data.ptr; + if (likely(tev)) { + /* look for deleted event handlers */ + if (unlikely(loop->deleted_events_nr)) { + found = + xio_ev_loop_deleted_event_lookup( + loop, tev); + if (found) + break; + + continue; + } + out_events = + epoll_to_xio_poll_events( + events[i].events); + /* (fd != loop->wakeup_event) */ + tev->ev_handler(tev->fd, out_events, + tev->data); + } else { + /* wakeup event auto-removed from epoll + * due to ONESHOT + * */ + + /* check wakeup is armed to prevent false + * wake ups + * */ + if (loop->wakeup_armed == 1) { + loop->wakeup_armed = 0; + loop->stop_loop = 1; + } + } + } + loop->in_dispatch = 0; + } else { + /* timed out */ + if (tmout || timeout == 0) + loop->stop_loop = 1; + /* TODO: timeout should be updated by the elapsed + * duration of each loop + * */ + } + /* calculate the remaining timeout */ + if (timeout != -1 && !loop->stop_loop) { + int time_passed = (int)((get_cycles() - + start_cycle)/(1000*g_mhz) + 0.5); + if (time_passed >= wait_time) + loop->stop_loop = 1; + else + timeout = wait_time - time_passed; + } + + if (likely(loop->stop_loop == 0)) { + goto retry; + } else { + /* drain events before returning */ + while (!list_empty(&loop->events_list)) + xio_ev_loop_exec_scheduled(loop); + + /* free deleted event handlers */ + while (loop->deleted_events_nr) + ufree(loop->deleted_events[--loop->deleted_events_nr]); + } + + loop->stop_loop = 0; + loop->wakeup_armed = 0; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_ev_loop_run_timeout */ +/*---------------------------------------------------------------------------*/ +int xio_ev_loop_run_timeout(void *loop_hndl, int timeout_msec) +{ + return xio_ev_loop_run_helper(loop_hndl, timeout_msec); +} + +/*---------------------------------------------------------------------------*/ +/* xio_ev_loop_run */ +/*---------------------------------------------------------------------------*/ +int xio_ev_loop_run(void *loop_hndl) +{ + return xio_ev_loop_run_helper(loop_hndl, -1 /* block indefinitely */); +} + +/*---------------------------------------------------------------------------*/ +/* xio_ev_loop_stop */ +/*---------------------------------------------------------------------------*/ +inline void xio_ev_loop_stop(void *loop_hndl) +{ + struct xio_ev_loop *loop = (struct xio_ev_loop *)loop_hndl; + + if (!loop || loop->efd == -1) + return; + + if (loop->stop_loop == 1) + return; /* loop is already marked for stopping (and also + armed for wakeup from blocking) */ + loop->stop_loop = 1; + + if (loop->in_dispatch || loop->wakeup_armed == 1) + return; /* wakeup is still armed, probably left loop in previous + cycle due to other reasons (timeout, events) */ + loop->wakeup_armed = 1; + xio_ev_loop_modify(loop, loop->wakeup_event, + XIO_POLLIN | XIO_ONESHOT); +} + +/*---------------------------------------------------------------------------*/ +/* xio_ev_loop_destroy */ +/*---------------------------------------------------------------------------*/ +void xio_ev_loop_destroy(void *loop_hndl) +{ + struct xio_ev_loop *loop = (struct xio_ev_loop *)loop_hndl; + struct xio_ev_data *tev, *tmp_tev; + + if (!loop) + return; + + /* mark loop as stopped */ + loop->stop_loop = 1; + list_for_each_entry_safe(tev, tmp_tev, &loop->poll_events_list, + events_list_entry) { + xio_ev_loop_del(loop, tev->fd); + } + + list_for_each_entry_safe(tev, tmp_tev, &loop->events_list, + events_list_entry) { + xio_ev_loop_remove_event(tev); + } + + /* free deleted event handlers */ + while (loop->deleted_events_nr) + ufree(loop->deleted_events[--loop->deleted_events_nr]); + + xio_ev_loop_del(loop, loop->wakeup_event); + + close(loop->efd); + loop->efd = -1; + + close(loop->wakeup_event); + loop->wakeup_event = -1; + + ufree(loop); +} + +/*---------------------------------------------------------------------------*/ +/* xio_ev_loop_poll_wait */ +/*---------------------------------------------------------------------------*/ +int xio_ev_loop_poll_wait(void *loop_hndl, int timeout_ms) +{ + struct xio_ev_loop *loop = (struct xio_ev_loop *)loop_hndl; + + loop->stop_loop = 1; + return xio_ev_loop_run_helper(loop, timeout_ms); +} + +/*---------------------------------------------------------------------------*/ +/* xio_ev_loop_get_poll_fd */ +/*---------------------------------------------------------------------------*/ +int xio_ev_loop_get_poll_fd(void *loop_hndl) +{ + struct xio_ev_loop *loop = (struct xio_ev_loop *)loop_hndl; + + if (!loop_hndl) { + xio_set_error(EINVAL); + return -1; + } + return loop->efd; +} + +/*---------------------------------------------------------------------------*/ +/* xio_ev_loop_is_stopping */ +/*---------------------------------------------------------------------------*/ +inline int xio_ev_loop_is_stopping(void *loop_hndl) +{ + return loop_hndl ? ((struct xio_ev_loop *)loop_hndl)->stop_loop : 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_ev_loop_reset_stop */ +/*---------------------------------------------------------------------------*/ +void xio_ev_loop_reset_stop(void *loop_hndl) +{ + struct xio_ev_loop *loop = (struct xio_ev_loop *)loop_hndl; + + loop->stop_loop = 0; + loop->wakeup_armed = 0; +} + diff --git a/open_src/xio/src/usr/xio/xio_ev_loop.h b/open_src/xio/src/usr/xio/xio_ev_loop.h new file mode 100644 index 0000000..71ff9a0 --- /dev/null +++ b/open_src/xio/src/usr/xio/xio_ev_loop.h @@ -0,0 +1,208 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_EV_LOOP_H +#define XIO_EV_LOOP_H + +/*---------------------------------------------------------------------------*/ +/* XIO default event loop API */ +/* */ +/* NoTE: xio provides default muxer implementation around epoll. */ +/* users are encouraged to utilize their own implementations and provides */ +/* appropriate services to xio via the xio's context open interface */ +/*---------------------------------------------------------------------------*/ +/** + * initializes event loop handle + * + * @returns event loop handle or NULL upon error + */ +void *xio_ev_loop_create(void); + +/** + * xio_ev_loop_run - event loop main loop + * + * @param[in] loop Pointer to the event dispatcher + * + * @returns success (0), or a (negative) error value + */ +int xio_ev_loop_run(void *loop); + +/** + * event loop main loop with limited blocking duration + * + * @param[in] loop_hndl Pointer to event loop + * @param[in] timeout_msec The timeout argument specifies the minimum + * number of milliseconds that xio_ev_loop_run + * will block before exiting + * + * @returns success (0), or a (negative) error value + */ +int xio_ev_loop_run_timeout(void *loop_hndl, int timeout_msec); + +/** + * stop a running event loop main loop + * + * @param[in] loop Pointer to event loop + */ +void xio_ev_loop_stop(void *loop); + +/** + * reset stop parameters + * + * @param[in] loop Pointer to event loop + */ +void xio_ev_loop_reset_stop(void *loop_hndl); + +/** + * check if stop activated + * + * @param[in] loop Pointer to event loop + */ +int xio_ev_loop_is_stopping(void *loop_hndl); + +/** + * destroy the event loop + * + * @param[in] loop Pointer to event loop + */ +void xio_ev_loop_destroy(void *loop); + +/** + * add event handlers on dispatcher + * + * @param[in] loop the dispatcher context + * @param[in] fd the file descriptor + * @param[in] events the event signaled as defined in + * enum xio_ev_loop_events + * @param[in] handler event handler that handles the event + * @param[in] data user private data + * + * @returns success (0), or a (negative) error value + */ +int xio_ev_loop_add(void *loop, + int fd, int events, + xio_ev_handler_t handler, + void *data); + +/** + * modify event handlers on dispatcher + * + * @param[in] loop the dispatcher context + * @param[in] fd the file descriptor + * @param[in] events the event signaled as defined in + * enum xio_ev_loop_events + * + * @returns success (0), or a (negative) error value + */ +int xio_ev_loop_modify(void *loop_hndl, int fd, int events); + +/** + * delete event handlers from dispatcher + * + * @param[in] loop the dispatcher context + * @param[in] fd the file descriptor + * + * @returns success (0), or a (negative) error value + */ +int xio_ev_loop_del(void *loop, int fd); + +/** + * get context poll fd, which can be later passed to an external dispatcher + * + * @param[in] loop the dispatcher context + * + * @return fd (non-negative) on success, or -1 on error. If an error occurs, + * call xio_errno function to get the failure reason. + */ +int xio_ev_loop_get_poll_fd(void *loop); + +/** + * poll for events for a specified (possibly infinite) amount of time; + * + * this function relies on polling and waiting mechanisms applied to all file + * descriptors and other event signaling resources (e.g. hw event queues) + * associated with the context; these mechanisms are invoked until the first + * successful polling attempt is made; + * + * all events which became pending till then are handled and the user callbacks + * are called as appropriate for those events; then the functions exits + * + * the number of actual events handled originated by any source of events is + * guaranteed to be limited + * + * @param[in] loop Pointer to the xio loop handle + * @param[in] timeout_ms number of milliseconds to wait before exiting, + * with or without events handled + * 0 : just poll instantly, don't wait + * XIO_INFINITE: wait for at least a single event + * + * @return 0 on success, or -1 on error. If an error occurs, call + * xio_errno function to get the failure reason. + */ +int xio_ev_loop_poll_wait(void *loop, int timeout_ms); + +/** + * add event job to scheduled events queue + * + * @param[in] loop the dispatcher context + * @param[in] evt the scheduled event data + * + * @returns none + */ +void xio_ev_loop_add_event(void *loop, + struct xio_ev_data *evt); + +/** + * remove event from events queue + * + * @param[in] evt the scheduled event data + * + * @returns none + */ +void xio_ev_loop_remove_event(struct xio_ev_data *evt); + +/** + * check whether event is pending + * + * @param[in] evt the event data + * + * @returns 1 if pending, 0 if not pending + */ +int xio_ev_loop_is_pending_event(struct xio_ev_data *evt); + +#endif + diff --git a/open_src/xio/src/usr/xio/xio_init.c b/open_src/xio/src/usr/xio/xio_init.c new file mode 100644 index 0000000..d61898d --- /dev/null +++ b/open_src/xio/src/usr/xio/xio_init.c @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include "libxio.h" +#include "xio_log.h" +#include "xio_common.h" +#include "xio_tls.h" +#include "xio_sessions_cache.h" +#include "xio_nexus_cache.h" +#include "xio_observer.h" +#include "xio_transport.h" +#include "xio_idr.h" +#include "xio_init.h" + +int page_size; +double g_mhz; +struct xio_idr *usr_idr = NULL; + +#ifdef HAVE_INFINIBAND_VERBS_H +extern struct xio_transport xio_rdma_transport; +#endif + +struct xio_transport *xio_rdma_get_transport_func_list(void); +struct xio_transport *xio_tcp_get_transport_func_list(void); + +typedef struct xio_transport *(*get_transport_func_list_t)(void); + +static get_transport_func_list_t transport_func_list_tbl[] = { +#ifdef HAVE_INFINIBAND_VERBS_H + xio_rdma_get_transport_func_list, +#endif + xio_tcp_get_transport_func_list +}; + +#define transport_tbl_sz (sizeof(transport_func_list_tbl) \ + / sizeof(transport_func_list_tbl[0])) + +static struct xio_transport *transport_tbl[transport_tbl_sz]; + +static volatile int32_t ini_refcnt; /*= 0 */ +static DEFINE_MUTEX(ini_mutex); + +extern double xio_get_cpu_mhz(void); + +/*---------------------------------------------------------------------------*/ +/* xio_dtor */ +/*---------------------------------------------------------------------------*/ +static void xio_dtor(void) +{ + size_t i; + + for (i = 0; i < transport_tbl_sz; i++) { + if (transport_tbl[i] == NULL) + continue; + if (transport_tbl[i]->release) + transport_tbl[i]->release(transport_tbl[i]); + + if (transport_tbl[i]->dtor) + transport_tbl[i]->dtor(); + + xio_unreg_transport(transport_tbl[i]); + } + xio_idr_destroy(usr_idr); + xio_thread_data_destruct(); + xio_env_cleanup(); +} + +/*---------------------------------------------------------------------------*/ +/* xio_dtor */ +/*---------------------------------------------------------------------------*/ +static void xio_ctor(void) +{ + size_t i; + + xio_env_startup(); + for (i = 0; i < transport_tbl_sz; i++) + if (!transport_tbl[i]) + transport_tbl[i] = transport_func_list_tbl[i](); + + page_size = xio_get_page_size(); + if (page_size < 0) + page_size = 4096; + g_mhz = xio_get_cpu_mhz(); + xio_thread_data_construct(); + usr_idr = xio_idr_create(); + if (!usr_idr) + ERROR_LOG("usr_idr creation failed"); + sessions_cache_construct(); + nexus_cache_construct(); + + for (i = 0; i < transport_tbl_sz; i++) { + if (transport_tbl[i] == NULL) + continue; + xio_reg_transport(transport_tbl[i]); + + if (transport_tbl[i]->ctor) + transport_tbl[i]->ctor(); + } +} + +void xio_init(void) +{ + mutex_lock(&ini_mutex); + if (++ini_refcnt == 1) + xio_ctor(); + mutex_unlock(&ini_mutex); +} + +void xio_shutdown(void) +{ + mutex_lock(&ini_mutex); + if (ini_refcnt <= 0) { + ERROR_LOG("reference count < 0\n"); + abort(); + mutex_unlock(&ini_mutex); + return; + } + if (--ini_refcnt == 0) + xio_dtor(); + mutex_unlock(&ini_mutex); +} + +int xio_inited(void) +{ + int ret; + mutex_lock(&ini_mutex); + ret = ini_refcnt; + mutex_unlock(&ini_mutex); + return ret; +} + diff --git a/open_src/xio/src/usr/xio/xio_init.h b/open_src/xio/src/usr/xio/xio_init.h new file mode 100644 index 0000000..f1a72f8 --- /dev/null +++ b/open_src/xio/src/usr/xio/xio_init.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_INIT_H +#define XIO_INIT_H + +/*---------------------------------------------------------------------------*/ +/* xio_inited */ +/*---------------------------------------------------------------------------*/ +int xio_inited(void); + +#endif /* XIO_INIT_H */ + diff --git a/open_src/xio/src/usr/xio/xio_log.c b/open_src/xio/src/usr/xio/xio_log.c new file mode 100644 index 0000000..7626e71 --- /dev/null +++ b/open_src/xio/src/usr/xio/xio_log.c @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include "libxio.h" +#include "xio_log.h" + +void xio_vlog(const char *file, unsigned line, const char *function, + unsigned level, const char *fmt, ...); + +enum xio_log_level xio_logging_level = XIO_LOG_LEVEL_ERROR; +xio_log_fn xio_vlog_fn = xio_vlog; + +#define LOG_TIME_FMT "%04d/%02d/%02d-%02d:%02d:%02d.%05ld" + +/*---------------------------------------------------------------------------*/ +/* xio_vlog */ +/*---------------------------------------------------------------------------*/ +void xio_vlog(const char *file, unsigned line, const char *function, + unsigned level, const char *fmt, ...) +{ + va_list args; + const char *short_file; + struct timeval tv; + struct tm t; + char buf[2048]; + char buf2[256]; + int length = 0; + static const char * const level_str[] = { + "FATAL", "ERROR", "WARN", "INFO", "DEBUG", "TRACE" + }; + time_t time1; + + va_start(args, fmt); + length = vsnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + buf[length] = 0; + + gettimeofday(&tv, NULL); + time1 = (time_t)tv.tv_sec; + localtime_r(&time1, &t); + + short_file = strrchr(file, '/'); + short_file = (!short_file) ? file : short_file + 1; + + snprintf(buf2, sizeof(buf2), "%s:%u", short_file, line); + /* + fprintf(stderr, + "[%012lu.%06lu] %-28s [%-5s] - %s", + tv.tv_sec, tv.tv_usec, buf2, level_str[level], buf); + */ + fprintf(stderr, + "[" LOG_TIME_FMT "] %-28s [%-5s] - %s", + t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, + t.tm_hour, t.tm_min, t.tm_sec, tv.tv_usec, + buf2, + level_str[level], buf); + + fflush(stderr); +} + +/*---------------------------------------------------------------------------*/ +/* xio_read_logging_level */ +/*---------------------------------------------------------------------------*/ +void xio_read_logging_level(void) +{ + char *val = getenv("XIO_TRACE"); + int level = 0; + + if (!val) + return; + + level = atoi(val); + if (level >= XIO_LOG_LEVEL_FATAL && level <= XIO_LOG_LEVEL_TRACE) + xio_logging_level = (enum xio_log_level)level; +} + diff --git a/open_src/xio/src/usr/xio/xio_log.h b/open_src/xio/src/usr/xio/xio_log.h new file mode 100644 index 0000000..7ee1f85 --- /dev/null +++ b/open_src/xio/src/usr/xio/xio_log.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_LOG_H +#define XIO_LOG_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Enable compiler checks for printf-like formatting. + * + * @param fmtargN number of formatting argument + * @param vargN number of variadic argument + */ +#define XIO_F_PRINTF(fmtarg, varg) \ + __attribute__((__format__(printf, fmtarg, varg))) + +/*---------------------------------------------------------------------------*/ +/* enum */ +/*---------------------------------------------------------------------------*/ +extern enum xio_log_level xio_logging_level; +extern xio_log_fn xio_vlog_fn; + +extern void xio_vlog(const char *file, unsigned line, const char *function, + unsigned level, const char *fmt, ...); + +#define xio_log(level, fmt, ...) \ + do { \ + if (unlikely(((level) < XIO_LOG_LEVEL_LAST) && \ + (level) <= xio_logging_level)) { \ + xio_vlog_fn(__FILE__, __LINE__, __func__, (level), \ + fmt, ## __VA_ARGS__); \ + } \ + } while (0) + +#define FATAL_LOG(fmt, ...) xio_log(XIO_LOG_LEVEL_FATAL, fmt, \ + ## __VA_ARGS__) +#define ERROR_LOG(fmt, ...) xio_log(XIO_LOG_LEVEL_ERROR, fmt, \ + ## __VA_ARGS__) +#define WARN_LOG(fmt, ...) xio_log(XIO_LOG_LEVEL_WARN, fmt,\ + ## __VA_ARGS__) +#define INFO_LOG(fmt, ...) xio_log(XIO_LOG_LEVEL_INFO, fmt,\ + ## __VA_ARGS__) +#define DEBUG_LOG(fmt, ...) xio_log(XIO_LOG_LEVEL_DEBUG, fmt,\ + ## __VA_ARGS__) +#define TRACE_LOG(fmt, ...) xio_log(XIO_LOG_LEVEL_TRACE, fmt,\ + ## __VA_ARGS__) + +void xio_read_logging_level(void); + +static inline int xio_set_log_level(enum xio_log_level level) +{ + xio_logging_level = level; + + return 0; +} + +static inline enum xio_log_level xio_get_log_level(void) +{ + return xio_logging_level; +} + +static inline int xio_set_log_fn(xio_log_fn fn) +{ + if (!fn) + xio_vlog_fn = xio_vlog; + else + xio_vlog_fn = fn; + + return 0; +} + +#ifdef __cplusplus +} +#endif + +#endif /* XIO_LOG_H */ diff --git a/open_src/xio/src/usr/xio/xio_mem.c b/open_src/xio/src/usr/xio/xio_mem.c new file mode 100644 index 0000000..9b10357 --- /dev/null +++ b/open_src/xio/src/usr/xio/xio_mem.c @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include "xio_log.h" +#include "xio_common.h" +#include "xio_mem.h" + +#define HUGE_PAGE_SZ (2*1024*1024) +#ifndef WIN32 +int disable_huge_pages = 0; +#else +int disable_huge_pages = 1; /* bypass hugepages */ +#endif +int allocator_assigned = 0; +struct xio_mem_allocator g_mem_allocator; +struct xio_mem_allocator *mem_allocator = &g_mem_allocator; + +/*---------------------------------------------------------------------------*/ +/* malloc_huge_pages */ +/*---------------------------------------------------------------------------*/ +void *malloc_huge_pages(size_t size) +{ + int retval; + size_t real_size; + void *ptr = NULL; + + if (disable_huge_pages) { + long page_size = xio_get_page_size(); + + if (page_size < 0) { + xio_set_error(errno); + ERROR_LOG("sysconf failed. (errno=%d %m)\n", errno); + return NULL; + } + + real_size = ALIGN(size, page_size); + retval = xio_memalign(&ptr, page_size, real_size); + if (retval) { + ERROR_LOG("posix_memalign failed sz:%zu. %s\n", + real_size, strerror(retval)); + return NULL; + } + memset(ptr, 0, real_size); + return ptr; + } + + /* Use 1 extra page to store allocation metadata */ + /* (libhugetlbfs is more efficient in this regard) */ + real_size = ALIGN(size + HUGE_PAGE_SZ, HUGE_PAGE_SZ); + + ptr = xio_mmap(real_size); + if (!ptr || ptr == MAP_FAILED) { + /* The mmap() call failed. Try to malloc instead */ + long page_size = xio_get_page_size(); + + if (page_size < 0) { + xio_set_error(errno); + ERROR_LOG("sysconf failed. (errno=%d %m)\n", errno); + return NULL; + } + WARN_LOG("huge pages allocation failed, allocating " \ + "regular pages\n"); + + DEBUG_LOG("mmap rdma pool sz:%zu failed (errno=%d %m)\n", + real_size, errno); + real_size = ALIGN(size + HUGE_PAGE_SZ, page_size); + retval = xio_memalign(&ptr, page_size, real_size); + if (retval) { + ERROR_LOG("posix_memalign failed sz:%zu. %s\n", + real_size, strerror(retval)); + return NULL; + } + memset(ptr, 0, real_size); + real_size = 0; + } else { + DEBUG_LOG("Allocated huge page sz:%zu\n", real_size); + } + /* Save real_size since mmunmap() requires a size parameter */ + *((size_t *)ptr) = real_size; + /* Skip the page with metadata */ + return sum_to_ptr(ptr, HUGE_PAGE_SZ); +} + +/*---------------------------------------------------------------------------*/ +/* free_huge_pages */ +/*---------------------------------------------------------------------------*/ +void free_huge_pages(void *ptr) +{ + void *real_ptr; + size_t real_size; + + if (!ptr) + return; + + if (disable_huge_pages) { + free(ptr); + return; + } + + /* Jump back to the page with metadata */ + real_ptr = (char *)ptr - HUGE_PAGE_SZ; + /* Read the original allocation size */ + real_size = *((size_t *)real_ptr); + + if (real_size != 0) + /* The memory was allocated via mmap() + and must be deallocated via munmap() + */ + xio_munmap(real_ptr, real_size); + else + /* The memory was allocated via malloc() + and must be deallocated via free() + */ + free(real_ptr); +} + +/*---------------------------------------------------------------------------*/ +/* xio_numa_alloc */ +/*---------------------------------------------------------------------------*/ +void *xio_numa_alloc(size_t bytes, int node) +{ + size_t real_size = ALIGN((bytes + page_size), page_size); + void *p = xio_numa_alloc_onnode(real_size, node); + + if (!p) { + ERROR_LOG("numa_alloc_onnode failed sz:%zu. %m\n", + real_size); + return NULL; + } + /* force the OS to allocate physical memory for the region */ + memset(p, 0, real_size); + + /* Save real_size since numa_free() requires a size parameter */ + *((size_t *)p) = real_size; + + /* Skip the page with metadata */ + return sum_to_ptr(p, page_size); +} + +/*---------------------------------------------------------------------------*/ +/* xio_numa_free_ptr */ +/*---------------------------------------------------------------------------*/ +void xio_numa_free_ptr(void *ptr) +{ + void *real_ptr; + size_t real_size; + + if (!ptr) + return; + + /* Jump back to the page with metadata */ + real_ptr = (char *)ptr - page_size; + /* Read the original allocation size */ + real_size = *((size_t *)real_ptr); + + if (real_size != 0) + /* The memory was allocated via numa_alloc() + and must be deallocated via numa_free() + */ + xio_numa_free(real_ptr, real_size); +} diff --git a/open_src/xio/src/usr/xio/xio_mem.h b/open_src/xio/src/usr/xio/xio_mem.h new file mode 100644 index 0000000..dabd2c1 --- /dev/null +++ b/open_src/xio/src/usr/xio/xio_mem.h @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_MEM_H +#define XIO_MEM_H + +#include + +extern int disable_huge_pages; +extern int allocator_assigned; +extern int page_size; +extern struct xio_mem_allocator *mem_allocator; + +extern void *malloc_huge_pages(size_t size); +extern void free_huge_pages(void *ptr); +extern void *xio_numa_alloc(size_t bytes, int node); +extern void xio_numa_free_ptr(void *ptr); + +static inline void xio_disable_huge_pages(int disable) +{ + if (disable_huge_pages) + return; + disable_huge_pages = disable; +} + +static inline int xio_set_mem_allocator(struct xio_mem_allocator *allocator) +{ + if (allocator_assigned) { + /* xio_set_error(EPERM);*/ + return -1; + } + memcpy(mem_allocator, allocator, sizeof(*allocator)); + allocator_assigned = 1; + + return 0; +} + +static inline void *ucalloc(size_t nmemb, size_t size) +{ + void *ptr; + + if (allocator_assigned && mem_allocator->allocate) { + ptr = mem_allocator->allocate(nmemb*size, + mem_allocator->user_context); + if (ptr) + memset(ptr, 0, nmemb*size); + } else { + ptr = calloc(nmemb, size); + } + return ptr; +} + +static inline void *umalloc(size_t size) +{ + if (allocator_assigned && mem_allocator->allocate) + return mem_allocator->allocate(size, + mem_allocator->user_context); + else + return malloc(size); +} + +static inline void *umemalign(size_t boundary, size_t size) +{ + void *ptr; + + if (allocator_assigned && mem_allocator->memalign) { + ptr = mem_allocator->memalign(boundary, size, + mem_allocator->user_context); + } else { + if (xio_memalign(&ptr, boundary, size) != 0) + return NULL; + } + if (ptr) + memset(ptr, 0, size); + return ptr; +} + +static inline void ufree(void *ptr) +{ + if (allocator_assigned && mem_allocator->free) + mem_allocator->free(ptr, mem_allocator->user_context); +#ifndef WIN32 + /*TODO: for win, sometimes 'free' and sometimes aligned_free is needed*/ + else + free(ptr); +#endif +} + +static inline void *umalloc_huge_pages(size_t size) +{ + void *ptr; + + if (allocator_assigned && mem_allocator->malloc_huge_pages) { + ptr = mem_allocator->malloc_huge_pages( + size, mem_allocator->user_context); + if (ptr) + memset(ptr, 0, size); + } else { + ptr = malloc_huge_pages(size); + } + return ptr; +} + +static inline void ufree_huge_pages(void *ptr) +{ + if (allocator_assigned && mem_allocator->free_huge_pages) + mem_allocator->free_huge_pages(ptr, + mem_allocator->user_context); + else + free_huge_pages(ptr); +} + +static inline void *unuma_alloc(size_t size, int node) +{ + if (allocator_assigned && mem_allocator->numa_alloc) + return mem_allocator->numa_alloc(size, node, + mem_allocator->user_context); + else + return xio_numa_alloc(size, node); +} + +static inline void unuma_free(void *ptr) +{ + if (allocator_assigned && mem_allocator->numa_free) + mem_allocator->numa_free(ptr, + mem_allocator->user_context); + else + xio_numa_free_ptr(ptr); +} + +#endif + diff --git a/open_src/xio/src/usr/xio/xio_netlink.c b/open_src/xio/src/usr/xio/xio_netlink.c new file mode 100644 index 0000000..f4d6163 --- /dev/null +++ b/open_src/xio/src/usr/xio/xio_netlink.c @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include "libxio.h" +#include "xio_log.h" +#include "xio_common.h" +#include "xio_observer.h" +#include "get_clock.h" +#include "xio_ev_data.h" +#include "xio_ev_loop.h" +#include "xio_idr.h" +#include "xio_objpool.h" +#include "xio_workqueue.h" +#include "xio_timers_list.h" +#include "xio_protocol.h" +#include "xio_mbuf.h" +#include "xio_task.h" +#include "xio_context.h" + +#define XIO_NETLINK_MCAST_GRP_ID 4 + +/*---------------------------------------------------------------------------*/ +/* xio_stats_handler */ +/*---------------------------------------------------------------------------*/ +static void xio_stats_handler(int fd, int events, void *data) +{ + struct xio_context *ctx = (struct xio_context *)data; + unsigned char buf[NLMSG_SPACE(1024)]; + struct nlmsghdr *nlh = (struct nlmsghdr *)buf; + struct msghdr msg; + struct iovec iov; + struct sockaddr_nl dest_addr; + uint64_t now = get_cycles(); + ssize_t ret; + char *ptr; + int i; + + /* read netlink message */ + iov.iov_base = (void *)nlh; + /* max size for receive */ + iov.iov_len = NLMSG_SPACE(1024); + + memset(&msg, 0, sizeof(msg)); + msg.msg_name = (void *)&dest_addr; + msg.msg_namelen = sizeof(dest_addr); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + ret = recvmsg(fd, &msg, 0); + if (ret <= 0) + return; + + ptr = (char *)NLMSG_DATA(nlh); + + switch (nlh->nlmsg_type - NLMSG_MIN_TYPE) { + case 0: /* Format */ + /* counting will start now */ + memset(&ctx->stats.counter, 0, + XIO_STAT_LAST * sizeof(uint64_t)); + /* First the cycles' hertz (assumed to be fixed) */ + memcpy(ptr, &ctx->stats.hertz, sizeof(ctx->stats.hertz)); + ptr += sizeof(ctx->stats.hertz); + memcpy(ptr, &now, sizeof(now)); + ptr += sizeof(now); + /* Counters' name */ + for (i = 0; i < XIO_STAT_LAST; i++) { + if (!ctx->stats.name[i]) + continue; + strcpy(ptr, ctx->stats.name[i]); + /* keep the '\0' */ + ptr += strlen(ptr) + 1; + } + /* but not the last '\0' */ + ptr--; + break; + case 1: /* Statistics */ + /* Fisrt the timestamp in cycles */ + memcpy(ptr, &now, sizeof(now)); + ptr += sizeof(now); + /* for each named counter counter */ + for (i = 0; i < XIO_STAT_LAST; i++) { + if (!ctx->stats.name[i]) + continue; + memcpy((void *)ptr, &ctx->stats.counter[i], + sizeof(uint64_t)); + ptr += sizeof(uint64_t); + } + break; + default: /* Not yet implemented */ + ERROR_LOG("Unsupported message type(%d)\n", nlh->nlmsg_type); + return; + } + + /* header is in the buffer */ + nlh->nlmsg_len = ptr - (char *)buf; + iov.iov_len = nlh->nlmsg_len; + + nlh->nlmsg_pid = getpid(); + nlh->nlmsg_flags = 1; + /* don't modify type */ + + /* Send unicst */ + dest_addr.nl_groups = 0; + /* send response */ + ret = sendmsg(fd, &msg, 0); + if (ret <= 0) + return; +} + +/*---------------------------------------------------------------------------*/ +/* xio_netlink */ +/*---------------------------------------------------------------------------*/ +int xio_netlink(struct xio_context *ctx) +{ + struct sockaddr_nl nladdr; + int fd; + socklen_t addr_len; + + /* only root can bind netlink socket */ + if (geteuid() != 0) { + DEBUG_LOG("statistics monitoring disabled. " \ + "not privileged user\n"); + return 0; + } + + fd = socket(PF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_GENERIC); + if (fd < 0) { + xio_set_error(errno); + ERROR_LOG("socket failed. %m\n"); + return -1; + } + + nladdr.nl_family = AF_NETLINK; + nladdr.nl_pad = 0; + + /* Listen to both UC and MC + * By default the monitoring program send MC request but if + * a thread starts after the monitor program than it will miss + * the request for the format. When the monitoring program receives + * statistics from a thread that it doesn't have its format it will + * send a UC request directly to it + * + */ + nladdr.nl_pid = 0; + nladdr.nl_groups = XIO_NETLINK_MCAST_GRP_ID; + + if (bind(fd, (struct sockaddr *)&nladdr, sizeof(nladdr))) { + /* + * I suspect that accelio is broken on kernel 3.19 due + * to the following patch: + * https://patchwork.ozlabs.org/patch/429350/ + */ + if (errno == ENOENT) { + WARN_LOG("netlink bind failed. %m\n"); + close(fd); + return 0; + } + xio_set_error(errno); + ERROR_LOG("netlink bind failed. %m\n"); + goto cleanup; + } + + addr_len = sizeof(nladdr); + if (getsockname(fd, (struct sockaddr *)&nladdr, &addr_len)) { + xio_set_error(errno); + ERROR_LOG("getsockname failed. %m\n"); + goto cleanup; + } + + if (addr_len != sizeof(nladdr)) { + xio_set_error(EINVAL); + ERROR_LOG("invalid addr_len\n"); + goto cleanup; + } + if (nladdr.nl_family != AF_NETLINK) { + xio_set_error(EINVAL); + ERROR_LOG("invalid nl_family\n"); + goto cleanup; + } + + DEBUG_LOG("netlink socket bind to port %u\n", + nladdr.nl_pid); + + xio_ev_loop_add(ctx->ev_loop, fd, XIO_POLLIN, + xio_stats_handler, ctx); + + ctx->stats.hertz = g_mhz * 1000000.0 + 0.5; + /* Init default counters' name */ + ctx->stats.name[XIO_STAT_TX_MSG] = strdup("TX_MSG"); + ctx->stats.name[XIO_STAT_RX_MSG] = strdup("RX_MSG"); + ctx->stats.name[XIO_STAT_TX_BYTES] = strdup("TX_BYTES"); + ctx->stats.name[XIO_STAT_RX_BYTES] = strdup("RX_BYTES"); + ctx->stats.name[XIO_STAT_DELAY] = strdup("DELAY"); + ctx->stats.name[XIO_STAT_APPDELAY] = strdup("APPDELAY"); + + ctx->netlink_sock = (void *)(unsigned long)fd; + return 0; + +cleanup: + close(fd); + return -1; +} + diff --git a/open_src/xio/src/usr/xio/xio_os.h b/open_src/xio/src/usr/xio/xio_os.h new file mode 100644 index 0000000..1f90cfe --- /dev/null +++ b/open_src/xio/src/usr/xio/xio_os.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_OS_H +#define XIO_OS_H + +#include + +static inline int xio_memalign(void **memptr, size_t alignment, + size_t size); +static inline void xio_memfree(void *memptr); +static inline long xio_get_page_size(void); +static inline void *xio_mmap(size_t length); +static inline int xio_munmap(void *addr, size_t length); +static inline void *xio_numa_alloc_onnode(size_t size, int node); +static inline void xio_numa_free(void *start, size_t size); + +#include +#include "get_clock.h" + +#endif /* XIO_OS_H */ diff --git a/open_src/xio/src/usr/xio/xio_sg_iov.c b/open_src/xio/src/usr/xio/xio_sg_iov.c new file mode 100644 index 0000000..74f5887 --- /dev/null +++ b/open_src/xio/src/usr/xio/xio_sg_iov.c @@ -0,0 +1,220 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* sg represents xio_sg_iov; */ +#include "libxio.h" +#include +#include "xio_sg_table.h" + +/*---------------------------------------------------------------------------*/ +/* xio_sgve_set_buf */ +/*---------------------------------------------------------------------------*/ +static inline void xio_sgve_set_buf(struct xio_iovec_ex *sg, const void *buf, + uint32_t buflen, void *mr) +{ + sg->iov_base = (void *)buf; + sg->iov_len = buflen; + sg->mr = (struct xio_mr *)mr; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgve_addr */ +/*---------------------------------------------------------------------------*/ +static inline void *xio_sgve_addr(struct xio_iovec_ex *sg) +{ + return sg->iov_base; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgve_set_addr */ +/*---------------------------------------------------------------------------*/ +static inline void xio_sgve_set_addr(struct xio_iovec_ex *sg, void *addr) +{ + sg->iov_base = addr; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgve_length */ +/*---------------------------------------------------------------------------*/ +static inline size_t xio_sgve_length(struct xio_iovec_ex *sg) +{ + return sg->iov_len; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgve_set_length */ +/*---------------------------------------------------------------------------*/ +static inline void xio_sgve_set_length(struct xio_iovec_ex *sg, + uint32_t length) +{ + sg->iov_len = length; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgve_mr */ +/*---------------------------------------------------------------------------*/ +static inline void *xio_sgve_mr(struct xio_iovec_ex *sg) +{ + return sg->mr; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgve_set_mr */ +/*---------------------------------------------------------------------------*/ +static inline void xio_sgve_set_mr(struct xio_iovec_ex *sg, void *mr) +{ + sg->mr = (struct xio_mr *)mr; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgve_first */ +/*---------------------------------------------------------------------------*/ +static struct xio_iovec_ex *xio_sgve_first(struct xio_sg_iov *sgv) +{ + return ((!sgv || sgv->nents == 0) ? + NULL : &sgv->sglist[0]); +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgve_last */ +/*---------------------------------------------------------------------------*/ +static struct xio_iovec_ex *xio_sgve_last(struct xio_sg_iov *sgv) +{ + return ((!sgv || sgv->nents == 0) ? + NULL : &sgv->sglist[sgv->nents - 1]); +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgve_next */ +/*---------------------------------------------------------------------------*/ +static struct xio_iovec_ex *xio_sgve_next(struct xio_sg_iov *sgv, + struct xio_iovec_ex *sgve) +{ + return (!sgv || sgv->nents == 0 || + (sgve == &sgv->sglist[sgv->nents - 1])) + ? NULL : ++sgve; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgv_sglist */ +/*---------------------------------------------------------------------------*/ +static inline struct xio_iovec_ex *xio_sgv_sglist(struct xio_sg_iov *sgv) +{ + return sgv->sglist; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgv_nents */ +/*---------------------------------------------------------------------------*/ +static inline int xio_sgv_nents(struct xio_sg_iov *sgv) +{ + return sgv->nents; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgv_max_nents */ +/*---------------------------------------------------------------------------*/ +static inline int xio_sgv_max_nents(struct xio_sg_iov *sgv) +{ + return XIO_IOVLEN; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgv_set_nents */ +/*---------------------------------------------------------------------------*/ +static inline void xio_sgv_set_nents(struct xio_sg_iov *sgv, uint32_t nents) +{ + if (!sgv || XIO_IOVLEN < nents) + return; + sgv->nents = nents; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgv_set_max_nents */ +/*---------------------------------------------------------------------------*/ +static inline void xio_sgv_set_max_nents(struct xio_sg_iov *sgv, + uint32_t max_nents) +{ + sgv->max_nents = XIO_IOVLEN; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgv_empty */ +/*---------------------------------------------------------------------------*/ +static int xio_sgv_empty(struct xio_sg_iov *sgv) +{ + return (!sgv || sgv->nents == 0); +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgv_length */ +/*---------------------------------------------------------------------------*/ +static size_t xio_sgv_length(struct xio_sg_iov *sgv) +{ + size_t sz = 0; + uint32_t i; + + for (i = 0; i < sgv->nents; i++) + sz += sgv->sglist[i].iov_len; + + return sz; +} + +/*---------------------------------------------------------------------------*/ +/* sgtbl_ops_iov */ +/*---------------------------------------------------------------------------*/ +struct xio_sg_table_ops sgtbl_ops_iov = { + .sge_set_buf = (sge_set_buf_fn)xio_sgve_set_buf, + .sge_addr = (sge_addr_fn)xio_sgve_addr, + .sge_set_addr = (sge_set_addr_fn)xio_sgve_set_addr, + .sge_mr = (sge_mr_fn)xio_sgve_mr, + .sge_set_mr = (sge_set_mr_fn)xio_sgve_set_mr, + .sge_length = (sge_length_fn)xio_sgve_length, + .sge_set_length = (sge_set_length_fn)xio_sgve_set_length, + .sge_first = (sge_first_fn)xio_sgve_first, + .sge_last = (sge_last_fn)xio_sgve_last, + .sge_next = (sge_next_fn)xio_sgve_next, + .tbl_empty = (tbl_empty_fn)xio_sgv_empty, + .tbl_nents = (tbl_nents_fn)xio_sgv_nents, + .tbl_sglist = (tbl_sglist_fn)xio_sgv_sglist, + .tbl_set_nents = (tbl_set_nents_fn)xio_sgv_set_nents, + .tbl_max_nents = (tbl_max_nents_fn)xio_sgv_max_nents, + .tbl_set_max_nents = (tbl_set_max_nents_fn)xio_sgv_set_max_nents, + .tbl_length = (tbl_length_fn)xio_sgv_length, +}; + diff --git a/open_src/xio/src/usr/xio/xio_sg_iovptr.c b/open_src/xio/src/usr/xio/xio_sg_iovptr.c new file mode 100644 index 0000000..745e712 --- /dev/null +++ b/open_src/xio/src/usr/xio/xio_sg_iovptr.c @@ -0,0 +1,218 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* sg represents xio_sg_iovptr; */ +#include "libxio.h" +#include +#include "xio_sg_table.h" + +/*---------------------------------------------------------------------------*/ +/* xio_sgve_set_buf */ +/*---------------------------------------------------------------------------*/ +static inline void xio_sgve_set_buf(struct xio_iovec_ex *sg, const void *buf, + uint32_t buflen, void *mr) +{ + sg->iov_base = (void *)buf; + sg->iov_len = buflen; + sg->mr = (struct xio_mr *)mr; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgve_addr */ +/*---------------------------------------------------------------------------*/ +static inline void *xio_sgve_addr(struct xio_iovec_ex *sg) +{ + return sg->iov_base; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgve_set_addr */ +/*---------------------------------------------------------------------------*/ +static inline void xio_sgve_set_addr(struct xio_iovec_ex *sg, void *addr) +{ + sg->iov_base = addr; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgve_length */ +/*---------------------------------------------------------------------------*/ +static inline size_t xio_sgve_length(struct xio_iovec_ex *sg) +{ + return sg->iov_len; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgve_set_length */ +/*---------------------------------------------------------------------------*/ +static inline void xio_sgve_set_length(struct xio_iovec_ex *sg, + uint32_t length) +{ + sg->iov_len = length; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgve_mr */ +/*---------------------------------------------------------------------------*/ +static inline void *xio_sgve_mr(struct xio_iovec_ex *sg) +{ + return sg->mr; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgve_set_mr */ +/*---------------------------------------------------------------------------*/ +static inline void xio_sgve_set_mr(struct xio_iovec_ex *sg, void *mr) +{ + sg->mr = (struct xio_mr *)mr; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgve_first */ +/*---------------------------------------------------------------------------*/ +static inline struct xio_iovec_ex *xio_sgve_first(struct xio_sg_iovptr *sgv) +{ + return ((!sgv || sgv->nents == 0) ? NULL : &sgv->sglist[0]); +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgve_last */ +/*---------------------------------------------------------------------------*/ +static inline struct xio_iovec_ex *xio_sgve_last(struct xio_sg_iovptr *sgv) +{ + return ((!sgv || sgv->nents == 0) ? + NULL : &sgv->sglist[sgv->nents - 1]); +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgve_next */ +/*---------------------------------------------------------------------------*/ +static inline struct xio_iovec_ex *xio_sgve_next(struct xio_sg_iovptr *sgv, + struct xio_iovec_ex *sgve) +{ + return (!sgv || sgv->nents == 0 || + (sgve == &sgv->sglist[sgv->nents - 1]) ? NULL : ++sgve); +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgv_sglist */ +/*---------------------------------------------------------------------------*/ +static inline struct xio_iovec_ex *xio_sgv_sglist(struct xio_sg_iovptr *sgv) +{ + return sgv->sglist; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgv_nents */ +/*---------------------------------------------------------------------------*/ +static inline int xio_sgv_nents(struct xio_sg_iovptr *sgv) +{ + return sgv->nents; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgv_max_nents */ +/*---------------------------------------------------------------------------*/ +static inline int xio_sgv_max_nents(struct xio_sg_iovptr *sgv) +{ + return sgv->max_nents; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgv_set_nents */ +/*---------------------------------------------------------------------------*/ +static inline void xio_sgv_set_nents(struct xio_sg_iovptr *sgv, uint32_t nents) +{ + if (!sgv || sgv->max_nents < nents) + return; + sgv->nents = nents; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgv_set_max_nents */ +/*---------------------------------------------------------------------------*/ +static inline void xio_sgv_set_max_nents(struct xio_sg_iovptr *sgv, + uint32_t max_nents) +{ + sgv->max_nents = max_nents; +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgv_empty */ +/*---------------------------------------------------------------------------*/ +static inline int xio_sgv_empty(struct xio_sg_iovptr *sgv) +{ + return (!sgv || sgv->nents == 0); +} + +/*---------------------------------------------------------------------------*/ +/* xio_sgv_set_length */ +/*---------------------------------------------------------------------------*/ +static inline size_t xio_sgv_length(struct xio_sg_iovptr *sgv) +{ + size_t sz = 0; + uint32_t i; + + for (i = 0; i < sgv->nents; i++) + sz += sgv->sglist[i].iov_len; + + return sz; +} + +/*---------------------------------------------------------------------------*/ +/* sgtbl_ops_iovptr */ +/*---------------------------------------------------------------------------*/ +struct xio_sg_table_ops sgtbl_ops_iovptr = { + .sge_set_buf = (sge_set_buf_fn)xio_sgve_set_buf, + .sge_addr = (sge_addr_fn)xio_sgve_addr, + .sge_set_addr = (sge_set_addr_fn)xio_sgve_set_addr, + .sge_mr = (sge_mr_fn)xio_sgve_mr, + .sge_set_mr = (sge_set_mr_fn)xio_sgve_set_mr, + .sge_length = (sge_length_fn)xio_sgve_length, + .sge_set_length = (sge_set_length_fn)xio_sgve_set_length, + .sge_first = (sge_first_fn)xio_sgve_first, + .sge_last = (sge_last_fn)xio_sgve_last, + .sge_next = (sge_next_fn)xio_sgve_next, + .tbl_empty = (tbl_empty_fn)xio_sgv_empty, + .tbl_nents = (tbl_nents_fn)xio_sgv_nents, + .tbl_sglist = (tbl_sglist_fn)xio_sgv_sglist, + .tbl_set_nents = (tbl_set_nents_fn)xio_sgv_set_nents, + .tbl_max_nents = (tbl_max_nents_fn)xio_sgv_max_nents, + .tbl_set_max_nents = (tbl_set_max_nents_fn)xio_sgv_set_max_nents, + .tbl_length = (tbl_length_fn)xio_sgv_length, +}; + diff --git a/open_src/xio/src/usr/xio/xio_sg_table.c b/open_src/xio/src/usr/xio/xio_sg_table.c new file mode 100644 index 0000000..885b82c --- /dev/null +++ b/open_src/xio/src/usr/xio/xio_sg_table.c @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include "xio_log.h" +#include "xio_sg_table.h" + +extern struct xio_sg_table_ops sgtbl_ops_iov; +extern struct xio_sg_table_ops sgtbl_ops_iovptr; + +void *xio_sg_table_ops_get(enum xio_sgl_type sgl_type) +{ + static void *vec[XIO_SGL_TYPE_LAST] = { + [XIO_SGL_TYPE_IOV] = (void *)&sgtbl_ops_iov, + [XIO_SGL_TYPE_IOV_PTR] = (void *)&sgtbl_ops_iovptr, + [XIO_SGL_TYPE_SCATTERLIST] = NULL + }; + + return vec[sgl_type]; +} +EXPORT_SYMBOL(xio_sg_table_ops_get); + +/*---------------------------------------------------------------------------*/ +/* tbl_clone */ +/*---------------------------------------------------------------------------*/ +int tbl_clone(struct xio_sg_table_ops *dtbl_ops, void *dtbl, + struct xio_sg_table_ops *stbl_ops, void *stbl) +{ + void *dsge; + void *ssge; + unsigned int i; + + if (tbl_max_nents(dtbl_ops, dtbl) < tbl_nents(stbl_ops, stbl)) { + ERROR_LOG("dest max nents is %d while src nents is %d\n", + tbl_max_nents(dtbl_ops, dtbl), + tbl_nents(stbl_ops, stbl)); + return -1; + } + + tbl_set_nents(dtbl_ops, dtbl, + tbl_nents(stbl_ops, stbl)); + ssge = sge_first(stbl_ops, stbl); + for_each_sge(dtbl, dtbl_ops, dsge, i) { + sge_set_addr(dtbl_ops, dsge, + sge_addr(stbl_ops, ssge)); + sge_set_length(dtbl_ops, dsge, + sge_length(stbl_ops, ssge)); + + ssge = sge_next(stbl_ops, stbl, ssge); + } + + return 0; +} +EXPORT_SYMBOL(tbl_clone); + +/*---------------------------------------------------------------------------*/ +/* tbl_copy */ +/*---------------------------------------------------------------------------*/ +int tbl_copy(struct xio_sg_table_ops *dtbl_ops, void *dtbl, + struct xio_sg_table_ops *stbl_ops, void *stbl) +{ + void *dsge = sge_first(dtbl_ops, dtbl); + void *ssge = sge_first(stbl_ops, stbl); + void *daddr = sge_addr(dtbl_ops, dsge); + void *saddr = sge_addr(stbl_ops, ssge); + size_t dlen = sge_length(dtbl_ops, dsge); + size_t slen = sge_length(stbl_ops, ssge); + size_t dnents = tbl_nents(dtbl_ops, dtbl); + size_t snents = tbl_nents(stbl_ops, stbl); + + size_t d = 0, + s = 0, + dst_len = 0; + + if (dnents < 1 || snents < 1) { + ERROR_LOG("nents < 1 dnents:%zd, snents:%zd\n", + dnents, snents); + return 0; + } + + while (1) { + if (slen < dlen) { + memcpy(daddr, saddr, slen); + dst_len += slen; + + s++; + ssge = sge_next(stbl_ops, stbl, ssge); + if (s == snents) { + sge_set_length(dtbl_ops, dsge, dst_len); + d++; + /*dsge = sge_next(dtbl_ops, dtbl, dsge);*/ + break; + } + dlen -= slen; + inc_ptr(daddr, slen); + saddr = sge_addr(stbl_ops, ssge); + slen = sge_length(stbl_ops, ssge); + } else if (dlen < slen) { + memcpy(daddr, saddr, dlen); + sge_set_length(dtbl_ops, dsge, (dst_len + dlen)); + dst_len = 0; + d++; + dsge = sge_next(dtbl_ops, dtbl, dsge); + if (d == dnents) + break; + slen -= dlen; + inc_ptr(saddr, dlen); + daddr = sge_addr(dtbl_ops, dsge); + dlen = sge_length(dtbl_ops, dsge); + } else { + memcpy(daddr, saddr, dlen); + sge_set_length(dtbl_ops, dsge, (dst_len + dlen)); + dst_len = 0; + + d++; + s++; + dsge = sge_next(dtbl_ops, dtbl, dsge); + ssge = sge_next(stbl_ops, stbl, ssge); + if ((d == dnents) || (s == snents)) + break; + + daddr = sge_addr(dtbl_ops, dsge); + dlen = sge_length(dtbl_ops, dsge); + saddr = sge_addr(stbl_ops, ssge); + slen = sge_length(stbl_ops, ssge); + } + } + + /* not enough buffers to complete */ + if (s < snents) { + ERROR_LOG("dest iovec exhausted\n"); + return 0; + } + tbl_set_nents(dtbl_ops, dtbl, d); + + return 0; +} +EXPORT_SYMBOL(tbl_copy); + +/*---------------------------------------------------------------------------*/ +/* tbl_copy_sg */ +/*---------------------------------------------------------------------------*/ +int tbl_copy_sg(struct xio_sg_table_ops *dtbl_ops, void *dtbl, + struct xio_sg_table_ops *stbl_ops, void *stbl) +{ + void *dsge = sge_first(dtbl_ops, dtbl); + void *ssge = sge_first(stbl_ops, stbl); + void *daddr = sge_addr(dtbl_ops, dsge); + void *saddr = sge_addr(stbl_ops, ssge); + size_t dlen = sge_length(dtbl_ops, dsge); + size_t slen = sge_length(stbl_ops, ssge); + size_t dnents = tbl_nents(dtbl_ops, dtbl); + size_t snents = tbl_nents(stbl_ops, stbl); + + size_t d = 0, + s = 0; + + if (dnents < 1 || snents < 1) { + ERROR_LOG("nents < 1 dnents:%zd, snents:%zd\n", + dnents, snents); + return 0; + } + if (dnents < snents) { + ERROR_LOG("dnents < snents dnents:%zd, snents:%zd\n", + dnents, snents); + return 0; + } + + dnents = snents; + while (1) { + if (slen <= dlen) { + dlen = slen; + memcpy(daddr, saddr, dlen); + sge_set_length(dtbl_ops, dsge, dlen); + + d++; + s++; + dsge = sge_next(dtbl_ops, dtbl, dsge); + ssge = sge_next(stbl_ops, stbl, ssge); + if ((d == dnents) || (s == snents)) + break; + + daddr = sge_addr(dtbl_ops, dsge); + dlen = sge_length(dtbl_ops, dsge); + saddr = sge_addr(stbl_ops, ssge); + slen = sge_length(stbl_ops, ssge); + } else { + ERROR_LOG("not enough buffer to complete " \ + "slen:%d dlen:%d\n", slen, dlen); + break; + } + } + tbl_set_nents(dtbl_ops, dtbl, d); + + return 0; +} +EXPORT_SYMBOL(tbl_copy_sg); + diff --git a/open_src/xio/src/usr/xio/xio_task.c b/open_src/xio/src/usr/xio/xio_task.c new file mode 100644 index 0000000..1355b11 --- /dev/null +++ b/open_src/xio/src/usr/xio/xio_task.c @@ -0,0 +1,346 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include "libxio.h" +#include "xio_log.h" +#include "xio_common.h" +#include "xio_protocol.h" +#include "xio_mbuf.h" +#include "xio_task.h" +#include "xio_mem.h" +#include + +#define XIO_TASK_MAGIC 0x58494f54 /* Hex of 'XIOT' */ + +/*---------------------------------------------------------------------------*/ +/* xio_tasks_pool_alloc_slab */ +/*---------------------------------------------------------------------------*/ +int xio_tasks_pool_alloc_slab(struct xio_tasks_pool *q, void *context) +{ + int alloc_nr; + size_t slab_alloc_sz; + size_t tasks_alloc_sz; + void *buf; + void *data, *ptr; + struct xio_tasks_slab *s; + struct xio_task *task; + int retval = 0, i; + int tot_sz; + int huge_alloc = 0; + LIST_HEAD(tmp_list); + INIT_LIST_HEAD(&tmp_list); + + if ((int)q->params.start_nr < 0 || (int)q->params.max_nr < 0 || + (int)q->params.alloc_nr < 0) { + xio_set_error(EINVAL); + return -1; + } + if (q->params.start_nr && q->curr_alloced < q->params.start_nr) + alloc_nr = min(q->params.start_nr, q->params.max_nr); + else + alloc_nr = min(q->params.alloc_nr, + q->params.max_nr - q->curr_alloced); + + if (alloc_nr == 0) + return 0; + + /* slab + private data */ + slab_alloc_sz = sizeof(struct xio_tasks_slab) + + q->params.slab_dd_data_sz + + alloc_nr * sizeof(struct xio_task *); + + /* slab data */ + tasks_alloc_sz = alloc_nr * (sizeof(struct xio_task) + + g_options.max_in_iovsz * sizeof(struct xio_iovec_ex) + + g_options.max_out_iovsz * sizeof(struct xio_iovec_ex) + + q->params.task_dd_data_sz); + + tot_sz = slab_alloc_sz + tasks_alloc_sz; + + if (tot_sz > 1 << 20) { + buf = umalloc_huge_pages(tot_sz); + huge_alloc = 1; + } else { + buf = umemalign(64, tot_sz); + } + if (!buf) { + xio_set_error(ENOMEM); + ERROR_LOG("ucalloc failed\n"); + return -1; + } + data = buf; + ptr = buf; + + /* slab */ + s = (struct xio_tasks_slab *)((char *)buf + tasks_alloc_sz); + s->dd_data = (void *)((char *)s + sizeof(struct xio_tasks_slab)); + + /* array */ + s->array = (struct xio_task **) + ((char *)(s->dd_data) + q->params.slab_dd_data_sz); + + /* fix indexes */ + s->start_idx = q->curr_idx; + s->end_idx = s->start_idx + alloc_nr - 1; + q->curr_idx = s->end_idx + 1; + s->nr = alloc_nr; + s->huge_alloc = huge_alloc; + + if (q->params.pool_hooks.slab_pre_create) { + retval = q->params.pool_hooks.slab_pre_create( + context, + alloc_nr, + q->dd_data, + s->dd_data); + if (retval) + goto cleanup; + } + + for (i = 0; i < alloc_nr; i++) { + s->array[i] = (struct xio_task *)data; + task = s->array[i]; + task->tlv_type = 0xdead; + task->ltid = s->start_idx + i; + task->magic = XIO_TASK_MAGIC; + task->pool = (void *)q; + task->slab = (void *)s; + task->dd_data = ((char *)data) + + sizeof(struct xio_task); + + data = ((char *)data) + sizeof(struct xio_task) + + q->params.task_dd_data_sz; + + task->imsg.in.sgl_type = XIO_SGL_TYPE_IOV_PTR; + task->imsg.in.pdata_iov.sglist = (struct xio_iovec_ex *)data; + task->imsg.in.pdata_iov.max_nents = g_options.max_in_iovsz; + + data = ((char *)data) + + g_options.max_in_iovsz * sizeof(struct xio_iovec_ex); + + task->imsg.out.sgl_type = XIO_SGL_TYPE_IOV_PTR; + task->imsg.out.pdata_iov.sglist = (struct xio_iovec_ex *)data; + task->imsg.out.pdata_iov.max_nents = + g_options.max_out_iovsz; + + data = ((char *)data) + + g_options.max_out_iovsz * sizeof(struct xio_iovec_ex); + + if (q->params.pool_hooks.slab_init_task && context) { + retval = q->params.pool_hooks.slab_init_task( + context, + q->dd_data, + s->dd_data, + i, + task); + if (retval) + goto cleanup; + } + list_add_tail(&task->tasks_list_entry, &tmp_list); + } + q->curr_alloced += alloc_nr; + + list_add_tail(&s->slabs_list_entry, &q->slabs_list); + list_splice_tail(&tmp_list, &q->stack); + + if (q->params.pool_hooks.slab_post_create && context) { + retval = q->params.pool_hooks.slab_post_create( + context, + q->dd_data, + s->dd_data); + if (retval) + goto cleanup; + } + return retval; + +cleanup: + if (huge_alloc) + ufree_huge_pages(ptr); + else + ufree(ptr); + + return -1; +} +EXPORT_SYMBOL(xio_tasks_pool_alloc_slab); + +/*---------------------------------------------------------------------------*/ +/* xio_tasks_pool_create */ +/*---------------------------------------------------------------------------*/ +struct xio_tasks_pool *xio_tasks_pool_create( + struct xio_tasks_pool_params *params) +{ + struct xio_tasks_pool *q; + char *buf; + + /* pool */ + buf = (char *)ucalloc(sizeof(*q) + params->pool_dd_data_sz, 1); + if (!buf) { + xio_set_error(ENOMEM); + ERROR_LOG("ucalloc failed\n"); + return NULL; + } + q = (struct xio_tasks_pool *)buf; + if (params->pool_dd_data_sz) + q->dd_data = (void *)(q + 1); + else + q->dd_data = NULL; + + INIT_LIST_HEAD(&q->stack); + INIT_LIST_HEAD(&q->slabs_list); + + memcpy(&q->params, params, sizeof(*params)); + + if (q->params.pool_hooks.pool_pre_create) + q->params.pool_hooks.pool_pre_create( + q->params.pool_hooks.context, q, q->dd_data); + + if (q->params.start_nr) { + xio_tasks_pool_alloc_slab(q, q->params.pool_hooks.context); + if (list_empty(&q->stack)) { + ufree(q); + return NULL; + } + } + if (q->params.pool_hooks.pool_post_create) + q->params.pool_hooks.pool_post_create( + q->params.pool_hooks.context, q, q->dd_data); + + return q; +} +EXPORT_SYMBOL(xio_tasks_pool_create); + +/*---------------------------------------------------------------------------*/ +/* xio_tasks_pool_destroy */ +/*---------------------------------------------------------------------------*/ +void xio_tasks_pool_destroy(struct xio_tasks_pool *q) +{ + struct xio_tasks_slab *pslab, *next_pslab; + unsigned int i; + + list_for_each_entry_safe(pslab, next_pslab, &q->slabs_list, + slabs_list_entry) { + list_del(&pslab->slabs_list_entry); + + if (q->params.pool_hooks.slab_uninit_task) { + for (i = 0; i < pslab->nr; i++) + q->params.pool_hooks.slab_uninit_task( + pslab->array[i]->context, + q->dd_data, + pslab->dd_data, + pslab->array[i]); + } + + if (q->params.pool_hooks.slab_destroy) + q->params.pool_hooks.slab_destroy( + q->params.pool_hooks.context, + q->dd_data, + pslab->dd_data); + + /* the tmp tasks are returned back to pool */ + + if (pslab->huge_alloc) + ufree_huge_pages(pslab->array[0]); + else + ufree(pslab->array[0]); + } + + if (q->params.pool_hooks.pool_destroy) + q->params.pool_hooks.pool_destroy( + q->params.pool_hooks.context, + q, q->dd_data); + + kfree(q->params.pool_name); + + ufree(q); +} +EXPORT_SYMBOL(xio_tasks_pool_destroy); + +/*---------------------------------------------------------------------------*/ +/* xio_tasks_pool_remap */ +/*---------------------------------------------------------------------------*/ +void xio_tasks_pool_remap(struct xio_tasks_pool *q, void *new_context) +{ + struct xio_tasks_slab *pslab, *next_pslab; + unsigned int i; + + if (!q) + return; + + list_for_each_entry_safe(pslab, next_pslab, &q->slabs_list, + slabs_list_entry) { + if (q->params.pool_hooks.slab_post_create) + q->params.pool_hooks.slab_post_create( + new_context, + q->dd_data, + pslab->dd_data); + + if (q->params.pool_hooks.slab_remap_task) { + for (i = 0; i < pslab->nr; i++) + q->params.pool_hooks.slab_remap_task( + q->params.pool_hooks.context, + new_context, + q->dd_data, + pslab->dd_data, + pslab->array[i]); + } + } + q->params.pool_hooks.context = new_context; +} + +/*---------------------------------------------------------------------------*/ +/* xio_tasks_pool_dump_used */ +/*---------------------------------------------------------------------------*/ +void xio_tasks_pool_dump_used(struct xio_tasks_pool *q) +{ + struct xio_tasks_slab *pslab; + unsigned int i; + char *pool_name; + + list_for_each_entry(pslab, &q->slabs_list, slabs_list_entry) { + for (i = 0; i < pslab->nr; i++) + if (pslab->array[i]->tlv_type != 0xdead) { + pool_name = q->params.pool_name ? + q->params.pool_name : "unknown"; + ERROR_LOG("pool_name:%s: in use: task:%p, " \ + "type:0x%x\n", + pool_name, + pslab->array[i], + pslab->array[i]->tlv_type); + } + } +} + diff --git a/open_src/xio/src/usr/xio/xio_timers_list.h b/open_src/xio/src/usr/xio/xio_timers_list.h new file mode 100644 index 0000000..ebbab4a --- /dev/null +++ b/open_src/xio/src/usr/xio/xio_timers_list.h @@ -0,0 +1,311 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_TIMERS_LIST_H +#define XIO_TIMERS_LIST_H + +#define XIO_MS_IN_SEC 1000ULL +#define XIO_US_IN_SEC 1000000ULL +#define XIO_NS_IN_SEC 1000000000ULL +#define XIO_US_IN_MSEC 1000ULL +#define XIO_NS_IN_MSEC 1000000ULL +#define XIO_NS_IN_USEC 1000ULL +#define SAFE_LIST + +#define xio_timer_handle_t void * + +struct xio_timers_list { + struct list_head timers_head; +#ifdef SAFE_LIST + spinlock_t lock; /* timer list lock */ + int pad; +#endif +}; + +enum timers_list_rc { + TIMERS_LIST_RC_ERROR = -1, + TIMERS_LIST_RC_OK = 0, + TIMERS_LIST_RC_EMPTY = 1, + TIMERS_LIST_RC_BECAME_FIRST_ENTRY = 2, + TIMERS_LIST_RC_NOT_EMPTY = 3, +}; + +static inline void xio_timers_list_lock(struct xio_timers_list *timers_list) +{ +#ifdef SAFE_LIST + spin_lock(&timers_list->lock); +#endif +} + +static inline void xio_timers_list_unlock(struct xio_timers_list *timers_list) +{ +#ifdef SAFE_LIST + spin_unlock(&timers_list->lock); +#endif +} + +/*---------------------------------------------------------------------------*/ +/* xio_timers_list_ns_current_get */ +/*---------------------------------------------------------------------------*/ +static inline uint64_t xio_timers_list_ns_current_get(void) +{ + uint64_t ns_monotonic; + struct timespec ts; + + xio_clock_gettime(&ts); + + ns_monotonic = (ts.tv_sec*XIO_NS_IN_SEC) + (uint64_t)ts.tv_nsec; + + return ns_monotonic; +} + +/*---------------------------------------------------------------------------*/ +/* xio_timers_list_init */ +/*---------------------------------------------------------------------------*/ +static inline void xio_timers_list_init(struct xio_timers_list *timers_list) +{ + INIT_LIST_HEAD(&timers_list->timers_head); +#ifdef SAFE_LIST + spin_lock_init(&timers_list->lock); +#endif +} + +/*---------------------------------------------------------------------------*/ +/* xio_timers_list_add */ +/*---------------------------------------------------------------------------*/ +static inline enum timers_list_rc xio_timers_list_add( + struct xio_timers_list *timers_list, + struct xio_timers_list_entry *tentry) +{ + struct list_head *timer_list; + struct xio_timers_list_entry *tentry_from_list; + int found = 0; + enum timers_list_rc retval = TIMERS_LIST_RC_OK; + + list_for_each(timer_list, &timers_list->timers_head) { + tentry_from_list = list_entry(timer_list, + struct xio_timers_list_entry, + entry); + + if (time_before64(tentry->expires, + tentry_from_list->expires)) { + list_add_tail(&tentry->entry, &tentry_from_list->entry); + found = 1; + break; /* for timer iteration */ + } + } + if (found == 0) + list_add_tail(&tentry->entry, &timers_list->timers_head); + + if (list_first_entry(&timers_list->timers_head, + struct xio_timers_list_entry, entry) == tentry) + retval = TIMERS_LIST_RC_BECAME_FIRST_ENTRY; + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_timers_list_add_duration */ +/*---------------------------------------------------------------------------*/ +static inline enum timers_list_rc xio_timers_list_add_duration( + struct xio_timers_list *timers_list, + uint64_t ns_duration, + struct xio_timers_list_entry *tentry) +{ + tentry->expires = + (xio_timers_list_ns_current_get() + ns_duration); + + return xio_timers_list_add(timers_list, tentry); +} + +/*---------------------------------------------------------------------------*/ +/* xio_timers_list_del */ +/*---------------------------------------------------------------------------*/ +static inline enum timers_list_rc xio_timers_list_del( + struct xio_timers_list *timers_list, + struct xio_timers_list_entry *tentry) +{ + enum timers_list_rc retval = TIMERS_LIST_RC_OK; + + if (list_empty(&timers_list->timers_head)) { + retval = TIMERS_LIST_RC_EMPTY; + goto unlock; + } + + list_del_init(&tentry->entry); + + if (list_empty(&timers_list->timers_head)) + retval = TIMERS_LIST_RC_EMPTY; + else + retval = TIMERS_LIST_RC_NOT_EMPTY; +unlock: + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_timers_list_close */ +/*---------------------------------------------------------------------------*/ +static inline void xio_timers_list_close(struct xio_timers_list *timers_list) +{ + struct xio_timers_list_entry *tentry; + + xio_timers_list_lock(timers_list); + while (!list_empty(&timers_list->timers_head)) { + tentry = list_first_entry( + &timers_list->timers_head, + struct xio_timers_list_entry, entry); + list_del_init(&tentry->entry); + } + xio_timers_list_unlock(timers_list); +} + +/*---------------------------------------------------------------------------*/ +/* xio_timers_list_expires */ +/*---------------------------------------------------------------------------*/ +static inline uint64_t xio_timers_list_expires( + struct xio_timers_list *timers_list, + struct xio_timers_list_entry *tentry) +{ + return tentry->expires; +} + +/*---------------------------------------------------------------------------*/ +/* xio_timers_list_pre_dispatch */ +/*---------------------------------------------------------------------------*/ +static inline void xio_timers_list_pre_dispatch( + struct xio_timers_list *timers_list, + struct xio_timers_list_entry *tentry) +{ + list_del_init(&tentry->entry); +} + +/*---------------------------------------------------------------------------*/ +/* xio_timers_list_post_dispatch */ +/*---------------------------------------------------------------------------*/ +static inline void xio_timers_list_post_dispatch( + struct xio_timers_list *timers_list, + struct xio_timers_list_entry *tentry) +{ +} + +/* + * returns the number of msec until the next timer will expire for + * use with epoll + */ +/*---------------------------------------------------------------------------*/ +/* xio_timers_list_ns_duration_to_expire */ +/*---------------------------------------------------------------------------*/ +static inline int64_t xio_timerlist_ns_duration_to_expire( + struct xio_timers_list *timers_list) +{ + struct xio_timers_list_entry *tentry; + int64_t current_time; + int64_t ns_duration_to_expire; + + /* + * empty list, no expire + */ + if (list_empty(&timers_list->timers_head)) + return -1; + + tentry = list_first_entry( + &timers_list->timers_head, + struct xio_timers_list_entry, entry); + + current_time = xio_timers_list_ns_current_get(); + + /* + * timer at head of list is expired, zero ns required + */ + if (time_after64(current_time, tentry->expires)) + return 0; + + ns_duration_to_expire = (tentry->expires - current_time); + + return ns_duration_to_expire; +} + +/* + * Expires any timers that should be expired + */ +/*---------------------------------------------------------------------------*/ +/* xio_timers_list_expire */ +/*---------------------------------------------------------------------------*/ +static inline void xio_timers_list_expire(struct xio_timers_list *timers_list) +{ + struct xio_timers_list_entry *tentry; + uint64_t current_time; + xio_delayed_work_handle_t *dwork; + xio_work_handle_t *work; + + xio_timers_list_lock(timers_list); + while (!list_empty(&timers_list->timers_head)) { + tentry = list_first_entry(&timers_list->timers_head, + struct xio_timers_list_entry, entry); + + current_time = xio_timers_list_ns_current_get(); + + if (time_before_eq64(tentry->expires, current_time)) { + xio_timers_list_pre_dispatch(timers_list, + tentry); + + xio_timers_list_unlock(timers_list); + dwork = container_of(tentry, + xio_delayed_work_handle_t, + timer); + work = &dwork->work; + work->flags &= ~XIO_WORK_PENDING; + + work->function(work->data); + + xio_timers_list_post_dispatch(timers_list, + tentry); + xio_timers_list_lock(timers_list); + } else { + break; /* for timer iteration */ + } + } + xio_timers_list_unlock(timers_list); +} + +static inline int xio_timers_list_is_empty(struct xio_timers_list *timers_list) +{ + return list_empty(&timers_list->timers_head); +} + +#endif /* XIO_TIMERS_LIST_H */ + diff --git a/open_src/xio/src/usr/xio/xio_tls.c b/open_src/xio/src/usr/xio/xio_tls.c new file mode 100644 index 0000000..4bdf813 --- /dev/null +++ b/open_src/xio/src/usr/xio/xio_tls.c @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include "libxio.h" +#include "xio_tls.h" +#include "xio_common.h" + +/*---------------------------------------------------------------------------*/ +/* global tls */ +/*---------------------------------------------------------------------------*/ +static xio_tls int _xio_errno; + +/*---------------------------------------------------------------------------*/ +/* xio_thread_data_destruct */ +/*---------------------------------------------------------------------------*/ +void xio_thread_data_destruct(void) +{ +} + +/*---------------------------------------------------------------------------*/ +/* xio_thread_data_construct */ +/*---------------------------------------------------------------------------*/ +void xio_thread_data_construct(void) +{ +} + +/*---------------------------------------------------------------------------*/ +/* debugging facilities */ +/*---------------------------------------------------------------------------*/ +void xio_set_error(int errnum) { _xio_errno = errnum; } +EXPORT_SYMBOL(xio_set_error); + +/*---------------------------------------------------------------------------*/ +/* xio_errno */ +/*---------------------------------------------------------------------------*/ +int xio_errno(void) { return _xio_errno; } +EXPORT_SYMBOL(xio_errno); + diff --git a/open_src/xio/src/usr/xio/xio_tls.h b/open_src/xio/src/usr/xio/xio_tls.h new file mode 100644 index 0000000..df2b14c --- /dev/null +++ b/open_src/xio/src/usr/xio/xio_tls.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_TLS_H +#define XIO_TLS_H + +/*---------------------------------------------------------------------------*/ +/* xio_thread_data_construct */ +/*---------------------------------------------------------------------------*/ +void xio_thread_data_construct(void); +void xio_thread_data_destruct(void); + +#endif /* XIO_TLS_H */ + diff --git a/open_src/xio/src/usr/xio/xio_usr_utils.c b/open_src/xio/src/usr/xio/xio_usr_utils.c new file mode 100644 index 0000000..620afdc --- /dev/null +++ b/open_src/xio/src/usr/xio/xio_usr_utils.c @@ -0,0 +1,360 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include "libxio.h" +#include "xio_log.h" +#include "xio_common.h" +#include "xio_protocol.h" +#include "xio_sg_table.h" +#include "xio_observer.h" +#include "xio_usr_transport.h" + +static int xio_get_addr(char *dst, char *port, struct sockaddr *addr) +{ + struct addrinfo *res; + int ret; + + if (!dst) { + struct addrinfo hints; + memset(&hints, 0, sizeof(struct addrinfo)); + hints.ai_family = AF_UNSPEC; /* Allow IPv4 or IPv6 */ + hints.ai_flags = AI_PASSIVE; + ret = getaddrinfo(dst, port, &hints, &res); + } else { + ret = getaddrinfo(dst, NULL, NULL, &res); + } + if (ret) { + ERROR_LOG("getaddrinfo failed. %s\n", gai_strerror(ret)); + return ret; + } + + if (res->ai_family == PF_INET) + memcpy(addr, res->ai_addr, sizeof(struct sockaddr_in)); + else if (res->ai_family == PF_INET6) + memcpy(addr, res->ai_addr, sizeof(struct sockaddr_in6)); + else + ret = -1; + + freeaddrinfo(res); + return ret; +} + +/*---------------------------------------------------------------------------*/ +/* xio_host_port_to_ss */ +/*---------------------------------------------------------------------------*/ +int xio_host_port_to_ss(const char *buf, struct sockaddr_storage *ss) +{ + char *cp = (char *)buf; + char *tp; + int len; + char host[NI_MAXHOST]; + char port[NI_MAXSERV]; + int s = 0; + socklen_t ss_len = -1; + + /* + * [host]:port, [host]:, [host]. + * [ipv6addr]:port, [ipv6addr]:, [ipv6addr]. + */ + if (*cp == '[') { + ++cp; + tp = strchr(cp, ']'); + if (!tp) + return -1; + len = tp - cp; + strncpy(host, cp, len); + host[len] = 0; + tp++; + if (*tp == 0) { + strcpy(port, "0"); + } else if (*tp == ':') { + tp++; + if (*tp) + strcpy(port, tp); + else + strcpy(port, "0"); + } else { + strcpy(port, "0"); + } + } else { + /* + * host:port, host:, host, :port. + */ + if (*cp == ':') { + strcpy(host, "0.0.0.0"); + cp++; + if (*cp) + strcpy(port, cp); + else + strcpy(port, "0"); + } else { + tp = strrchr(cp, ':'); + if (!tp) { + strcpy(host, cp); + strcpy(port, "0"); + } else { + len = tp - cp; + strncpy(host, cp, len); + host[len] = 0; + tp++; + if (*tp == 0) + strcpy(port, "0"); + else + strcpy(port, tp); + } + } + } + + /*printf("host:%s, port:%s\n", host, port); */ + + ss->ss_family = PF_INET; + + if (host[0] == '*' || host[0] == 0) + s = xio_get_addr(NULL, port, (struct sockaddr *)ss); + else + s = xio_get_addr(host, NULL, (struct sockaddr *)ss); + + if (s != 0) { + ERROR_LOG("unresolved address\n"); + return -1; + } + switch (ss->ss_family) { + case AF_INET: + ss_len = sizeof(struct sockaddr_in); + ((struct sockaddr_in *)ss)->sin_port = htons(atoi(port)); + break; + case AF_INET6: + ss_len = sizeof(struct sockaddr_in6); + ((struct sockaddr_in6 *)ss)->sin6_port = htons(atoi(port)); + break; + } + return ss_len; +} +EXPORT_SYMBOL(xio_host_port_to_ss); + +/*---------------------------------------------------------------------------*/ +/* xio_uri_to_ss */ +/*---------------------------------------------------------------------------*/ +int xio_uri_to_ss(const char *uri, struct sockaddr_storage *ss) +{ + const char *start; + char host[NI_MAXHOST]; + char port[NI_MAXSERV]; + const char *p1, *p2; + int s = 0; + int len; + socklen_t ss_len = -1; + + /* only supported protocol is rdma */ + start = strstr(uri, "://"); + if (!start) + return -1; + + if (*(start + 3) == '[') { /* IPv6 */ + p1 = strstr(start + 3, "]:"); + if (!p1) + return -1; + + len = p1 - (start + 4); + strncpy(host, (start + 4), len); + host[len] = 0; + + p2 = strchr(p1 + 2, '/'); + if (!p2) { + strcpy(port, p1 + 2); + } else { + len = (p2 - 1) - (p1 + 2); + strncpy(port, (p1 + 2), len); + port[len] = 0; + } + } else { + /* extract the resource */ + p1 = uri + strlen(uri); + p2 = NULL; + while (p1 != (start + 3)) { + if (*p1 == '/') + p2 = p1; + p1--; + if (p1 == uri) + return -1; + } + + if (!p2) { /* no resource */ + p1 = strrchr(uri, ':'); + if (!p1 || p1 == start) + return -1; + strcpy(port, (p1 + 1)); + } else { + if (*p2 != '/') + return -1; + p1 = p2; + while (*p1 != ':') { + p1--; + if (p1 == uri) + return -1; + } + + len = p2 - (p1 + 1); + + strncpy(port, p1 + 1, len); + port[len] = 0; + } + len = p1 - (start + 3); + + /* extract the address */ + strncpy(host, (start + 3), len); + host[len] = 0; + } + /*printf("host:%s port:%s\n", host, port); */ + + ss->ss_family = PF_INET; + + if (host[0] == '*' || host[0] == 0) + s = xio_get_addr(NULL, port, (struct sockaddr *)ss); + else + s = xio_get_addr(host, NULL, (struct sockaddr *)ss); + + if (s != 0) { + ERROR_LOG("unresolved address\n"); + return -1; + } + switch (ss->ss_family) { + case AF_INET: + ss_len = sizeof(struct sockaddr_in); + ((struct sockaddr_in *)ss)->sin_port = htons(atoi(port)); + break; + case AF_INET6: + ss_len = sizeof(struct sockaddr_in6); + ((struct sockaddr_in6 *)ss)->sin6_port = htons(atoi(port)); + break; + } +#if 0 + { + char buffer[NI_MAXHOST]; + const char *result = + inet_ntop(AF_INET, + &((struct sockaddr_in *)ss)->sin_addr, + buffer, sizeof(buffer)); + + printf("host:%s port:%d\n", result, + ntohs(((struct sockaddr_in *)ss)->sin_port)); + } +#endif + return ss_len; +} +EXPORT_SYMBOL(xio_uri_to_ss); + +/*---------------------------------------------------------------------------*/ +/* xio_msg_dump */ +/*---------------------------------------------------------------------------*/ +void xio_msg_dump(struct xio_msg *xio_msg) +{ + struct xio_sg_table_ops *sgtbl_ops; + struct xio_mr *mr; + void *sgtbl; + void *sge; + unsigned int i; + + ERROR_LOG("********************************************************\n"); + ERROR_LOG("type:0x%x\n", xio_msg->type); + if (xio_msg->type == XIO_MSG_TYPE_REQ || + xio_msg->type == XIO_ONE_WAY_REQ) + ERROR_LOG("serial number:%lld\n", xio_msg->sn); + else if (xio_msg->type == XIO_MSG_TYPE_RSP) + ERROR_LOG("response:%p, serial number:%lld\n", + xio_msg->request, + ((xio_msg->request) ? + xio_msg->request->sn : (uint64_t)-1)); + + sgtbl = xio_sg_table_get(&xio_msg->in); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(xio_msg->in.sgl_type); + + ERROR_LOG("in header: length:%zd, address:%p\n", + xio_msg->in.header.iov_len, xio_msg->in.header.iov_base); + ERROR_LOG("in sgl type:%d max_nents:%d\n", xio_msg->in.sgl_type, + tbl_max_nents(sgtbl_ops, sgtbl)); + ERROR_LOG("in data size:%zd\n", + tbl_nents(sgtbl_ops, sgtbl)); + + for_each_sge(sgtbl, sgtbl_ops, sge, i) { + mr = (struct xio_mr *)sge_mr(sgtbl_ops, sge); + if (mr) + ERROR_LOG("in data[%d]: length:%zd, " \ + "address:%p, mr:%p " \ + "- [addr:%p, len:%d]\n", i, + sge_length(sgtbl_ops, sge), + sge_addr(sgtbl_ops, sge), + mr, mr->addr, mr->length); + else + ERROR_LOG("in data[%d]: length:%zd, " \ + "address:%p, mr:%p\n", i, + sge_length(sgtbl_ops, sge), + sge_addr(sgtbl_ops, sge), mr); + } + + sgtbl = xio_sg_table_get(&xio_msg->out); + sgtbl_ops = (struct xio_sg_table_ops *) + xio_sg_table_ops_get(xio_msg->out.sgl_type); + + ERROR_LOG("out header: length:%zd, address:%p\n", + xio_msg->out.header.iov_len, xio_msg->out.header.iov_base); + ERROR_LOG("out sgl type:%d max_nents:%d\n", xio_msg->out.sgl_type, + tbl_max_nents(sgtbl_ops, sgtbl)); + ERROR_LOG("out data size:%zd\n", tbl_nents(sgtbl_ops, sgtbl)); + + for_each_sge(sgtbl, sgtbl_ops, sge, i) { + mr = (struct xio_mr *)sge_mr(sgtbl_ops, sge); + if (mr) + ERROR_LOG("out data[%d]: length:%zd, " \ + "address:%p, mr:%p " \ + "- [addr:%p, len:%d]\n", i, + sge_length(sgtbl_ops, sge), + sge_addr(sgtbl_ops, sge), + mr, mr->addr, mr->length); + else + ERROR_LOG("out data[%d]: length:%zd, " \ + "address:%p, mr:%p\n", + i, + sge_length(sgtbl_ops, sge), + sge_addr(sgtbl_ops, sge), mr); + } + ERROR_LOG("*******************************************************\n"); +} +EXPORT_SYMBOL(xio_msg_dump); + diff --git a/open_src/xio/src/usr/xio/xio_usr_utils.h b/open_src/xio/src/usr/xio/xio_usr_utils.h new file mode 100644 index 0000000..f0a973c --- /dev/null +++ b/open_src/xio/src/usr/xio/xio_usr_utils.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_USR_UTILS_H +#define XIO_USR_UTILS_H + +#endif /* XIO_USR_UTILS_H */ diff --git a/open_src/xio/src/usr/xio/xio_workqueue.c b/open_src/xio/src/usr/xio/xio_workqueue.c new file mode 100644 index 0000000..b157380 --- /dev/null +++ b/open_src/xio/src/usr/xio/xio_workqueue.c @@ -0,0 +1,489 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include + +#include "xio_log.h" +#include "xio_common.h" +#include "xio_observer.h" +#include "xio_ev_data.h" +#include "xio_objpool.h" +#include "xio_workqueue.h" +#include "xio_timers_list.h" +#include "xio_context.h" + +#define NSEC_PER_SEC 1000000000L +#define MAX_DELETED_WORKS 1024 + +enum xio_workqueue_flags { + XIO_WORKQUEUE_IN_POLL = 1 << 0, + XIO_WORKQUEUE_TIMER_ARMED = 1 << 1 +}; + +struct xio_workqueue { + struct xio_context *ctx; + struct xio_timers_list timers_list; + int timer_fd; + socket_t pipe_fd[2]; + + volatile uint32_t flags; + uint64_t deleted_works[MAX_DELETED_WORKS]; + uint32_t deleted_works_nr; + uint32_t pad; +}; + +/** + * set_normalized_timespec - set timespec sec and nsec parts and + * normalize + * + * @ts: pointer to timespec variable to be set + * @sec: seconds to set + * @nsec: nanoseconds to set + * + * Set seconds and nanoseconds field of a timespec variable and + * normalize to the timespec storage format + * + * Note: The tv_nsec part is always in the range of + * 0 <= tv_nsec < NSEC_PER_SEC + * For negative values only the tv_sec field is negative ! + */ +static void set_normalized_timespec(struct timespec *ts, + time_t sec, int64_t nsec) +{ + while (nsec >= (int64_t)XIO_NS_IN_SEC) { + nsec -= XIO_NS_IN_SEC; + ++sec; + } + while (nsec < 0) { + nsec += XIO_NS_IN_SEC; + --sec; + } + ts->tv_sec = sec; + ts->tv_nsec = (long)nsec; +} + +/*---------------------------------------------------------------------------*/ +/* xio_workqueue_rearm */ +/*---------------------------------------------------------------------------*/ +static int xio_workqueue_rearm(struct xio_workqueue *work_queue) +{ + struct itimerspec new_t = { {0, 0}, {0, 0} }; + int err; + int64_t ns_to_expire; + + if (work_queue->flags & XIO_WORKQUEUE_IN_POLL) + return 0; + if (xio_timers_list_is_empty(&work_queue->timers_list)) + return 0; + + ns_to_expire = + xio_timerlist_ns_duration_to_expire( + &work_queue->timers_list); + + if (ns_to_expire == -1) + return 0; + + if (ns_to_expire < 1) { + new_t.it_value.tv_nsec = 1; + } else { + set_normalized_timespec(&new_t.it_value, + 0, ns_to_expire); + } + /* rearm the timer */ + err = xio_timerfd_settime(work_queue->timer_fd, 0, &new_t, NULL); + if (unlikely(err < 0)) { + ERROR_LOG("timerfd_settime failed. %m\n"); + return -1; + } + work_queue->flags |= XIO_WORKQUEUE_TIMER_ARMED; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_workqueue_disarm */ +/*---------------------------------------------------------------------------*/ +static void xio_workqueue_disarm(struct xio_workqueue *work_queue) +{ + struct itimerspec new_t = { {0, 0}, {0, 0} }; + int err; + + if (!(work_queue->flags & XIO_WORKQUEUE_TIMER_ARMED)) + return; + + err = xio_timerfd_settime(work_queue->timer_fd, 0, &new_t, NULL); + if (unlikely(err < 0)) + ERROR_LOG("timerfd_settime failed. %m\n"); + + work_queue->flags &= ~XIO_WORKQUEUE_TIMER_ARMED; +} + +/*---------------------------------------------------------------------------*/ +/* xio_delayed_action_handler */ +/*---------------------------------------------------------------------------*/ +static void xio_delayed_action_handler(int fd, int events, void *user_context) +{ + struct xio_workqueue *work_queue = (struct xio_workqueue *)user_context; + int64_t exp; + ssize_t s; + + /* consume the timer data in fd */ + s = xio_read(work_queue->timer_fd, &exp, sizeof(exp)); + if (s < 0) { + if (xio_get_last_socket_error() != XIO_EAGAIN) + ERROR_LOG("failed to read from timerfd, %m\n"); + return; + } + if (s != sizeof(uint64_t)) { + ERROR_LOG("failed to read from timerfd, %m\n"); + return; + } + + work_queue->flags |= XIO_WORKQUEUE_IN_POLL; + xio_timers_list_expire(&work_queue->timers_list); + xio_timers_list_lock(&work_queue->timers_list); + work_queue->flags &= ~XIO_WORKQUEUE_IN_POLL; + xio_workqueue_rearm(work_queue); + xio_timers_list_unlock(&work_queue->timers_list); +} + +/*---------------------------------------------------------------------------*/ +/* xio_work_action_handler */ +/*---------------------------------------------------------------------------*/ +static void xio_work_action_handler(int fd, int events, void *user_context) +{ + struct xio_workqueue *work_queue = (struct xio_workqueue *)user_context; + uint64_t exp; + ssize_t s; + xio_work_handle_t *work; + unsigned int i, found = 0; + + /* drain the pipe data */ + while (1) { + s = xio_read(work_queue->pipe_fd[0], &exp, sizeof(exp)); + if (s < 0) { + if (xio_get_last_socket_error() != XIO_EAGAIN) + ERROR_LOG("failed to read from pipe, %m\n"); + work_queue->deleted_works_nr = 0; + return; + } + if (s != sizeof(uint64_t)) { + ERROR_LOG("failed to read from pipe, %m\n"); + return; + } + work = (xio_work_handle_t *)ptr_from_int64(exp); + if (!work) { + ERROR_LOG("null work\n"); + return; + } + + /* scan for deleted work the may be inside the pipe */ + for (i = 0; i < work_queue->deleted_works_nr; i++) { + if (work_queue->deleted_works[i] == exp) { + found = 1; + break; + } + } + if (found) { + found = 0; + continue; + } + + if (test_bits(XIO_WORK_PENDING, &work->flags)) { + clr_bits(XIO_WORK_PENDING, &work->flags); + + set_bits(XIO_WORK_IN_HANDLER, &work->flags); + work->function(work->data); + clr_bits(XIO_WORK_IN_HANDLER, &work->flags); + if (work->destructor) + work->destructor(work->destructor_data); + } + } +} + +/*---------------------------------------------------------------------------*/ +/* xio_workqueue_create */ +/*---------------------------------------------------------------------------*/ +struct xio_workqueue *xio_workqueue_create(struct xio_context *ctx) +{ + struct xio_workqueue *work_queue; + int retval; + + work_queue = (struct xio_workqueue *)ucalloc(1, sizeof(*work_queue)); + if (!work_queue) { + ERROR_LOG("ucalloc failed. %m\n"); + return NULL; + } + + xio_timers_list_init(&work_queue->timers_list); + work_queue->ctx = ctx; + + work_queue->timer_fd = xio_timerfd_create(); + if (work_queue->timer_fd < 0) { + ERROR_LOG("timerfd_create failed. %m\n"); + goto exit; + } + + retval = xio_pipe(work_queue->pipe_fd, 0); + + if (retval < 0) { + ERROR_LOG("pipe failed. %m\n"); + goto exit1; + } + + /* add to epoll */ + retval = xio_context_add_ev_handler( + ctx, + work_queue->timer_fd, + XIO_POLLIN, + xio_delayed_action_handler, + work_queue); + if (retval) { + ERROR_LOG("ev_loop_add_cb failed. %m\n"); + goto exit2; + } + + /* add to epoll */ + retval = xio_context_add_ev_handler( + ctx, + work_queue->pipe_fd[0], + XIO_POLLIN, + xio_work_action_handler, + work_queue); + if (retval) { + ERROR_LOG("ev_loop_add_cb failed. %m\n"); + goto exit2; + } + + return work_queue; + +exit2: + xio_closesocket(work_queue->pipe_fd[0]); + xio_closesocket(work_queue->pipe_fd[1]); +exit1: + xio_closesocket(work_queue->timer_fd); +exit: + ufree(work_queue); + return NULL; +} + +/*---------------------------------------------------------------------------*/ +/* xio_workqueue_destroy */ +/*---------------------------------------------------------------------------*/ +int xio_workqueue_destroy(struct xio_workqueue *work_queue) +{ + int retval; + + xio_workqueue_disarm(work_queue); + + retval = xio_context_del_ev_handler( + work_queue->ctx, + work_queue->timer_fd); + if (retval) + ERROR_LOG("ev_loop_del_cb failed. %m\n"); + + retval = xio_context_del_ev_handler( + work_queue->ctx, + work_queue->pipe_fd[0]); + if (retval) + ERROR_LOG("ev_loop_del_cb failed. %m\n"); + + xio_timers_list_close(&work_queue->timers_list); + + xio_closesocket(work_queue->pipe_fd[0]); + xio_closesocket(work_queue->pipe_fd[1]); + xio_closesocket(work_queue->timer_fd); + ufree(work_queue); + + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_workqueue_add_delayed_work */ +/*---------------------------------------------------------------------------*/ +int xio_workqueue_add_delayed_work(struct xio_workqueue *work_queue, + int msec_duration, void *data, + void (*function)(void *data), + xio_delayed_work_handle_t *dwork) +{ + int retval = 0; + enum timers_list_rc rc; + xio_work_handle_t *work = &dwork->work; + + if (xio_is_delayed_work_pending(dwork)) { + ERROR_LOG("work already pending\n"); + xio_set_error(EEXIST); + return -1; + } + + xio_timers_list_lock(&work_queue->timers_list); + + work->function = function; + work->data = data; + work->flags |= XIO_WORK_PENDING; + + rc = xio_timers_list_add_duration( + &work_queue->timers_list, + ((uint64_t)msec_duration) * 1000000ULL, + &dwork->timer); + if (rc == TIMERS_LIST_RC_ERROR) { + ERROR_LOG("adding to timer failed\n"); + retval = -1; + goto unlock; + } + + /* if the recently add timer is now the first in list, rearm */ + /* rearm the timer */ + retval = xio_workqueue_rearm(work_queue); + if (unlikely(retval)) + ERROR_LOG("xio_workqueue_rearm failed. %m\n"); + +unlock: + xio_timers_list_unlock(&work_queue->timers_list); + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_workqueue_del_delayed_work */ +/*---------------------------------------------------------------------------*/ +int xio_workqueue_del_delayed_work(struct xio_workqueue *work_queue, + xio_delayed_work_handle_t *dwork) +{ + int retval = 0; + enum timers_list_rc rc; + + if (!xio_is_delayed_work_pending(dwork)) { + ERROR_LOG("work not pending\n"); + xio_set_error(EEXIST); + return -1; + } + + /* stop the timer */ + xio_workqueue_disarm(work_queue); + + xio_timers_list_lock(&work_queue->timers_list); + + dwork->work.flags &= ~XIO_WORK_PENDING; + + rc = xio_timers_list_del(&work_queue->timers_list, &dwork->timer); + if (rc == TIMERS_LIST_RC_ERROR) { + ERROR_LOG("deleting work from queue failed. queue is empty\n"); + goto unlock; + } + /* rearm the timer */ + retval = xio_workqueue_rearm(work_queue); + if (unlikely(retval)) + ERROR_LOG("xio_workqueue_rearm failed. %m\n"); +unlock: + xio_timers_list_unlock(&work_queue->timers_list); + return retval; +} + +/*---------------------------------------------------------------------------*/ +/* xio_workqueue_add_work */ +/*---------------------------------------------------------------------------*/ +int xio_workqueue_add_work(struct xio_workqueue *work_queue, + void *data, + void (*function)(void *data), + xio_work_handle_t *work) +{ + uint64_t exp = uint64_from_ptr(work); + int s; + + work->function = function; + work->data = data; + work->flags |= XIO_WORK_PENDING; + + s = xio_write(work_queue->pipe_fd[1], &exp, sizeof(exp)); + if (s < 0) { + ERROR_LOG("failed to write to pipe, %m\n"); + return -1; + } + if (s != sizeof(exp)) { + ERROR_LOG("failed to write to pipe, %m\n"); + return -1; + } + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_workqueue_del_work */ +/*---------------------------------------------------------------------------*/ +int xio_workqueue_del_work(struct xio_workqueue *work_queue, + xio_work_handle_t *work) +{ + if (work->flags & XIO_WORK_PENDING) { + work->flags &= ~XIO_WORK_PENDING; + if (work_queue->deleted_works_nr < MAX_DELETED_WORKS) { + work_queue->deleted_works[ + work_queue->deleted_works_nr] = + uint64_from_ptr(work); + work_queue->deleted_works_nr++; + } else { + ERROR_LOG("failed to delete work\n"); + } + + return 0; + } + return -1; +} + +/*---------------------------------------------------------------------------*/ +/* xio_workqueue_set_work_destructor */ +/*---------------------------------------------------------------------------*/ +int xio_workqueue_set_work_destructor(struct xio_workqueue *work_queue, + void *data, + void (*destructor)(void *data), + xio_work_handle_t *work) +{ + work->destructor = destructor; + work->destructor_data = data; + + return 0; +} + +/*---------------------------------------------------------------------------*/ +/* xio_workqueue_is_work_in_hanlder */ +/*---------------------------------------------------------------------------*/ +int xio_workqueue_is_work_in_handler(struct xio_workqueue *work_queue, + xio_work_handle_t *work) +{ + return test_bits(XIO_WORK_IN_HANDLER, &work->flags); +} + diff --git a/open_src/xio/src/usr/xio/xio_workqueue_priv.h b/open_src/xio/src/usr/xio/xio_workqueue_priv.h new file mode 100644 index 0000000..0b8285f --- /dev/null +++ b/open_src/xio/src/usr/xio/xio_workqueue_priv.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2013 Mellanox Technologies®. All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License (GPL) Version 2, available from the file COPYING in the main + * directory of this source tree, or the Mellanox Technologies® BSD license + * below: + * + * - Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the Mellanox Technologies® nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef XIO_WORKQUEUE_PRIV_H +#define XIO_WORKQUEUE_PRIV_H + +enum xio_work_flags { + XIO_WORK_PENDING = 1 << 0, + XIO_WORK_IN_HANDLER = 1 << 1 +}; + +struct xio_timers_list_entry { + struct list_head entry; + uint64_t expires; +}; + +typedef struct xio_work_struct { + void (*function)(void *data); + void *data; + + void (*destructor)(void *data); + void *destructor_data; + + volatile uint32_t flags; + uint32_t pad; +} xio_work_handle_t; + +typedef struct xio_delayed_work_struct { + struct xio_work_struct work; + struct xio_timers_list_entry timer; +} xio_delayed_work_handle_t; + +/*---------------------------------------------------------------------------*/ +/* xio_is_work_pending */ +/*---------------------------------------------------------------------------*/ +static inline int xio_is_work_pending(xio_work_handle_t *work) +{ + return work->flags & XIO_WORK_PENDING; +} + +/*---------------------------------------------------------------------------*/ +/* xio_is_delayed_work_pending */ +/*---------------------------------------------------------------------------*/ +static inline int xio_is_delayed_work_pending(xio_delayed_work_handle_t *dwork) +{ + return dwork->work.flags & XIO_WORK_PENDING; +} + +#endif /* XIO_WORKQUEUE_PRIV_H */ diff --git a/open_src/xio/version.c b/open_src/xio/version.c new file mode 100644 index 0000000..8f1c7e0 --- /dev/null +++ b/open_src/xio/version.c @@ -0,0 +1 @@ +const char XIO_VERSION_STRING[] = "xio_1.7.0_release"; \ No newline at end of file diff --git a/src/common/base_log.h b/src/common/base_log.h new file mode 100644 index 0000000..6d9b393 --- /dev/null +++ b/src/common/base_log.h @@ -0,0 +1,95 @@ +/* + * Copyright(C) 2020 Ruijie Network. All rights reserved. + */ + +/*! +* \file xxx.x +* \brief xxx +* +* 包含.. +* +* \copyright 2020 Ruijie Network. All rights reserved. +* \author hongchunhua@ruijie.com.cn +* \version v1.0.0 +* \date 2020.08.05 +* \note none +*/ + +#ifndef _BASE_LOG_H_ +#define _BASE_LOG_H_ + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define BASE_ERROR -1 +#define BASE_SUCCESS 0 + +#define BASE_DEBUG_ON +#define BASE_LOG_ERROR(format, arg...) fprintf(stdout, "[ERROR]File:|%s|%d---"format"\n", __FUNCTION__, __LINE__, ##arg) +#define BASE_LOG_NOTICE(format, arg...) fprintf(stdout, "[NOTICE]File:|%s|%d---"format"\n",__FUNCTION__, __LINE__, ##arg) + +#ifdef BASE_DEBUG_ON +#define BASE_LOG_DEBUG(format, arg...) fprintf(stdout, "[DEBUG]File:|%s|%d---"format"\n",__FUNCTION__, __LINE__, ##arg) +#else +#define BASE_LOG_DEBUG(format, arg...) +#endif + +#define unlikely(x) __builtin_expect(!!(x), 0) + +#define LOG_THEN_GOTO_TAG_IF_VAL_TRUE(val, tag, format, arg...) \ +do{\ + if(unlikely((val))){\ + BASE_LOG_ERROR(format,##arg);\ + goto tag;\ + }\ +}while(0); + +#define LOG_DEBUG_GOTO_TAG_IF_VAL_TRUE(val, tag, format, arg...) \ +do{\ + if(unlikely((val))){\ + BASE_LOG_DEBUG(format,##arg);\ + goto tag;\ + }\ +}while(0); + +#define LOG_THEN_RETURN_IF_VAL_TRUE(val, format, arg...) \ +do{\ + if(unlikely((val))){\ + BASE_LOG_ERROR(format,##arg);\ + return;\ + }\ +}while(0); + +#define LOG_THEN_RETURN_VAL_IF_TRUE(val, ret, format, arg...)\ +do{\ + if(unlikely((val))){\ + BASE_LOG_ERROR(format, ##arg);\ + return ret;\ + }\ +}while(0); + +#define BASE_ASSERT(condition, format, arg...) \ +do{\ + if(unlikely((condition))){\ + BASE_LOG_ERROR(format, ##arg);\ + assert(!condition);\ + }\ +}while(0); + +#define LOG_ERROR_IF_VAL_TRUE(val, format, arg...) \ +do{\ + if(unlikely((val))){\ + BASE_LOG_ERROR(format,##arg);\ + }\ +}while(0); + +#ifdef __cplusplus +} +#endif + +#endif /*XIO_API_H */ diff --git a/src/common/queue.h b/src/common/queue.h new file mode 100644 index 0000000..ff3540a --- /dev/null +++ b/src/common/queue.h @@ -0,0 +1,108 @@ +/* Copyright (c) 2013, Ben Noordhuis + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef QUEUE_H_ +#define QUEUE_H_ + +#include + +typedef void *QUEUE[2]; + +/* Private macros. */ +#define QUEUE_NEXT(q) (*(QUEUE **) &((*(q))[0])) +#define QUEUE_PREV(q) (*(QUEUE **) &((*(q))[1])) +#define QUEUE_PREV_NEXT(q) (QUEUE_NEXT(QUEUE_PREV(q))) +#define QUEUE_NEXT_PREV(q) (QUEUE_PREV(QUEUE_NEXT(q))) + +/* Public macros. */ +#define QUEUE_DATA(ptr, type, field) \ + ((type *) ((char *) (ptr) - offsetof(type, field))) + +/* Important note: mutating the list while QUEUE_FOREACH is + * iterating over its elements results in undefined behavior. + */ +#define QUEUE_FOREACH(q, h) \ + for ((q) = QUEUE_NEXT(h); (q) != (h); (q) = QUEUE_NEXT(q)) + +#define QUEUE_EMPTY(q) \ + ((const QUEUE *) (q) == (const QUEUE *) QUEUE_NEXT(q)) + +#define QUEUE_HEAD(q) \ + (QUEUE_NEXT(q)) + +#define QUEUE_INIT(q) \ + do { \ + QUEUE_NEXT(q) = (q); \ + QUEUE_PREV(q) = (q); \ + } \ + while (0) + +#define QUEUE_ADD(h, n) \ + do { \ + QUEUE_PREV_NEXT(h) = QUEUE_NEXT(n); \ + QUEUE_NEXT_PREV(n) = QUEUE_PREV(h); \ + QUEUE_PREV(h) = QUEUE_PREV(n); \ + QUEUE_PREV_NEXT(h) = (h); \ + } \ + while (0) + +#define QUEUE_SPLIT(h, q, n) \ + do { \ + QUEUE_PREV(n) = QUEUE_PREV(h); \ + QUEUE_PREV_NEXT(n) = (n); \ + QUEUE_NEXT(n) = (q); \ + QUEUE_PREV(h) = QUEUE_PREV(q); \ + QUEUE_PREV_NEXT(h) = (h); \ + QUEUE_PREV(q) = (n); \ + } \ + while (0) + +#define QUEUE_MOVE(h, n) \ + do { \ + if (QUEUE_EMPTY(h)) \ + QUEUE_INIT(n); \ + else { \ + QUEUE* q = QUEUE_HEAD(h); \ + QUEUE_SPLIT(h, q, n); \ + } \ + } \ + while (0) + +#define QUEUE_INSERT_HEAD(h, q) \ + do { \ + QUEUE_NEXT(q) = QUEUE_NEXT(h); \ + QUEUE_PREV(q) = (h); \ + QUEUE_NEXT_PREV(q) = (q); \ + QUEUE_NEXT(h) = (q); \ + } \ + while (0) + +#define QUEUE_INSERT_TAIL(h, q) \ + do { \ + QUEUE_NEXT(q) = (h); \ + QUEUE_PREV(q) = QUEUE_PREV(h); \ + QUEUE_PREV_NEXT(q) = (q); \ + QUEUE_PREV(h) = (q); \ + } \ + while (0) + +#define QUEUE_REMOVE(q) \ + do { \ + QUEUE_PREV_NEXT(q) = QUEUE_NEXT(q); \ + QUEUE_NEXT_PREV(q) = QUEUE_PREV(q); \ + } \ + while (0) + +#endif /* QUEUE_H_ */ diff --git a/src/common/threadpool.c b/src/common/threadpool.c new file mode 100644 index 0000000..d5bfdf7 --- /dev/null +++ b/src/common/threadpool.c @@ -0,0 +1,372 @@ +/* + * Copyright(C) 2020 Ruijie Network. All rights reserved. + */ + +/*! +* \file xxx.x +* \brief xxx +* +* 包含.. +* +* \copyright 2020 Ruijie Network. All rights reserved. +* \author hongchunhua@ruijie.com.cn +* \version v1.0.0 +* \date 2020.08.05 +* \note none +*/ + +#include +#include +#include +#include +#include +#include +#include + +#include "base_log.h" +#include "threadpool.h" +#include "queue.h" + + +#define TP_LOG_ERROR(format, arg...) BASE_LOG_ERROR(format, ##arg) +#define TP_LOG_NOTICE(format, arg...) BASE_LOG_NOTICE(format, ##arg) +#define TP_LOG_DEBUG(format, arg...) BASE_LOG_DEBUG(format, ##arg) + +#define FLAG_TASK_ACTIVE 0 +#define FLAG_TASK_IDLE 1 +#define FLAG_TASK_EXIT 2 + +#define FLAG_WORK_INIT 0 +#define FLAG_WORK_RUN 1 +#define FLAG_WORK_DONE 2 +#define FLAG_WORK_WAIT 3 + +#define IS_SET(flag, tag) (flag&(1<mutex); + t_msg = &pool_ctx->thread[pool_ctx->idle_num]; + t_msg->work_id = pool_ctx->idle_num; + + TASK_SET_ACTIVE(t_msg->flag); + TP_LOG_DEBUG(" Eentry thread[%lu], init first.", t_msg->thread_id); + /* 线程同步*/ + sem_post(pool_ctx->sync); + for (;;) { + while (QUEUE_EMPTY(&pool_ctx->wait_to_run)) { + pool_ctx->idle_num++; + TASK_SET_IDLE(t_msg->flag); + pthread_cond_wait(&pool_ctx->cond, &pool_ctx->mutex); + TASK_CLR_IDLE(t_msg->flag); + pool_ctx->idle_num--; + if (IS_SET(t_msg->flag, FLAG_TASK_EXIT)){ + break; // 被要求退出线程 + } + } + if (IS_SET(t_msg->flag, FLAG_TASK_EXIT)){ + break; // 被要求退出线程 + } + + q = QUEUE_HEAD(&pool_ctx->wait_to_run); + + QUEUE_REMOVE(q); + QUEUE_INIT(q); /* Signal uv_cancel() that the work req is executing. */ + + pthread_mutex_unlock(&pool_ctx->mutex); + to_run = QUEUE_DATA(q, struct _work, queue); + + pthread_mutex_lock(&to_run->mutex); + CLR_FLAG(to_run->flag, FLAG_WORK_DONE); + SET_FLAG(to_run->flag, FLAG_WORK_RUN); + pthread_mutex_unlock(&to_run->mutex); + + to_run->loop(to_run->usr_ctx); + + pthread_mutex_lock(&to_run->mutex); + if (IS_SET(to_run->flag, FLAG_WORK_WAIT)) { + CLR_FLAG(to_run->flag, FLAG_WORK_RUN); + SET_FLAG(to_run->flag, FLAG_WORK_DONE); + pthread_cond_signal(&to_run->cond); + pthread_mutex_unlock(&to_run->mutex); + }else{ + pthread_mutex_unlock(&to_run->mutex); // *主动释放 + pthread_cond_destroy(&to_run->cond); + pthread_mutex_destroy(&to_run->mutex); + free(to_run); + } + + pthread_mutex_lock(&pool_ctx->mutex); + + } + TASK_CLR_ACTIVE(t_msg->flag); + pthread_mutex_unlock(&pool_ctx->mutex); + TP_LOG_DEBUG("work thread[%lu] exit success.", t_msg->thread_id); + return NULL; +} + + +tp_handle tp_create_thread_pool(struct tp_param *p) +{ + uint32_t i; + struct _thread_pool_msg *pool = NULL; + uint32_t thread_num = 5; + + if (p) { + thread_num = (p->thread_max_num > 0)?p->thread_max_num:thread_num; + } + pool = (struct _thread_pool_msg *)calloc(1, sizeof(struct _thread_pool_msg) + thread_num * sizeof(struct _thread_msg)); + LOG_THEN_RETURN_VAL_IF_TRUE((!pool), NULL, "pool calloc fail."); + + pool->thread_num = thread_num; + pthread_mutex_init(&pool->mutex, NULL); /* 初始化互斥锁 */ + pthread_cond_init(&pool->cond, NULL); /* 初始化条件变量 */ + + QUEUE_INIT(&pool->wait_to_run); + + pool->thread = (struct _thread_msg*)pool->ext_data; + LOG_THEN_GOTO_TAG_IF_VAL_TRUE((!pool->thread), error_1, "pool->thread calloc fail."); + + pool->sync = (sem_t *)calloc(1, sizeof(sem_t)); + LOG_THEN_GOTO_TAG_IF_VAL_TRUE((!pool->thread), error_1, "pool->sync calloc fail."); + + if(sem_init(pool->sync, 0, 0) < 0) { + TP_LOG_ERROR("sem_init fail, to free pool."); + goto error_2; + } + + for (i = 0; i < p->thread_max_num; i++){ + if (pthread_create(&pool->thread[i].thread_id, NULL, task_worker, pool) < 0) { + TP_LOG_ERROR("create thread fail, to free pool."); + pool->thread[i].thread_id = 0; + goto error_3; + } + sem_wait(pool->sync); + TP_LOG_DEBUG("create thread[%lu] success, do next.", pool->thread[i].thread_id); + } + if(pool->sync) + free(pool->sync); + pool->sync = NULL; + + return pool; +error_3: + pthread_mutex_lock(&pool->mutex); + for (i = 0; i < p->thread_max_num; i++){ + TASK_SET_EXIT(pool->thread[i].flag); + TP_LOG_DEBUG("notify thread[%lu] exit, wait.", pool->thread[i].thread_id); + } + pthread_cond_broadcast(&pool->cond); + pthread_mutex_unlock(&pool->mutex); + for (i = 0; i < pool->thread_num; i++){ + TP_LOG_DEBUG("join thread[%lu] exit, do next.", pool->thread[i].thread_id); + if (pool->thread[i].thread_id > 0) + pthread_join(pool->thread[i].thread_id, NULL); + } +error_2: + if(pool->sync) + free(pool->sync); + pool->sync = NULL; +error_1: + pthread_cond_destroy(&pool->cond); + pthread_mutex_destroy(&pool->mutex); + if (pool) + free(pool); + return NULL; +} + +int tp_destroy_thread_pool(tp_handle *fd) +{ + uint32_t i; + struct _thread_pool_msg *pool = (struct _thread_pool_msg *)(*fd); + LOG_THEN_RETURN_VAL_IF_TRUE((!pool), -1,"pool null fail."); + + pthread_mutex_lock(&pool->mutex); + for (i = 0; i < pool->thread_num; i++){ + if (!(IS_SET(pool->thread[i].flag, FLAG_TASK_IDLE))){ + TP_LOG_ERROR("the thread[%lu], work_id[%u] is runing,can't stop it", pool->thread[i].thread_id, i); + goto error; + } + } + for (i = 0; i < pool->thread_num; i++){ + TASK_SET_EXIT(pool->thread[i].flag); + TP_LOG_DEBUG("notify thread[%lu] exit, wait.", pool->thread[i].thread_id); + } + + pthread_cond_broadcast(&pool->cond); + pthread_mutex_unlock(&pool->mutex); + + for (i = 0; i < pool->thread_num; i++){ + TP_LOG_DEBUG("join[%u] thread[%lu] exit, do next.", i, pool->thread[i].thread_id); + if (pool->thread[i].thread_id > 0) + pthread_join(pool->thread[i].thread_id, NULL); + } + + pthread_cond_destroy(&pool->cond); + pthread_mutex_destroy(&pool->mutex); + if(pool) + free(pool); + TP_LOG_DEBUG(" free pool success, exit."); + *fd = NULL; + return 0; +error: + pthread_mutex_unlock(&pool->mutex); + return -1; +} + +work_handle_t tp_post_one_work(tp_handle fd, struct tp_thread_work *w, uint8_t auto_free) +{ + struct _thread_pool_msg *pool = (struct _thread_pool_msg *)fd; + struct _work *work; + int ret; + LOG_THEN_RETURN_VAL_IF_TRUE((!pool || !w), NULL,"pool null or w null fail."); + LOG_THEN_RETURN_VAL_IF_TRUE(!w->loop, NULL, "work loop null, fail."); + + work = (struct _work*)calloc(1, sizeof(struct _work)); + work->loop = w->loop; + work->stop = w->stop; + work->usr_ctx = w->usr_ctx; + + ret = pthread_mutex_init(&work->mutex, NULL); /* 初始化互斥锁 */ + LOG_THEN_GOTO_TAG_IF_VAL_TRUE((ret != 0), error, "pthread_mutex_init fail."); + ret = pthread_cond_init(&work->cond, NULL); /* 初始化条件变量 */ + if (ret != 0) { + pthread_cond_destroy(&work->cond); + goto error; + } + SET_FLAG(work->flag, FLAG_WORK_INIT); + if (auto_free){ + CLR_FLAG(work->flag, FLAG_WORK_WAIT); + } + pthread_mutex_lock(&pool->mutex); + QUEUE_INSERT_TAIL(&pool->wait_to_run, &work->queue); + if (pool->idle_num > 0) + pthread_cond_signal(&pool->cond); + pthread_mutex_unlock(&pool->mutex); + if (auto_free){ + work = (struct _work*)HIDE_ADDR; + } + return work; +error: + if (work) + free(work); + work = NULL; + + return NULL; +} + +int tp_wait_work_done(work_handle_t *w, uint32_t timeout_ms) +{ + struct _work *work = (struct _work *)(*w); + struct timespec abstime; + struct timeval now; + uint64_t nsec; + LOG_THEN_RETURN_VAL_IF_TRUE((!w || !work || (work == (void*)HIDE_ADDR)), -1, "work_handle_t fail."); + pthread_mutex_lock(&work->mutex); + if (IS_SET(work->flag, FLAG_WORK_DONE)){ + goto end; + } + + SET_FLAG(work->flag, FLAG_WORK_WAIT); + if (timeout_ms > 0) { + gettimeofday(&now, NULL); // 线程安全 + nsec = now.tv_usec * 1000 + (timeout_ms % 1000) * 1000000; + abstime.tv_sec=now.tv_sec + nsec / 1000000000 + timeout_ms / 1000; + abstime.tv_nsec=nsec % 1000000000; + pthread_cond_timedwait(&work->cond, &work->mutex, &abstime); + }else{ + pthread_cond_wait(&work->cond, &work->mutex); + } + + if (!IS_SET(work->flag, FLAG_WORK_DONE)){ + TP_LOG_ERROR("the work wait fail."); + CLR_FLAG(work->flag, FLAG_WORK_WAIT); + pthread_mutex_unlock(&work->mutex); + return -1; + } + + pthread_mutex_unlock(&work->mutex); +end: + if (IS_SET(work->flag, FLAG_WORK_INIT)) { + pthread_cond_destroy(&work->cond); + pthread_mutex_destroy(&work->mutex); + } + if (work) + free(work); + *w = NULL; + return 0; +} + +int tp_cancel_one_work(work_handle_t *w) +{ + struct _work *work = (struct _work *)(*w); + LOG_THEN_RETURN_VAL_IF_TRUE((!w || !work || (work==(void*)HIDE_ADDR)), -1, "work_handle_t fail."); + pthread_mutex_lock(&work->mutex); + if (!work->stop){ + pthread_mutex_unlock(&work->mutex); + return -1; + } + + if (IS_SET(work->flag, FLAG_WORK_DONE)){ + goto end; + } + + work->stop(work->usr_ctx); + SET_FLAG(work->flag, FLAG_WORK_WAIT); + pthread_cond_wait(&work->cond, &work->mutex); + pthread_mutex_unlock(&work->mutex); +end: + if (IS_SET(work->flag, FLAG_WORK_INIT)) { + pthread_cond_destroy(&work->cond); + pthread_mutex_destroy(&work->mutex); + } + if (*w) + free(*w); + *w = NULL; + return 0; +} diff --git a/src/common/threadpool.h b/src/common/threadpool.h new file mode 100644 index 0000000..0555394 --- /dev/null +++ b/src/common/threadpool.h @@ -0,0 +1,63 @@ +/* + * Copyright(C) 2020 Ruijie Network. All rights reserved. + */ + +/*! +* \file xxx.x +* \brief xxx +* +* 包含.. +* +* \copyright 2020 Ruijie Network. All rights reserved. +* \author hongchunhua@ruijie.com.cn +* \version v1.0.0 +* \date 2020.08.05 +* \note none +*/ + +#ifndef _THREAD_POOL_H_ +#define _THREAD_POOL_H_ + +#include +#include + + +#ifdef __cplusplus +extern "C" { +#endif + +#define TP_ERROR -1 +#define TP_SUCCESS 0 + + +#define WORK_DONE_AUTO_FREE 1 +#define WORK_DONE_MANUAL_FREE 0 + +typedef void *tp_handle; + +struct tp_param { + uint32_t thread_max_num; /* 线程池的线程数*/ + uint32_t max_stack_size; /* 线程栈空间大小*/ + uint32_t flag; +}; + +tp_handle tp_create_thread_pool(struct tp_param *p); +int tp_destroy_thread_pool(tp_handle *fd); + +typedef void *work_handle_t; + +struct tp_thread_work { + int (*loop)(void *usr_ctx); /* 循环回调函数*/ + void (*stop)(void *usr_ctx); /* 停止循环回调函数*/ + void *usr_ctx; /* 任务上下文*/ +}; + +work_handle_t tp_post_one_work(tp_handle fd, struct tp_thread_work *w, uint8_t auto_free); +int tp_wait_work_done(work_handle_t *w, uint32_t timeout_ms); +int tp_cancel_one_work(work_handle_t *w); + +#ifdef __cplusplus +} +#endif + +#endif /*XIO_API_H */ diff --git a/src/session/arpc_client.c b/src/session/arpc_client.c new file mode 100644 index 0000000..91d891f --- /dev/null +++ b/src/session/arpc_client.c @@ -0,0 +1,467 @@ +/* + * Copyright(C) 2020 Ruijie Network. All rights reserved. + */ + +/*! +* \file xxx.x +* \brief xxx +* +* 包含.. +* +* \copyright 2020 Ruijie Network. All rights reserved. +* \author hongchunhua@ruijie.com.cn +* \version v1.0.0 +* \date 2020.08.05 +* \note none +*/ + +#include +#include +#include + +#include "queue.h" +#include "arpc_com.h" +#include "threadpool.h" + +#ifdef _DEF_SESSION_CLIENT + +#define RE_CREATE_SESSION(x_data) \ +do{\ + if (x_data->session)\ + xio_session_destroy(x_data->session);\ + x_data->session = NULL;\ + x_data->session = xio_session_create(&x_data->session_param);\ + x_data->conn_param.session = x_data->session;\ + if (!x_data->conn)\ + x_data->conn = xio_connect(&x_data->conn_param);\ +}while(0); + +#define GET_CLIENT_SESSION_DATA(client, head, fd) \ +struct arpc_client_session_data *client = NULL;\ +struct arpc_handle_ex *head = (struct arpc_handle_ex *)fd;\ +do{\ + if(head && head->handle_ex)\ + client = (struct arpc_client_session_data *)head->handle_ex;\ +}while(0); + +struct msg_hashtable { + QUEUE queue; +}; + +/* ################## struct ###################*/ +struct arpc_client_session_data { + struct xio_session *session; /* session 资源 */ + struct xio_connection *conn; /* connection 资源*/ + struct xio_session_params session_param; + struct xio_connection_params conn_param; + struct arpc_session_ops *ops; + struct msg_hashtable msg_hash_table; /*发送消息的队列*/ +}; + +/* ################## function ###################*/ + +static int session_event(struct xio_session *session, + struct xio_session_event_data *event_data, + void *cb_user_context) +{ + GET_CLIENT_SESSION_DATA(x_data, fd, cb_user_context); + + ARPC_LOG_DEBUG("#### event:%d|%s. reason: %s, status:%d.",event_data->event, + xio_session_event_str(event_data->event), + xio_strerror(event_data->reason), fd->status); + + pthread_mutex_lock(&fd->lock); + if (fd->status == SESSION_STA_INIT){ + pthread_cond_signal(&fd->cond); + } + switch (event_data->event) { + case XIO_SESSION_TEARDOWN_EVENT: //链路断开,立马重试 + fd->retry_time++; + if(fd->status != SESSION_STA_CLEANUP){ + RE_CREATE_SESSION(x_data); + ARPC_LOG_ERROR(" try build new session fd."); + }else{ + if(session) + xio_session_destroy(session); + x_data->session = NULL; + } + xio_context_stop_loop(fd->ctx); + break; + case XIO_SESSION_CONNECTION_ESTABLISHED_EVENT: + fd->retry_time = 0; + fd->reconn_interval_s = 0; + fd->active_conn = event_data->conn; + fd->status = SESSION_STA_RUN_ACTION; + break; + case XIO_SESSION_CONNECTION_TEARDOWN_EVENT: // conn断开,需要释放con资源 + if (event_data->conn) + xio_connection_destroy(event_data->conn); + x_data->conn = NULL; + fd->active_conn = NULL; + if (fd->status != SESSION_STA_CLEANUP) + fd->status = SESSION_STA_WAIT; // 断路 + break; + case XIO_SESSION_REJECT_EVENT: + case XIO_SESSION_CONNECTION_REFUSED_EVENT: /**< connection refused event*/ + if (fd->retry_time > RETRY_MAX_TIME) { + fd->reconn_interval_s = SERVER_DOWN_WAIT_TIME; + if (fd->status != SESSION_STA_CLEANUP) + fd->status = SESSION_STA_WAIT; + }else{ + fd->reconn_interval_s = 1; //立马重连 + if (fd->status != SESSION_STA_CLEANUP) + fd->status = SESSION_STA_RUN; + } + break; + case XIO_SESSION_ERROR_EVENT: + fd->retry_time++; + break; + default: + break; + }; + pthread_mutex_unlock(&fd->lock); + return 0; +} + +static int session_established(struct xio_session *session, + struct xio_new_session_rsp *rsp, + void *cb_user_context) +{ + ARPC_LOG_NOTICE("session established."); + return 0; +} + +static int msg_error(struct xio_session *session, + enum xio_status error, + enum xio_msg_direction dir, + struct xio_msg *rsp, + void *cb_user_context) +{ + ARPC_LOG_ERROR("msg_error message to do. "); + return 0; +} + +static int _client_msg_header_dispatch(struct xio_session *session, + struct xio_msg *msg, + void *cb_user_context) +{ + int ret = 0; + GET_CLIENT_SESSION_DATA(client, _fd, cb_user_context); + + ARPC_LOG_DEBUG("header message type:%d", msg->type); + switch(msg->type) { + case XIO_MSG_TYPE_REQ: + ret = _process_request_header(msg, &client->ops->req_ops, IOV_DEFAULT_MAX_LEN, _fd->usr_context); + break; + case XIO_MSG_TYPE_RSP: + ret = _process_rsp_header(msg, _fd->usr_context); + break; + case XIO_MSG_TYPE_ONE_WAY: + ret = _process_oneway_header(msg, &client->ops->oneway_ops, IOV_DEFAULT_MAX_LEN, _fd->usr_context); + break; + default: + break; + } + return ret; +} +static int _client_msg_data_dispatch(struct xio_session *session, + struct xio_msg *rsp, + int last_in_rxq, + void *cb_user_context) +{ + int ret = 0; + GET_CLIENT_SESSION_DATA(client, _fd,cb_user_context); + if (!rsp) + return 0; + ARPC_LOG_DEBUG("msg_data_dispatch, msg type:%d", rsp->type); + switch(rsp->type) { + case XIO_MSG_TYPE_REQ: + ret = _process_request_data(rsp, &client->ops->req_ops, last_in_rxq, _fd->usr_context); + break; + case XIO_MSG_TYPE_RSP: + ret = _process_rsp_data(rsp, last_in_rxq); + break; + case XIO_MSG_TYPE_ONE_WAY: + ret = _process_oneway_data(rsp, &client->ops->oneway_ops, last_in_rxq, _fd->usr_context); + break; + default: + break; + } + + return ret; +} + +static int _rsp_send_complete(struct xio_session *session, + struct xio_msg *rsp, + void *conn_user_context) +{ + GET_CLIENT_SESSION_DATA(client, _fd, conn_user_context); + return _process_send_rsp_complete(rsp, &client->ops->req_ops, _fd->usr_context); +} + +static int _on_msg_delivered(struct xio_session *session, + struct xio_msg *msg, + int last_in_rxq, + void *conn_user_context) +{ + ARPC_LOG_DEBUG("_on_msg_delivered, msg type:%d", msg->type); + return 0; +} + +static int _ow_msg_send_complete(struct xio_session *session, + struct xio_msg *msg, + void *conn_user_context) +{ + struct arpc_msg *_msg = (struct arpc_msg *)msg->user_context; + LOG_THEN_RETURN_VAL_IF_TRUE((!_msg), -1, "arpc_msg_data is empty."); + return _process_send_complete(_msg); +} + +static struct xio_session_ops x_client_ops = { + .on_session_event = &session_event, + .on_session_established = &session_established, + .rev_msg_data_alloc_buf = &_client_msg_header_dispatch, + .on_msg = &_client_msg_data_dispatch, + .on_msg_send_complete = &_rsp_send_complete, + .on_msg_delivered = &_on_msg_delivered, + .on_ow_msg_send_complete = &_ow_msg_send_complete, + .on_msg_error = &msg_error +}; + +static int arpc_client_run_session(void * ctx) +{ + struct arpc_handle_ex *_fd = (struct arpc_handle_ex *)ctx; + int32_t sleep_time = 0; + if (!_fd){ + ARPC_LOG_ERROR( "fd null, exit."); + return ARPC_ERROR; + } + + ARPC_LOG_DEBUG("session run on the thread[%lu].", pthread_self()); + for(;;){ + if (xio_context_run_loop(_fd->ctx, XIO_INFINITE) < 0) + ARPC_LOG_ERROR("xio error msg: %s.", xio_strerror(xio_errno())); + + ARPC_LOG_DEBUG("xio context run loop pause..."); + pthread_mutex_lock(&_fd->lock); + sleep_time = _fd->reconn_interval_s; + if(_fd->status == SESSION_STA_CLEANUP){ + ARPC_LOG_DEBUG("session ctx[%p] thread is stop loop, exit.", _fd->ctx); + pthread_cond_broadcast(&_fd->cond); // 释放信号,让等待信号的线程退出 + pthread_mutex_unlock(&_fd->lock); + break; + } + pthread_mutex_unlock(&_fd->lock); + + if (sleep_time > 0) + sleep(sleep_time); // 恢复周期 + } + ARPC_LOG_DEBUG("exit signaled."); + + return ARPC_SUCCESS; +} + +static void arpc_client_stop_session(void * ctx) +{ + arpc_client_destroy_session((arpc_session_handle_t)ctx); + return; +} + +arpc_session_handle_t arpc_client_create_session(const struct arpc_client_session_param *param) +{ + int ret = 0; + work_handle_t hd; + struct arpc_client_session_data *x_data = NULL; + struct arpc_handle_ex *fd = NULL; + struct tp_thread_work thread; + /* handle*/ + fd = (struct arpc_handle_ex *)ARPC_MEM_ALLOC( sizeof(struct arpc_handle_ex) + + sizeof(struct arpc_client_session_data), + NULL); + if (!fd) { + ARPC_LOG_ERROR( "malloc error, exit "); + return NULL; + } + memset(fd, 0, sizeof(struct arpc_handle_ex) + sizeof(struct arpc_client_session_data)); + + fd->type = SESSION_CLIENT; + x_data = (struct arpc_client_session_data *)fd->handle_ex; + + pthread_mutex_init(&fd->lock, NULL); /* 初始化互斥锁 */ + pthread_cond_init(&fd->cond, NULL); /* 初始化条件变量 */ + + ret = get_uri(¶m->con, fd->uri, URI_MAX_LEN); + if ( ret < 0) { + ARPC_LOG_ERROR( "get_uri error, exit "); + goto error_2; + } + + /* context */ + fd->ctx = xio_context_create(NULL, 0, fd->affinity); + if (fd->ctx == NULL){ + ARPC_LOG_ERROR( "xio_context_create error, exit "); + goto error_2; + } + + /* session */ + (void)memset(&x_data->session_param, 0, sizeof(struct xio_session_params)); + x_data->session_param.type = XIO_SESSION_CLIENT; + x_data->session_param.ses_ops = &x_client_ops; + x_data->session_param.user_context = fd; + x_data->session_param.uri = fd->uri; + x_data->session_param.initial_sn = 1; + if (param->req_data && param->req_data_len && param->req_data_len < MAX_SESSION_REQ_DATA_LEN) { + x_data->session_param.private_data = ARPC_MEM_ALLOC(param->req_data_len, NULL); + if (x_data->session_param.private_data) { + x_data->session_param.private_data_len = param->req_data_len; + memcpy(x_data->session_param.private_data, param->req_data, param->req_data_len); + }else{ + ARPC_LOG_ERROR( "ARPC_MEM_ALLOC private_data error, exit "); + goto error_3; + } + } + + x_data->session = xio_session_create(&x_data->session_param); + if (x_data->session == NULL){ + ARPC_LOG_ERROR( "xio_session_create error, exit "); + goto error_3; + } + + /* connection */ + (void)memset(&x_data->conn_param, 0, sizeof(struct xio_connection_params)); + x_data->conn_param.session = x_data->session; + x_data->conn_param.ctx = fd->ctx; + x_data->conn_param.conn_idx = 0; + x_data->conn_param.conn_user_context = fd; + x_data->conn = xio_connect(&x_data->conn_param); + if(!x_data->conn){ + ARPC_LOG_ERROR( "xio_connect error, exit "); + goto error_4; + } + /* others*/ + fd->usr_context = param->ops_usr_ctx; + fd->active_conn = x_data->conn; + fd->status = SESSION_STA_INIT; + x_data->ops = param->ops; + + /* 线程池申请资源*/ + thread.loop = &arpc_client_run_session; + thread.stop = &arpc_client_stop_session; + thread.usr_ctx = (void*)fd; + hd = tp_post_one_work(_arpc_get_threadpool(), &thread, WORK_DONE_AUTO_FREE); + if(!hd){ + ARPC_LOG_ERROR( "tp_post_one_work error, exit "); + goto error_5; + } + pthread_mutex_lock(&fd->lock); + pthread_cond_wait(&fd->cond, &fd->lock); + pthread_mutex_unlock(&fd->lock); + ARPC_LOG_DEBUG("Create session success."); + return (arpc_session_handle_t)fd; + +error_5: + if(x_data->conn) + xio_connection_destroy(x_data->conn); + x_data->conn = NULL; + +error_4: + if(x_data->session) + xio_session_destroy(x_data->session); + x_data->session = NULL; +error_3: + if (x_data->session_param.private_data) { + ARPC_MEM_FREE(x_data->session_param.private_data, NULL); + } + x_data->session_param.private_data =NULL; + if (fd->ctx) + xio_context_destroy(fd->ctx); + fd->ctx = NULL; +error_2: + pthread_cond_destroy(&fd->cond); + pthread_mutex_destroy(&fd->lock); + if (fd) + free(fd); + fd = NULL; + ARPC_LOG_ERROR( "create session fail, exit."); + return NULL; +} + +int arpc_client_destroy_session(arpc_session_handle_t *fd) +{ + struct arpc_client_session_data *x_data = NULL; + struct arpc_handle_ex *_fd = (struct arpc_handle_ex *)*fd; + if(!_fd){ + return ARPC_ERROR; + } + pthread_mutex_lock(&_fd->lock); + if(_fd->type != SESSION_CLIENT){ + ARPC_LOG_ERROR( "session not client session."); + goto error; + } + + if(_fd->status != SESSION_STA_CLEANUP) { + ARPC_LOG_DEBUG( "session thread is running, status[%d].", _fd->status); + _fd->status = SESSION_STA_CLEANUP; /* 停止 session*/ + if (_fd->active_conn) + xio_disconnect(_fd->active_conn); + _fd->active_conn = NULL; + pthread_cond_wait(&_fd->cond, &_fd->lock); /* 等待退出的信号 */ + } + + x_data = (struct arpc_client_session_data *)_fd->handle_ex; + if (x_data){ + if(x_data->session) + xio_session_destroy(x_data->session); + x_data->session = NULL; + } + + if (x_data->session_param.private_data) { + ARPC_MEM_FREE(x_data->session_param.private_data, NULL); + } + x_data->session_param.private_data =NULL; + + if (_fd->ctx) + xio_context_destroy(_fd->ctx); + _fd->ctx = NULL; + pthread_cond_destroy(&_fd->cond); + + pthread_mutex_unlock(&_fd->lock); + pthread_mutex_destroy(&_fd->lock); + + if (_fd) + ARPC_MEM_FREE(_fd, NULL); + *fd = NULL; + ARPC_LOG_DEBUG( "destroy session success, exit."); + return ARPC_SUCCESS; +error: + pthread_mutex_unlock(&_fd->lock); + return ARPC_ERROR; +} + +enum arpc_session_status arpc_get_session_status(const arpc_session_handle_t fd) +{ + struct arpc_handle_ex *_fd = (struct arpc_handle_ex *)fd; + enum session_status status; + enum arpc_session_status out_sta = ARPC_SESSION_STA_NOT_EXISTED; + if(!_fd){ + return ARPC_ERROR; + } + + status = _fd->status; + switch (status) + { + case SESSION_STA_RUN: + out_sta = ARPC_SESSION_STA_RE_CON; + break; + case SESSION_STA_RUN_ACTION: + out_sta = ARPC_SESSION_STA_ACTIVE; + break; + case SESSION_STA_WAIT: + out_sta = ARPC_SESSION_STA_WAIT; + break; + default: + break; + } + return out_sta; +} +#endif diff --git a/src/session/arpc_com.c b/src/session/arpc_com.c new file mode 100644 index 0000000..33fd9a0 --- /dev/null +++ b/src/session/arpc_com.c @@ -0,0 +1,336 @@ +/* + * Copyright(C) 2020 Ruijie Network. All rights reserved. + */ + +/*! +* \file xxx.x +* \brief xxx +* +* 包含.. +* +* \copyright 2020 Ruijie Network. All rights reserved. +* \author hongchunhua@ruijie.com.cn +* \version v1.0.0 +* \date 2020.08.05 +* \note none +*/ + +#include +#include +#include +#include +#include + +#include "arpc_com.h" +#include "threadpool.h" + +struct aprc_paramter{ + int thread_max_num; + tp_handle thread_pool; +}; + +static struct aprc_paramter g_param= { + .thread_max_num = 5, + .thread_pool = NULL, +}; + +int get_uri(const struct arpc_con_info *param, char *uri, uint32_t uri_len) +{ + const char *type = NULL, *ip=NULL; + uint32_t port = 0; + if (!param || !uri) { + return -1; + } + switch(param->type){ + case ARPC_E_TRANS_TCP: + type = "tcp"; + ip = param->ipv4.ip; + port = param->ipv4.port; + break; + default: + ARPC_LOG_ERROR("unkown type:[%d].", param->type); + return -1; + } + (void)sprintf(uri, "%s://%s:%u", type, ip, port); + ARPC_LOG_NOTICE("uri:[%s].", uri); + return 0; +} + +int _arpc_get_ipv4_addr(struct sockaddr_storage *src_addr, char *ip, uint32_t len, uint32_t *port) +{ + struct sockaddr_in *s4; + LOG_THEN_RETURN_VAL_IF_TRUE((!src_addr || !ip || !port), ARPC_ERROR, "input null."); + LOG_THEN_RETURN_VAL_IF_TRUE((src_addr->ss_family != AF_INET || len < INET_ADDRSTRLEN), ARPC_ERROR, "input invalid."); + + s4 = (struct sockaddr_in *)src_addr; + *port = s4->sin_port; + inet_ntop(AF_INET, s4, ip, len); + return ARPC_SUCCESS; +} + +int arpc_init() +{ + struct tp_param p = {0}; + ARPC_LOG_DEBUG( "arpc_init."); + xio_init(); + p.thread_max_num = g_param.thread_max_num; + g_param.thread_pool = tp_create_thread_pool(&p); + return 0; +} + +void arpc_finish() +{ + ARPC_LOG_DEBUG( "arpc_finish."); + tp_destroy_thread_pool(&g_param.thread_pool); + xio_shutdown(); +} + +tp_handle _arpc_get_threadpool() +{ + return g_param.thread_pool; +} + +void _debug_printf_msg(struct xio_msg *rsp) +{ + struct xio_iovec *sglist = vmsg_base_sglist(&rsp->in); + char *str; + uint32_t nents = vmsg_sglist_nents(&rsp->in); + uint32_t len, i; + char tmp; + str = (char *)rsp->in.header.iov_base; + len = rsp->in.header.iov_len; + if (str) { + tmp = str[len -1]; + str[len -1] = '\0'; + ARPC_LOG_DEBUG("message header : [%llu] - %s.",(unsigned long long)(rsp->sn + 1), str); + str[len -1] = tmp; + } + for (i = 0; i < nents; i++) { + str = (char *)sglist[i].iov_base; + len = sglist[i].iov_len; + if (str) { + tmp = str[len -1]; + str[len -1] = '\0'; + ARPC_LOG_DEBUG("message data: [%llu][%d][%d] - %s\n", + (unsigned long long)(rsp->sn + 1), + i, len, str); + str[len -1] = tmp; + } + } + return; +} + +int +_arpc_wait_request_rsp(struct arpc_msg_data* pri_msg, int32_t timeout_ms) +{ + int ret; + struct timespec abstime; + struct timeval now; + uint64_t nsec; + gettimeofday(&now, NULL); // 线程安全 + nsec = now.tv_usec * 1000 + (timeout_ms % 1000) * 1000000; + abstime.tv_sec=now.tv_sec + nsec / 1000000000 + timeout_ms / 1000; + abstime.tv_nsec=nsec % 1000000000; + ret = pthread_cond_timedwait(&pri_msg->cond, &pri_msg->lock, &abstime); + return ret; +} + + +struct _async_deal_msg_param{ + struct xio_msg *oneway_msg; + struct arpc_vmsg rev_iov; + struct _async_proc_ops ops; + void *usr_ctx; +}; + +static int _msg_async_deal(void *usr_ctx) +{ + struct _async_deal_msg_param *async = (struct _async_deal_msg_param *)usr_ctx; + uint32_t i; + ARPC_LOG_DEBUG("Note: msg deal on thread[%lu]...", pthread_self());// to do + if (!async){ + ARPC_LOG_ERROR("usr_ctx null, exit.");// to do + return 0; + } + LOG_THEN_RETURN_VAL_IF_TRUE((!async->ops.proc_async_cb), ARPC_ERROR, "proc_async_cb null."); + async->ops.proc_async_cb(&async->rev_iov, async->usr_ctx); + + // 释放资源 + TO_FREE_USER_DATA_BUF(async->ops.free_cb, async->usr_ctx, async->rev_iov.vec, async->rev_iov.vec_num, i); + + if (async->oneway_msg){ + xio_release_msg(async->oneway_msg); + async->oneway_msg = NULL; + } + // free + if (async->rev_iov.head) + ARPC_MEM_FREE(async->rev_iov.head, NULL); + async->rev_iov.head = NULL; + if (async->rev_iov.vec) + ARPC_MEM_FREE(async->rev_iov.vec, NULL); + async->rev_iov.vec = NULL; + + ARPC_MEM_FREE(async, NULL); + return 0; +} + +int _post_iov_to_async_thread(struct arpc_vmsg *iov, struct xio_msg *oneway_msg, struct _async_proc_ops *ops, void *usr_ctx) +{ + struct _async_deal_msg_param *async; + struct tp_thread_work thread; + LOG_THEN_RETURN_VAL_IF_TRUE((!iov || !ops), ARPC_ERROR, "rev_iov or ops null."); + + LOG_THEN_RETURN_VAL_IF_TRUE((!iov->head_len && !iov->vec_num), ARPC_ERROR, "invalid in of iov."); + + LOG_THEN_RETURN_VAL_IF_TRUE((!ops->free_cb || !ops->proc_async_cb), ARPC_ERROR, "ops invalid."); + /* 线程池申请资源*/ + async = (struct _async_deal_msg_param*)ARPC_MEM_ALLOC(sizeof(struct _async_deal_msg_param), NULL); + if (!async){ + ARPC_LOG_ERROR( "ARPC_MEM_ALLOC fail, exit "); + return ARPC_ERROR; + } + memset(async, 0, sizeof(struct _async_deal_msg_param)); + // data buff not copy; + async->ops = *ops; + async->rev_iov.total_data = iov->total_data; + async->rev_iov.vec_num = iov->vec_num; + async->rev_iov.vec = iov->vec; + async->oneway_msg = oneway_msg; + // deep copy; + if (iov->head_len) { + async->rev_iov.head = (void*)ARPC_MEM_ALLOC(iov->head_len, NULL); + if (!async->rev_iov.head) + goto error; + async->rev_iov.head_len = iov->head_len; + memcpy(async->rev_iov.head, iov->head, iov->head_len); + } + async->usr_ctx = usr_ctx; + + thread.loop = &_msg_async_deal; + thread.stop = NULL; + thread.usr_ctx = (void*)async; + if(!tp_post_one_work(_arpc_get_threadpool(), &thread, WORK_DONE_AUTO_FREE)){ + ARPC_LOG_ERROR( "tp_post_one_work error."); + goto error; + } + iov->vec =NULL; + iov->vec_num = 0; + return 0; +error: + ARPC_LOG_ERROR( "_post_iov_to_async_thread error, exit "); + if (async->rev_iov.head) + ARPC_MEM_FREE(async->rev_iov.head, NULL); + if (async->rev_iov.vec) + ARPC_MEM_FREE(async->rev_iov.vec, NULL); + if (async) + ARPC_MEM_FREE(async, NULL); + async = NULL; + return ARPC_ERROR; +} + +int _create_header_source(struct xio_msg *msg, struct _proc_header_func *ops, uint64_t iov_max_len, void *usr_ctx) +{ + struct xio_iovec *sglist; + uint32_t nents = 0; + struct arpc_header_msg header; + uint32_t flag = 0; + uint32_t i; + int ret; + + LOG_THEN_RETURN_VAL_IF_TRUE((!msg->in.header.iov_base || !msg->in.header.iov_len), -1, "header null."); + LOG_THEN_RETURN_VAL_IF_TRUE((!msg->in.header.iov_len), -1, "header len is 0."); + LOG_THEN_RETURN_VAL_IF_TRUE((!ops->proc_head_cb), -1, "proc_head_cb null."); + + memset(&header, 0, sizeof(struct arpc_header_msg)); + header.head = msg->in.header.iov_base; + header.head_len = msg->in.header.iov_len; + header.data_len = msg->in.total_data_len; + // header process + msg->usr_flags = 0; + ret = ops->proc_head_cb(&header, usr_ctx, &flag); + if (ret != ARPC_SUCCESS || !msg->in.total_data_len){ + SET_FLAG(msg->usr_flags, FLAG_MSG_ERROR_DISCARD_DATA); // data数据不做处理 + ARPC_LOG_DEBUG("discard data, total_data_len[%lu].", msg->in.total_data_len); + return ARPC_ERROR; + } + msg->usr_flags = flag; + // alloc data buf form user define call back + if (!IS_SET(msg->usr_flags, METHOD_ALLOC_DATA_BUF)) { + ARPC_LOG_DEBUG("not need alloc data buf."); + return ARPC_SUCCESS; + } + + if (!ops->alloc_cb || !ops->free_cb) { + CLR_FLAG(msg->usr_flags, METHOD_ALLOC_DATA_BUF); + ARPC_LOG_DEBUG("func malloc or free is null."); + return ARPC_ERROR; + } + + // 分配内存 + nents = (msg->in.total_data_len / iov_max_len + 1); + sglist = (struct xio_iovec* )ARPC_MEM_ALLOC(nents * sizeof(struct xio_iovec), NULL); + LOG_THEN_GOTO_TAG_IF_VAL_TRUE((!msg->in.data_tbl.sglist), error, "calloc fail."); + + for (i = 0; i < nents -1; i++) { + sglist[i].iov_len = iov_max_len; + sglist[i].iov_base = ops->alloc_cb(sglist[i].iov_len, usr_ctx); + LOG_THEN_GOTO_TAG_IF_VAL_TRUE((!sglist[i].iov_base), error_1, "calloc fail."); + } + sglist[i].iov_len = (msg->in.total_data_len % iov_max_len); + sglist[i].iov_base = ops->alloc_cb(sglist[i].iov_len, usr_ctx); + + // 出参 + msg->in.data_tbl.sglist = (void*)sglist; + vmsg_sglist_set_nents(&msg->in, nents); + msg->in.sgl_type = XIO_SGL_TYPE_IOV_PTR; + return 0; + +error_1: + for (i = 0; i < nents; i++) { + if (sglist[i].iov_base) + ops->free_cb(sglist[i].iov_base, usr_ctx); + sglist[i].iov_base =NULL; + } +error: + if (sglist) { + ARPC_MEM_FREE(sglist, NULL); + sglist = NULL; + } + CLR_FLAG(msg->usr_flags, METHOD_ALLOC_DATA_BUF); + return -1; +} + +int _clean_header_source(struct xio_msg *msg, mem_free_cb_t free_cb, void *usr_ctx) +{ + struct xio_iovec *sglist; + uint32_t nents; + uint32_t i; + + LOG_THEN_RETURN_VAL_IF_TRUE((!msg), ARPC_ERROR, "msg null."); + LOG_THEN_RETURN_VAL_IF_TRUE((!free_cb), ARPC_ERROR, "alloc free is null."); + + if (!IS_SET(msg->usr_flags, METHOD_ALLOC_DATA_BUF)) { + ARPC_LOG_DEBUG("not need free data buf."); + return ARPC_ERROR; + } + // 释放内存 + nents = vmsg_sglist_nents(&msg->in); + sglist = vmsg_base_sglist(&msg->in); + if (!sglist || !nents){ + ARPC_LOG_ERROR("msg buf is null, nents:%u.", nents); + return ARPC_ERROR; + } + for (i = 0; i < nents; i++) { + if (sglist[i].iov_base) + free_cb(sglist[i].iov_base, usr_ctx); + } + if (sglist) + ARPC_MEM_FREE(sglist, NULL); + + // 出参 + msg->in.data_tbl.sglist = NULL; + vmsg_sglist_set_nents(&msg->in, 0); + msg->in.sgl_type = XIO_SGL_TYPE_IOV_PTR; + CLR_FLAG(msg->usr_flags, METHOD_ALLOC_DATA_BUF); + return 0; +} diff --git a/src/session/arpc_com.h b/src/session/arpc_com.h new file mode 100644 index 0000000..89b48d2 --- /dev/null +++ b/src/session/arpc_com.h @@ -0,0 +1,185 @@ +/* + * Copyright(C) 2020 Ruijie Network. All rights reserved. + */ + +/*! +* \file xxx.x +* \brief xxx +* +* 包含.. +* +* \copyright 2020 Ruijie Network. All rights reserved. +* \author hongchunhua@ruijie.com.cn +* \version v1.0.0 +* \date 2020.08.05 +* \note none +*/ + +#ifndef _ARPC_COM_H +#define _ARPC_COM_H + +#include +#include +#include +#include + +#include "base_log.h" + +#include "libxio.h" +#include "arpc_api.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define ARPC_ERROR -1 +#define ARPC_SUCCESS 0 + +#define ARPC_LOG_ERROR(format, arg...) BASE_LOG_ERROR(format, ##arg) +#define ARPC_LOG_NOTICE(format, arg...) BASE_LOG_NOTICE(format, ##arg) +#define ARPC_LOG_DEBUG(format, arg...) BASE_LOG_DEBUG(format, ##arg) + +#define ARPC_ASSERT(condition, format, arg...) BASE_ASSERT(condition, format, ##arg) + +#define URI_MAX_LEN 256 +#define MAX_DATA_SEG_LEN 1024 +#define DEFAULT_DEPTH 4 + +#define RETRY_MAX_TIME 10 /* session断开自动重连次数*/ +#define SERVER_DOWN_WAIT_TIME 10 + +#define SAFE_FREE_MEM(prt) if(prt) free(prt);prt= NULL; + +#define IS_SET(flag, tag) (flag&(1< +#include + +#include "queue.h" +#include "arpc_com.h" + + +#ifdef _DEF_SESSION_CLIENT + +#define MAX_SEND_ONEWAY_END_TIME 5*1000 + +typedef int (*func_xio_send_msg)(struct xio_connection *conn, struct xio_msg *msg); + +static struct xio_msg *_arpc_create_xio_msg(struct arpc_msg *msg); +static void _arpc_destroy_xio_msg(struct arpc_msg *msg); + +static struct arpc_msg *_xio_create_arpc_msg(struct xio_msg *rsp_msg); +static void _xio_destroy_arpc_msg(struct xio_msg *rsp_msg); + +static int _alloc_buf_to_rsp_msg(struct xio_msg *rsp); +static int _free_buf_on_rsp_msg(struct xio_msg *rsp); + +/** + * 发送一个请求消息 + * @param[in] fd ,a session handle + * @param[in] msg ,a data that will send + * @return receive .0,表示发送成功,小于0则失败 + */ +int arpc_do_request(const arpc_session_handle_t fd, struct arpc_msg *msg, int32_t timeout_ms) +{ + struct arpc_handle_ex *_fd = (struct arpc_handle_ex *)fd; + struct xio_msg *req = NULL; + int ret = ARPC_ERROR; + struct arpc_msg_data *pri_msg = NULL; + struct arpc_msg *rev_msg; + + LOG_THEN_RETURN_VAL_IF_TRUE((!fd || !msg ), ARPC_ERROR, "arpc_session_handle_t fd null, exit."); + + pri_msg = (struct arpc_msg_data*)msg->handle; + pthread_mutex_lock(&pri_msg->lock); // to lock + // get msg + req = _arpc_create_xio_msg(msg); + LOG_THEN_GOTO_TAG_IF_VAL_TRUE((req == NULL), error, "_arpc_convert_xio_msg fail."); + + /*session 发送数据*/ + pthread_mutex_lock(&_fd->lock); + if(_fd->active_conn && _fd->status == SESSION_STA_RUN_ACTION) { + ret = xio_send_request(_fd->active_conn, req); + }else{ + ret = ARPC_ERROR; + ARPC_LOG_ERROR("session invalid, session status:%d.", _fd->status); + } + pthread_mutex_unlock(&_fd->lock); + MSG_SET_REQ(pri_msg->flag); + LOG_THEN_GOTO_TAG_IF_VAL_TRUE(ret, error, "xio_send_msg fail, ret:%d.", ret); + + if (msg->proc_rsp_cb && msg->clean_send_cb) { + goto end;// 可以实现完全非阻塞 + }else if (msg->proc_rsp_cb) { + SET_FLAG(pri_msg->flag, XIO_SEND_END_TO_NOTIFY); //发送完成,发信号通知 + ret = _arpc_wait_request_rsp(pri_msg, MAX_SEND_ONEWAY_END_TIME); + LOG_ERROR_IF_VAL_TRUE(ret, "receive rsp msg fail for time out or system fail."); + goto end;// 发送阻塞,接收非阻塞 + } + + // 全部等待回复 + if (timeout_ms > 0) + ret = _arpc_wait_request_rsp(pri_msg, timeout_ms); + else + ret = pthread_cond_wait(&pri_msg->cond, &pri_msg->lock); + LOG_ERROR_IF_VAL_TRUE(ret, "receive rsp msg fail for time out or system fail."); + MSG_CLR_REQ(pri_msg->flag); + + //取回复对的数据 + rev_msg = _xio_create_arpc_msg(req); + if (!rev_msg || rev_msg != msg){ + ARPC_LOG_ERROR("receive msg invalid."); + ret = ARPC_ERROR; + } + _arpc_destroy_xio_msg(msg); // 释放发送资源 + +end: + pthread_mutex_unlock(&pri_msg->lock); //un lock + return ret; + +error: + _arpc_destroy_xio_msg(msg); + pthread_mutex_unlock(&pri_msg->lock); //un lock + return ARPC_ERROR; +} + +/** + * 发送一个单向消息(接收方无需回复) + * @param[in] fd ,a session handle + * @param[in] msg ,a data that will send + * @return receive .0,表示发送成功,小于0则失败 + */ +int arpc_send_oneway_msg(const arpc_session_handle_t fd, struct arpc_msg *msg) +{ + struct arpc_handle_ex *_fd = (struct arpc_handle_ex *)fd; + struct arpc_msg_data *pri_msg = NULL; + int ret = ARPC_ERROR; + struct xio_msg *req = NULL; + + LOG_THEN_RETURN_VAL_IF_TRUE((!_fd || !msg ), ARPC_ERROR, "arpc_session_handle_t fd null, exit."); + pri_msg = (struct arpc_msg_data*)msg->handle; + + pthread_mutex_lock(&pri_msg->lock); // to lock + // get msg + req = _arpc_create_xio_msg(msg); + LOG_THEN_GOTO_TAG_IF_VAL_TRUE((req == NULL), error, "_arpc_convert_xio_msg fail."); + + /*session加锁保护*/ + pthread_mutex_lock(&_fd->lock); + if(_fd->active_conn && _fd->status == SESSION_STA_RUN_ACTION) { + ret = xio_send_msg(_fd->active_conn, req); + }else{ + ret = ARPC_ERROR; + ARPC_LOG_ERROR("session invalid, session status:%d.", _fd->status); + } + pthread_mutex_unlock(&_fd->lock); + LOG_THEN_GOTO_TAG_IF_VAL_TRUE(ret, error, "xio_send_msg fail, ret:%d.", ret); + MSG_SET_REQ(pri_msg->flag); + + if(!msg->clean_send_cb){ + SET_FLAG(pri_msg->flag, XIO_SEND_END_TO_NOTIFY); //发送完成,发信号通知 + ret = _arpc_wait_request_rsp(pri_msg, MAX_SEND_ONEWAY_END_TIME); + LOG_ERROR_IF_VAL_TRUE(ret, "receive rsp msg fail for time out or system fail."); + MSG_CLR_REQ(pri_msg->flag); + } + pthread_mutex_unlock(&pri_msg->lock); //un lock + return ret; +error: + pthread_mutex_unlock(&pri_msg->lock); //un lock + return ARPC_ERROR; +} + +// 已加锁 +int _arpc_rev_request_head(struct xio_msg *in_rsp) +{ + struct arpc_msg *msg = NULL; + struct arpc_msg_data *pri_msg = NULL; + int ret = ARPC_ERROR; + struct arpc_msg *rev_msg; + + LOG_THEN_RETURN_VAL_IF_TRUE((!in_rsp), ARPC_ERROR, "in_rsp null."); + msg = (struct arpc_msg *)in_rsp->user_context; + LOG_THEN_RETURN_VAL_IF_TRUE((!msg), ARPC_ERROR, "msg invalid."); + pri_msg = (struct arpc_msg_data *)msg->handle; + pthread_mutex_lock(&pri_msg->lock); // to lock + ret = _alloc_buf_to_rsp_msg(in_rsp); + pthread_mutex_unlock(&pri_msg->lock); //un lock + return ret; +} + +// 已加锁 +int _arpc_rev_request_rsp(struct xio_msg *in_rsp) +{ + struct arpc_msg *msg = NULL; + struct arpc_msg_data *pri_msg = NULL; + int ret = ARPC_ERROR; + struct arpc_msg *rev_msg; + + LOG_THEN_RETURN_VAL_IF_TRUE((!in_rsp), ARPC_ERROR, "in_rsp null."); + msg = (struct arpc_msg *)in_rsp->user_context; + LOG_THEN_GOTO_TAG_IF_VAL_TRUE((!msg), end, "msg invalid."); + pri_msg = (struct arpc_msg_data *)msg->handle; + pthread_mutex_lock(&pri_msg->lock); // to lock + if (msg->proc_rsp_cb){ + rev_msg = _xio_create_arpc_msg(in_rsp); + if (!rev_msg || rev_msg != msg){ + ARPC_LOG_ERROR( "receive msg invalid."); + ret = ARPC_ERROR; + } + ret = msg->proc_rsp_cb(&rev_msg->receive, pri_msg->usr_ctx); + LOG_ERROR_IF_VAL_TRUE(ret, "proc_rsp_cb fail."); + _release_rsp_msg(msg); + if (in_rsp->type == XIO_MSG_TYPE_RSP) { + pthread_cond_signal(&pri_msg->cond);// 通知 + } + }else{ + pthread_cond_signal(&pri_msg->cond);// 通知 + } +end: + pthread_mutex_unlock(&pri_msg->lock); //un lock + return ret; +} + +// 无锁 +int _release_rsp_msg(struct arpc_msg *msg) +{ + struct xio_msg *rsp = NULL; + struct arpc_msg_data *pri_msg = NULL; + LOG_THEN_RETURN_VAL_IF_TRUE((!msg), ARPC_ERROR, "msg null ,fail."); + pri_msg = (struct arpc_msg_data*)msg->handle; + rsp = &(pri_msg->x_msg); + _arpc_destroy_xio_msg(msg); // 释放发送资源 + _free_buf_on_rsp_msg(rsp); // 释放自定义内存 + _xio_destroy_arpc_msg(rsp);// 释放自定义IOV指针 + xio_release_response(rsp); // 释放内部资源 + return 0; +} + +// 发送消息完成后处理方式 +int _process_send_complete(struct arpc_msg *msg) +{ + struct arpc_msg_data *pri_msg =NULL; + struct xio_msg *req = NULL; + LOG_THEN_RETURN_VAL_IF_TRUE((!msg), ARPC_ERROR, "msg null ,fail."); + pri_msg = (struct arpc_msg_data*)msg->handle; + pthread_mutex_lock(&pri_msg->lock); + req = &pri_msg->x_msg; + if (msg->clean_send_cb){ + msg->clean_send_cb(&msg->send, pri_msg->usr_ctx); + } + if (req->type == XIO_MSG_TYPE_ONE_WAY) { + CLR_FLAG(pri_msg->flag, XIO_MSG_REQ); + CLR_FLAG(pri_msg->flag, XIO_MSG_RSP); + } + _arpc_destroy_xio_msg(msg); + if (IS_SET(pri_msg->flag, XIO_SEND_END_TO_NOTIFY)){ + pthread_cond_signal(&pri_msg->cond); + CLR_FLAG(pri_msg->flag, XIO_SEND_END_TO_NOTIFY); + } + pthread_mutex_unlock(&pri_msg->lock); + + return ARPC_SUCCESS; +} + +// private funciton +static struct xio_msg *_arpc_create_xio_msg(struct arpc_msg *msg) +{ + struct xio_msg *req = NULL; + uint32_t i; + struct arpc_msg_data *pri_msg = (struct arpc_msg_data*)msg->handle; + + req = &(pri_msg->x_msg); + /* header */ + LOG_THEN_RETURN_VAL_IF_TRUE((!msg->send.head || !msg->send.head_len + || msg->send.head_len > MAX_HEADER_DATA_LEN), + NULL, "msg head is invalid, header:%p, len:%u.", + msg->send.head, + msg->send.head_len); + + (void)memset(req, 0, sizeof(struct xio_msg)); + req->out.header.iov_base = msg->send.head; + req->out.header.iov_len = msg->send.head_len; + + /* data */ + req->out.sgl_type = XIO_SGL_TYPE_IOV_PTR; + LOG_DEBUG_GOTO_TAG_IF_VAL_TRUE(!msg->send.total_data, data_null, "send total_data is 0."); + LOG_THEN_GOTO_TAG_IF_VAL_TRUE((msg->send.total_data > DATA_DEFAULT_MAX_LEN), data_null, + "send total_data[%lu] is over max size[%lu].", + msg->send.total_data, + (uint64_t)DATA_DEFAULT_MAX_LEN); + LOG_THEN_GOTO_TAG_IF_VAL_TRUE(!msg->send.vec_num, data_null, "send vec_num is 0."); + LOG_THEN_GOTO_TAG_IF_VAL_TRUE(!msg->send.vec, data_null, "send vec null."); + + req->out.pdata_iov.max_nents = msg->send.vec_num; + req->out.pdata_iov.nents = msg->send.vec_num; + req->out.pdata_iov.sglist = (struct xio_iovec_ex *)ARPC_MEM_ALLOC( msg->send.vec_num * sizeof(struct xio_iovec_ex), NULL); + LOG_THEN_GOTO_TAG_IF_VAL_TRUE(!req->out.pdata_iov.sglist, data_null, "ARPC_MEM_ALLOC fail."); + + SET_FLAG(pri_msg->flag, XIO_SEND_MSG_ALLOC_BUF);// 标识分配内存 + for (i =0; i < msg->send.vec_num; i++){ + req->out.pdata_iov.sglist[i].iov_base = msg->send.vec[i].data; + req->out.pdata_iov.sglist[i].iov_len = msg->send.vec[i].len; + } + LOG_THEN_RETURN_VAL_IF_TRUE(( msg->send.vec_num && !msg->send.vec), NULL, "send vec null ,fail."); + goto end; + +data_null: + req->out.pdata_iov.max_nents =0; + req->out.pdata_iov.nents = 0; + req->out.pdata_iov.sglist = NULL; + +end: + /* receive 默认方式*/ + req->in.sgl_type = XIO_SGL_TYPE_IOV; + req->in.data_iov.max_nents = XIO_IOVLEN; + + /* 消息上下文保存*/ + req->user_context = msg; + + return req; +} +// 释放发送锁自动分配的资源 +static void _arpc_destroy_xio_msg(struct arpc_msg *msg) +{ + struct xio_msg *req = NULL; + struct arpc_msg_data *pri_msg = (struct arpc_msg_data*)msg->handle; + req = &(pri_msg->x_msg); + if (IS_SET(pri_msg->flag, XIO_SEND_MSG_ALLOC_BUF) && req->out.pdata_iov.sglist){ + ARPC_MEM_FREE(req->out.pdata_iov.sglist, NULL); + req->out.pdata_iov.sglist = NULL; + } + CLR_FLAG(pri_msg->flag, XIO_SEND_MSG_ALLOC_BUF); + memset(&req->out, 0, sizeof(struct xio_vmsg)); + return; +} + +static struct arpc_msg *_xio_create_arpc_msg(struct xio_msg *rsp_msg) +{ + struct arpc_msg *msg = NULL; + struct arpc_msg_data *pri_msg = NULL; + struct xio_iovec_ex *sglist = NULL; + uint32_t nents = 0; + uint32_t i; + int ret = -1; + + LOG_THEN_RETURN_VAL_IF_TRUE((!rsp_msg), NULL, "rsp_msg null, exit."); + LOG_THEN_RETURN_VAL_IF_TRUE((!rsp_msg->in.header.iov_base), NULL, "header null, exit."); + LOG_THEN_RETURN_VAL_IF_TRUE((!rsp_msg->in.header.iov_len), NULL, "header iov_len is 0."); + nents = vmsg_sglist_nents(&rsp_msg->in); + + msg = (struct arpc_msg *)rsp_msg->user_context; + LOG_THEN_RETURN_VAL_IF_TRUE((!msg), NULL, "msg null, exit."); + msg->receive.head = rsp_msg->in.header.iov_base; + msg->receive.head_len = rsp_msg->in.header.iov_len; + msg->receive.total_data = rsp_msg->in.total_data_len; + + pri_msg = (struct arpc_msg_data *)msg->handle; + if (!IS_SET(pri_msg->flag, XIO_MSG_ALLOC_BUF) && nents) { + // 内部buf, 结构体需要转换,复制指针,不做copy + sglist = vmsg_sglist(&rsp_msg->in); + msg->receive.vec = (struct arpc_iov *)ARPC_MEM_ALLOC(nents * sizeof(struct arpc_iov), NULL); + LOG_THEN_GOTO_TAG_IF_VAL_TRUE((!msg->receive.vec), end,"vec alloc is empty."); + msg->receive.vec_num = nents; + msg->receive.total_data = 0; + SET_FLAG(pri_msg->flag, XIO_RSP_IOV_ALLOC_BUF); // 标识分配内存 + for(i = 0; i < nents; i++){ + msg->receive.vec[i].data = sglist[i].iov_base; + msg->receive.vec[i].len = sglist[i].iov_len; + msg->receive.total_data +=msg->receive.vec[i].len; + } + }else{ + // 自定义buf,结构体可以强制转换 + sglist = vmsg_base_sglist(&rsp_msg->in); + msg->receive.vec = (struct arpc_iov *)sglist; + msg->receive.vec_num = nents; + } + +end: + SET_FLAG(pri_msg->flag, XIO_MSG_RSP); + return msg; +} + +// 释放内部拷贝的指针数组 +static void _xio_destroy_arpc_msg(struct xio_msg *rsp_msg) +{ + struct arpc_msg *msg = NULL; + struct arpc_msg_data *pri_msg = NULL; + struct xio_iovec_ex *sglist = NULL; + uint32_t nents = 0; + uint32_t i; + int ret = -1; + + LOG_THEN_RETURN_IF_VAL_TRUE((!rsp_msg), "rsp_msg null, exit."); + msg = (struct arpc_msg *)rsp_msg->user_context; + LOG_THEN_RETURN_IF_VAL_TRUE((!msg), "msg null, exit."); + pri_msg = (struct arpc_msg_data *)msg->handle; + if (IS_SET(pri_msg->flag, XIO_RSP_IOV_ALLOC_BUF)){ + ARPC_MEM_FREE(msg->receive.vec, NULL); + msg->receive.vec =NULL; + msg->receive.vec_num =0; + msg->receive.total_data =0; + CLR_FLAG(pri_msg->flag, XIO_RSP_IOV_ALLOC_BUF); + } + CLR_FLAG(pri_msg->flag, XIO_MSG_RSP); + return; +} + +static int _alloc_buf_to_rsp_msg(struct xio_msg *rsp) +{ + struct arpc_msg *msg = (struct arpc_msg *)rsp->user_context; + struct arpc_msg_data *pri_msg = NULL; + struct xio_iovec *sglist = NULL; + uint32_t nents = 0; + uint32_t i; + int ret = -1; + + LOG_THEN_GOTO_TAG_IF_VAL_TRUE((!msg), error, "msg invalid."); + pri_msg = (struct arpc_msg_data *)msg->handle; + + // 分配内存 + if (!pri_msg->alloc_cb || !pri_msg->free_cb || IS_SET(pri_msg->flag ,XIO_MSG_CANCEL)){ + goto error; + } + nents = (rsp->in.total_data_len / pri_msg->iov_max_len + 1); + sglist = (struct xio_iovec* )ARPC_MEM_ALLOC(nents * sizeof(struct xio_iovec), NULL); + if (!sglist) { + goto error; + } + + // 分配资源 + for (i = 0; i < nents -1; i++) { + sglist[i].iov_len = pri_msg->iov_max_len; + sglist[i].iov_base = pri_msg->alloc_cb(sglist[i].iov_len, pri_msg->usr_ctx); + LOG_THEN_GOTO_TAG_IF_VAL_TRUE((!sglist[i].iov_base), error_1, "calloc fail."); + }; + sglist[i].iov_len = (rsp->in.total_data_len % pri_msg->iov_max_len); + sglist[i].iov_base = pri_msg->alloc_cb(sglist[i].iov_len, pri_msg->usr_ctx); + + rsp->in.sgl_type = XIO_SGL_TYPE_IOV_PTR; + rsp->in.data_tbl.sglist = sglist; + vmsg_sglist_set_nents(&rsp->in, nents); + rsp->in.data_tbl.max_nents = nents; + SET_FLAG(pri_msg->flag, XIO_MSG_ALLOC_BUF); + + return 0; +error_1: + for (i = 0; i < nents; i++) { + if (sglist[i].iov_base) + pri_msg->free_cb(sglist[i].iov_base, pri_msg->usr_ctx); + sglist[i].iov_base =NULL; + } +error: + if (sglist) { + ARPC_MEM_FREE(sglist, NULL); + sglist = NULL; + } + /* receive 默认方式*/ + rsp->in.sgl_type = XIO_SGL_TYPE_IOV; + vmsg_sglist_set_nents(&rsp->in, XIO_IOVLEN); + rsp->in.data_iov.max_nents = XIO_IOVLEN; + CLR_FLAG(pri_msg->flag ,XIO_MSG_ALLOC_BUF); + return -1; +} + +static int _free_buf_on_rsp_msg(struct xio_msg *rsp) +{ + struct arpc_msg *msg = (struct arpc_msg *)rsp->user_context; + struct arpc_msg_data *pri_msg = NULL; + struct xio_iovec *sglist = NULL; + uint32_t nents = 0; + uint32_t i; + + LOG_THEN_GOTO_TAG_IF_VAL_TRUE((!rsp), end, "msg null."); + pri_msg = (struct arpc_msg_data *)msg->handle; + if (!IS_SET(pri_msg->flag, XIO_MSG_ALLOC_BUF)) { + goto end; + } + LOG_THEN_GOTO_TAG_IF_VAL_TRUE((!pri_msg->free_cb), end, "alloc free is null."); + + // 释放内存 + nents = vmsg_sglist_nents(&rsp->in); + sglist = vmsg_base_sglist(&rsp->in); + LOG_THEN_GOTO_TAG_IF_VAL_TRUE((!sglist), end, "rsp[%p],sglist is null.", rsp); + for (i = 0; i < nents; i++) { + if (sglist[i].iov_base) + pri_msg->free_cb(sglist[i].iov_base, pri_msg->usr_ctx); + } + if (sglist) + ARPC_MEM_FREE(sglist, NULL); + + // 出参 + rsp->in.sgl_type = XIO_SGL_TYPE_IOV; + vmsg_sglist_set_nents(&rsp->in, XIO_IOVLEN); + rsp->in.data_iov.max_nents = XIO_IOVLEN; + CLR_FLAG(pri_msg->flag ,XIO_MSG_ALLOC_BUF); +end: + return 0; +} + + +#endif diff --git a/src/session/arpc_message.c b/src/session/arpc_message.c new file mode 100644 index 0000000..94ad8a2 --- /dev/null +++ b/src/session/arpc_message.c @@ -0,0 +1,126 @@ +/* + * Copyright(C) 2020 Ruijie Network. All rights reserved. + */ + +/*! +* \file xxx.x +* \brief xxx +* +* 包含.. +* +* \copyright 2020 Ruijie Network. All rights reserved. +* \author hongchunhua@ruijie.com.cn +* \version v1.0.0 +* \date 2020.08.05 +* \note none +*/ + +#include +#include +#include +#include +#include +#include + +#include "arpc_com.h" + + +struct arpc_msg *arpc_new_msg(const struct arpc_msg_param *p) +{ + struct arpc_msg *ret_msg = NULL; + struct arpc_msg_data *pri_msg = NULL; + + ret_msg = (struct arpc_msg*)ARPC_MEM_ALLOC(sizeof(struct arpc_msg) + sizeof(struct arpc_msg_data),NULL); + if (!ret_msg) { + ARPC_LOG_ERROR( "calloc fail."); + return NULL; + } + memset(ret_msg, 0, sizeof(struct arpc_msg) + sizeof(struct arpc_msg_data)); + pri_msg = (struct arpc_msg_data *)ret_msg->handle; + if (!pri_msg) { + ARPC_LOG_ERROR( "calloc handle fail."); + goto error; + } + + /* 变量操作 */ + pthread_mutex_init(&pri_msg->lock, NULL); /* 初始化互斥锁 */ + pthread_cond_init(&pri_msg->cond, NULL); /* 初始化条件变量 */ + + pri_msg->flag = 0; // 暂时不使用用户的flag + pri_msg->x_msg.user_context = (void *)ret_msg; + if (p){ + pri_msg->alloc_cb = p->alloc_cb; + pri_msg->free_cb = p->free_cb; + pri_msg->usr_ctx = p->usr_context; + } + pri_msg->iov_max_len = IOV_DEFAULT_MAX_LEN; + + return ret_msg; +error: + pthread_cond_destroy(&pri_msg->cond); + pthread_mutex_destroy(&pri_msg->lock); + if (ret_msg) + ARPC_MEM_FREE(ret_msg, NULL); + return NULL; +} + + +int arpc_delete_msg(struct arpc_msg **msg) +{ + struct arpc_msg_data *pri_msg = NULL; + if (!msg && !(*msg)) { + ARPC_LOG_ERROR( "msg null, fail."); + return ARPC_ERROR; + } + pri_msg = (struct arpc_msg_data*)(*msg)->handle; + pthread_mutex_lock(&pri_msg->lock); + if(IS_SET(pri_msg->flag, XIO_MSG_REQ) || IS_SET(pri_msg->flag, XIO_MSG_RSP)){ + pthread_mutex_unlock(&pri_msg->lock); + return ARPC_ERROR; + } + SET_FLAG(pri_msg->flag, XIO_MSG_CANCEL); + ARPC_LOG_DEBUG("message do cancel."); + pthread_cond_broadcast(&pri_msg->cond); // 释放信号,让等待回复消息线程退出 + pthread_mutex_unlock(&pri_msg->lock); + + pthread_mutex_lock(&pri_msg->lock); + pthread_cond_destroy(&pri_msg->cond); + pthread_mutex_unlock(&pri_msg->lock); + + pthread_mutex_destroy(&pri_msg->lock); + if(*msg) + ARPC_MEM_FREE(*msg, NULL); + *msg = NULL; + return 0; +} + +int arpc_msg_reset(struct arpc_msg *msg) +{ + struct arpc_msg_data *pri_msg = NULL; + if (!msg) { + ARPC_LOG_ERROR( "msg null, fail."); + return ARPC_ERROR; + } + pri_msg = (struct arpc_msg_data*)msg->handle; + pthread_mutex_lock(&pri_msg->lock); + if(IS_SET(pri_msg->flag, XIO_MSG_CANCEL)){ + ARPC_LOG_ERROR("message is canceling."); + goto end; + } + + if (IS_SET(pri_msg->flag, XIO_MSG_REQ)) { + ARPC_LOG_ERROR("message do reuqest."); + goto end; + } + + if (IS_SET(pri_msg->flag, XIO_MSG_RSP)) { + ARPC_LOG_DEBUG("message get rsp, need to release."); + _release_rsp_msg(msg); // 释放回复资源 + } + CLR_FLAG(pri_msg->flag, XIO_MSG_RSP); + pri_msg->flag = 0; + +end: + pthread_mutex_unlock(&pri_msg->lock); + return 0; +} diff --git a/src/session/arpc_process_oneway.c b/src/session/arpc_process_oneway.c new file mode 100644 index 0000000..3ec900c --- /dev/null +++ b/src/session/arpc_process_oneway.c @@ -0,0 +1,86 @@ +/* + * Copyright(C) 2020 Ruijie Network. All rights reserved. + */ + +/*! +* \file xxx.x +* \brief xxx +* +* 包含.. +* +* \copyright 2020 Ruijie Network. All rights reserved. +* \author hongchunhua@ruijie.com.cn +* \version v1.0.0 +* \date 2020.08.05 +* \note none +*/ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arpc_com.h" +#include "threadpool.h" + + +int _process_oneway_header(struct xio_msg *msg, struct oneway_ops *ops, uint64_t iov_max_len, void *usr_ctx) +{ + struct _proc_header_func head_ops; + head_ops.alloc_cb = ops->alloc_cb; + head_ops.free_cb = ops->free_cb; + head_ops.proc_head_cb = ops->proc_head_cb; + return _create_header_source(msg, &head_ops, iov_max_len, usr_ctx); +} + +int _process_oneway_data(struct xio_msg *req, + struct oneway_ops *ops, + int last_in_rxq, + void *usr_ctx) +{ + struct xio_iovec *sglist = vmsg_base_sglist(&req->in); + uint32_t nents = vmsg_sglist_nents(&req->in); + uint32_t i; + int ret; + struct arpc_vmsg rev_iov; + struct _async_proc_ops async_ops; + + LOG_THEN_RETURN_VAL_IF_TRUE((!req), ARPC_ERROR, "req null."); + if (IS_SET(req->usr_flags, FLAG_MSG_ERROR_DISCARD_DATA)) { + goto free_user_buf; + } + memset(&rev_iov, 0, sizeof(struct arpc_vmsg)); + rev_iov.head = req->in.header.iov_base; + rev_iov.head_len = req->in.header.iov_len; + rev_iov.vec_num = nents; + rev_iov.vec = (struct arpc_iov *)sglist; + rev_iov.total_data = req->in.total_data_len; + + if (ops->proc_data_cb) { + ret = ops->proc_data_cb(&rev_iov, NULL, usr_ctx); + LOG_ERROR_IF_VAL_TRUE((ret != ARPC_SUCCESS), "proc_data_cb fail."); + } + + if(IS_SET(req->usr_flags, METHOD_ALLOC_DATA_BUF) && + IS_SET(req->usr_flags, METHOD_PROCESS_ASYNC) && + ops->free_cb && ops->proc_async_cb){ + async_ops.alloc_cb = ops->alloc_cb; + async_ops.free_cb = ops->free_cb; + async_ops.proc_async_cb = ops->proc_async_cb; + xio_release_msg(req); + ret = _post_iov_to_async_thread(&rev_iov, req, &async_ops, usr_ctx); + if(ret != ARPC_SUCCESS) { + ARPC_LOG_ERROR("_post_iov_to_async_thread fail."); + goto free_user_buf; + } + return 0; + } +free_user_buf: + _clean_header_source(req, ops->free_cb, usr_ctx); + xio_release_msg(req); + return 0; +} diff --git a/src/session/arpc_process_request.c b/src/session/arpc_process_request.c new file mode 100644 index 0000000..13589c7 --- /dev/null +++ b/src/session/arpc_process_request.c @@ -0,0 +1,157 @@ +/* + * Copyright(C) 2020 Ruijie Network. All rights reserved. + */ + +/*! +* \file xxx.x +* \brief xxx +* +* 包含.. +* +* \copyright 2020 Ruijie Network. All rights reserved. +* \author hongchunhua@ruijie.com.cn +* \version v1.0.0 +* \date 2020.08.05 +* \note none +*/ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arpc_com.h" +#include "threadpool.h" + + +static const char SERVER_DEFAULT[] = "rsp-header:undefine"; + +int _process_request_header(struct xio_msg *msg, struct request_ops *ops, uint64_t iov_max_len, void *usr_ctx) +{ + struct _proc_header_func head_ops; + head_ops.alloc_cb = ops->alloc_cb; + head_ops.free_cb = ops->free_cb; + head_ops.proc_head_cb = ops->proc_head_cb; + return _create_header_source(msg, &head_ops, iov_max_len, usr_ctx); +} + +static int _do_respone(struct arpc_vmsg *rsp_iov, struct xio_msg *req) +{ + struct xio_msg *rsp_msg; + uint32_t i; + + rsp_msg = (struct xio_msg*)ARPC_MEM_ALLOC(sizeof(struct xio_msg), NULL); // todo queue + memset(rsp_msg, 0, sizeof(struct xio_msg)); + if(rsp_iov && rsp_iov->head){ + ARPC_LOG_DEBUG("rsp header."); + rsp_msg->out.header.iov_base = rsp_iov->head; + rsp_msg->out.header.iov_len = rsp_iov->head_len; + }else{ + rsp_msg->out.header.iov_base = (char*)SERVER_DEFAULT; + rsp_msg->out.header.iov_len = sizeof(SERVER_DEFAULT); + rsp_msg->out.sgl_type = XIO_SGL_TYPE_IOV_PTR; + vmsg_sglist_set_nents(&rsp_msg->out, 0); + } + + if (rsp_iov && rsp_iov->vec && rsp_iov->total_data && rsp_iov->vec_num) { + ARPC_LOG_DEBUG("rsp data."); + rsp_msg->out.total_data_len = rsp_iov->total_data; + rsp_msg->out.sgl_type = XIO_SGL_TYPE_IOV_PTR; + rsp_msg->out.pdata_iov.sglist = ARPC_MEM_ALLOC(rsp_iov->vec_num * sizeof(struct xio_iovec_ex), NULL); + for (i = 0; i < rsp_iov->vec_num; i++){ + rsp_msg->out.pdata_iov.sglist[i].iov_base = rsp_iov->vec[i].data; + rsp_msg->out.pdata_iov.sglist[i].iov_len = rsp_iov->vec[i].len; + rsp_msg->out.pdata_iov.sglist[i].mr = NULL; + rsp_msg->out.pdata_iov.sglist[i].user_context = NULL; + } + rsp_msg->out.pdata_iov.max_nents = rsp_iov->vec_num; + vmsg_sglist_set_nents(&rsp_msg->out, rsp_iov->vec_num); + rsp_msg->user_context = (void*)rsp_iov->vec; + SET_FLAG(rsp_msg->usr_flags, FLAG_RSP_USER_DATA); + } + + rsp_msg->request = req; + xio_send_response(rsp_msg); + return 0; +} + +int _process_request_data(struct xio_msg *req, + struct request_ops *ops, + int last_in_rxq, + void *usr_ctx) +{ + struct xio_iovec *sglist = vmsg_base_sglist(&req->in); + uint32_t nents = vmsg_sglist_nents(&req->in); + uint32_t i; + struct arpc_vmsg rev_iov; + struct arpc_vmsg rsp_iov; + struct _async_proc_ops async_ops; + int ret; + + LOG_THEN_RETURN_VAL_IF_TRUE((!req), ARPC_ERROR, "req null."); + + memset(&rev_iov, 0, sizeof(struct arpc_vmsg)); + rev_iov.head = req->in.header.iov_base; + rev_iov.head_len = req->in.header.iov_len; + rev_iov.vec_num = nents; + rev_iov.vec = (struct arpc_iov *)sglist; + rev_iov.total_data = req->in.total_data_len; + + // 数据处理,并获得回复消息 + memset(&rsp_iov, 0, sizeof(struct arpc_vmsg)); + if(ops->proc_data_cb){ + ret = ops->proc_data_cb(&rev_iov, &rsp_iov, usr_ctx); + LOG_ERROR_IF_VAL_TRUE(ret, "proc_data_cb that define for user is error."); + } + + // 异步处理 + if(IS_SET(req->usr_flags, METHOD_PROCESS_ASYNC) && + IS_SET(req->usr_flags, METHOD_ALLOC_DATA_BUF) && + ops->free_cb && ops->proc_async_cb){ + async_ops.alloc_cb = ops->alloc_cb; + async_ops.free_cb = ops->free_cb; + async_ops.proc_async_cb = ops->proc_async_cb; + ret = _post_iov_to_async_thread(&rev_iov, NULL, &async_ops, usr_ctx); + if(ret != ARPC_SUCCESS) { + ARPC_LOG_ERROR("_post_iov_to_async_thread fail."); + goto free_user_buf; + }else{ + goto do_respone; + } + } + +free_user_buf: + _clean_header_source(req, ops->free_cb, usr_ctx); + +do_respone: + /* attach request to response */ + return _do_respone(&rsp_iov, req); +} + +int _process_send_rsp_complete(struct xio_msg *rsp, struct request_ops *ops, void *usr_ctx) +{ + struct arpc_vmsg rsp_iov; + LOG_THEN_RETURN_VAL_IF_TRUE((!rsp || !ops), ARPC_ERROR, "rsp or ops null."); + ARPC_LOG_DEBUG("rsp_send_complete, rsp:%p.", rsp); + if(IS_SET(rsp->usr_flags, FLAG_RSP_USER_DATA) && ops->release_rsp_cb){ + rsp_iov.head = rsp->out.header.iov_base; + rsp_iov.head_len = rsp->out.header.iov_len; + rsp_iov.total_data = rsp->out.total_data_len; + rsp_iov.vec = (struct arpc_iov*)rsp->user_context; + rsp_iov.vec_num = rsp->out.data_tbl.nents; + ops->release_rsp_cb(&rsp_iov, usr_ctx); //释放用户申请的资源 + } + // 释放内部IOV申请的资源 + if (rsp->out.pdata_iov.sglist) { + ARPC_MEM_FREE(rsp->out.pdata_iov.sglist, NULL); // todo + rsp->out.pdata_iov.sglist = NULL; + } + ARPC_MEM_FREE(rsp, NULL); // todo + + return 0; +} + diff --git a/src/session/arpc_process_rsp.c b/src/session/arpc_process_rsp.c new file mode 100644 index 0000000..c15b973 --- /dev/null +++ b/src/session/arpc_process_rsp.c @@ -0,0 +1,34 @@ +/* + * Copyright(C) 2020 Ruijie Network. All rights reserved. + */ + +/*! +* \file xxx.x +* \brief xxx +* +* 包含.. +* +* \copyright 2020 Ruijie Network. All rights reserved. +* \author hongchunhua@ruijie.com.cn +* \version v1.0.0 +* \date 2020.08.05 +* \note none +*/ + +#include +#include +#include + +#include "arpc_com.h" + + +int _process_rsp_header(struct xio_msg *rsp, void *usr_ctx) +{ + return _arpc_rev_request_head(rsp); +} + +int _process_rsp_data(struct xio_msg *rsp, int last_in_rxq) +{ + return _arpc_rev_request_rsp(rsp); +} + diff --git a/src/session/arpc_server.c b/src/session/arpc_server.c new file mode 100644 index 0000000..adeddec --- /dev/null +++ b/src/session/arpc_server.c @@ -0,0 +1,399 @@ +/* + * Copyright(C) 2020 Ruijie Network. All rights reserved. + */ + +/*! +* \file xxx.x +* \brief xxx +* +* 包含.. +* +* \copyright 2020 Ruijie Network. All rights reserved. +* \author hongchunhua@ruijie.com.cn +* \version v1.0.0 +* \date 2020.08.05 +* \note none +*/ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arpc_com.h" +#include "threadpool.h" + +#ifdef _DEF_SESSION_SERVER + + +#define GET_SERVER_SESSION_DATA(server_data, head, fd) \ +struct arpc_server_session_data *server_data = NULL;\ +struct arpc_handle_ex *head = (struct arpc_handle_ex *)fd;\ +do{\ + if(head && head->handle_ex)\ + server_data = (struct arpc_server_session_data *)head->handle_ex;\ + else\ + return -1;\ +}while(0); + +#define GET_NEW_SESSION_FD(new, head, ctx) \ +struct _arpc_new_client_session_data *new = NULL;\ +struct arpc_handle_ex *head = (struct arpc_handle_ex *)ctx;\ +do{\ + if(head && head->handle_ex)\ + new = (struct _arpc_new_client_session_data *)head->handle_ex;\ + else\ + return -1;\ +}while(0); + +/* ################## enum ###################*/ + +struct arpc_server_session_data { + struct xio_server *server; + int (*new_session_start)(const struct arpc_new_session_req *, struct arpc_new_session_rsp *, void*); + int (*new_session_end)(arpc_session_handle_t, struct arpc_new_session_rsp *, void*); + struct arpc_session_ops default_ops; + uint32_t iov_max_len; +}; + +struct _arpc_new_client_session_data{ + struct arpc_session_ops ops; +}; + +static int _msg_error(struct xio_session *session, + enum xio_status error, + enum xio_msg_direction dir, + struct xio_msg *rsp, + void *cb_user_context) +{ + ARPC_LOG_NOTICE("msg_error message. "); + return 0; +} +static int _server_session_event(struct xio_session *session, + struct xio_session_event_data *event_data, + void *cb_user_context) +{ + struct arpc_handle_ex *head = (struct arpc_handle_ex *)cb_user_context; + struct xio_connection_attr attr; + ARPC_LOG_DEBUG("################### event:%d ,%s. reason: %s.",event_data->event, + xio_session_event_str(event_data->event), + xio_strerror(event_data->reason)); + switch (event_data->event) { + case XIO_SESSION_NEW_CONNECTION_EVENT: //新的链路 + if(head){ + head->active_conn = event_data->conn; + attr.user_context = cb_user_context; + xio_modify_connection(head->active_conn, &attr, XIO_CONNECTION_ATTR_USER_CTX); + } + break; + case XIO_SESSION_CONNECTION_TEARDOWN_EVENT: + if (event_data->conn){ + xio_connection_destroy(event_data->conn); + head->active_conn =NULL; + } + break; + case XIO_SESSION_TEARDOWN_EVENT: + if (session) + xio_session_destroy(session); + if (head && head->type == SESSION_SERVER_CHILD) + ARPC_MEM_FREE(head, NULL); + break; + case XIO_SESSION_ERROR_EVENT: + break; + default: + break; + }; + + return 0; +} + +static int _on_new_session(struct xio_session *session, + struct xio_new_session_req *req, + void *cb_user_context) +{ + struct arpc_new_session_req client; + struct arpc_con_info *ipv4; + struct arpc_new_session_rsp param; + struct arpc_handle_ex *client_fd = NULL; + struct _arpc_new_client_session_data *fd_ex = NULL; + struct xio_session_attr attr; + int ret; + GET_SERVER_SESSION_DATA(server, head, cb_user_context); + + LOG_THEN_RETURN_VAL_IF_TRUE((!session || !req || !cb_user_context), -1, "invalid input."); + + memset(&client, 0, sizeof(struct arpc_new_session_req)); + memset(¶m, 0, sizeof(param)); + + param.rsp_data = NULL; + param.rsp_data_len = 0; + param.ret_status = ARPC_E_STATUS_OK; + + LOG_THEN_GOTO_TAG_IF_VAL_TRUE((!(server->new_session_start && server->new_session_end)), reject, "new_session callback null."); + + ipv4 = &client.client_con_info; + LOG_THEN_GOTO_TAG_IF_VAL_TRUE((req->proto != XIO_PROTO_TCP), reject, "no tcp con, fail."); + ipv4->type =ARPC_E_TRANS_TCP; + _arpc_get_ipv4_addr(&req->src_addr, ipv4->ipv4.ip, IPV4_MAX_LEN, &ipv4->ipv4.port); + + client.client_data.data = req->private_data; + client.client_data.len = req->private_data_len; + + ret = server->new_session_start(&client, ¶m, head->usr_context); + LOG_THEN_GOTO_TAG_IF_VAL_TRUE((ret != ARPC_SUCCESS), reject, "new_session return fail."); + + LOG_THEN_GOTO_TAG_IF_VAL_TRUE((param.ret_status != ARPC_E_STATUS_OK), reject, "ops null."); + + client_fd = ARPC_MEM_ALLOC(sizeof(struct arpc_handle_ex) + sizeof(struct _arpc_new_client_session_data), NULL); + memset(client_fd, 0, sizeof(struct _arpc_new_client_session_data) + sizeof(struct _arpc_new_client_session_data)); + fd_ex = (struct _arpc_new_client_session_data *)client_fd->handle_ex; + if (param.ops) { + fd_ex->ops = *(param.ops); + }else{ + fd_ex->ops = server->default_ops; + } + + if (param.ops_new_ctx) + client_fd->usr_context = param.ops_new_ctx; + else + client_fd->usr_context = head->usr_context; + + client_fd->type = SESSION_SERVER_CHILD; + + attr.ses_ops = NULL; + attr.uri = NULL; + attr.user_context = (void*)client_fd; + + ret = xio_modify_session(session, &attr, XIO_SESSION_ATTR_USER_CTX); + LOG_THEN_GOTO_TAG_IF_VAL_TRUE((ret != ARPC_SUCCESS), reject, "xio_modify_session fail."); + + server->new_session_end((arpc_session_handle_t)client_fd, ¶m, head->usr_context); + xio_accept(session, NULL, 0, param.rsp_data, param.rsp_data_len); + return 0; +reject: + if(client_fd) + ARPC_MEM_FREE(client_fd, NULL); + client_fd = NULL; + server->new_session_end((arpc_session_handle_t)client_fd, ¶m, head->usr_context); + xio_reject(session, XIO_E_SESSION_ABORTED, NULL, 0); // 拒绝session请求 + return -1; +} + +static int _rsp_send_complete(struct xio_session *session, + struct xio_msg *rsp, + void *conn_user_context) +{ + GET_NEW_SESSION_FD(new_fd, head, conn_user_context); + return _process_send_rsp_complete(rsp, &new_fd->ops.req_ops, head->usr_context); +} + +static int _server_msg_header_dispatch(struct xio_session *session, + struct xio_msg *msg, + void *cb_user_context) +{ + int ret = 0; + GET_NEW_SESSION_FD(fd, head, cb_user_context); + ARPC_LOG_DEBUG("header message type:%d", msg->type); + switch(msg->type) { + case XIO_MSG_TYPE_REQ: + ret = _process_request_header(msg, &fd->ops.req_ops, IOV_DEFAULT_MAX_LEN, head->usr_context); + break; + case XIO_MSG_TYPE_RSP: + ret = _process_rsp_header(msg, head->usr_context); + break; + case XIO_MSG_TYPE_ONE_WAY: + ret = _process_oneway_header(msg, &fd->ops.oneway_ops, IOV_DEFAULT_MAX_LEN, head->usr_context); + break; + default: + break; + } + return ret; +} + +static int _server_msg_data_dispatch(struct xio_session *session, + struct xio_msg *rsp, + int last_in_rxq, + void *cb_user_context) +{ + int ret = 0; + GET_NEW_SESSION_FD(fd, head, cb_user_context); + + ARPC_LOG_DEBUG("message data type:%d", rsp->type); + switch(rsp->type) { + case XIO_MSG_TYPE_REQ: + ret = _process_request_data(rsp, &fd->ops.req_ops, last_in_rxq, head->usr_context); + break; + case XIO_MSG_TYPE_RSP: + ret = _process_rsp_data(rsp, last_in_rxq); + break; + case XIO_MSG_TYPE_ONE_WAY: + ret = _process_oneway_data(rsp, &fd->ops.oneway_ops, last_in_rxq, head->usr_context); + break; + default: + break; + } + + return ret; +} +static struct xio_session_ops x_server_ops = { + .on_session_event = &_server_session_event, + .on_new_session = &_on_new_session, + .rev_msg_data_alloc_buf = &_server_msg_header_dispatch, + .on_msg = &_server_msg_data_dispatch, + .on_msg_send_complete = &_rsp_send_complete, + .on_msg_error = &_msg_error +}; + +arpc_server_t arpc_server_create(const struct arpc_server_param *param) +{ + int ret = 0; + struct arpc_server_session_data *x_data = NULL; + struct arpc_handle_ex *fd = NULL; + struct request_ops *req_ops; + /* handle*/ + fd = (struct arpc_handle_ex *)ARPC_MEM_ALLOC(sizeof(struct arpc_handle_ex) + + sizeof(struct arpc_server_session_data), + NULL); + if (!fd) { + ARPC_LOG_ERROR( "malloc error, exit "); + return NULL; + } + fd->type = SESSION_SERVER; + x_data = (struct arpc_server_session_data *)fd->handle_ex; + + x_data->new_session_start = param->new_session_start; + x_data->new_session_end = param->new_session_end; + LOG_THEN_GOTO_TAG_IF_VAL_TRUE((!x_data->new_session_start || !x_data->new_session_end) ,error_1, "new_session is null."); + + x_data->default_ops = param->default_ops; + req_ops = &x_data->default_ops.req_ops; + LOG_THEN_GOTO_TAG_IF_VAL_TRUE((!req_ops->proc_head_cb || !req_ops->proc_data_cb), error_1, "proc_data_cb is null."); + + x_data->iov_max_len = (param->iov_max_len > 512)?param->iov_max_len:IOV_DEFAULT_MAX_LEN; + + pthread_mutex_init(&fd->lock, NULL); /* 初始化互斥锁 */ + pthread_cond_init(&fd->cond, NULL); /* 初始化条件变量 */ + + ret = get_uri(¶m->con, fd->uri, URI_MAX_LEN); + if ( ret < 0) { + ARPC_LOG_ERROR( "get_uri error, exit "); + goto error_2; + } + + /* context */ + fd->ctx = xio_context_create(NULL, 0, fd->affinity); + if (fd->ctx == NULL){ + ARPC_LOG_ERROR( "xio_context_create error, exit "); + goto error_3; + } + + x_data->server = xio_bind(fd->ctx, &x_server_ops, fd->uri, NULL, 0, fd); + if (!x_data->server){ + ARPC_LOG_ERROR( "xio_bind error, exit "); + goto error_4; + } + + fd->usr_context = param->default_ops_usr_ctx; + fd->active_conn = NULL; + fd->status = SESSION_STA_INIT; + + ARPC_LOG_NOTICE("Create server success."); + + return (arpc_server_t)fd; + +error_4: + if (x_data->server) + xio_unbind(x_data->server); + x_data->server = NULL; +error_3: + if (fd->ctx) + xio_context_destroy(fd->ctx); + fd->ctx = NULL; + +error_2: + pthread_cond_destroy(&fd->cond); + pthread_mutex_destroy(&fd->lock); +error_1: + if (fd) + free(fd); + ARPC_LOG_NOTICE( "destroy session success, exit."); + return NULL; +} + +int arpc_server_destroy(arpc_server_t *fd) +{ + struct arpc_server_session_data *x_data = NULL; + struct arpc_handle_ex *_fd = NULL; + if(!fd){ + return ARPC_ERROR; + } + _fd = (struct arpc_handle_ex *)*fd; + if(!_fd){ + return ARPC_ERROR; + } + x_data = (struct arpc_server_session_data *)_fd->handle_ex; + pthread_mutex_lock(&_fd->lock); + if (x_data && x_data->server) + xio_unbind(x_data->server); + x_data->server = NULL; + + if (_fd->ctx) + xio_context_destroy(_fd->ctx); + _fd->ctx = NULL; + + pthread_cond_destroy(&_fd->cond); + + pthread_mutex_unlock(&_fd->lock); + pthread_mutex_destroy(&_fd->lock); + + if (_fd) + ARPC_MEM_FREE(_fd, NULL); + + ARPC_LOG_NOTICE( "destroy session success, exit."); + *fd = NULL; + return ARPC_SUCCESS; +} + +int arpc_server_loop(arpc_server_t fd, int32_t timeout_ms) +{ + struct arpc_handle_ex *_fd = (struct arpc_handle_ex *)fd; + int32_t sleep_time = 0; + if (!_fd){ + ARPC_LOG_ERROR( "fd null, exit."); + return ARPC_ERROR; + } + + ARPC_LOG_NOTICE("session run on the thread[%lu].", pthread_self()); + pthread_mutex_lock(&_fd->lock); + _fd->status = SESSION_STA_RUN; + pthread_mutex_unlock(&_fd->lock); + + for(;;){ + if (xio_context_run_loop(_fd->ctx, timeout_ms) < 0) + ARPC_LOG_NOTICE("xio error msg: %s.", xio_strerror(xio_errno())); + + ARPC_LOG_NOTICE("xio context run loop pause..."); + pthread_mutex_lock(&_fd->lock); + sleep_time = _fd->reconn_interval_s; + if(_fd->status == SESSION_STA_CLEANUP || timeout_ms > 0){ + ARPC_LOG_NOTICE("session ctx[%p] thread is stop loop, exit.", _fd->ctx); + _fd->status = SESSION_STA_INIT; // 状态恢复 + pthread_cond_broadcast(&_fd->cond); // 释放信号,让等待信号的线程退出 + pthread_mutex_unlock(&_fd->lock); + break; + } + pthread_mutex_unlock(&_fd->lock); + + if (sleep_time > 0) + sleep(sleep_time); // 恢复周期 + } + + ARPC_LOG_NOTICE("exit signaled."); + return ARPC_SUCCESS; +} + +#endif