diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 00000000000..e762f67dff6 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,15 @@ +# To get started with Dependabot version updates, you'll need to specify which +# package ecosystems to update and where the package manifests are located. +# Please see the documentation for all configuration options: +# https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file + +version: 2 +updates: + - package-ecosystem: "github-actions" + directory: "/" # Location of package manifests + schedule: + # Check for updates to GitHub Actions weekly on Monday + interval: "weekly" + time: "09:00" + timezone: "America/Los_Angeles" + diff --git a/.github/workflows/clang-format-check.yml b/.github/workflows/clang-format-check.yml index aa91ec9dd7f..6e5916c4737 100644 --- a/.github/workflows/clang-format-check.yml +++ b/.github/workflows/clang-format-check.yml @@ -1,5 +1,7 @@ name: clang-format Check on: [push, pull_request] +permissions: + contents: read jobs: formatting-check: name: Formatting Check @@ -11,7 +13,7 @@ jobs: steps: - uses: actions/checkout@v4 - name: Run clang-format style check for C/C++/Protobuf programs. - uses: jidicula/clang-format-action@v4.10.2 + uses: jidicula/clang-format-action@v4.12.0 with: clang-format-version: '15' check-path: ${{ matrix.path }} diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 00000000000..29504b7eaff --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,84 @@ +# For most projects, this workflow file will not need changing; you simply need +# to commit it to your repository. +# +# You may wish to alter this file to override the set of languages analyzed, +# or to provide custom queries or build logic. +# +# ******** NOTE ******** +# We have attempted to detect the languages in your repository. Please check +# the `language` matrix defined below to confirm you have the correct set of +# supported CodeQL languages. +# +name: "CodeQL" + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + schedule: + - cron: '0 23 * * *' + +jobs: + analyze: + name: Analyze + # Runner size impacts CodeQL analysis time. To learn more, please see: + # - https://gh.io/recommended-hardware-resources-for-running-codeql + # - https://gh.io/supported-runners-and-hardware-resources + # - https://gh.io/using-larger-runners + # Consider using larger runners for possible analysis time improvements. + runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }} + timeout-minutes: ${{ (matrix.language == 'swift' && 120) || 360 }} + permissions: + # required for all workflows + security-events: write + + # only required for workflows in private repositories + actions: read + contents: read + + strategy: + fail-fast: false + matrix: + language: [ 'c-cpp', 'python', 'ruby' ] + # CodeQL supports [ 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift' ] + # Use only 'java-kotlin' to analyze code written in Java, Kotlin or both + # Use only 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both + # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@v3 + with: + languages: ${{ matrix.language }} + # If you wish to specify custom queries, you can do so here or in a config file. + # By default, queries listed here will override any specified in a config file. + # Prefix the list here with "+" to use these queries and those in the config file. + + # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs + # queries: security-extended,security-and-quality + + + # Autobuild attempts to build any compiled languages (C/C++, C#, Go, Java, or Swift). + # If this step fails, then you should remove it and run the build manually (see below) + - name: Autobuild + uses: github/codeql-action/autobuild@v3 + + # ℹ️ Command-line programs to run using the OS shell. + # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun + + # If the Autobuild fails above, remove it and uncomment the following three lines. + # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. + + # - run: | + # echo "Run, Build Application using script" + # ./location_of_script_within_repo/buildscript.sh + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v3 + with: + category: "/language:${{matrix.language}}" diff --git a/.github/workflows/coverity.yml b/.github/workflows/coverity.yml index fac6e075cfd..204af0aa07f 100644 --- a/.github/workflows/coverity.yml +++ b/.github/workflows/coverity.yml @@ -1,8 +1,12 @@ name: Nightly Coverity Scan on: schedule: - # Every night, at midnight - - cron: '0 0 * * *' + # UTC time, 'minute hour day-of-month month day-of-week' + - cron: '0 5 * * *' + +permissions: + contents: read + env: APT_PACKAGES: >- abi-compliance-checker @@ -34,6 +38,7 @@ env: --enable-shm --enable-tcp --enable-udp + --enable-usnic --enable-verbs=rdma-core/build --enable-sm2 RDMA_CORE_PATH: 'rdma-core/build' @@ -57,7 +62,7 @@ jobs: set -x git clone --depth 1 -b ${{ env.RDMA_CORE_VERSION }} https://github.com/linux-rdma/rdma-core.git pushd rdma-core; bash build.sh; popd - export LD_LIBRARY_PATH="${{ env.RDMA_CORE_PATH }}/lib:$LD_LIBRARY_PATH" + export LD_LIBRARY_PATH="$PWD/${{ env.RDMA_CORE_PATH }}/lib:$LD_LIBRARY_PATH" # We use a compiler extension that supports broader complex type # definitions than what's defined in C99 standard (which only defines @@ -70,8 +75,13 @@ jobs: ./autogen.sh ./configure --prefix=$PWD/install ${{ env.OFI_PROVIDER_FLAGS }} CC=clang - cov-build --dir cov-int make make install + make clean + pushd fabtests + ./autogen.sh + ./configure --with-libfabric=$PWD/../install CC=clang + popd + cov-build --dir cov-int bash -c "make && make -C fabtests LDFLAGS=\"-L$PWD/${{ env.RDMA_CORE_PATH }}/lib -L$PWD/install/lib\"" - name: Submit results run: | tar czvf libfabric.tgz cov-int diff --git a/.github/workflows/gh-man.yaml b/.github/workflows/gh-man.yaml index b84a2bee9d6..f548a2627e7 100644 --- a/.github/workflows/gh-man.yaml +++ b/.github/workflows/gh-man.yaml @@ -13,6 +13,9 @@ jobs: gh-man-update: name: GH Man Page Updater runs-on: ubuntu-latest + permissions: + contents: write + pull-requests: write steps: - name: Debug information env: diff --git a/.github/workflows/nroff-elves.yaml b/.github/workflows/nroff-elves.yaml index 344150b2661..4803d6f8054 100644 --- a/.github/workflows/nroff-elves.yaml +++ b/.github/workflows/nroff-elves.yaml @@ -11,6 +11,9 @@ jobs: nroff-elves-scheduled: name: The Nroff Elves runs-on: ubuntu-latest + permissions: + contents: write + pull-requests: write steps: - name: Debug information env: diff --git a/.github/workflows/pr-ci.yml b/.github/workflows/pr-ci.yml index 7de5545549d..fe044b7b07f 100644 --- a/.github/workflows/pr-ci.yml +++ b/.github/workflows/pr-ci.yml @@ -1,5 +1,8 @@ name: Build Checks on: [push, pull_request] +permissions: + contents: read + pull-requests: read env: APT_PACKAGES: >- abi-compliance-checker @@ -23,8 +26,6 @@ env: sparse valgrind wget - APT_REPOS: >- - ppa:lttng/ppa OFI_PROVIDER_FLAGS: >- --enable-efa=$PWD/rdma-core/build --enable-mrail @@ -34,6 +35,7 @@ env: --enable-shm --enable-tcp --enable-udp + --enable-usnic --enable-verbs=$PWD/rdma-core/build RDMA_CORE_PATH: '$PWD/rdma-core/build' RDMA_CORE_VERSION: v34.1 @@ -43,7 +45,6 @@ jobs: strategy: matrix: os: - - ubuntu-20.04 - ubuntu-22.04 cc: - gcc @@ -52,7 +53,6 @@ jobs: steps: - name: Install dependencies (Linux) run: | - sudo apt-add-repository ${{ env.APT_REPOS }} sudo apt-get update sudo apt-get install -y ${{ env.APT_PACKAGES }} - uses: actions/checkout@v4 @@ -74,11 +74,10 @@ jobs: name: ${{ matrix.os }}-${{ matrix.cc }}-config.log path: config.log hmem: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - name: Install dependencies (Linux) run: | - sudo apt-add-repository ${{ env.APT_REPOS }} sudo apt-get update sudo apt-get install -y ${{ env.APT_PACKAGES }} - name: Install CUDA diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml new file mode 100644 index 00000000000..828f9681613 --- /dev/null +++ b/.github/workflows/scorecard.yml @@ -0,0 +1,73 @@ +# This workflow uses actions that are not certified by GitHub. They are provided +# by a third-party and are governed by separate terms of service, privacy +# policy, and support documentation. + +name: Scorecard supply-chain security +on: + # For Branch-Protection check. Only the default branch is supported. See + # https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection + branch_protection_rule: + # To guarantee Maintained check is occasionally updated. See + # https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained + schedule: + # Runs everyday at 23:00 UTC/16:00 PST. + - cron: '0 23 * * *' + push: + branches: [ "main" ] + +# Declare default permissions as read only. +permissions: read-all + +jobs: + analysis: + name: Scorecard analysis + runs-on: ubuntu-latest + permissions: + # Needed to upload the results to code-scanning dashboard. + security-events: write + # Needed to publish results and get a badge (see publish_results below). + id-token: write + # Uncomment the permissions below if installing in a private repository. + # contents: read + # actions: read + + steps: + - name: "Checkout code" + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + with: + persist-credentials: false + + - name: "Run analysis" + uses: ossf/scorecard-action@dc50aa9510b46c811795eb24b2f1ba02a914e534 # v2.3.3 + with: + results_file: results.sarif + results_format: sarif + # (Optional) "write" PAT token. Uncomment the `repo_token` line below if: + # - you want to enable the Branch-Protection check on a *public* repository, or + # - you are installing Scorecard on a *private* repository + # To create the PAT, follow the steps in https://github.com/ossf/scorecard-action#authentication-with-pat. + # repo_token: ${{ secrets.SCORECARD_TOKEN }} + + # Public repositories: + # - Publish results to OpenSSF REST API for easy access by consumers + # - Allows the repository to include the Scorecard badge. + # - See https://github.com/ossf/scorecard-action#publishing-results. + # For private repositories: + # - `publish_results` will always be set to `false`, regardless + # of the value entered here. + publish_results: true + + # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF + # format to the repository Actions tab. + - name: "Upload artifact" + uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1 + with: + name: SARIF file + path: results.sarif + retention-days: 5 + + # Upload the results to GitHub's code scanning dashboard. + - name: "Upload to code-scanning" + uses: github/codeql-action/upload-sarif@8a470fddafa5cbb6266ee11b37ef4d8aae19c571 # v3.24.6 + with: + sarif_file: results.sarif diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index e3a631e87ec..e74e7cf30af 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -18,7 +18,7 @@ jobs: pull-requests: write steps: - - uses: actions/stale@v8 + - uses: actions/stale@v9 with: repo-token: ${{ secrets.GITHUB_TOKEN }} days-before-stale: 360 diff --git a/.travis.yml b/.travis.yml index 5a4f669a5be..dfed1bf1486 100644 --- a/.travis.yml +++ b/.travis.yml @@ -39,7 +39,7 @@ addons: name: "ofiwg/libfabric" description: "Libfabric project coverity scans" notification_email: sean.hefty@intel.com - build_command_prepend: "./autogen.sh; ./configure --enable-efa=$RDMA_CORE_PATH --enable-psm2 --enable-psm3=$RDMA_CORE_PATH --enable-verbs=$RDMA_CORE_PATH" + build_command_prepend: "./autogen.sh; ./configure --enable-efa=$RDMA_CORE_PATH --enable-psm2 --enable-psm3=$RDMA_CORE_PATH --enable-usnic --enable-verbs=$RDMA_CORE_PATH" build_command: "make -j2" branch_pattern: main @@ -75,7 +75,7 @@ install: git clone --depth 1 -b $RDMA_CORE_BRANCH https://github.com/linux-rdma/rdma-core.git && cd rdma-core && bash build.sh && cd -; RDMA_CORE_PATH=$PWD/rdma-core/build ; export LD_LIBRARY_PATH="$RDMA_CORE_PATH/lib:$LD_LIBRARY_PATH" ; - LIBFABRIC_CONFIGURE_ARGS="$LIBFABRIC_CONFIGURE_ARGS + LIBFABRIC_CONFIGURE_ARGS="$LIBFABRIC_CONFIGURE_ARGS --enable-usnic --enable-psm3=$RDMA_CORE_PATH --enable-verbs=$RDMA_CORE_PATH --enable-efa=$RDMA_CORE_PATH"; @@ -99,6 +99,7 @@ install: --disable-shm --disable-tcp --disable-udp + --disable-usnic --disable-verbs - make -j2 $MAKE_FLAGS - make install @@ -120,7 +121,7 @@ install: make dist; config_options="--enable-efa=$RDMA_CORE_PATH --enable-psm3=$RDMA_CORE_PATH - --enable-verbs=$RDMA_CORE_PATH; + --enable-verbs=$RDMA_CORE_PATH --enable-usnic"; LDFLAGS=-Wl,--build-id rpmbuild -ta --define "configopts $config_options" libfabric-*.tar.bz2; fi diff --git a/AUTHORS b/AUTHORS index 0928a5e4d49..e5ef4f08911 100644 --- a/AUTHORS +++ b/AUTHORS @@ -8,6 +8,7 @@ Alex McKinley Alex McKinley Alexia Ingerson alexia.ingerson +Amir Shehata Amir Shehata Amith Abraham Ana Guerrero López @@ -17,6 +18,7 @@ Andrew Friedley Andrey Lobanov Anthony Zinger Ao Li +Archana Venkatesha Arun C Ilango arun ilango Arun Ilango @@ -28,6 +30,7 @@ AWS ParallelCluster user AWS ParallelCluster user aws-ceenugal <123417666+aws-ceenugal@users.noreply.github.com> Ben Lynam +Ben Lynam Ben Lynam Ben Menadue Ben Turrubiates @@ -43,6 +46,7 @@ Brian J. Murrell Brian Li bwilsoncn Casey Carter +chadkoster-hpe Chang Hyun Park Charles J Archer Charles King @@ -52,10 +56,13 @@ Chen Zhao Chenwei Zhang Chien Tin Tung Chris Dolan +Chris Taylor Chuck Fossen +Cody Mann Coni Gehler ct-clmsn Dardo D Kleiner +dariuszsciebura <93722774+dariuszsciebura@users.noreply.github.com> Darryl Abbate Dave Goodell David Noel @@ -66,6 +73,8 @@ Dmitry Durnov Dmitry Gladkov Doug Oucharek Edgar Gabriel +Elias Kozah +Elias Kozah Eric Raut Erik Paulson Erik Paulson @@ -76,6 +85,7 @@ Evgeny Leksikov Ezra Kissel Firas Jahjah Frank Zago +Franz Pöschel fullerdj Gal Pressman Gengbin Zheng @@ -110,7 +120,9 @@ Jeff Squyres Jerome Berryhill Jerome Boyd Berryhill Jerome Soumagne +Jessie Yang Jiakun Yan +Jianshui Yu Jianxin Xiong jianxin.xiong Jie Zhang @@ -118,6 +130,7 @@ Jim Snow Jingyin Tang Jithin Jose Joe Doyle +Joe Nemeth Johannes Ziegenbalg John Biddiscombe John Byrne @@ -139,6 +152,7 @@ kseager Kyle Gerheiser Latchesar Ionkov Leena Radeke +Lindsay Reiser Lisanna Dettwyler Lisanna Dettwyler Lukasz Dorau @@ -163,6 +177,7 @@ mmubarak Mohan Gandhi muttormark Neil Spruit +Nicholas Sielicki Nicolas Morey-Chaisemartin Nikhil Nanal nikhilnanal @@ -204,6 +219,7 @@ Robert Wespetal Rohit Zambre Ryan Hankins Ryan Hankins +Rémi Dehenne Sai Sunku Sannikov, Alexander Sayantan Sur @@ -237,7 +253,9 @@ Thomas Gillis Thomas Huber Thomas Huber Thomas Smith +thomasgillis Thorsten Schütt +Tim Hu Tim Thompson <80290075+timothom64@users.noreply.github.com> Tim Thompson Todd Rimmer diff --git a/Makefile.am b/Makefile.am index e38497b3811..8571a583c18 100644 --- a/Makefile.am +++ b/Makefile.am @@ -220,7 +220,7 @@ src_libfabric_la_LIBADD = src_libfabric_la_DEPENDENCIES = libfabric.map if !EMBEDDED -src_libfabric_la_LDFLAGS += -version-info 24:0:23 +src_libfabric_la_LDFLAGS += -version-info 25:0:24 endif src_libfabric_la_LDFLAGS += -export-dynamic \ $(libfabric_version_script) @@ -456,6 +456,7 @@ include prov/sockets/Makefile.include include prov/udp/Makefile.include include prov/verbs/Makefile.include include prov/efa/Makefile.include +include prov/usnic/Makefile.include include prov/psm2/Makefile.include include prov/psm3/Makefile.include include prov/cxi/Makefile.include diff --git a/NEWS.md b/NEWS.md index 152c685de3e..7848b17a410 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,35 +6,579 @@ bug fixes (and other actions) for each version of Libfabric since version 1.0. New major releases include all fixes from minor releases with earlier release dates. -v1.20.0, Fri Nov 17, 2023 +v1.21.0, Fri Mar 22, 2024 ======================== ## Core +## BGQ + +Removed. + +## CXI + +New provider supporting Cray's Slingshot network. + ## EFA +## GNI + +Removed. + ## Hooks +# NETDIR + +Removed. The functionality is intergrated into the verbs provider. + ## OPX ## Peer ## PSM3 +## RSTREAM + +Removed. + +## RXM + +## SHM + +## TCP + +## UCX + +## Util + +## Verbs + + +## Fabtests + + +v1.20.1, Mon Jan 22, 2024 +========================= + +## Core + +- hmem/ze: Change the library name passed to dlopen +- hmem/ze: map device id to physical device +- hmem/ze: skip duplicate initialization +- hmem/ze: dynamically allocate device resources based on number of devices +- hmem/ze: fix hmem_ze_copy_engine variable look up +- hmem/ze: Increase ZE_MAX_DEVICES to 32 +- man: Fix typo in fi_getinfo man page +- Fix compiler warning when compiling with ICX +- man: Fix fi_rxm.7 and fi_collective.3 man pages +- man: Update EFA docs for FI_EFA_INTER_MIN_READ_WRITE_SIZE + +## EFA + +- efa_rdm_ep_record_tx_op_submitted() rm peer lookup +- Remove peer lookup from efa_rdm_pke_sendv() +- Make handshake response use txe +- test: Only close SHM if SHM peer is Created +- Handshake code allocs txe via efa util +- Initialize txe.rma_iov_count to 0 +- Switch fi_addr to efa_rdm_peer in trigger_handshake +- Downgrade EFA Endpoint Creation WARN to INFO +- Init srx_ctx before use +- Clean up generic_send path +- Pass in efa_rdm_ep to efa_rdm_msg_generic_recv() +- Make recv path slightly more efficient +- re-org rma write to avoid duplicate checks +- Add missing sync_memops call to writedata +- use peer pointer from txe in read, write and send +- Pass in peer pointer to txe +- Get rid of noop instruction from empty #define +- Remove noop memset +- Fix the ibv cq error handling. +- Don't do handshake for local read +- Fix a typo in configure.m4 +- Make runt_size aligned + +## NetDir + +- Add missing unlock in error path of nd_send_ack() + +## OPX + +- Initialize cq error data size + +## RXM + +- Fix data error with FI_OFI_RXM_USE_RNDV_WRITE=1 + +## SHM + +- Fix coverity issue about resource leak +- Adjust the order of smr_region fields. +- Allocate peer device fds dynamically + +## Util + +- Fix coverity issue about missing lock +- Implement timeout in util_wait_yield_run() +- Fix bug in util_cq startup error case +- util_mem_hooks: add missing parantheses + +## Verbs + +- Windows: Resolve regression in user data retrieval + +## Fabtests + +- efa: Close ibv device after use +- efa: Get device MR limit from ibv_query_device +- efa: Add simple unexpected test to MR exhaustion test +- pytest: add a new ssh connection error pattern + + +v1.19.1, Mon Jan 22, 2024 +========================= + +## Core + +- hmem/ze: Change the library name passed to dlopen +- hmem/ze: map device id to physical device +- hmem/ze: skip duplicate initialization +- hmem/ze: dynamically allocate device resources based on number of devices +- hmem/ze: fix hmem_ze_copy_engine variable look up +- hmem/ze: Increase ZE_MAX_DEVICES to 32 +- man: Fix typo in fi_getinfo man page +- Fix compiler warning when compiling with ICX +- man: Fix fi_rxm.7 and fi_collective.3 man pages +- man: Fix the fi_provider.7 man page for the man page converter +- hmem/synapseai: Refine the error handling and warning +- configure.ac Fix `--with-lttng` causing `yes/` to populate {CPP,LD}FLAGS +- hmem: Only initalize synapseai if device exists +- hmem/ze: fix incorrect device id in copy function +- configure.ac: Fix `with_synaposeai` typo + +## EFA + +- Fix the ibv cq error handling. +- Don't do handshake for local read +- Don't do handshake for local fi_write +- Make runt_size aligned +- Add pingpong test after exhausting MRs +- Introduce utilities to exhaust MRs on EFA device +- Add read nack protocol docs +- Receiver send NACK if runt read fails with ENOMR +- Sender switch to long CTS protocol if runt read fails with ENOMR +- Receiver send NACK if long read fails with ENOMR +- Update efa_rdm_rxe_map_remove to accept msg_id and addr +- Sender switch to long CTS protocol if long read fails with ENOMR +- Introduce new READ_NACK feature +- Do not abort on all deprecated env vars +- Allocate pke_vec, recv_wr_vec, sge_vec from heap +- Close shm resource when it is disabled in ep +- Disable RUNTING for Neuron +- Move cuda-sync-memops from MR to EP +- Do not insert shm av inside efa progress engine +- Fix coverity warning in efa_mr_reg_impl +- Fix typos in packet macros +- Adjust posted receive size to pkt_size +- RDMA write with immediate data completion bugfix +- Do not create SHM peer when SHM is disabled +- Use correct threading model for shm +- Restrict RDMA read to compatible EFA devices +- Add EFA device version to handshake +- Cleanup/fix some unit test code +- Touch up RDM protocol header, doc +- Fix efa device name matching +- Add missing locks in efa_cntr_wait. +- Fix the efa_env_initialize() call sequence. +- Fix a compilation warning +- Handle RNRs from RDMA writedata +- Add writedata RNR fabtest +- Correct typo in RMA context type + +## NetDir + +- Add missing unlock in error path of nd_send_ack() + +## RXM + +- Fix data error with FI_OFI_RXM_USE_RNDV_WRITE=1 + +## SHM + +- Fix coverity issue about resource leak +- Allocate peer device fds dynamically +- Add memory barrier before updating resp for atomic +- Use peer cntr inc ops in smr_progress_cmd +- Only increment tx cntr when inject rma succeeded. + +## TCP + +- Pass through rdm_ep flags to msg eps. +- Derive cq flags from op and msg flags +- Set FI_MULTI_RECV for last completed RX slice + +## UCX + +- Initialize ep_flush to 1 + +## Util + +- Fix coverity issue about missing lock +- Implement timeout in util_wait_yield_run() +- memhooks: Fix a bug when calculating mprotect region + +## Verbs + +- Windows: Resolve regression in user data retrieval +- Windows: Check error code from GetPrivateData +- Bug fix for matching domain name with device name + +## Fabtests + +- efa: Close ibv device after use +- efa: Get device MR limit from ibv_query_device +- efa: Add simple unexpected test to MR exhaustion test +- pytest: Add a new ssh connection error pattern +- Make ft_force_progress non-static +- memcopy-xe: Fix data verification error for device buffer +- dmabuf: Increase the number of NICs that can be tested +- cq_data: Relax CQ data validation to cq_data_size +- dmabuf: Handle partial read scenario for fi_xe_rdmabw test +- pytest/efa: Add cuda memory marker + + +v1.18.3, Mon Jan 22, 2024 +========================= + +## Core + +- hmem/ze: Change the library name passed to dlopen +- hmem/ze: map device id to physical device +- hmem/ze: skip duplicate initialization +- hmem/ze: dynamically allocate device resources based on number of devices +- hmem/ze: fix hmem_ze_copy_engine variable look up +- hmem/ze: Increase ZE_MAX_DEVICES to 32 +- man: Fix typo in fi_getinfo man page +- man: Fix fi_rxm.7 and fi_collective.3 man pages +- man: Fix the fi_provider.7 man page for the man page converter +- configure.ac Fix `--with-lttng` causing `yes/` to populate {CPP,LD}FLAGS +- hmem/ze: fix incorrect device id in copy function +- configure.ac: Fix `with_synaposeai` typo + +## EFA + +- Fix efa device name matching +- Add writedata RNR fabtest +- Handle RNRs from RDMA writedata + +## NetDir + +- Add missing unlock in error path of nd_send_ack() +- Release lock prior to returning from nd_send_ack + +## RXM + +- Fix data error with FI_OFI_RXM_USE_RNDV_WRITE=1 + +## SHM + +- Fix coverity issue about resource leak +- Allocate peer device fds dynamically + +## TCP + +- Pass through rdm_ep flags to msg eps. +- Derive cq flags from op and msg flags +- Set FI_MULTI_RECV for last completed RX slice + +## UCX + +- Initialize ep_flush to 1 + +## Util + +- Fix coverity issue about missing lock +- Implement timeout in util_wait_yield_run() +- memhooks: Fix a bug when calculating mprotect region + +## Verbs + +- Windows: Resolve regression in user data retrieval +- Windows: Check error code from GetPrivateData +- Bug fix for matching domain name with device name + +## Fabtests + +- rdm_tagged_peek: Fix race condition synchronization +- Make rdm_tagged_peek test more general +- Split cq_read and cq_readerr in ft_spin_for_comp +- sock_test: Do not use epoll if not available +- Use dummy ft_pin_core on macOS +- Avoid using memset function name +- Fix some header includes +- memcopy-xe: Fix data verification error for device buffer +- dmabuf: Increase the number of NICs that can be tested +- dmabuf: Handle partial read scenario for fi_xe_rdmabw test +- pytest/efa: add cuda memory marker + + +v1.20.0, Fri Nov 17, 2023 +========================= + +## Core + +- General bug fixes and code clean-up +- configure.ac: add extra check for 128 bit atomic support +- hmem/synapseai: Refine the error handling and warning +- Introduce FI_ENOMR +- hmem/cuda: fix a bug when calculating aligned size. +- Handle dmabuf for ofi_mr_cache* functions. +- Handle dmabuf flag in ofi_mr_attr_update +- Handle dmabuf for mr_map insert. +- man: Fix the description of virtual address when FI_MR_DMABUF is set +- man: Clarify the defition of FI_OPT_MIN_MULTI_RECV +- hmem/cuda: Add dmabuf fd ops functions +- include/ofi_atomic_queue: Properly align atomic values +- Define fi_av_set_user_id +- Support multiple auth keys per EP +- Simplify restricted-dl feature +- hmem: Only initalize synapseai if device exists +- Add "--enable-profile" option +- windows: Updated config.h +- Add environment variable for selective HMEM initialization +- Add restricted dlopen flag to configure options +- hmem: generalize the use of OFI_HMEM_DATA to non-cuda iface +- hmem: fail cuda_dev_register if gdrcopy is not enabled +- Add 1.7 ABI compat +- Define fi_domain_attr::max_ep_auth_key +- hmem: Add new op to hmem_ops for getting dmabuf fd +- hmem/cuda: Update cuda_gdrcopy_dev_register's signature +- mr_cache: Define ofi_mr_info::flags +- Add ABI compat for fi_cq_err_entry::src_addr +- Define fi_cq_err_entry::src_addr +- Add base_addr to fi_mr_dmabuf +- hmem: Set FI_HMEM_HOST_ALLOC for ze addr valid +- hmem: Support dev reg with FI_HMEM_ZE +- tostr: Added fi_tostr() for data type struct fi_cq_err_entry. +- hmem_ze: fix incorrect device id in copy function +- Introduce new profiling interface for low-level statistics +- hmem: Support dev reg with FI_HMEM_CUDA +- hmem: Support dev reg with FI_HMEM_ROCR +- hmem: Support dev reg with FI_HMEM_SYSTEM +- hmem: Define optimized HMEM memcpy APIs +- Implement memhooks atfork child handler +- hmem: Support ofi_hmem_get_base_addr with sys mem +- hmem: Add length field to ofi_hmem_get_base_addr +- mr_cache: Improve cache hit rate +- mr_cache: Purge dead regions in find +- mr_cache: Update find to remove invalid MR entries +- mr_cache: Update find with MM valid check +- Add direct support for dma-buf memory registration +- man/fi_tagged: Remove the peek for data ability +- indexer: Add byte idx abstraction +- Add missing FI_REMOTE_CQ_DATA for fi_inject_writedata +- Add configure flags for more sanitizers +- Fix fi_peer man page inconsistency +- include/fi_peer: Add cq_data to rx_entry, allow peer to modify on unexp +- Add XPMEM support + +## EFA + +- General bug fix and code clean-up +- Do not abort on all deprecated env vars +- Onboard fi_mr_dmabuf API in mem reg ops. +- Try registering cuda memory via dmabuf when checking p2p +- Introduce HAVE_EFA_DMABUF_MR macro in configure +- Add read nack protocol docs +- Receiver send NACK if runt read fails with ENOMR +- Sender switch to long CTS protocol if runt read fails with ENOMR +- Receiver send NACK if long read fails with ENOMR +- Update efa_rdm_rxe_map_remove to accept msg_id and addr +- Sender switch to long CTS protocol if long read fails with ENOMR +- Introduce new READ_NACK feature +- Use SHM's full inject size +- Add testing for small messages without inject +- Enable inject rdma write +- Use bounce buffer for 0 byte writes +- Onboard ofi_hmem_dev_register API +- Update cuda_gdrcopy_dev_register's signature +- Allocate pke_vec, recv_wr_vec, sge_vec from heap +- Close shm resource when it is disabled in ep +- Disable RUNTING for Neuron +- Move cuda-sync-memops from MR to EP +- Do not insert shm av inside efa progress engine +- Enable shm when FI_HMEM and FI_ATOMIC are requested +- Adjust posted receive size to pkt_size +- Do not create SHM peer when SHM is disabled +- Use correct threading model for shm +- Restrict RDMA read to compatible EFA devices +- Add EFA device version to handshake +- Add missing locks in efa_cntr_wait. +- Add writedata RNR fabtest +- Handle RNRs from RDMA writedata +- Check opt_len in efa_rdm_ep_getopt +- Use correct tx/rx op_flags for shm + +## Hooks + +- dmabuf: Initialize fd to supress compiler warning +- trace: Add log on FI_VAR_UNEXP_MSG_CNT when enabled. +- trace: Fixed trace log format on some attributes. + +## OPX + +- Fix compiler warnings + +## PSM3 + +- Fix compiler warnings +- Update provider to sync with IEFS 11.5.1.1.1 + ## RXM +- Remove unused function +- Use gdrcopy in rma when emulating injection +- Use gdrcopy in eager send/recv +- Add hmem gdrcopy functions +- Remove unused dynamic rbuf support + ## SHM +- General bug fixes and cleanup +- Add ofi_buf_alloc error handling +- Only copy header + msg on unexpected path +- Add FI_HMEM atomic support +- Add memory barrier before updating resp for atomic +- Add more error output +- Reduce atomic locking with ofi_mr_map_verify +- Only increment tx cntr when inject rma succeeded. +- Use peer cntr inc ops in smr_progress_cmd +- Allow for inject protocol to buffer more unexpected messages +- Change pending fs to bufpool to allow it to grow +- Add unexpected SAR buffering +- Use generic acronym for shm cap +- Move CMA to use the p2p infrastructure +- Add p2p abstraction +- Load DSA dependency dynamically +- Replace tx_lock with ep_lock +- Calculate comp vars when writing completion +- Move progress_sar above progress_cmd +- Rename SAR status enum to be more clear +- Make SAR protocol handle 0 byte transfer. +- Move selection logic to smr_select_proto() + +## Sockets + +- Fix compiler warnings +- Fix provider name and api version in returned fi_info struct + ## TCP +- Add profiling interface support +- Pass through rdm_ep flags to msg eps +- Derive cq flags from op and msg flags +- Do not progress ep that is disconnected +- Set FI_MULTI_RECV for last completed RX slice +- Return an error if invalid sequence number received +- xnet_progress_rx() must only be called when connected +- Reset ep->rx_avail to 0 after RX queue is flushed +- Disable the EP if an error is detected for zero-copy +- Add debug tracking of transfer entries +- Negotiate support for rendezvous +- Add rendezvous protocol option +- Generalize xnet_send_ack +- Flatten protocol header definitions +- Remove unused dynamic rbuf support +- Define tcp specific protocol ops +- Remove unneeded and incorrect rx_entry init code + ## UCX +- Add FI_HMEM support +- Initialize ep_flush to 1 + ## Util +- General bug fixes +- memhooks: Fix a bug when calculating mprotect region +- Check the return value of ofi_genlock_init() +- Update checks for FI_AV_AUTH_KEY +- Define domain primary and secondary caps +- Add profiling util functions +- Update util_cq to support err_data +- Update ofi_cq_readerr to use new memcpy +- Update ofi_cq_err_memcpy to handle err_data +- Zero util cancel err entry +- Move FI_REMOTE/LOCAL_COMM to secondary caps +- Alter domain max_ep_auth_key +- Add domain checks for max_ep_auth_key +- Revert util_cntr->ep_list_lock to ofi_mutex +- Add NIC FID functions to ofi.h +- Add EP and domain auth key checking +- Add bounds checks to ibuf get +- Define dlist_first_entry_or_null +- Update util_getinfo to dup auth_key +- Revert util_av, util_cq and util_cntr to mutex +- Add missing calls to (de)initialize monitor's mutexes +- Avoid attempting to cleanup an uninitialized MR cache +- Rename ofi_mr_info fields +- Add rv64g support to memory hooks + ## Verbs +- Windows: Check error code from GetPrivateData +- Add missing lock to protect SRX +- Add synapseai dmabuf mr support +- Bug fix for matching domain name with device name +- Windows: Fetch rejected connection data +- Add support for DMA-buf memory registration +- Windows: Fix use-after-free in case of failure in fi_listen +- Windows: Map ND request type to ibverbs opcode +- Fix memory leak when creating EQ with unsupported wait object +- Track ep state to prevent duplicate shutdown events + ## Fabtests +- Update man page +- pytests/efa: onboard dmabuf argument for test_mr +- pytest: make do_dmabuf_reg_for_hmem an cmdline argument +- Bump Libfabric API version. +- mr_test: Add dmabuf support +- Introduce ft_get_dmabuf_from_iov +- unexpected_msg: Use ft_reg_mr to register memory +- pytest: Allow registering mr with dmabuf +- Add dmabuf support to ft_reg_mr +- Add dmabuf ops for cuda. +- Test max inject size +- Add FI_HMEM support to fi_rdm_rma_event and fi_rdm tests +- memcopy-xe: Fix data verification error for device buffer +- dmabuf-rdma: Increase the number of NICs that can be tested +- dmabuf-rdma: Remove redundant libze_ops definition +- fi-mr-reg-xe: Skip native dmabuf reg test for system memory +- Check if fi_info is returned correctly in case of FI_CONNREQ +- cq_data: relax CQ data validation to cq_data_size +- Add ZE host alloc function +- Use common device host buffer for check_buf +- hmem_ze: allocate one cq and cl on init +- fi-mr-reg-xe: Add testing for dmabuf registration +- scripts: use yaml safe_load +- macos: Fix build error with clang +- multinode: Use FI_DELIVERY_COMPLETE for 'barrier' +- Handle partial read scenario for fi_xe_rdmabw test For cross node tests +- pytest/efa: add cuda memory marker +- pytest/efa: Skip some configuration for unexp msg test on neuron. +- runfabtests.py: ignore error due to no tests are collected. +- pytest/efa: extend unexpected msg test range +- pytest/shm: extend unexpected msg test range +- pytest: Allow running shm fabtests in parallel +- unexpected_msg.c: Allow running the test with FI_DELIVERY_COMPLETE +- runfabtests.sh: run fi_unexpected_msg with data validation +- pytest/shm: Extend test_unexpected_message +- unexpected_msg: Make tx/rx_size large enough +- pytest/shm: Extend shm's rma bw test +- Update shm.exclude + + v1.19.0, Fri Sep 1, 2023 ======================== diff --git a/README.md b/README.md index 0f39b5bfc3a..b8b9d3f8286 100644 --- a/README.md +++ b/README.md @@ -131,22 +131,6 @@ A more comprehensive test package is available via the fabtests package. ## Providers -### gni - -*** - -The `gni` provider runs on Cray XC (TM) systems utilizing the user-space -Generic Network Interface (`uGNI`), which provides low-level access to -the Aries interconnect. The Aries interconnect is designed for -low-latency one-sided messaging and also includes direct hardware -support for common atomic operations and optimized collectives. - -See the `fi_gni(7)` man page for more details. - -#### Dependencies - -- The `gni` provider requires `gcc` version 4.9 or higher. - ### opx *** @@ -241,6 +225,38 @@ libfabric features over any hardware. See the `fi_udp(7)` man page for more details. +### usnic + +*** + +The `usnic` provider is designed to run over the Cisco VIC (virtualized NIC) +hardware on Cisco UCS servers. It utilizes the Cisco usnic (userspace NIC) +capabilities of the VIC to enable ultra low latency and other offload +capabilities on Ethernet networks. + +See the `fi_usnic(7)` man page for more details. + +#### Dependencies + +- The `usnic` provider depends on library files from either `libnl` version 1 + (sometimes known as `libnl` or `libnl1`) or version 3 (sometimes known as + `libnl3`). If you are compiling libfabric from source and want to enable + usNIC support, you will also need the matching `libnl` header files (e.g., + if you are building with `libnl` version 3, you need both the header and + library files from version 3). + +#### Configure options + +``` +--with-libnl= +``` + +If specified, look for libnl support. If it is not found, the `usnic` +provider will not be built. If `` is specified, then check in the +directory and check for `libnl` version 3. If version 3 is not found, then +check for version 1. If no `` argument is specified, then this +option is redundant with `--with-usnic`. + ### verbs *** @@ -251,6 +267,9 @@ transport and translates OFI calls to appropriate verbs API calls. It uses librdmacm for communication management and libibverbs for other control and data transfer operations. +The verbs provider can also be built on Windows using the Microsoft Network +Direct SPI for network transport. + See the `fi_verbs(7)` man page for more details. #### Dependencies @@ -261,29 +280,8 @@ See the `fi_verbs(7)` man page for more details. If the libraries and header files are not in default paths, specify them in CFLAGS, LDFLAGS and LD_LIBRARY_PATH environment variables. -### Network Direct - -*** - -The Network Direct provider enables applications using OFI to be run over -any verbs hardware (Infiniband, iWarp, and RoCE). It uses the Microsoft Network -Direct SPI for network transport and provides a translation of OFI calls to -appropriate Network Direct API calls. -The Network Direct providers enables OFI-based applications to utilize -zero-copy data transfers between applications, kernel-bypass I/O generation and -one-sided data transfer operations on Microsoft Windows OS. -An application can use OFI with the Network Direct provider enabled on -Windows OS to expose the capabilities of the networking devices if the hardware -vendors of the devices implemented the Network Direct service provider interface -(SPI) for their hardware. - -See the `fi_netdir(7)` man page for more details. - -#### Dependencies - -- The Network Direct provider requires Network Direct SPI. If you are compiling - libfabric from source and want to enable Network Direct support, you will also - need the matching header files for the Network Direct SPI. +- Windows built requires Network Direct SPI. If you are compiling libfabric from + source, you will also need the matching header files for the Network Direct SPI. If the libraries and header files are not in default paths, specify them in the configuration properties of the VS project. @@ -346,3 +344,26 @@ It is possible to compile and link libfabric with windows applications. - choose C/C++ > General and add `\include` to "Additional include Directories" - choose Linker > Input and add `\x64\\libfabric.lib` to "Additional Dependencies" - depending on what you are building you may also need to copy `libfabric.dll` into the target folder of your own project. + +### cxi + +The CXI provider enables libfabric on Cray's Slingshot network. Slingshot is +comprised of the Rosetta switch and Cassini NIC. Slingshot is an +Ethernet-compliant network. However, The provider takes advantage of proprietary +extensions to support HPC applications. + +The CXI provider supports reliable, connection-less endpoint semantics. It +supports two-sided messaging interfaces with message matching offloaded by the +Cassini NIC. It also supports one-sided RMA and AMO interfaces, light-weight +counting events, triggered operations (via the deferred work API), and +fabric-accelerated small reductions. + +See the `fi_cxi(7)` man page for more details. + +#### Dependencies + +- The CXI Provider requires Cassini's optimized HPC protocol which is only + supported in combination with the Rosetta switch. + +- The provider uses the libCXI library for control operations and a set of + Cassini-specific header files to enable direct hardware access in the data path. diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 00000000000..4a50965d370 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,10 @@ +# Reporting Security Issues + +The libfabric team and community take security bugs in libfabric seriously. We appreciate your efforts to responsibly disclose your findings, and will make every effort to acknowledge your contributions. + +To report a security issue, please use the GitHub Security Advisory ["Report a Vulnerability"](https://github.com/ofiwg/libfabric/security) tab. + +The libfabric team will send a response indicating the next steps in handling your report. After the initial reply to your report, the security team will keep you informed of the progress towards a fix, and may ask for additional information or guidance. + +Report security bugs in third-party modules to the team maintaining the module. + diff --git a/config/cron-make-nightly-tarball.pl b/config/cron-make-nightly-tarball.pl index c8c22ecea2a..2db6d241480 100755 --- a/config/cron-make-nightly-tarball.pl +++ b/config/cron-make-nightly-tarball.pl @@ -279,7 +279,7 @@ sub submit_to_coverity { # Run the coverity script if requested if (defined($libfabric_coverity_token_arg) && $rebuilt_libfabric) { submit_to_coverity("ofiwg%2Flibfabric", $libfabric_version, - "--enable-sockets --enable-udp --enable-verbs", + "--enable-sockets --enable-udp --enable-verbs --enable-usnic", $libfabric_coverity_token_arg); } if (defined($fabtests_coverity_token_arg) && $rebuilt_fabtests) { diff --git a/configure.ac b/configure.ac index db61225cced..a55891b483b 100644 --- a/configure.ac +++ b/configure.ac @@ -9,7 +9,7 @@ dnl dnl Process this file with autoconf to produce a configure script. AC_PREREQ([2.60]) -AC_INIT([libfabric], [1.21.0a1], [ofiwg@lists.openfabrics.org]) +AC_INIT([libfabric], [1.22.0a1], [ofiwg@lists.openfabrics.org]) AC_CONFIG_SRCDIR([src/fabric.c]) AC_CONFIG_AUX_DIR(config) AC_CONFIG_MACRO_DIR(config) @@ -59,7 +59,7 @@ AC_DEFINE_UNQUOTED([BUILD_ID],["$with_build_id"], # Override autoconf default CFLAG settings (e.g. "-g -O2") while still # allowing the user to explicitly set CFLAGS="" -: ${CFLAGS="-fvisibility=hidden ${base_c_warn_flags}"} +: ${CFLAGS="-pipe -fvisibility=hidden ${base_c_warn_flags}"} # AM_PROG_AS would set CFLAGS="-g -O2" by default if not set already so it # should not be called earlier @@ -426,14 +426,16 @@ AC_ARG_ENABLE([xpmem], PATH: enable xpmem and use xpmem installed under PATH)])], ) -FI_CHECK_PACKAGE([xpmem], - [xpmem.h], - [xpmem], - [xpmem_make], - [], - [$enable_xpmem], - [$enable_xpmem/lib64], - [xpmem_happy=1]) +AS_IF([test x"$enable_xpmem" != x"no"], + [FI_CHECK_PACKAGE([xpmem], + [xpmem.h], + [xpmem], + [xpmem_make], + [], + [$enable_xpmem], + [], + [xpmem_happy=1]) + ]) AS_IF([test x"$enable_xpmem" != x"no" && test -n "$enable_xpmem" && test "$xpmem_happy" = "0" ], [AC_MSG_ERROR([XPMEM support requested but XPMEM runtime not available.])]) @@ -1005,6 +1007,9 @@ FI_PROVIDER_SETUP([sockets]) FI_PROVIDER_SETUP([verbs]) FI_PROVIDER_SETUP([efa]) FI_PROVIDER_SETUP([cxi]) +dnl The usnic provider must be setup after the verbs provider. See +dnl prov/usnic/configure.m4 for details. +FI_PROVIDER_SETUP([usnic]) FI_PROVIDER_SETUP([udp]) FI_PROVIDER_SETUP([tcp]) FI_PROVIDER_SETUP([rxm]) diff --git a/contrib/aws/Jenkinsfile b/contrib/aws/Jenkinsfile new file mode 100644 index 00000000000..cf7e5429339 --- /dev/null +++ b/contrib/aws/Jenkinsfile @@ -0,0 +1,129 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ +// Use milestones to abort old builds when the user force pushes +def buildNumber = env.BUILD_NUMBER as int +if (buildNumber > 1) milestone(buildNumber - 1) +milestone(buildNumber) + +pipeline { + agent { + ecs { + inheritFrom 'fargate-large' + } + } + options { + buildDiscarder(logRotator(daysToKeepStr: "90")) + timeout(time: 8, unit: 'HOURS') + } + environment { + // AWS region where the cluster is created + REGION="us-west-2" + } + stages { + // Cleanup workspace before job start. + stage("Clean up workspace") { + steps{ + deleteDir() + } + } + stage("Checkout SCM repo") { + steps { + checkout scm + } + } + stage("Download and extract PortaFiducia") { + steps { + script { + sh 'printenv' + def common = load "contrib/aws/common.groovy" + common.download_and_extract_portafiducia('PortaFiducia') + } + } + } + stage("Install PortaFiducia") { + steps { + script { + def common = load "contrib/aws/common.groovy" + common.install_porta_fiducia() + } + + } + } + stage("Test EFA provider") { + steps { + script { + def common = load "contrib/aws/common.groovy" + def stages = [:] + // This needs the extra space at the end + def addl_args_pr = "--test-libfabric-pr $env.CHANGE_ID " + + // Single Node Tests - EFA + stages["1_g4dn_alinux2-efa"] = common.get_test_stage("1_g4dn_alinux2_efa", env.BUILD_TAG, "alinux2", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", addl_args_pr) + stages["1_g4dn_ubuntu2004-efa"] = common.get_test_stage("1_g4dn_ubuntu2004_efa", env.BUILD_TAG, "ubuntu2004", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", addl_args_pr) + stages["1_g4dn_rhel8-efa"] = common.get_test_stage("1_g4dn_rhel8_efa", env.BUILD_TAG, "rhel8", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", addl_args_pr) + stages["1_g4dn_centos7-efa"] = common.get_test_stage("1_g4dn_centos7_efa", env.BUILD_TAG, "centos7", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", addl_args_pr) + + // Single Node Tests - SHM + stages["1_g4dn_alinux2_shm"] = common.get_test_stage("1_g4dn_alinux2_shm", env.BUILD_TAG, "alinux2", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider shm") + stages["1_g4dn_ubuntu2004_shm"] = common.get_test_stage("1_g4dn_ubuntu2004_shm", env.BUILD_TAG, "ubuntu2004", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider shm") + stages["1_g4dn_rhel8_shm"] = common.get_test_stage("1_g4dn_rhel8_shm", env.BUILD_TAG, "rhel8", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider shm") + stages["1_g4dn_centos7_shm"] = common.get_test_stage("1_g4dn_centos7_shm", env.BUILD_TAG, "centos7", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider shm") + stages["1_g4dn_ubuntu2004_shm_disable-cma"] = common.get_test_stage("1_g4dn_ubuntu2004_shm_disable-cma", env.BUILD_TAG, "ubuntu2004", "g4dn.8xlarge", 1, "us-east-1", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider shm --enable-cma false") + + // Single Node Windows Test + stages["EFA_Windows_Test"] = common.get_single_node_windows_test_stage("EFA_Windows_Test") + + // Multi Node Tests - EFA + stages["2_hpc6a_alinux2_efa"] = common.get_test_stage("2_hpc6a_alinux2_efa", env.BUILD_TAG, "alinux2", "hpc6a.48xlarge", 2, "eu-north-1", "libfabric_pr_test.yaml", addl_args_pr) + stages["2_hpc6a_ubuntu2004_efa"] = common.get_test_stage("2_hpc6a_ubuntu2004_efa", env.BUILD_TAG, "ubuntu2004", "hpc6a.48xlarge", 2, "eu-north-1", "libfabric_pr_test.yaml", addl_args_pr) + stages["2_hpc6a_rhel8_efa"] = common.get_test_stage("2_hpc6a_rhel8_efa", env.BUILD_TAG, "rhel8", "hpc6a.48xlarge", 2, "eu-north-1", "libfabric_pr_test.yaml", addl_args_pr) + + // Multi Node Tests - TCP + stages["2_hpc6a_alinux2_tcp"] = common.get_test_stage("2_hpc6a_alinux2_tcp", env.BUILD_TAG, "alinux2", "hpc6a.48xlarge", 2, "eu-north-1", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider tcp") + stages["2_hpc6a_ubuntu2004_tcp"] = common.get_test_stage("2_hpc6a_ubuntu2004_tcp", env.BUILD_TAG, "ubuntu2004", "hpc6a.48xlarge", 2, "eu-north-1", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider tcp") + stages["2_hpc6a_rhel8_tcp"] = common.get_test_stage("2_hpc6a_rhel8_tcp", env.BUILD_TAG, "rhel8", "hpc6a.48xlarge", 2, "eu-north-1", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider tcp") + + // Multi Node Tests - SOCKETS + stages["2_hpc6a_alinux2_sockets"] = common.get_test_stage("2_hpc6a_alinux2_sockets", env.BUILD_TAG, "alinux2", "hpc6a.48xlarge", 2, "eu-north-1", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider sockets") + stages["2_hpc6a_ubuntu2004_sockets"] = common.get_test_stage("2_hpc6a_ubuntu2004_sockets", env.BUILD_TAG, "ubuntu2004", "hpc6a.48xlarge", 2, "eu-north-1", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider sockets") + stages["2_hpc6a_rhel8_sockets"] = common.get_test_stage("2_hpc6a_rhel8_sockets", env.BUILD_TAG, "rhel8", "hpc6a.48xlarge", 2, "eu-north-1", "libfabric_pr_test.yaml", addl_args_pr + "--test-libfabric-provider sockets") + + parallel stages + } + } + } + stage('check build_ok') { + steps { + script { + def common = load "contrib/aws/common.groovy" + if (common.build_ok) { + currentBuild.result = "SUCCESS" + } + else { + currentBuild.result = "FAILURE" + } + } + } + } + } + post { + always { + sh 'find PortaFiducia/tests/outputs -name "*.xml" | xargs du -shc' + junit testResults: 'PortaFiducia/tests/outputs/**/*.xml', keepLongStdio: false + archiveArtifacts artifacts: 'PortaFiducia/tests/outputs/**/*.*' + } + failure { + sh ''' + . venv/bin/activate + ./PortaFiducia/scripts/delete_manual_cluster.py --cluster-name WindowsLibfabricCi_${env.CHANGE_ID}_* + ''' + } + aborted { + sh '. venv/bin/activate; ./PortaFiducia/scripts/delete_manual_cluster.py --cluster-name "$BUILD_TAG"\'*\' --region $REGION' + } + // Cleanup workspace after job completes. + cleanup { + deleteDir() + } + } +} diff --git a/contrib/aws/common.groovy b/contrib/aws/common.groovy new file mode 100644 index 00000000000..d9c6db36e4a --- /dev/null +++ b/contrib/aws/common.groovy @@ -0,0 +1,142 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ +/* This file contains variables and functions that can be shared across different jobs */ +import groovy.transform.Field +@Field boolean build_ok = true + +def get_portafiducia_download_path() { + /* Stable Portafiducia tarball */ + def AWS_ACCOUNT_ID = sh ( + script: "aws sts get-caller-identity --query Account --output text | tr -dc 0-9", + returnStdout: true + ) + return "s3://libfabric-ci-$AWS_ACCOUNT_ID-us-west-2/portafiducia/portafiducia.tar.gz" +} + +def download_and_extract_portafiducia(outputDir) { + /* Download PortaFiducia tarball from S3 and extract to outputDir */ + def tempPath = "/tmp/portafiducia.tar.gz" + def downloadPath = this.get_portafiducia_download_path() + + def ret = sh ( + script: "mkdir -p ${outputDir} && aws s3 cp ${downloadPath} ${tempPath} && " + + "tar xf ${tempPath} -C ${outputDir}", + returnStatus: true, + ) + + if (ret != 0) { + unstable('Failed to download and extract PortaFiducia') + } +} + +def install_porta_fiducia() { + /* + * Install PortaFiducia in a (new) virtual environment. + */ + sh ''' + python3 -m venv venv + . venv/bin/activate + pip install --upgrade pip + pip install --upgrade awscli + pip install -e PortaFiducia + ''' +} + +def run_test_orchestrator_once(run_name, build_tag, os, instance_type, instance_count, region, test_config_file, addl_args) { + /* + * Run PortaFiducia/tests/test_orchestrator.py with given command line arguments + * param@ args: str, the command line arguments + */ + def cluster_name = get_cluster_name(build_tag, os, instance_type) + def args = "--config configs/${test_config_file} --os ${os} --instance-type ${instance_type} --instance-count ${instance_count} --region ${region} --cluster-name ${cluster_name} ${addl_args} --junit-xml outputs/${cluster_name}.xml" + def ret = sh ( + script: ". venv/bin/activate; cd PortaFiducia/tests && ./test_orchestrator.py ${args}", + returnStatus: true + ) + if (ret == 65) + unstable('Scripts exited with status 65') + else if (ret != 0) + build_ok = false + catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE') { + sh "exit ${ret}" + } +} + +def get_random_string(len) { + def s = sh ( + script: "cat /dev/urandom | LC_ALL=C tr -dc A-Za-z0-9 | head -c ${len}", + returnStdout: true + ) + return s +} + +def get_cluster_name(build_tag, os, instance_type) { + /* + * Compose the cluster name. Pcluster requires a cluster name under 60 characters. + * cluster name cannot have ".". + * Jenkins does not allow groovy to use the replace() method + * of string. Therefore we used shell command sed to replace "." with "" + */ + build_tag = sh( + script: "echo ${build_tag} | sed \"s/^jenkins-//g\" | sed \"s/ //g\"", + returnStdout: true + ) + + def cluster_name = sh( + script: "echo '${build_tag.take(28)}-${os.take(10)}-${instance_type.take(10)}-'${get_random_string(8)} | tr -d '.\\n'", + returnStdout: true + ) + + return cluster_name +} + +def get_single_node_windows_test_stage(stage_name) { + /* + * Get Windows Stage + */ + return { + stage("${stage_name}") { + def ret = sh ( + script: """ + . venv/bin/activate; + cd PortaFiducia/scripts; + export PULL_REQUEST_ID=${env.CHANGE_ID}; + env AWS_DEFAULT_REGION=us-west-2 ./test_orchestrator_windows.py --ci public --s3-bucket-name libfabric-ci-windows-prod-test-output --pull-request-id ${env.CHANGE_ID}; + """, + returnStatus: true + ) + if (ret == 65) + unstable('Scripts exited with status 65') + else if (ret != 0) + build_ok = false + catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE') { + sh "exit ${ret}" + } + } + } + +} + +def get_test_stage(stage_name, build_tag, os, instance_type, instance_count, region, test_config, addl_args) { + /* + * Generate a single test stage that run test_orchestrator.py with the given parameters. + * param@ stage_name: the name of the stage + * param@ build_tag: the BUILD_TAG env generated by Jenkins + * param@ os: the operating system for the test stage. + * param@ instance_type: the instance type for the test stage. + * param@ instance_count: number of intances to use + * param@ region: the (default) aws region where the tests are run. + * param@ test_config: the name of test config file in PortaFiducia/tests/configs/ + * param@ addl_args: additional arguments passed to test_orchestrator.py + * return@: the test stage. + */ + return { + stage("${stage_name}") { + this.run_test_orchestrator_once(stage_name, build_tag, os, instance_type, instance_count, region, test_config, addl_args) + } + } +} + + + +return this diff --git a/contrib/buildrpm/README b/contrib/buildrpm/README index 01fb28d35a2..40db242278e 100644 --- a/contrib/buildrpm/README +++ b/contrib/buildrpm/README @@ -87,5 +87,5 @@ General parameters: Print usage message and exit. Example usages of the script: - buildrpmLibfabric.sh -omsv -e sockets -e verbs -e psm3 libfabric-1.4.1.tar.bz2 + buildrpmLibfabric.sh -omsv -i usnic -e sockets -e verbs -e psm3 libfabric-1.4.1.tar.bz2 buildrpmLibfabric.sh -omsv -c "--disable-silent-rules" libfabric-1.4.1.tar.bz2 diff --git a/contrib/intel/jenkins/Jenkinsfile b/contrib/intel/jenkins/Jenkinsfile index d8364d01c3d..4a37008ebb4 100644 --- a/contrib/intel/jenkins/Jenkinsfile +++ b/contrib/intel/jenkins/Jenkinsfile @@ -3,7 +3,6 @@ import groovy.transform.Field properties([disableConcurrentBuilds(abortPrevious: true)]) @Field def DO_RUN=true @Field def TARGET="main" -@Field def SCRIPT_LOCATION="upstream/libfabric/contrib/intel/jenkins" @Field def RELEASE=false @Field def BUILD_MODES=["reg", "dbg", "dl"] @Field def PYTHON_VERSION="3.9" @@ -59,7 +58,7 @@ def run_fabtests(stage_name, hw, partition, node_num, prov, util=null, } def run_middleware(providers, stage_name, test, hw, partition, node_num, - mpi=null, imb_grp=null) { + mpi=null, imb_grp=null, user_env=null) { def base_cmd = "python3.9 ${RUN_LOCATION}/runtests.py --test=${test} --build_hw=${hw}" def opts = "" def prefix = "${env.LOG_DIR}/${stage_name}_" @@ -74,7 +73,10 @@ def run_middleware(providers, stage_name, test, hw, partition, node_num, if (env.WEEKLY.toBoolean()) base_cmd = "${base_cmd} --weekly=${env.WEEKLY}" - + + if (user_env) + base_cmd = "${base_cmd} --user_env ${user_env}" + for (prov in providers) { if (prov[1]) { echo "Running ${prov[0]}-${prov[1]} ${stage_name}" @@ -91,6 +93,14 @@ def run_middleware(providers, stage_name, test, hw, partition, node_num, } } +def run_ci(stage_name, config_name) { + sh """source ${CI_LOCATION}/${env.CI_MODULE}/venv/bin/activate;\ + python run.py \ + --output=${env.LOG_DIR}/${stage_name} \ + --job=${config_name} + """ +} + def gather_logs(cluster, key, dest, source) { def address = "${env.USER}@${cluster}" @@ -157,9 +167,22 @@ def checkout_ci_resources() { """ } +def checkout_ci() { + sh """ + if [[ ! -d ${env.WORKSPACE}/ci ]]; then + mkdir ${env.WORKSPACE}/ci + else + rm -rf ${env.WORKSPACE}/ci && mkdir ${env.WORKSPACE}/ci + fi + + git clone --recurse-submodules ${env.CI} ${env.WORKSPACE}/ci + """ +} + def checkout_external_resources() { checkout_ci_resources() checkout_upstream() + checkout_ci() } def generate_diff(def branch_name, def output_loc) { @@ -238,6 +261,10 @@ def build(item, mode=null, hw=null, additional_args=null) { run_python(PYTHON_VERSION, cmd) } +def build_ci() { + sh "${CI_LOCATION}/${env.CI_MODULE}/bootstrap.sh" +} + def check_target() { echo "CHANGE_TARGET = ${env.CHANGE_TARGET}" if (changeRequest()) { @@ -289,7 +316,7 @@ def skip() { } echo "Changeset is: ${changeStrings.toArray()}" - if (changeStrings.toArray().every { it =~ /(?:fabtests\/pytests|man|prov\/efa|prov\/opx).*$/ }) { + if (changeStrings.toArray().every { it =~ /(?:fabtests\/pytests|man|prov\/efa|prov\/opx|contrib\/aws).*$/ }) { echo "DONT RUN!" return true } @@ -317,8 +344,10 @@ pipeline { environment { JOB_CADENCE = 'PR' WITH_ENV="'PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin:$PYTHONPATH'" - RUN_LOCATION="${env.WORKSPACE}/${SCRIPT_LOCATION}/" CUSTOM_WORKSPACE="${CB_HOME}/workspace/${JOB_NAME}/${env.BUILD_NUMBER}" + DELETE_LOCATION="${env.CUSTOM_WORKSPACE}/middlewares" + RUN_LOCATION="${env.CUSTOM_WORKSPACE}/ci_resources/legacy_pipeline_scripts/" + CI_LOCATION="${env.CUSTOM_WORKSPACE}/ci" LOG_DIR = "${env.CUSTOM_WORKSPACE}/log_dir" } stages { @@ -340,6 +369,9 @@ pipeline { dir ("${CUSTOM_WORKSPACE}/ucx/libfabric") { checkout scm } + dir ("${CUSTOM_WORKSPACE}/cuda/libfabric") { + checkout scm + } dir ("${CUSTOM_WORKSPACE}/iouring/libfabric") { checkout scm } @@ -388,7 +420,17 @@ pipeline { stage ('parallel-builds') { when { equals expected: true, actual: DO_RUN } parallel { + stage ('build-ci') { + steps { + script { + build_ci() + } + } + } stage ('build-water') { + environment { + build_type = "water" + } steps { script { slurm_build(BUILD_MODES, "water", "water", "water") @@ -397,10 +439,18 @@ pipeline { """python$PYTHON_VERSION ${RUN_LOCATION}/build.py \ --build_item=mpich --build_hw=water""" ) + slurm_batch("water", "1", + "${env.LOG_DIR}/build_shmem_water_log", + """python$PYTHON_VERSION ${RUN_LOCATION}/build.py \ + --build_item=shmem --build_hw=water""" + ) } } } stage ('build-grass') { + environment { + build_type = "grass" + } steps { script { slurm_build(BUILD_MODES, "grass", "grass", "grass") @@ -409,10 +459,18 @@ pipeline { """python$PYTHON_VERSION ${RUN_LOCATION}/build.py \ --build_item=mpich --build_hw=grass""" ) + slurm_batch("grass", "1", + "${env.LOG_DIR}/build_shmem_grass_log", + """python$PYTHON_VERSION ${RUN_LOCATION}/build.py \ + --build_item=shmem --build_hw=grass""" + ) } } } stage ('build-electric') { + environment { + build_type = "electric" + } steps { script { slurm_build(BUILD_MODES, "electric", "electric", "electric") @@ -426,6 +484,16 @@ pipeline { } } } + stage ('build-cuda') { + steps { + script { + slurm_build(BUILD_MODES, "cyndaquil", "cuda", "cyndaquil", + "--cuda") + slurm_build(BUILD_MODES, "quilava", "cuda", "quilava", + "--cuda") + } + } + } stage ('build-iouring') { steps { script { @@ -479,48 +547,50 @@ pipeline { stage('parallel-tests') { when { equals expected: true, actual: DO_RUN } parallel { - stage('MPI_verbs-rxm_IMB') { + stage ('CI_MPI_verbs-rxm_IMB') { + environment { + build_type = "water" + } steps { script { - dir (RUN_LOCATION) { - def providers = [["verbs", "rxm"]] - for (def mpi in ["impi"]) { - for (imb_grp = 1; imb_grp < 4; imb_grp++) { - run_middleware(providers, "MPI", "IMB", "water", - "squirtle,totodile", "2", "${mpi}", - "${imb_grp}") - } - } + dir (CI_LOCATION) { + run_ci("CI_MPI_verbs-rxm_IMB", "pr_imb_water.json") } } } } - stage('MPI_verbs-rxm_OSU') { + stage ('CI_MPI_verbs-rxm_OSU') { + environment { + build_type = "water" + } steps { script { - dir (RUN_LOCATION) { - def providers = [["verbs", "rxm"]] - for (def mpi in ["impi", "mpich"]) { - run_middleware(providers, "MPI", "osu", "water", - "squirtle,totodile", "2", "${mpi}") - } + dir (CI_LOCATION) { + run_ci("CI_MPI_verbs-rxm_OSU", "pr_osu_water.json") } } } } - stage('MPI_tcp') { + stage ('CI_MPI_tcp_IMB') { + environment { + build_type = "grass" + } steps { script { - dir (RUN_LOCATION) { - def providers = [["tcp", null]] - for (imb_grp = 1; imb_grp < 4; imb_grp++) { - run_middleware(providers, "MPI", "IMB", "grass", - "bulbasaur", "2", "impi", "${imb_grp}") - } - for (def mpi in ["impi", "mpich"]) { - run_middleware(providers, "MPI", "osu", "grass", "bulbasaur", - "2", "${mpi}") - } + dir (CI_LOCATION) { + run_ci("CI_MPI_tcp_IMB", "pr_imb_grass.json") + } + } + } + } + stage ('CI_MPI_tcp_OSU') { + environment { + build_type = "grass" + } + steps { + script { + dir (CI_LOCATION) { + run_ci("CI_MPI_tcp_OSU", "pr_osu_grass.json") } } } @@ -637,13 +707,22 @@ pipeline { } } } - stage('SHMEM') { + stage('SHMEM_grass') { + steps { + script { + dir (RUN_LOCATION) { + run_middleware([["tcp", null]], "SHMEM", "shmem", + "grass", "grass", "2") + } + } + } + } + stage('SHMEM_water') { steps { script { dir (RUN_LOCATION) { - run_middleware([["verbs", null], ["tcp", null], - ["sockets", null]], "SHMEM", "shmem", - "water", "squirtle,totodile", "2") + run_middleware([["verbs", "rxm"], ["sockets", null]], "SHMEM", + "shmem", "water", "squirtle,totodile", "2") } } } @@ -658,12 +737,22 @@ pipeline { } } } - stage ('oneCCL') { + stage ('oneCCL') { steps { script { dir (RUN_LOCATION) { - run_middleware([["tcp", "rxm"]/*, ["psm3", null]*/], "oneCCL", - "oneccl", "grass", "bulbasaur", "2") + run_middleware([["verbs", null]], "oneCCL", + "oneccl", "water", "squirtle,totodile", "2") + run_middleware([["shm", null]], "oneCCL", + "oneccl", "grass", "bulbasaur", "1") + run_middleware([["psm3", null]], "oneCCL", + "oneccl", "water", "squirtle", "2") + run_middleware([["tcp", null]], "oneCCL", + "oneccl", "grass", "bulbasaur", "2") + run_middleware([["shm", null]], "oneCCL_DSA", + "oneccl", "electric", "pikachu", "1", null, null, + """CCL_ATL_SHM=1 FI_SHM_DISABLE_CMA=1 \ + FI_SHM_USE_DSA_SAR=1 FI_LOG_LEVEL=warn""") } } } @@ -674,9 +763,16 @@ pipeline { steps { script { dir (RUN_LOCATION) { - run_middleware([["verbs", "rxm"]], "oneCCL-GPU-v3", "onecclgpu", - "gpu", "fabrics-ci", "2") - } + run_middleware([["tcp", null]], "oneCCL-GPU-v3", "onecclgpu", + "gpu", "fabrics-ci", "2", null, null, + "FI_HMEM_DISABLE_P2P=1") + run_middleware([["psm3", null]], "oneCCL-GPU-v3", "onecclgpu", + "gpu", "fabrics-ci", "2", null, null, + "FI_HMEM_DISABLE_P2P=1") + run_middleware([["verbs", null]], "oneCCL-GPU-v3", "onecclgpu", + "gpu", "fabrics-ci", "2", null, null, + "FI_HMEM_DISABLE_P2P=1") + } } } } @@ -713,7 +809,7 @@ pipeline { options { skipDefaultCheckout() } steps { script { - dir ("${env.WORKSPACE}/${SCRIPT_LOCATION}/") { + dir (RUN_LOCATION) { dmabuf_output = "${LOG_DIR}/DMABUF-Tests_verbs-rxm_dmabuf" cmd = """ python3.9 runtests.py --test=dmabuf \ --prov=verbs --util=rxm --build_hw=gpu""" @@ -725,6 +821,34 @@ pipeline { } } } + stage ('cuda-shm-v1') { + steps { + script { + dir (RUN_LOCATION) { + run_fabtests("cuda_v1_shm", "cyndaquil", "cyndaquil", "1", + "shm", null, null, "h2d") + run_fabtests("cuda_v1_shm", "cyndaquil", "cyndaquil", "1", + "shm", null, null, "d2d") + // run_fabtests("cuda_v1_shm", "cyndaquil", "cyndaquil", "1", + // "shm", null, null, "xd2d") + } + } + } + } + stage ('cuda-shm-v2') { + steps { + script { + dir (RUN_LOCATION) { + run_fabtests("cuda_v2_shm", "quilava", "quilava", "1", "shm", + null, null, "h2d") + run_fabtests("cuda_v2_shm", "quilava", "quilava", "1", "shm", + null, null, "d2d") + // run_fabtests("cuda_v2_shm", "quilava ", "quilava", "1", "shm", + // null, null, "xd2d") + } + } + } + } stage ('ze-shm-v3') { agent { node { label 'ze' } } options { skipDefaultCheckout() } @@ -732,11 +856,11 @@ pipeline { script { dir (RUN_LOCATION) { run_fabtests("ze_v3_shm", "gpu", "fabrics-ci", "1", "shm", null, - null, "h2d") + "FI_HMEM_DISABLE_P2P=1", "h2d") run_fabtests("ze_v3_shm", "gpu", "fabrics-ci", "1", "shm", null, - null, "d2d") + "FI_HMEM_DISABLE_P2P=1", "d2d") run_fabtests("ze_v3_shm", "gpu", "fabrics-ci", "1", "shm", null, - null, "xd2d") + "FI_HMEM_DISABLE_P2P=1", "xd2d") } } } @@ -808,4 +932,4 @@ pipeline { dir("${env.WORKSPACE}@tmp") { deleteDir() } } } -} \ No newline at end of file +} diff --git a/contrib/intel/jenkins/README b/contrib/intel/jenkins/README index 01fef9f964b..a3334f72021 100644 --- a/contrib/intel/jenkins/README +++ b/contrib/intel/jenkins/README @@ -1,8 +1,11 @@ Introduction ============ -Jenkins is a CI/CD (Continuous Integration/Continuous Development) Pipelining tool that Intel uses to test code changes to libfabric. It follows the Jenkinsfile pipeline stages to build, test, and cleanup resources. +Jenkins is a CI/CD (Continuous Integration/Continuous Development) Pipelining +tool that Intel uses to test code changes to libfabric. It follows the +Jenkinsfile pipeline stages to build, test, and cleanup resources. -The runtime flow generally follows, Jenkinsfile -> build.py -> runtests.py -> run.py -> tests.py. +The scripts that this pipeline uses are stored in an internal Intel CI +repository. Tests, Middlewares and Libraries supported by Intel CI/CD are: Fabtests diff --git a/contrib/intel/jenkins/build.py b/contrib/intel/jenkins/build.py deleted file mode 100755 index b163a47c6e3..00000000000 --- a/contrib/intel/jenkins/build.py +++ /dev/null @@ -1,207 +0,0 @@ -import os -import sys - -# add jenkins config location to PATH -sys.path.append(f"{os.environ['WORKSPACE']}/ci_resources/configs/{os.environ['CLUSTER']}") -import cloudbees_config - -import argparse -import subprocess -import shlex -import common -import re -import shutil - -def build_libfabric(libfab_install_path, mode, hw_type, gpu=False): - - if (os.path.exists(libfab_install_path) != True): - os.makedirs(libfab_install_path) - - config_cmd = ['./configure', f'--prefix={libfab_install_path}'] - enable_prov_val = 'yes' - - if (mode == 'dbg'): - config_cmd.append('--enable-debug') - elif (mode == 'dl'): - enable_prov_val = 'dl' - - for prov in common.providers[hw_type]['enable']: - config_cmd.append(f'--enable-{prov}={enable_prov_val}') - - for prov in common.providers[hw_type]['disable']: - config_cmd.append(f'--enable-{prov}=no') - - for op in common.common_disable_list: - config_cmd.append(f'--enable-{op}=no') - - if (gpu): - config_cmd.append('--enable-ze-dlopen') - - common.run_command(['./autogen.sh']) - common.run_command(shlex.split(" ".join(config_cmd))) - common.run_command(['make','clean']) - common.run_command(['make', '-j32']) - common.run_command(['make','install']) - - -def build_fabtests(libfab_install_path, mode): - if (mode == 'dbg'): - config_cmd = ['./configure', '--enable-debug', - f'--prefix={libfab_install_path}', - f'--with-libfabric={libfab_install_path}'] - else: - config_cmd = ['./configure', f'--prefix={libfab_install_path}', - f'--with-libfabric={libfab_install_path}'] - - common.run_command(['./autogen.sh']) - common.run_command(config_cmd) - common.run_command(['make','clean']) - common.run_command(['make', '-j32']) - common.run_command(['make', 'install']) - -def build_mpich(install_path, libfab_installpath, hw_type): - mpich_build_dir = f'{install_path}/middlewares/mpich_{hw_type}/mpich' - cwd = os.getcwd() - if (os.path.exists(mpich_build_dir)): - print("configure mpich") - os.chdir(mpich_build_dir) - configure_cmd = f"./configure " - configure_cmd += f"--prefix={install_path}/middlewares/mpich_{hw_type} " - configure_cmd += f"--with-libfabric={libfab_installpath} " - configure_cmd += "--disable-oshmem " - configure_cmd += "--disable-fortran " - configure_cmd += "--without-ch4-shmmods " - configure_cmd += "--with-device=ch4:ofi " - configure_cmd += "--without-ze " - print(configure_cmd) - common.run_command(['./autogen.sh']) - common.run_command(shlex.split(configure_cmd)) - common.run_command(['make','-j']) - common.run_command(['make','install']) - os.chdir(cwd) - -def build_mpich_osu(install_path, libfab_installpath, hw_type): - mpich_build = f'{install_path}/middlewares/mpich_{hw_type}' - osu_build_dir = f'{install_path}/middlewares/mpich_{hw_type}/osu_source' - cwd = os.getcwd() - if (os.path.exists(osu_build_dir)): - os.chdir(osu_build_dir) - if 'LD_LIBRARY_PATH' in dict(os.environ).keys(): - ld_library_path = os.environ['LD_LIBRARY_PATH'] - else: - ld_library_path = '' - - if 'PATH' in dict(os.environ).keys(): - path = os.environ['PATH'] - else: - path = '' - - os.environ['CC']=f'{mpich_build}/bin/mpicc' - os.environ['CXX']=f'{mpich_build}/bin/mpicxx' - os.environ['CFLAGS']=f'-I{osu_build_dir}/util' - os.environ['PATH']=f'{libfab_installpath}/bin:{mpich_build}/bin/:{path}' - os.environ['LD_LIBRARY_PATH']=f'{libfab_installpath}/lib:'\ - f'{mpich_build}/bin/lib:{ld_library_path}' - configure_cmd = f"./configure " - configure_cmd += f"--prefix={mpich_build}/osu " - print(f"Building OSU Tests: {configure_cmd}") - common.run_command(shlex.split(configure_cmd)) - common.run_command(shlex.split("make -j install")) - os.chdir(cwd) - os.environ['PATH'] = path - os.environ['LD_LIBRARY_PATH'] = ld_library_path - - -def copy_build_dir(install_path): - middlewares_path = f'{install_path}/middlewares' - if (os.path.exists(middlewares_path) != True): - os.makedirs(f'{install_path}/middlewares') - - shutil.copytree(f'{cloudbees_config.build_dir}/shmem', - f'{middlewares_path}/shmem') - shutil.copytree(f'{cloudbees_config.build_dir}/oneccl', - f'{middlewares_path}/oneccl') - shutil.copytree(f'{cloudbees_config.build_dir}/mpich_water', - f'{middlewares_path}/mpich_water') - shutil.copytree(f'{cloudbees_config.build_dir}/mpich_grass', - f'{middlewares_path}/mpich_grass') - - os.symlink(f'{cloudbees_config.build_dir}/impi', - f'{middlewares_path}/impi') - os.symlink(f'{cloudbees_config.build_dir}/ompi', - f'{middlewares_path}/ompi') - os.symlink(f'{cloudbees_config.build_dir}/oneccl_gpu', - f'{middlewares_path}/oneccl_gpu') - -def copy_file(file_name): - if (os.path.exists(f'{workspace}/{file_name}')): - shutil.copyfile(f'{workspace}/{file_name}', - f'{install_path}/log_dir/{file_name}') - -def log_dir(install_path, release=False): - if (os.path.exists(f'{install_path}/log_dir') != True): - os.makedirs(f'{install_path}/log_dir') - - if (release): - copy_file('Makefile.am.diff') - copy_file('configure.ac.diff') - copy_file('release_num.txt') - -if __name__ == "__main__": -#read Jenkins environment variables - # In Jenkins, JOB_NAME = 'ofi_libfabric/master' vs BRANCH_NAME = 'master' - # job name is better to use to distinguish between builds of different - # jobs but with same branch name. - jobname = os.environ['JOB_NAME'] - buildno = os.environ['BUILD_NUMBER'] - workspace = os.environ['WORKSPACE'] - custom_workspace = os.environ['CUSTOM_WORKSPACE'] - - parser = argparse.ArgumentParser() - parser.add_argument('--build_item', help="build libfabric or fabtests", \ - choices=['libfabric', 'fabtests', 'builddir', 'logdir',\ - 'mpich']) - parser.add_argument('--build_hw', help="HW type for build", - choices=['water', 'grass', 'fire', 'electric', 'ucx', - 'daos', 'gpu', 'ivysaur']) - parser.add_argument('--ofi_build_mode', help="select buildmode libfabric "\ - "build mode", choices=['reg', 'dbg', 'dl']) - parser.add_argument('--build_loc', help="build location for libfabric "\ - "and fabtests", type=str, default='./') - parser.add_argument('--release', help="This job is likely testing a "\ - "release and will be checked into a git tree.", - action='store_true') - parser.add_argument('--gpu', help="Enable ZE dlopen", action='store_true') - - args = parser.parse_args() - build_item = args.build_item - build_hw = args.build_hw - build_loc = args.build_loc - release = args.release - gpu = args.gpu - - if (args.ofi_build_mode): - ofi_build_mode = args.ofi_build_mode - else: - ofi_build_mode = 'reg' - - libfab_install_path = f'{custom_workspace}/{build_hw}/{ofi_build_mode}' - - p = re.compile('mpi*') - - curr_dir = os.getcwd() - os.chdir(build_loc) - - if (build_item == 'libfabric'): - build_libfabric(libfab_install_path, ofi_build_mode, build_hw, gpu) - elif (build_item == 'fabtests'): - build_fabtests(libfab_install_path, ofi_build_mode) - elif (build_item == 'builddir'): - copy_build_dir(custom_workspace) - elif (build_item == 'logdir'): - log_dir(custom_workspace, release) - elif(build_item == 'mpich'): - build_mpich(custom_workspace, libfab_install_path, build_hw) - build_mpich_osu(custom_workspace, libfab_install_path, build_hw) - - os.chdir(curr_dir) diff --git a/contrib/intel/jenkins/common.py b/contrib/intel/jenkins/common.py deleted file mode 100755 index 5cadf06e41d..00000000000 --- a/contrib/intel/jenkins/common.py +++ /dev/null @@ -1,160 +0,0 @@ -import collections -import subprocess -import sys -import os -from subprocess import Popen, TimeoutExpired -from time import sleep - -def get_node_name(host, interface): - return '%s-%s' % (host, interface) - -def run_command(command): - print(" ".join(command)) - p = subprocess.Popen(command, stdout=subprocess.PIPE, text=True) - print(p.returncode) - while True: - out = p.stdout.read(1) - if (out == '' and p.poll() != None): - break - if (out != ''): - sys.stdout.write(out) - sys.stdout.flush() - - print(f"Return code is {p.returncode}") - if (p.returncode != 0): - print("exiting with " + str(p.poll())) - sys.exit(p.returncode) - -def run_logging_command(command, log_file): - print("filename: ".format(log_file)) - f = open(log_file, 'a') - print(" ".join(command)) - p = subprocess.Popen(command, stdout=subprocess.PIPE, text=True) - print(p.returncode) - f.write(" ".join(command) + '\n') - while True: - out = p.stdout.read(1) - f.write(out) - if (out == '' and p.poll() != None): - break - if (out != ''): - sys.stdout.write(out) - sys.stdout.flush() - - print(f"Return code is {p.returncode}") - if (p.returncode != 0): - print("exiting with " + str(p.poll())) - f.close() - sys.exit(p.returncode) - f.close() - -def read_file(file_name): - with open(file_name) as file_out: - output = file_out.read() - return output - -class ClientServerTest: - def __init__(self, server_cmd, client_cmd, server_log, client_log, - timeout=None): - self.server_cmd = server_cmd - self.client_cmd = client_cmd - self.server_log = server_log - self.client_log = client_log - self._timeout = timeout - - def run(self): - server_process = Popen( - f"{self.server_cmd} > {self.server_log} 2>&1", - shell=True, close_fds=True - ) - sleep(1) - client_process = Popen( - f"{self.client_cmd} > {self.client_log} 2>&1", - shell=True, close_fds=True - ) - - try: - server_process.wait(timeout=self._timeout) - except TimeoutExpired: - server_process.terminate() - - try: - client_process.wait(timeout=self._timeout) - except TimeoutExpired: - client_process.terminate() - - server_output = read_file(self.server_log) - client_output = read_file(self.client_log) - - print("") - print(f"server_command: {self.server_cmd}") - print('server_stdout:') - print(server_output) - print(f"client_command: {self.client_cmd}") - print('client_stdout:') - print(client_output) - - return (server_process.returncode, client_process.returncode) - -Prov = collections.namedtuple('Prov', 'core util') -prov_list = [ - Prov('psm3', None), - Prov('verbs', None), - Prov('verbs', 'rxd'), - Prov('verbs', 'rxm'), - Prov('sockets', None), - Prov('tcp', None), - Prov('udp', None), - Prov('udp', 'rxd'), - Prov('shm', None), - Prov('ucx', None) -] - -providers = { - 'daos' : { - 'enable' : ['verbs', 'tcp'], - 'disable' : [] - }, - 'gpu' : { - 'enable' : ['verbs', 'shm'], - 'disable' : ['psm3'] - }, - 'dsa' : { - 'enable' : ['shm'], - 'disable' : [] - }, - 'ucx' : { - 'enable' : ['ucx'], - 'disable' : [] - }, - 'water' : { - 'enable' : ['tcp', 'verbs', 'psm3', 'sockets'], - 'disable' : [] - }, - 'grass' : { - 'enable' : ['tcp', 'sockets', 'udp', 'shm'], - 'disable' : [] - }, - 'fire' : { - 'enable' : ['shm'], - 'disable' : [] - }, - 'ivysaur': { - 'enable' : ['tcp'], - 'disable' : [] - }, - 'electric' : { - 'enable' : ['shm'], - 'disable' : [] - } -} - -common_disable_list = [ - 'efa', - 'perf', - 'hook_debug', - 'mrail', - 'opx' -] - -cloudbees_log_start_string = "Begin Cloudbees Test Output" diff --git a/contrib/intel/jenkins/run.py b/contrib/intel/jenkins/run.py deleted file mode 100755 index 75bde9f693e..00000000000 --- a/contrib/intel/jenkins/run.py +++ /dev/null @@ -1,231 +0,0 @@ -import tests -import subprocess -import sys -import argparse -import os -import common - -sys.path.append(f"{os.environ['WORKSPACE']}/ci_resources/configs/{os.environ['CLUSTER']}") -import cloudbees_config - -# read Jenkins environment variables -# In Jenkins, JOB_NAME = 'ofi_libfabric/master' vs BRANCH_NAME = 'master' -# job name is better to use to distinguish between builds of different -# jobs but with the same branch name. -fab = os.environ['FABRIC'] -if 'slurm' in fab: - fab = cloudbees_config.fabric_map[f"{os.environ['SLURM_JOB_PARTITION']}"] - -jbname = os.environ['JOB_NAME']#args.jobname -bno = os.environ['BUILD_NUMBER']#args.buildno - -def fi_info_test(hw, core, hosts, mode, user_env, log_file, util): - - fi_info_test = tests.FiInfoTest(jobname=jbname,buildno=bno, - testname='fi_info', hw=hw, core_prov=core, - fabric=fab, hosts=hosts, - ofi_build_mode=mode, user_env=user_env, - log_file=log_file, util_prov=util) - print('-------------------------------------------------------------------') - print(f"Running fi_info test for {core}-{util}-{fab}") - fi_info_test.execute_cmd() - print('-------------------------------------------------------------------') - -def fabtests(hw, core, hosts, mode, user_env, log_file, util, way): - - runfabtest = tests.Fabtest(jobname=jbname,buildno=bno, - testname='runfabtests', hw=hw, core_prov=core, - fabric=fab, hosts=hosts, ofi_build_mode=mode, - user_env=user_env, log_file=log_file, - util_prov=util, way=way) - - print('-------------------------------------------------------------------') - if (runfabtest.execute_condn): - print(f"Running Fabtests for {core}-{util}-{fab}") - runfabtest.execute_cmd() - else: - print(f"Skipping {core} {runfabtest.testname} as exec condn fails") - print('-------------------------------------------------------------------') - -def shmemtest(hw, core, hosts, mode, user_env, log_file, util): - - runshmemtest = tests.ShmemTest(jobname=jbname,buildno=bno, - testname="shmem test", hw=hw, core_prov=core, - fabric=fab, hosts=hosts, - ofi_build_mode=mode, user_env=user_env, - log_file=log_file, util_prov=util) - - print('-------------------------------------------------------------------') - if (runshmemtest.execute_condn): -# skip unit because it is failing shmem_team_split_2d -# print(f"Running shmem unit test for {core}-{util}-{fab}") -# runshmemtest.execute_cmd("unit") - print(f"Running shmem PRK test for {core}-{util}-{fab}") - runshmemtest.execute_cmd("prk") - - print('--------------------------------------------------------------') - print(f"Running shmem ISx test for {core}-{util}-{fab}") - runshmemtest.execute_cmd("isx") - - print('---------------------------------------------------------------') - print(f"Running shmem uh test for {core}-{util}-{fab}") - runshmemtest.execute_cmd("uh") - else: - print(f"Skipping {core} {runshmemtest.testname} as exec condn fails") - print('-------------------------------------------------------------------') - -def multinodetest(hw, core, hosts, mode, user_env, log_file, util): - - runmultinodetest = tests.MultinodeTests(jobname=jbname,buildno=bno, - testname="multinode performance test", - hw=hw, core_prov=core, fabric=fab, - hosts=hosts, ofi_build_mode=mode, - user_env=user_env, log_file=log_file, - util_prov=util) - - print("-------------------------------------------------------------------") - if (runmultinodetest.execute_condn): - print("Running multinode performance test for {}-{}-{}" \ - .format(core, util, fab)) - runmultinodetest.execute_cmd() - - print("---------------------------------------------------------------") - else: - print("Skipping {} as execute condition fails" \ - .format(runmultinodetest.testname)) - print("-------------------------------------------------------------------") - -def intel_mpi_benchmark(hw, core, hosts, mpi, mode, group, user_env, log_file, - util): - - imb = tests.IMBtests(jobname=jbname, buildno=bno, - testname='IntelMPIbenchmark', core_prov=core, hw=hw, - fabric=fab, hosts=hosts, mpitype=mpi, - ofi_build_mode=mode, user_env=user_env, - log_file=log_file, test_group=group, util_prov=util) - - print('-------------------------------------------------------------------') - if (imb.execute_condn == True): - print(f"Running IMB-tests for {core}-{util}-{fab}-{mpi}") - imb.execute_cmd() - else: - print(f"Skipping {mpi.upper} {imb.testname} as execute condition fails") - print('-------------------------------------------------------------------') - -def mpich_test_suite(hw, core, hosts, mpi, mode, user_env, log_file, util, - weekly=None): - - mpich_tests = tests.MpichTestSuite(jobname=jbname,buildno=bno, - testname="MpichTestSuite",core_prov=core, - hw=hw, fabric=fab, mpitype=mpi, - hosts=hosts, ofi_build_mode=mode, - user_env=user_env, log_file=log_file, - util_prov=util, weekly=weekly) - - print('-------------------------------------------------------------------') - if (mpich_tests.execute_condn == True): - print(f"Running mpichtestsuite for {core}-{util}-{fab}-{mpi}") - mpich_tests.execute_cmd() - else: - print(f"Skipping {mpi.upper()} {mpich_tests.testname} exec condn fails") - print('-------------------------------------------------------------------') - -def osu_benchmark(hw, core, hosts, mpi, mode, user_env, log_file, util): - - osu_test = tests.OSUtests(jobname=jbname, buildno=bno, - testname='osu-benchmarks', core_prov=core, - hw=hw, fabric=fab, mpitype=mpi, hosts=hosts, - ofi_build_mode=mode, user_env=user_env, - log_file=log_file, util_prov=util) - - print('-------------------------------------------------------------------') - if (osu_test.execute_condn == True): - print(f"Running OSU-Test for {core}-{util}-{fab}-{mpi}") - osu_test.execute_cmd() - else: - print(f"Skipping {mpi.upper()} {osu_test.testname} as exec condn fails") - print('-------------------------------------------------------------------') - -def oneccltest(hw, core, hosts, mode, user_env, log_file, util): - - runoneccltest = tests.OneCCLTests(jobname=jbname,buildno=bno, - testname="oneccl test", core_prov=core, - hw=hw, fabric=fab, hosts=hosts, - ofi_build_mode=mode, user_env=user_env, - log_file=log_file, util_prov=util) - - print('-------------------------------------------------------------------') - if (runoneccltest.execute_condn): - print(f"Running oneCCL cpu tests for {core}-{util}-{fab}") - runoneccltest.execute_cmd() - else: - print(f"Skipping {runoneccltest.testname} as execute condition fails") - print('-------------------------------------------------------------------') - -def oneccltestgpu(hw, core, hosts, mode, user_env, log_file, util): - - runoneccltestgpu = tests.OneCCLTestsGPU(jobname=jbname,buildno=bno, - testname="oneccl GPU test", - core_prov=core, hw=hw, fabric=fab, - hosts=hosts, ofi_build_mode=mode, - user_env=user_env, log_file=log_file, - util_prov=util) - - print('-------------------------------------------------------------------') - if (runoneccltestgpu.execute_condn): - print(f"Running oneCCL GPU examples test for {core}-{util}-{fab}") - runoneccltestgpu.execute_cmd('examples') - - print('---------------------------------------------------------------') - print(f"Running oneCCL GPU functional test for {core}-{util}-{fab}") - runoneccltestgpu.execute_cmd('functional') - else: - print(f"Skipping {runoneccltestgpu.testname} as execute condition fails") - print('-------------------------------------------------------------------') - -def daos_cart_tests(hw, core, hosts, mode, user_env, log_file, util): - - runcarttests = tests.DaosCartTest(jobname=jbname, buildno=bno, - testname="Daos Cart Test", core_prov=core, - hw=hw, fabric=fab, hosts=hosts, - ofi_build_mode=mode, user_env=user_env, - log_file=log_file, util_prov=util) - - print('-------------------------------------------------------------------') - if (runcarttests.execute_condn): - print(f"Running cart test for {core}-{util}-{fab}") - runcarttests.execute_cmd() - print('-------------------------------------------------------------------') - -def dmabuftests(hw, core, hosts, mode, user_env, log_file, util): - - rundmabuftests = tests.DMABUFTest(jobname=jbname,buildno=bno, - testname="DMABUF Tests", core_prov=core, - hw=hw, fabric=fab, hosts=hosts, - ofi_build_mode=mode, user_env=user_env, - log_file=log_file, util_prov=util) - - print('-------------------------------------------------------------------') - if (rundmabuftests.execute_condn): - print(f"Running dmabuf H->H tests for {core}-{util}-{fab}") - rundmabuftests.execute_cmd('H2H') - - print('---------------------------------------------------------------') - print(f"Running dmabuf H->D tests for {core}-{util}-{fab}") - rundmabuftests.execute_cmd('H2D') - - print('---------------------------------------------------------------') - print(f"Running dmabuf D->H tests for {core}-{util}-{fab}") - rundmabuftests.execute_cmd('D2H') - - print('---------------------------------------------------------------') - print(f"Running dmabuf D->D tests for {core}-{util}-{fab}") - rundmabuftests.execute_cmd('D2D') - - print('---------------------------------------------------------------') - else: - print(f"Skipping {rundmabuftests.testname} as execute condition fails") - print('-------------------------------------------------------------------') - -if __name__ == "__main__": - pass diff --git a/contrib/intel/jenkins/runtests.py b/contrib/intel/jenkins/runtests.py deleted file mode 100755 index 4122cd51c48..00000000000 --- a/contrib/intel/jenkins/runtests.py +++ /dev/null @@ -1,154 +0,0 @@ -import argparse -import os -import sys -sys.path.append(f"{os.environ['WORKSPACE']}/ci_resources/configs/{os.environ['CLUSTER']}") -import cloudbees_config -import subprocess -import run -import common -import shlex - -class ParseDict(argparse.Action): - def __call__(self, parser, namespace, values, option_string=None): - setattr(namespace, self.dest, dict()) - for value in values: - key, value = value.split('=') - getattr(namespace, self.dest)[key] = value - -parser = argparse.ArgumentParser() -parser.add_argument('--build_hw', help="HW type for build", - choices=['water', 'grass', 'fire', 'electric', 'daos',\ - 'gpu', 'ucx', 'ivysaur']) -parser.add_argument('--prov', help="core provider", choices=['verbs', \ - 'tcp', 'udp', 'sockets', 'shm', 'psm3', 'ucx']) -parser.add_argument('--util', help="utility provider", choices=['rxd', 'rxm']) -parser.add_argument('--ofi_build_mode', help="specify the build configuration",\ - choices = ['reg', 'dbg', 'dl'], default='reg') -parser.add_argument('--test', help="specify test to execute", \ - choices = ['all', 'shmem', 'IMB', 'osu', 'oneccl', \ - 'mpichtestsuite', 'fabtests', 'onecclgpu', \ - 'fi_info', 'daos', 'multinode', 'dmabuf']) - -parser.add_argument('--imb_grp', help="IMB test group 1:[MPI1, P2P], \ - 2:[EXT, IO], 3:[NBC, RMA, MT]", choices=['1', '2', '3']) -parser.add_argument('--way', help="direction to run with device option", - choices=['h2d', 'd2d', 'xd2d'], default=None) -parser.add_argument('--user_env', help="Run with additional environment " \ - "variables", nargs='*', action=ParseDict, default={}) -parser.add_argument('--mpi', help="Select mpi to use for middlewares", - choices=['impi', 'mpich', 'ompi'], default='impi') -parser.add_argument('--log_file', help="Full path to log file", - default=os.environ['DEFAULT_LOG_LOCATION'], type=str) -parser.add_argument('--weekly', help="run weekly", default=False, type=bool) - -args = parser.parse_args() -build_hw = args.build_hw -args_core = args.prov -args_util = args.util -user_env = args.user_env -log_file = args.log_file -weekly = args.weekly - -if (args.ofi_build_mode): - ofi_build_mode = args.ofi_build_mode -else: - ofi_build_mode='reg' - -if (args.test): - run_test = args.test -else: - run_test = 'all' - -if (args.imb_grp): - imb_group = args.imb_grp -else: - imb_group = '1' - -mpi = args.mpi -way = args.way - -hosts = [] -if 'slurm' in os.environ['FABRIC']: - slurm_nodes = os.environ['SLURM_JOB_NODELIST'] # example cb[1-4,11] - common.run_command(shlex.split(f"sinfo --Format=Features -n {slurm_nodes}")) - if int(os.environ['SLURM_NNODES']) == 1: - hosts.append(slurm_nodes) - else: - prefix = slurm_nodes[0:slurm_nodes.find('[')] - nodes = slurm_nodes[slurm_nodes.find('[') + 1 : - slurm_nodes.find(']')].split(',') # ['1-4', '11'] - for item in nodes: # ['1-4', '11'] -> ['cb1', 'cb2', 'cb3', 'cb4', 'cb11'] - if '-' in item: - rng = item.split('-') - node_list = list(range(int(rng[0]), int(rng[1]) + 1)) - for node in node_list: - hosts.append(f'{prefix}{node}') - else: - hosts.append(f'{prefix}{item}') -else: - node = (os.environ['NODE_NAME']).split('_')[0] - hosts = [node] - for host in cloudbees_config.node_map[node]: - hosts.append(host) - print(f"hosts = {hosts}") - -print(common.cloudbees_log_start_string) - -#this script is executed from /tmp -#this is done since some mpi tests -#look for a valid location before running -# the test on the secondary host(client) -# but jenkins only creates a valid path on -# the primary host (server/test node) - -os.chdir('/tmp/') - -if(args_core): - if (run_test == 'all' or run_test == 'fi_info'): - run.fi_info_test(build_hw, args_core, hosts, ofi_build_mode, - user_env, log_file, util=args.util) - - if (run_test == 'all' or run_test == 'fabtests'): - run.fabtests(build_hw, args_core, hosts, ofi_build_mode, user_env, - log_file, args_util, way) - - if (run_test == 'all' or run_test == 'shmem'): - run.shmemtest(build_hw, args_core, hosts, ofi_build_mode, user_env, - log_file, args_util) - - if (run_test == 'all' or run_test == 'oneccl'): - run.oneccltest(build_hw, args_core, hosts, ofi_build_mode, user_env, - log_file, args_util) - - if (run_test == 'all' or run_test == 'onecclgpu'): - run.oneccltestgpu(build_hw, args_core, hosts, ofi_build_mode, - user_env, log_file, args_util) - - if (run_test == 'all' or run_test == 'daos'): - run.daos_cart_tests(build_hw, args_core, hosts, ofi_build_mode, - user_env, log_file, args_util) - - if (run_test == 'all' or run_test == 'multinode'): - run.multinodetest(build_hw, args_core, hosts, ofi_build_mode, - user_env, log_file, args_util) - - if (run_test == 'all' or run_test == 'mpichtestsuite'): - run.mpich_test_suite(build_hw, args_core, hosts, mpi, - ofi_build_mode, user_env, log_file, - args_util, weekly) - - if (run_test == 'all' or run_test == 'IMB'): - run.intel_mpi_benchmark(build_hw, args_core, hosts, mpi, - ofi_build_mode, imb_group, - user_env, log_file, args_util) - - if (run_test == 'all' or run_test == 'osu'): - run.osu_benchmark(build_hw, args_core, hosts, mpi, - ofi_build_mode, user_env, log_file, - args_util) - - if (run_test == 'all' or run_test == 'dmabuf'): - run.dmabuftests(build_hw, args_core, hosts, ofi_build_mode, - user_env, log_file, args_util) -else: - print("Error : Specify a core provider to run tests") diff --git a/contrib/intel/jenkins/summary.py b/contrib/intel/jenkins/summary.py deleted file mode 100755 index bbef9f003ba..00000000000 --- a/contrib/intel/jenkins/summary.py +++ /dev/null @@ -1,995 +0,0 @@ -from abc import ABC, abstractmethod -import shutil -from datetime import datetime -from typing import Tuple -import os -from pickle import FALSE -import sys -import smtplib -from email.mime.multipart import MIMEMultipart -from email.mime.text import MIMEText -from email.mime.base import MIMEBase -from email import encoders - -# add jenkins config location to PATH -sys.path.append(f"{os.environ['CUSTOM_WORKSPACE']}/ci_resources/configs/{os.environ['CLUSTER']}") - -import cloudbees_config -import argparse -import common - -verbose = False - -class SendEmail: - def __init__(self, sender=None, receivers=None, attachment=None): - self.sender = sender if sender is not None else os.environ['SENDER'] - self.receivers = (receivers if receivers is not None else \ - f"{os.environ['RECEIVER']}").split(',') - self.attachment = attachment - self.work_week = datetime.today().isocalendar()[1] - self.msg = MIMEMultipart() - - def __add_attachments(self): - print(f"Attachment is {self.attachment}") - if self.attachment is None: - return - - attachment = MIMEBase('application', 'octet-stream') - attachment.set_payload(open(self.attachment, 'rb').read()) - encoders.encode_base64(attachment) - name = f"Jenkins_Summary_ww{self.work_week}" - if (verbose): - name = f"{name}_all" - attachment.add_header('Content-Disposition', - f"attachment; filename={name}") - self.msg.attach(attachment) - - def __write_msg(self): - self.msg['Subject'] = f"Cloudbees Summary {os.environ['JOB_NAME']}" - self.msg['From'] = self.sender - self.msg['To'] = ", ".join(self.receivers) - self.msg.attach(MIMEText(f"WW{self.work_week} Summary for Libfabric "\ - "From Cloudbees")) - - def send_mail(self): - self.__write_msg() - self.__add_attachments() - server = smtplib.SMTP(os.environ['SMTP_SERVER'], - os.environ['SMTP_PORT']) - server.sendmail(self.sender, self.receivers, self.msg.as_string()) - server.quit() - -class Release: - def __init__(self, log_dir, output_file, logger, release_num): - self.log_dir = log_dir - self.output_file = output_file - self.logger = logger - self.release_num = release_num - - def __log_entire_file(self, file_name): - with open(file_name) as f: - for line in f: - self.logger.log(line, end_delimiter = '') - - def __append_release_changes(self, file_name): - if os.path.exists(file_name): - self.__log_entire_file(file_name) - - def add_release_changes(self): - self.logger.log(F"Release number: {self.release_num}") - self.__append_release_changes(f'{self.log_dir}/Makefile.am.diff') - self.__append_release_changes(f'{self.log_dir}/configure.ac.diff') - -class Logger: - def __init__(self, output_file, release): - self.output_file = output_file - self.release = release - self.padding = '\t' - - def log(self, line, end_delimiter='\n', lpad=0, ljust=0): - print(f'{self.padding * lpad}{line}'.ljust(ljust), end = end_delimiter) - self.output_file.write(f'{self.padding * lpad}{line}{end_delimiter}') - -class Summarizer(ABC): - @classmethod - def __subclasshook__(cls, subclass): - return ( - hasattr(subclass, "print_results") - and callable(subclass.print_results) - and hasattr(subclass, "check_features") - and callable(subclass.check_features) - and hasattr(subclass, "check_node") - and callable(subclass.check_node) - and hasattr(subclass, "check_name") - and callable(subclass.check_name) - and hasattr(subclass, "check_pass") - and callable(subclass.check_pass) - and hasattr(subclass, "check_fail") - and callable(subclass.check_fail) - and hasattr(subclass, "check_exclude") - and callable(subclass.check_exclude) - and hasattr(subclass, "fast_forward") - and callable(subclass.fast_forward) - and hasattr(subclass, "read_file") - and callable(subclass.read_file) - and hasattr(subclass, "run") - and callable(subclass.run) - or NotImplemented - ) - - @abstractmethod - def __init__(self, logger, log_dir, prov, file_name, stage_name): - self.logger = logger - self.log_dir = log_dir - self.prov = prov - self.file_name = file_name - self.stage_name = stage_name - self.file_path = os.path.join(self.log_dir, self.file_name) - self.exists = os.path.exists(self.file_path) - self.log = None - self.passes = 0 - self.passed_tests = [] - self.fails = 0 - self.failed_tests = [] - self.excludes = 0 - self.excluded_tests = [] - self.error = 0 - self.errored_tests = [] - self.test_name ='no_test' - self.name = 'no_name' - self.features = "no_features_found" - self.node = "no_node_found" - - def print_results(self): - total = self.passes + self.fails - # log was empty or not valid - if not total: - return - - percent = self.passes/total * 100 - if (verbose): - self.logger.log( - f"<>{self.stage_name} : ", lpad=1, ljust=50, end_delimiter = '' - ) - else: - self.logger.log( - f"{self.stage_name} : ", - lpad=1, ljust=50, end_delimiter = '' - ) - self.logger.log( - f"{self.node} : ", - lpad=1, ljust=20, end_delimiter = '' - ) - self.logger.log( - f"[{self.features}] : ", - lpad=1, ljust=30, end_delimiter = '' - ) - self.logger.log(f"{self.passes}:{total} ", ljust=10, end_delimiter = '') - self.logger.log(f": {percent:.2f}% : ", ljust=12, end_delimiter = '') - self.logger.log("Pass", end_delimiter = '') - if (self.excludes > 0): - self.logger.log(f" : {self.excludes:3.0f} : Excluded/Notrun") - else: - self.logger.log("") - - if (verbose and self.passes): - self.logger.log(f"Passed tests: {self.passes}", lpad=2) - for test in self.passed_tests: - self.logger.log(f'{test}', lpad=3) - if self.fails: - self.logger.log(f"Failed tests: {self.fails}", lpad=2) - for test in self.failed_tests: - self.logger.log(f'{test}', lpad=3) - if (verbose): - if self.excludes: - self.logger.log( - f"Excluded/Notrun tests: {self.excludes} ", lpad=2 - ) - for test in self.excluded_tests: - self.logger.log(f'{test}', lpad=3) - - if self.error: - self.logger.log( - "Errored, Interrupt, or Canceled Tests: "\ - f"{self.excludes} ", lpad=2 - ) - for test in self.errored_tests: - self.logger.log(f'{test}', lpad=3) - - def check_features(self, previous, line): - if ('avail_features') in previous: - self.features = line.strip() - - def check_node(self, line): - if ('slurm_nodelist' in line): - self.node = line.strip().split('=')[1] - - def check_name(self, line): - return - - def check_pass(self, line): - return - - def check_fail(self, line): - if "exiting with" in line: - self.fails += 1 - - def check_exclude(self, line): - return - - def check_line(self, line): - self.check_name(line) - self.check_pass(line) - self.check_fail(line) - self.check_exclude(line) - - def fast_forward(self, log_file): - previous = "" - line = log_file.readline().lower() - while line != "": - self.check_node(line) - self.check_features(previous, line) - if common.cloudbees_log_start_string.lower() in line: - break - - previous = line - line = log_file.readline().lower() - - def read_file(self): - with open(self.file_path, 'r') as log_file: - self.fast_forward(log_file) - for line in log_file: - self.check_line(line.lower()) - - def summarize(self): - if not self.exists: - return 0 - - self.read_file() - self.print_results() - return int(self.fails) - -class FiInfoSummarizer(Summarizer): - def __init__(self, logger, log_dir, prov, file_name, stage_name): - super().__init__(logger, log_dir, prov, file_name, stage_name) - - def check_fail(self, line): - if "exiting with" in line: - self.fails += 1 - self.failed_tests.append(f"fi_info {self.prov}") - - def read_file(self): - super().read_file() - - if not self.fails: - self.passes += 1 - self.passed_tests.append(f"fi_info {self.prov}") - -class FabtestsSummarizer(Summarizer): - def __init__(self, logger, log_dir, prov, file_name, stage_name): - super().__init__(logger, log_dir, prov, file_name, stage_name) - self.trace = False - - def check_name(self, line): - # don't double count ubertest output and don't count fi_ubertest's - # invocation - if 'ubertest' in line and 'client_cmd:' in line: - self.test_name = 'no_test' - if 'name:' not in line: # skip past client output in ubertest - return - - test_name = line.split("name:") - if len(test_name) > 1: - self.test_name = test_name[-1].lower().strip() - - def get_result_line(self, line) -> Tuple[str,str]: - result = line.split("result:") - if len(result) > 1: - return (result[-1].lower().strip(), line.split()) - return None, None - - def check_pass(self, line): - result, result_line = self.get_result_line(line) - if result == 'pass' or result == 'success' or result == 'passed': - self.passes += 1 - if 'ubertest' in self.test_name: - idx = (result_line.index('result:') - 1) - try: - int((result_line[idx].split(',')[0])) - except: - return - - ubertest_number = int((result_line[idx].split(',')[0])) - self.passed_tests.append(f"{self.test_name}: "\ - f"{ubertest_number}") - else: - self.passed_tests.append(self.test_name) - - def check_fail(self, line): - result, result_line = self.get_result_line(line) - if result == 'fail': - self.fails += 1 - if 'ubertest' in self.test_name: - idx = (result_line.index('result:') - 1) - try: - int((result_line[idx].split(',')[0])) - except: - return - ubertest_number = int((result_line[idx].split(',')[0])) - self.failed_tests.append(f"{self.test_name}: " \ - f"{ubertest_number}") - else: - self.failed_tests.append(self.test_name) - - if "exiting with" in line: - self.fails += 1 - self.failed_tests.append(self.test_name) - - def check_exclude(self, line): - result, _ = self.get_result_line(line) - if result == 'excluded' or result == 'notrun': - self.excludes += 1 - self.excluded_tests.append(self.test_name) - - def check_trace(self, line): - if not self.trace: - cmd_count = 0 - faults_count = 0 - if ("user to sar buffer" in line): - tokens = line.split(' ') - for i in range(0, len(tokens)): - if 'cmd' in tokens[i]: - cmd_count += int(tokens[i + 1]) - if 'faults' in tokens[i]: - faults_count += int(tokens[i + 1]) - - if (cmd_count > 0 or faults_count > 0): - self.trace = True - - def check_line(self, line): - self.check_name(line) - if (self.test_name != 'no_test'): - self.check_pass(line) - self.check_fail(line) - self.check_exclude(line) - if ('dsa' in self.file_name): - self.check_trace(line) - - def summarize(self): - if not self.exists: - return 0 - - self.read_file() - self.print_results() - if ('dsa' in self.file_name and not self.trace): - exit("Expected: DSA to run. Actual: DSA Not Run") - - return int(self.fails) - -class MultinodePerformanceSummarizer(Summarizer): - def __init__(self, logger, log_dir, prov, file_name, stage_name): - super().__init__(logger, log_dir, prov, file_name, stage_name) - - def check_name(self, line): - #name lines look like "starting ... " - if 'starting' in line and '...' in line: - self.test_name = line.split()[1].split('.')[0] - - def check_pass(self, line): - if 'pass' in line: - self.passes += 1 - self.passed_tests.append(self.test_name) - - def check_fail(self, line): - if 'fail' in line: - self.fails += 1 - self.failed_tests.append(self.test_name) - - if "exiting with" in line: - self.fails += 1 - self.failed_tests.append(self.test_name) - -class OnecclSummarizer(Summarizer): - def __init__(self, logger, log_dir, prov, file_name, stage_name): - super().__init__(logger, log_dir, prov, file_name, stage_name) - self.file_path = os.path.join(self.log_dir, self.file_name) - self.exists = os.path.exists(self.file_path) - self.name = 'no_test' - - def check_name(self, line): - #lines look like path/run_oneccl.sh ..... -test examples ..... test_name - if " -test" in line: - tokens = line.split() - self.name = f"{tokens[tokens.index('-test') + 1]} " \ - f"{tokens[len(tokens) - 1]}" - - def check_pass(self, line): - if 'passed' in line or "all done" in line: - self.passes += 1 - self.passed_tests.append(self.name) - - def check_fail(self, line): - if 'failed' in line or "exiting with" in line: - self.fails += 1 - self.failed_tests.append(self.name) - -class ShmemSummarizer(Summarizer): - def __init__(self, logger, log_dir, prov, file_name, stage_name): - super().__init__(logger, log_dir, prov, file_name, stage_name) - self.shmem_type = { - 'uh' : { 'func' : self.check_uh, - 'keyphrase' : 'summary', - 'passes' : 0, - 'fails' : 0 - }, - 'isx' : { 'func' : self.check_isx, - 'keyphrase' : 'scaling', - 'passes' : 0, - 'fails' : 0 - }, - 'prk' : { 'func' : self.check_prk, - 'keyphrase' : 'solution', - 'passes' : 0, - 'fails' : 0 - } - } - self.test_type = 'prk' - self.keyphrase = self.shmem_type[self.test_type]['keyphrase'] - self.name = 'no_test' - - def check_uh(self, line, log_file): - # (test_002) Running test_shmem_atomics.x: Test all atomics... OK - # (test_003) Running test_shmem_barrier.x: Tests barrier ... Failed - if "running test_" in line: - tokens = line.split() - for token in tokens: - if 'test_' in token: - self.name = token - if tokens[len(tokens) - 1] == 'ok': - self.shmem_type[self.test_type]['passes'] += 1 - self.passed_tests.append(self.name) - else: - self.shmem_type[self.test_type]['fails'] += 1 - self.failed_tests.append(self.name) - # Summary - # x/z Passed. - # y/z Failed. - if self.keyphrase in line: #double check - passed = log_file.readline().lower() - failed = log_file.readline().lower() - token = int(passed.split()[1].split('/')[0]) - if self.shmem_type[self.test_type]['passes'] != token: - self.logger.log( - f"passes {self.shmem_type[self.test_type]['passes']} do " \ - f"not match log reported passes {token}" - ) - token = int(failed.split()[1].split('/')[0]) - if self.shmem_type[self.test_type]['fails'] != int(token): - self.logger.log( - f"fails {self.shmem_type[self.test_type]['fails']} does "\ - f"not match log fails {token}" - ) - - def check_prk(self, line, log_file=None): - if self.keyphrase in line: - self.shmem_type[self.test_type]['passes'] += 1 - if 'error:' in line or "exiting with" in line: - self.shmem_type[self.test_type]['fails'] += 1 - p = self.shmem_type[self.test_type]['passes'] - f = self.shmem_type[self.test_type]['fails'] - self.failed_tests.append(f"{self.prov} {p + f}") - if 'test(s)' in line: - token = line.split()[0] - if self.fails != int(token): - self.logger.log( - f"fails {self.fails} does not match log reported fails " \ - f"{token}" - ) - - def check_isx(self, line, log_file=None): - if self.keyphrase in line: - self.shmem_type[self.test_type]['passes'] += 1 - if ('failed' in line and 'test(s)' not in line) or \ - "exiting with" in line: - self.shmem_type[self.test_type]['fails'] += 1 - p = self.shmem_type[self.test_type]['passes'] - f = self.shmem_type[self.test_type]['fails'] - self.failed_tests.append(f"{self.prov} {p + f}") - if 'test(s)' in line: - token = line.split()[0] - if int(token) != self.shmem_type[self.test_type]['fails']: - self.logger.log( - f"fails {self.shmem_type[self.test_type]['fails']} does " \ - f"not match log reported fails {int(token)}" - ) - - def check_fails(self, line): - if "exiting with" in line: - self.shmem_type[self.test_type]['fails'] += 1 - p = self.shmem_type[self.test_type]['passes'] - f = self.shmem_type[self.test_type]['fails'] - self.failed_tests.append(f"{self.prov} {p + f}") - - def check_test_type(self, line): - if "running shmem" in line: - self.test_type = line.split(' ')[2].lower() - self.keyphrase = self.shmem_type[self.test_type]['keyphrase'] - - def check_line(self, line, log_file): - self.check_test_type(line) - if self.test_type is not None: - self.shmem_type[self.test_type]['func'](line, log_file) - self.check_fails(line) - - def read_file(self): - with open(self.file_path, 'r') as log_file: - super().fast_forward(log_file) - for line in log_file: - self.check_line(line.lower(), log_file) - - for key in self.shmem_type.keys(): - self.passes += self.shmem_type[key]['passes'] - self.fails += self.shmem_type[key]['fails'] - -class MpichTestSuiteSummarizer(Summarizer): - def __init__(self, logger, log_dir, prov, mpi, file_name, stage_name): - super().__init__(logger, log_dir, prov, file_name, stage_name) - - self.mpi = mpi - self.run = 'mpiexec' - - def read_file(self): - with open(self.file_path,'r') as log_file: - super().fast_forward(log_file) - for line in log_file: - super().check_line(line.lower().strip()) - - def check_exclude(self, line): - if line.startswith('excluding:'): - test = line.split(':')[-1] - self.excludes += 1 - self.excluded_tests.append(test) - - def check_name(self, line): - if (line.startswith('ok') or - line.startswith('not ok')): - self.name = line.split('-')[1].split('#')[0].strip() - - def check_pass(self, line): - if (line.startswith('ok') and not - line.split('#')[1].strip().startswith('skip')): - self.passes += 1 - self.passed_tests.append(self.name) - - def check_fail(self, line): - if (line.startswith('not ok') and not - line.split('#')[1].strip().startswith('skip')): - self.fails += 1 - self.failed_tests.append(self.name) - - -class ImbSummarizer(Summarizer): - def __init__(self, logger, log_dir, prov, mpi, file_name, stage_name): - super().__init__(logger, log_dir, prov, file_name, stage_name) - - self.mpi = mpi - if self.mpi == 'impi': - self.run = 'mpiexec' - else: - self.run = 'mpirun' - self.test_type = '' - - def check_type(self, line): - if 'part' in line: - self.test_type = line.split()[len(line.split()) - 2] - - def check_name(self, line): - if "benchmarking" in line: - self.name = line.split()[len(line.split()) - 1] - - def check_pass(self, line): - if "benchmarking" in line: - self.passes += 1 - self.passed_tests.append(self.name) - - def check_fail(self, line): - if "exiting with" in line: - self.fails += 1 - self.failed_tests.append(f"{self.test_type} {self.name}") - self.passes -= 1 - - def check_line(self, line): - self.check_type(line) - self.check_name(line) - self.check_pass(line) - self.check_fail(line) - super().check_exclude(line) - -class OsuSummarizer(Summarizer): - def __init__(self, logger, log_dir, prov, mpi, file_name, stage_name): - super().__init__(logger, log_dir, prov, file_name, stage_name) - self.mpi = mpi - if self.mpi == 'impi': - self.run = 'mpiexec' - else: - self.run = 'mpirun' - - self.type = '' - self.tokens = [] - - def get_tokens(self, line): - if "# osu" in line: - self.tokens = line.split() - else: - self.tokens = [] - - def check_name(self, line): - if 'osu' in self.tokens: - self.name = " ".join(self.tokens[self.tokens.index('osu') + \ - 1:self.tokens.index('test')]) - - def check_type(self): - if self.tokens: - self.test_type = self.tokens[1] - - def check_pass(self, line): - if 'osu' in self.tokens: - # Assume pass - self.passes += 1 - self.passed_tests.append(self.name) - - def check_fail(self, line): - if "exiting with" in line: - self.fails += 1 - self.failed_tests.append(f"{self.test_type} {self.name}") - # Remove assumed pass - self.passes -= 1 - - def check_line(self, line): - self.get_tokens(line) - self.check_name(line) - self.check_type() - self.check_pass(line) - self.check_fail(line) - super().check_exclude(line) - -class DaosSummarizer(Summarizer): - def __init__(self, logger, log_dir, prov, file_name, stage_name): - super().__init__(logger, log_dir, prov, file_name, stage_name) - - if (self.exists): - if ('verbs' in file_name): - self.node = cloudbees_config.daos_prov_node_map['verbs'] - if ('tcp' in file_name): - self.node = cloudbees_config.daos_prov_node_map['tcp'] - - self.features = cloudbees_config.daos_node_features - - def check_name(self, line): - if "reading ." in line: - self.test_name = line.split('/')[len(line.split('/')) - 1] \ - .rstrip('.yaml\n') - - def check_pass(self, line): - res_string = line.lstrip("results :").rstrip() - res_list = res_string.split(' | ') - for elem in res_list: - if 'pass' in elem: - self.passes += [int(s) for s in elem.split() if s.isdigit()][0] - display_testname = self.test_name.ljust(20) - self.passed_tests.append(f"{display_testname} : {res_string}") - - def check_fail(self, line): - res_list = line.lstrip("results :").rstrip().split('|') - for elem in res_list: - total = [int(s) for s in elem.split() if s.isdigit()][0] - if total != 0: - if 'fail' in elem: - self.fails += total - self.failed_tests.append(f'{self.test_name}') - if 'error' in elem: - self.error += total - self.errored_tests.append(f'error: {self.test_name}') - if 'interrupt' in elem: - self.error += total - self.errored_tests.append(f'interrupt: {self.test_name}') - if 'cancel' in elem: - self.error += total - self.errored_tests.append(f'cancel: {self.test_name}') - - def check_exclude(self, line): - res_list = line.lstrip("results :").rstrip().split('|') - for elem in res_list: - total = [int(s) for s in elem.split() if s.isdigit()][0] - if total != 0: - if 'skip' in elem: - self.excludes += total - self.excluded_tests.append(f'skip: {self.test_name}') - if 'warn' in elem: - self.excludes += total - self.excluded_tests.append(f'warn: {self.test_name}') - - def check_line(self, line): - self.check_name(line) - if "results :" in line: - self.check_pass(line) - self.check_fail(line) - self.check_exclude(line) - -class DmabufSummarizer(Summarizer): - def __init__(self, logger, log_dir, prov, file_name, stage_name): - super().__init__(logger, log_dir, prov, file_name, stage_name) - - self.test_type = '' - - def check_type(self, line): - if "Running" in line: - self.test_type = line.split()[2] - - def check_num_node(self, line): - if "SLURM_NNODES" in line: - self.num_nodes = line.split("=")[-1].strip() - self.num_nodes = ' '.join([self.num_nodes, 'node']) - - def check_name(self, line): - if "client_command" in line: - name_list = line.split()[-2:] - name_list.insert(0, str(self.num_nodes)) - name_list.insert(1, str(self.test_type)) - self.test_name = name_list - - def check_pass(self, line): - if "TEST COMPLETED" in line: - self.passes += 1 - self.passed_tests.append(self.test_name) - - def check_fail(self, line): - if "TEST FAILED" in line: - self.fails += 1 - self.failed_tests.append(self.test_name) - - def fast_forward(self, log_file): - previous = "" - line = log_file.readline() - while line != "": - self.check_num_node(line) - self.check_node(line.lower()) - self.check_features(previous.lower(), line.lower()) - if common.cloudbees_log_start_string.lower() in line.lower(): - break - - previous = line - line = log_file.readline() - - def read_file(self): - with open(self.file_path, 'r') as log_file: - self.fast_forward(log_file) - for line in log_file: - self.check_type(line) - self.check_line(line) - -def get_release_num(): - file_name = f'{os.environ["CUSTOM_WORKSPACE"]}/source/libfabric/'\ - 'release_num.txt' - if os.path.exists(file_name): - with open(file_name) as f: - num = f.readline() - - return num.strip() - - raise Exception("No release num") - -def summarize_items(summary_item, logger, log_dir, mode): - err = 0 - mpi_list = ['impi', 'mpich', 'ompi'] - logger.log(f"Summarizing {mode} build mode:") - provs = common.prov_list + [('tcp-iouring', None)] - if summary_item == 'fabtests' or summary_item == 'all': - for prov,util in provs: - if util: - prov = f'{prov}-{util}' - ret = FabtestsSummarizer( - logger, log_dir, prov, - f'{prov}_fabtests_{mode}', - f"{prov} fabtests {mode}" - ).summarize() - err += ret if ret else 0 - ret = FiInfoSummarizer( - logger, log_dir, prov, - f'{prov}_fi_info_{mode}', - f"{prov} fi_info {mode}" - ).summarize() - err += ret if ret else 0 - - if ((summary_item == 'daos' or summary_item == 'all') - and mode == 'reg'): - for prov in ['tcp-rxm', 'verbs-rxm']: - ret = DaosSummarizer( - logger, log_dir, prov, - f'daos_{prov}_{mode}', - f"{prov} daos {mode}" - ).summarize() - err += ret if ret else 0 - - if summary_item == 'imb' or summary_item == 'all': - for mpi in mpi_list: - for item in ['tcp-rxm', 'verbs-rxm', 'tcp']: - ret = ImbSummarizer( - logger, log_dir, item, mpi, - f'MPI_{item}_{mpi}_IMB_{mode}', - f"{item} {mpi} IMB {mode}" - ).summarize() - err += ret if ret else 0 - - if summary_item == 'osu' or summary_item == 'all': - for mpi in mpi_list: - for item in ['tcp-rxm', 'verbs-rxm', 'tcp']: - ret = OsuSummarizer( - logger, log_dir, item, mpi, - f'MPI_{item}_{mpi}_osu_{mode}', - f"{item} {mpi} OSU {mode}" - ).summarize() - err += ret if ret else 0 - - if summary_item == 'mpichtestsuite' or summary_item == 'all': - for mpi in mpi_list: - for item in ['tcp', 'verbs-rxm']: - ret = MpichTestSuiteSummarizer( - logger, log_dir, item, mpi, - f'mpichtestsuite_{item}_{mpi}_'\ - f'mpichtestsuite_{mode}', - f"{item} {mpi} mpichtestsuite {mode}" - ).summarize() - err += ret if ret else 0 - if summary_item == 'multinode' or summary_item == 'all': - for prov,util in common.prov_list: - if util: - prov = f'{prov}-{util}' - - ret = MultinodePerformanceSummarizer( - logger, log_dir, prov, - f'multinode_performance_{prov}_multinode_{mode}', - f"multinode performance {prov} {mode}" - ).summarize() - err += ret if ret else 0 - - if summary_item == 'oneccl' or summary_item == 'all': - for prov in ['tcp-rxm', 'verbs-rxm']: - ret = OnecclSummarizer( - logger, log_dir, 'oneCCL', - f'oneCCL_{prov}_oneccl_{mode}', - f'oneCCL {prov} {mode}' - ).summarize() - err += ret if ret else 0 - ret = OnecclSummarizer( - logger, log_dir, 'oneCCL-GPU', - f'oneCCL-GPU_{prov}_onecclgpu_{mode}', - f'oneCCL-GPU {prov} {mode}' - ).summarize() - err += ret if ret else 0 - - if summary_item == 'shmem' or summary_item == 'all': - for prov in ['tcp', 'verbs', 'sockets']: - ret= ShmemSummarizer( - logger, log_dir, prov, - f'SHMEM_{prov}_shmem_{mode}', - f'shmem {prov} {mode}' - ).summarize() - err += ret if ret else 0 - - if summary_item == 'v3' or summary_item == 'all': - test_types = ['h2d', 'd2d', 'xd2d'] - for t in test_types: - ret = FabtestsSummarizer( - logger, log_dir, 'shm', - f'ze_v3_shm_{t}_fabtests_{mode}', - f"ze v3 shm {t} fabtests {mode}" - ).summarize() - err += ret if ret else 0 - - ret = OnecclSummarizer( - logger, log_dir, 'oneCCL-GPU', - f'oneCCL-GPU-v3_verbs-rxm_onecclgpu_{mode}', - f'oneCCL-GPU-v3 verbs-rxm {mode}' - ).summarize() - err += ret if ret else 0 - - if summary_item == 'dsa' or summary_item == 'all': - for prov in ['shm']: - ret = FabtestsSummarizer( - logger, log_dir, 'shm', - f'{prov}_dsa_fabtests_{mode}', - f"{prov} dsa fabtests {mode}" - ).summarize() - err += ret if ret else 0 - - if summary_item == 'dmabuf' or summary_item == 'all': - for prov in ['verbs-rxm']: - for num_nodes in range(1,3): - ret = DmabufSummarizer( - logger, log_dir, 'verbs-rxm', - f'DMABUF-Tests_{prov}_dmabuf_{num_nodes}_{mode}', - f"DMABUF-Tests {prov} dmabuf {num_nodes} node {mode}" - ).summarize() - err += ret if ret else 0 - - return err - -if __name__ == "__main__": -#read Jenkins environment variables - # In Jenkins, JOB_NAME = 'ofi_libfabric/master' vs BRANCH_NAME = 'master' - # job name is better to use to distinguish between builds of different - # jobs but with same branch name. - jobname = os.environ['JOB_NAME'] - buildno = os.environ['BUILD_NUMBER'] - workspace = os.environ['WORKSPACE'] - custom_workspace = os.environ['CUSTOM_WORKSPACE'] - - parser = argparse.ArgumentParser() - parser.add_argument('--summary_item', help="functional test to summarize", - choices=['fabtests', 'imb', 'osu', 'mpichtestsuite', - 'oneccl', 'shmem', 'multinode', 'daos', 'v3', - 'dsa', 'dmabuf', 'all']) - parser.add_argument('--ofi_build_mode', help="select buildmode debug or dl", - choices=['dbg', 'dl', 'reg'], default='all') - parser.add_argument('-v', help="Verbose mode. Print all tests", \ - action='store_true') - parser.add_argument('--release', help="This job is testing a release."\ - "It will be saved and checked into a git tree.", - action='store_true') - parser.add_argument('--send_mail', help="Email mailing list with summary "\ - "results", action='store_true') - - args = parser.parse_args() - verbose = args.v - summary_item = args.summary_item - release = args.release - ofi_build_mode = args.ofi_build_mode - send_mail = args.send_mail - - mpi_list = ['impi', 'mpich', 'ompi'] - log_dir = f'{custom_workspace}/log_dir' - if (not os.path.exists(log_dir)): - os.makedirs(log_dir) - - job_name = os.environ['JOB_NAME'].replace('/', '_') - - print(f"Files to be summarized: {os.listdir(log_dir)}") - - if (release): - release_num = get_release_num() - date = datetime.now().strftime("%Y%m%d%H%M%S") - output_name = f'summary_{release_num}_{job_name}_{date}.log' - else: - output_name = f'summary_{job_name}.log' - - full_file_name = f'{log_dir}/{output_name}' - - with open(full_file_name, 'a') as output_file: - if (ofi_build_mode == 'all'): - output_file.truncate(0) - - logger = Logger(output_file, release) - if (release): - Release( - log_dir, output_file, logger, release_num - ).add_release_changes() - - err = 0 - build_modes = ['reg', 'dbg', 'dl'] - for mode in build_modes: - if ofi_build_mode != 'all' and mode != ofi_build_mode: - continue - - err += summarize_items(summary_item, logger, log_dir, mode) - - if (release): - shutil.copyfile(f'{full_file_name}', f'{custom_workspace}/{output_name}') - - if (send_mail): - SendEmail(sender = os.environ['SENDER'], - receivers = os.environ['mailrecipients'], - attachment = full_file_name - ).send_mail() - - exit(err) diff --git a/contrib/intel/jenkins/tests.py b/contrib/intel/jenkins/tests.py deleted file mode 100755 index 71617933992..00000000000 --- a/contrib/intel/jenkins/tests.py +++ /dev/null @@ -1,1138 +0,0 @@ -import sys -import os -import io - -sys.path.append(f"{os.environ['WORKSPACE']}/ci_resources/configs/{os.environ['CLUSTER']}") - -import subprocess -import re -import cloudbees_config -import common -import shlex -import time - -# A Jenkins env variable for job name is composed of the name of the jenkins job and the branch name -# it is building for. for e.g. in our case jobname = 'ofi_libfabric/master' -class Test: - - def __init__ (self, jobname, buildno, testname, hw, core_prov, fabric, - hosts, ofi_build_mode, user_env, log_file, mpitype=None, - util_prov=None, way=None): - self.jobname = jobname - self.buildno = buildno - self.testname = testname - self.hw = hw - self.core_prov = core_prov - self.util_prov = f'ofi_{util_prov}' if util_prov != None else '' - self.fabric = fabric - self.hosts = hosts - self.log_file = log_file - self.mpi_type = mpitype - self.ofi_build_mode = ofi_build_mode - if (len(hosts) == 1): - self.server = hosts[0] - self.client = hosts[0] - elif (len(hosts) == 2): - self.server = hosts[0] - self.client = hosts[1] - - self.nw_interface = cloudbees_config.interface_map[self.fabric] - self.custom_workspace = os.environ['CUSTOM_WORKSPACE'] - self.libfab_installpath = f'{self.custom_workspace}/'\ - f'{self.hw}/{self.ofi_build_mode}' - - self.middlewares_path = f'{self.custom_workspace}/middlewares' - self.ci_logdir_path = f'{self.custom_workspace}/log_dir' - self.env = user_env - self.way = way - - self.mpi = '' - if (self.mpi_type == 'impi'): - self.mpi = IMPI(self.core_prov, self.hosts, - self.libfab_installpath, self.nw_interface, - self.server, self.client, self.env, - self.middlewares_path, self.util_prov) - elif (self.mpi_type == 'ompi'): - self.mpi = OMPI(self.core_prov, self.hosts, - self.libfab_installpath, self.nw_interface, - self.server, self.client, self.env, - self.middlewares_path, self.util_prov) - elif (self.mpi_type == 'mpich'): - self.mpi = MPICH(self.hw, self.core_prov, self.hosts, - self.libfab_installpath, self.nw_interface, - self.server, self.client, self.env, - self.middlewares_path, self.util_prov) - - -class FiInfoTest(Test): - - def __init__(self, jobname, buildno, testname, hw, core_prov, fabric, - hosts, ofi_build_mode, user_env, log_file, util_prov=None): - - super().__init__(jobname, buildno, testname, hw, core_prov, fabric, - hosts, ofi_build_mode, user_env, log_file, None, util_prov) - - self.fi_info_testpath = f'{self.libfab_installpath}/bin' - - @property - def cmd(self): - return f"{self.fi_info_testpath}/fi_info " - - @property - def options(self): - if (self.util_prov): - opts = f"-f {self.fabric} -p {self.core_prov};{self.util_prov}" - elif (self.core_prov == 'psm3'): - opts = f"-p {self.core_prov}" - else: - opts = f"-f {self.fabric} -p {self.core_prov}" - - return opts - - def execute_cmd(self): - command = self.cmd + self.options - outputcmd = shlex.split(command) - common.run_command(outputcmd) - - -class Fabtest(Test): - - def __init__(self, jobname, buildno, testname, hw, core_prov, fabric, - hosts, ofi_build_mode, user_env, log_file, util_prov=None, - way=None): - - super().__init__(jobname, buildno, testname, hw, core_prov, fabric, - hosts, ofi_build_mode, user_env, log_file, None, - util_prov, way) - self.fabtestpath = f'{self.libfab_installpath}/bin' - self.fabtestconfigpath = f'{self.libfab_installpath}/share/fabtests' - - def get_exclude_file(self): - path = self.libfab_installpath - efile_path = f'{path}/share/fabtests/test_configs' - - if self.hw == 'ivysaur': - efile = f'{efile_path}/{self.core_prov}/io_uring.exclude' - else: - prov = self.util_prov if self.util_prov else self.core_prov - efile_old = f'{efile_path}/{prov}/{prov}.exclude' - - if self.util_prov: - efile = f'{efile_path}/{self.util_prov}/{self.core_prov}/exclude' - else: - efile = f'{efile_path}/{self.core_prov}/exclude' - - if os.path.isfile(efile): - return efile - elif os.path.isfile(efile_old): - return efile_old - else: - print(f"Exclude file: {efile} not found!") - return None - - @property - def cmd(self): - return f"{self.fabtestpath}/runfabtests.sh " - - @property - def options(self): - opts = f"-T 300 -vvv -p {self.fabtestpath} -S " - if (self.core_prov != 'shm' and self.nw_interface): - opts += f"-s {common.get_node_name(self.server, self.nw_interface)} " - opts += f"-c {common.get_node_name(self.client, self.nw_interface)} " - - if (self.core_prov == 'shm'): - opts += f"-s {self.server} " - opts += f"-c {self.client} " - opts += "-N " - - if (self.core_prov == 'ucx'): - opts += "-b " - - if (self.ofi_build_mode == 'dl'): - opts += "-t short " - else: - opts += "-t all " - - if (self.way == 'h2d'): - opts += "-C \"-H\" -L \"-D ze\" " - elif (self.way == 'd2d'): - opts += "-C \"-D ze\" -L \"-D ze\" " - elif (self.way == 'xd2d'): - opts += "-C \"-D ze\" -L \"-D ze -i 1\" " - - if (self.core_prov == 'sockets' and self.ofi_build_mode == 'reg'): - complex_test_file = f'{self.libfab_installpath}/share/fabtests/'\ - f'test_configs/{self.core_prov}/quick.test' - if (os.path.isfile(complex_test_file)): - opts += "-u {complex_test_file} " - else: - print(f"{self.core_prov} Complex test file not found") - - if (self.ofi_build_mode != 'reg' or self.core_prov == 'udp'): - opts += "-e \'ubertest,multinode\' " - - efile = self.get_exclude_file() - if efile: - opts += "-R " - opts += f"-f {efile} " - - for key in self.env: - opts += f"-E {key}={self.env[key]} " - - if self.util_prov: - opts += f"{self.core_prov};{self.util_prov} " - else: - opts += f"{self.core_prov} " - - if (self.core_prov == 'shm'): - opts += f"{self.server} {self.server} " - else: - opts += f"{self.server} {self.client} " - - return opts - - @property - def execute_condn(self): - return True - - def execute_cmd(self): - curdir = os.getcwd() - os.chdir(self.fabtestconfigpath) - command = self.cmd + self.options - outputcmd = shlex.split(command) - common.run_command(outputcmd) - os.chdir(curdir) - - -class ShmemTest(Test): - - def __init__(self, jobname, buildno, testname, hw, core_prov, fabric, - hosts, ofi_build_mode, user_env, log_file, util_prov=None): - - super().__init__(jobname, buildno, testname, hw, core_prov, fabric, - hosts, ofi_build_mode, user_env, log_file, None, - util_prov) - - self.n = 4 - self.ppn = 2 - self.shmem_dir = f'{self.middlewares_path}/shmem' - self.hydra = f'{cloudbees_config.hydra}' - self.shmem_testname = '' - self.threshold = '1' - self.isx_shmem_total_size = 33554432 - self.isx_shmem_kernel_max = 134217728 - self.prk_iterations = 10 - self.prk_first_arr_dim = 1000 - self.prk_second_arr_dim = 1000 - if self.util_prov: - self.prov = f'{self.core_prov};{self.util_prov}' - else: - self.prov = self.core_prov - - self.test_dir = { - 'unit' : 'SOS', - 'uh' : 'tests-uh', - 'isx' : 'ISx/SHMEM', - 'prk' : 'PRK/SHMEM' - } - - self.make = { - 'unit' : 'make VERBOSE=1', - 'uh' : 'make C_feature_tests-run', - 'isx' : '', - 'prk' : '' - } - - self.shmem_environ = { - 'SHMEM_OFI_USE_PROVIDER': self.prov, - 'OSHRUN_LAUNCHER' : self.hydra, - 'PATH' : f'{self.shmem_dir}/bin:$PATH', - 'LD_LIBRARY_PATH' : f'{self.shmem_dir}/lib:'\ - f'{self.libfab_installpath}/lib', - 'SHMEM_SYMMETRIC_SIZE' : '4G', - 'LD_PRELOAD' : f'{self.libfab_installpath}'\ - '/lib/libfabric.so', - 'threshold' : self.threshold - } - - def export_env(self): - environ = '' - if self.shmem_testname == 'isx' or self.shmem_testname == 'prk': - self.threshold = '0' - - for key,val in self.shmem_environ.items(): - environ += f"export {key}={val}; " - return environ - - def cmd(self): - cmd = '' - if self.shmem_testname == 'unit': - cmd += f"{self.make[self.shmem_testname]} " - cmd += "mpiexec.hydra " - cmd += f"-n {self.n} " - cmd += f"-np {self.ppn} " - cmd += 'check' - elif self.shmem_testname == 'uh': - cmd += f'{self.make[self.shmem_testname]}' - elif self.shmem_testname == 'isx': - cmd += f"oshrun -np 4 ./bin/isx.strong {self.isx_shmem_kernel_max}"\ - " output_strong; " - cmd += f"oshrun -np 4 ./bin/isx.weak {self.isx_shmem_total_size} "\ - "output_weak; " - cmd += f"oshrun -np 4 ./bin/isx.weak_iso "\ - f"{self.isx_shmem_total_size} output_weak_iso " - elif self.shmem_testname == 'prk': - cmd += f"oshrun -np 4 ./Stencil/stencil {self.prk_iterations} "\ - f"{self.prk_first_arr_dim}; " - cmd += f"oshrun -np 4 ./Synch_p2p/p2p {self.prk_iterations} "\ - f"{self.prk_first_arr_dim} {self.prk_second_arr_dim}; " - cmd += f"oshrun -np 4 ./Transpose/transpose {self.prk_iterations} "\ - f"{self.prk_first_arr_dim} " - - return cmd - - - @property - def execute_condn(self): - #make always true when verbs and sockets are passing - return True if (self.core_prov == 'tcp') \ - else False - - def execute_cmd(self, shmem_testname): - self.shmem_testname = shmem_testname - cwd = os.getcwd() - os.chdir(f'{self.shmem_dir}/{self.test_dir[self.shmem_testname]}') - print("Changed directory to "\ - f'{self.shmem_dir}/{self.test_dir[self.shmem_testname]}') - command = f"bash -c \'{self.export_env()} {self.cmd()}\'" - outputcmd = shlex.split(command) - common.run_command(outputcmd) - os.chdir(cwd) - -class MultinodeTests(Test): - - def __init__(self, jobname, buildno, testname, hw, core_prov, fabric, - hosts, ofi_build_mode, user_env, log_file, util_prov=None): - - super().__init__(jobname, buildno, testname, hw, core_prov, fabric, - hosts, ofi_build_mode, user_env, log_file, None, util_prov) - self.fabtestpath = f'{self.libfab_installpath}/bin' - self.fabtestconfigpath = f'{self.libfab_installpath}/share/fabtests' - self.n = 2 - self.ppn = 64 - self.iterations = 1 - self.method = 'msg' - self.pattern = "full_mesh" - - @property - def cmd(self): - return f"{self.fabtestpath}/runmultinode.sh " - - @property - def options(self): - opts = f"-h {common.get_node_name(self.server, self.nw_interface)}" - opts += f",{common.get_node_name(self.client, self.nw_interface)}" - opts += f" -n {self.ppn}" - opts += f" -I {self.iterations}" - opts += f" -z {self.pattern}" - opts += f" -C {self.method}" - if self.util_prov: - opts += f" -p {self.core_prov};{self.util_prov}" - else: - opts += f" -p {self.core_prov}" - opts += f" --ci {self.fabtestpath}/" #enable ci mode to disable tput - - return opts - - @property - def execute_condn(self): - return True - - def execute_cmd(self): - if self.util_prov: - prov = f"{self.core_prov}-{self.util_prov} " - else: - prov = self.core_prov - curdir = os.getcwd() - os.chdir(self.fabtestconfigpath) - command = self.cmd + self.options - outputcmd = shlex.split(command) - common.run_command(outputcmd) - os.chdir(curdir) - -class OMPI: - def __init__(self, core_prov, hosts, libfab_installpath, nw_interface, - server, client, environ, middlewares_path, util_prov=None): - - self.ompi_src = f'{middlewares_path}/ompi' - self.core_prov = core_prov - self.hosts = hosts - self.util_prov = util_prov - self.libfab_installpath = libfab_installpath - self.nw_interface = nw_interface - self.server = server - self.client = client - self.environ = environ - self.n = 4 - self.ppn = 2 - - @property - def env(self): - cmd = "bash -c \'" - if (self.util_prov): - cmd += f"export FI_PROVIDER={self.core_prov}\\;{self.util_prov}; " - else: - cmd += f"export FI_PROVIDER={self.core_prov}; " - cmd += "export I_MPI_FABRICS=ofi; " - cmd += f"export LD_LIBRARY_PATH={self.ompi_src}/lib:$LD_LIBRARY_PATH; " - cmd += f"export LD_LIBRARY_PATH={self.libfab_installpath}/lib/:"\ - "$LD_LIBRARY_PATH; " - cmd += f"export PATH={self.ompi_src}/bin:$PATH; " - cmd += f"export PATH={self.libfab_installpath}/bin:$PATH; " - return cmd - - @property - def options(self): - opts = f"-np {self.n} " - hosts = '\',\''.join([':'.join([common.get_node_name(host, \ - self.nw_interface), str(self.ppn)]) \ - for host in self.hosts]) - opts += f"--host \'{hosts}\' " - if self.util_prov: - opts += f"--mca mtl_ofi_provider_include {self.core_prov}\\;"\ - f"{self.util_prov} " - opts += f"--mca btl_ofi_provider_include {self.core_prov}\\;"\ - f"{self.util_prov} " - else: - opts += f"--mca mtl_ofi_provider_include {self.core_prov} " - opts += f"--mca btl_ofi_provider_include {self.core_prov} " - opts += "--mca orte_base_help_aggregate 0 " - # This is necessary to prevent verbs from printing warning messages - # The test still uses libfabric verbs even when enabled. - # if (self.core_prov == 'verbs'): - # opts += "--mca btl_openib_allow_ib 1 " - opts += "--mca mtl ofi " - opts += "--mca pml cm -tag-output " - for key in self.environ: - opts += f"-x {key}={self.environ[key]} " - - return opts - - @property - def cmd(self): - return f"{self.ompi_src}/bin/mpirun {self.options}" - -class MPICH: - def __init__(self, hw, core_prov, hosts, libfab_installpath, nw_interface, - server, client, environ, middlewares_path, util_prov=None): - - self.mpich_dir = f'{middlewares_path}/mpich_{hw}' - self.mpichpath = f'{self.mpich_dir}/mpich' - self.core_prov = core_prov - self.hosts = hosts - self.util_prov = util_prov - self.libfab_installpath = libfab_installpath - self.nw_interface = nw_interface - self.server = server - self.client = client - self.environ = environ - self.n = 4 - self.ppn = 1 - - @property - def env(self): - cmd = "bash -c \'" - if (self.util_prov): - cmd += f"export FI_PROVIDER={self.core_prov}\\;{self.util_prov}; " - else: - cmd += f"export FI_PROVIDER={self.core_prov}; " - cmd += "export I_MPI_FABRICS=ofi; " - cmd += "export HYDRA_LAUNCHER=fork;" - cmd += "export MPIR_CVAR_CH4_OFI_ENABLE_ATOMICS=0; " - cmd += "export MPIR_CVAR_CH4_OFI_CAPABILITY_SETS_DEBUG=0; " - cmd += f"export LD_LIBRARY_PATH={self.mpich_dir}/lib:$LD_LIBRARY_PATH; " - cmd += f"export LD_LIBRARY_PATH={self.libfab_installpath}/lib/:"\ - "$LD_LIBRARY_PATH; " - cmd += f"export PATH={self.mpich_dir}/bin:$PATH; " - cmd += f"export PATH={self.libfab_installpath}/bin:$PATH; " - return cmd - - @property - def options(self): - opts = f"-n {self.n} " - opts += f"-ppn {self.ppn} " - opts += "-launcher ssh " - # Removed because sbatch does this for us whenwe use mpirun - # opts += f"-hosts {common.get_node_name(self.server, self.nw_interface)},"\ - # f"{common.get_node_name(self.client, self.nw_interface)} " - for key in self.environ: - opts += f"-genv {key} {self.environ[key]} " - - return opts - - @property - def cmd(self): - return f"{self.mpich_dir}/bin/mpirun {self.options}" - -class IMPI: - def __init__(self, core_prov, hosts, libfab_installpath, nw_interface, - server, client, environ, middlewares_path, util_prov=None): - - self.impi_src = f'{cloudbees_config.impi_root}' - self.mpichpath = f'{middlewares_path}/impi/mpichsuite/' - self.core_prov = core_prov - self.hosts = hosts - self.util_prov = util_prov - self.libfab_installpath = libfab_installpath - self.nw_interface = nw_interface - self.server = server - self.client = client - self.environ = environ - self.n = 4 - self.ppn = 1 - - @property - def env(self): - cmd = f"bash -c \'source {self.impi_src}/env/vars.sh "\ - "-i_mpi_ofi_internal=0; " - cmd += f"source {cloudbees_config.intel_compiler_root}/env/vars.sh; " - if (self.util_prov): - cmd += f"export FI_PROVIDER={self.core_prov}\\;{self.util_prov}; " - else: - cmd += f"export FI_PROVIDER={self.core_prov}; " - if (self.core_prov == 'tcp'): - cmd += "export FI_IFACE=eth0; " - elif (self.core_prov == 'verbs'): - cmd += "export FI_IFACE=ib0; " - cmd += "export I_MPI_FABRICS=ofi; " - cmd += f"export LD_LIBRARY_PATH={self.impi_src}/lib:$LD_LIBRARY_PATH; " - cmd += f"export LD_LIBRARY_PATH={self.impi_src}/lib/release:"\ - "$LD_LIBRARY_PATH; " - cmd += f"export LD_LIBRARY_PATH={self.libfab_installpath}/lib/:"\ - "$LD_LIBRARY_PATH; " - cmd += f"export PATH={self.libfab_installpath}/bin:$PATH; " - return cmd - - @property - def options(self): - opts = f"-n {self.n} " - opts += f"-ppn {self.ppn} " - opts += f"-hosts {common.get_node_name(self.server, self.nw_interface)},"\ - f"{common.get_node_name(self.client, self.nw_interface)} " - for key in self.environ: - opts += f"-genv {key} {self.environ[key]} " - - return opts - - @property - def cmd(self): - return f"{self.impi_src}/bin/mpiexec {self.options}" - - -class IMBtests(Test): - def __init__(self, jobname, buildno, testname, hw, core_prov, fabric, - hosts, mpitype, ofi_build_mode, user_env, log_file, test_group, - util_prov=None): - - super().__init__(jobname, buildno, testname, hw, core_prov, - fabric, hosts, ofi_build_mode, user_env, log_file, mpitype, - util_prov) - - self.test_group = test_group - self.mpi_type = mpitype - self.imb_src = '' - self.imb_tests = { - '1' :[ - 'MPI1', - 'P2P' - ], - '2' :[ - 'EXT', - 'IO' - ], - '3' :[ - 'NBC', - 'RMA', - 'MT' - ] - } - self.iter = 100 - self.include = { - 'MPI1':[ - 'Biband', - 'Uniband', - 'PingPongAnySource', - 'PingPingAnySource', - 'PingPongSpecificSource', - 'PingPingSpecificSource' - ], - 'P2P':[], - 'EXT':[], - 'IO':[], - 'NBC':[], - 'RMA':[], - 'MT':[] - } - self.exclude = { - 'MPI1':[], - 'P2P':[], - 'EXT':[ - 'Accumulate' - ], - 'IO':[], - 'NBC':[], - 'RMA':[ - 'Accumulate', - 'Get_accumulate', - 'Fetch_and_op', - 'Compare_and_swap', - 'All_put_all', - 'All_get_all' - ], - 'MT':[] - } - self.imb_src = f'{self.middlewares_path}/{self.mpi_type}/imb' - - @property - def execute_condn(self): - # Mpich and ompi are excluded to save time. Run manually if needed - return (self.mpi_type == 'impi') - - def imb_cmd(self, imb_test): - print(f"Running IMB-{imb_test}") - cmd = f"{self.imb_src}/IMB-{imb_test} " - if (imb_test != 'MT'): - cmd += f"-iter {self.iter} " - - if (len(self.include[imb_test]) > 0): - cmd += f"-include {','.join(self.include[imb_test])}" - - if (len(self.exclude[imb_test]) > 0): - cmd += f"-exclude {','.join(self.exclude[imb_test])}" - - return cmd - - def execute_cmd(self): - for test_type in self.imb_tests[self.test_group]: - outputcmd = shlex.split(self.mpi.env + self.mpi.cmd + \ - self.imb_cmd(test_type) + '\'') - common.run_command(outputcmd) - - -class OSUtests(Test): - - def __init__(self, jobname, buildno, testname, hw, core_prov, fabric, - hosts, mpitype, ofi_build_mode, user_env, log_file, util_prov=None): - - super().__init__(jobname, buildno, testname, hw, core_prov, - fabric, hosts, ofi_build_mode, user_env, log_file, mpitype, - util_prov) - - self.n_ppn = { - 'pt2pt': (2, 1), - 'collective': (4, 2), - 'one-sided': (2, 1), - 'startup': (2, 1) - } - if mpitype == 'mpich' and hw in ['water', 'grass']: - self.mpitype = f'{mpitype}_{hw}' - else: - self.mpitype = mpitype - - self.osu_src = f'{self.middlewares_path}/{self.mpitype}/osu/libexec/'\ - 'osu-micro-benchmarks/mpi/' - - @property - def execute_condn(self): - # mpich-tcp, ompi are the only osu test combinations failing - return False if ((self.mpi_type == 'mpich' and self.core_prov == 'tcp') or \ - self.mpi_type == 'ompi') \ - else True - - def osu_cmd(self, test_type, test): - print(f"Running OSU-{test_type}-{test}") - cmd = f'{self.osu_src}/{test_type}/{test} ' - return cmd - - def execute_cmd(self): - assert(self.osu_src) - p = re.compile('osu_put*') - for root, dirs, tests in os.walk(self.osu_src): - for test in tests: - self.mpi.n = self.n_ppn[os.path.basename(root)][0] - self.mpi.ppn = self.n_ppn[os.path.basename(root)][1] - - if (test == 'osu_latency_mp' and self.core_prov == 'verbs'): - self.env['IBV_FORK_SAFE'] = '1' - - if(p.search(test) == None): - osu_command = self.osu_cmd(os.path.basename(root), test) - outputcmd = shlex.split(self.mpi.env + self.mpi.cmd + \ - osu_command + '\'') - common.run_command(outputcmd) - - if (test == 'osu_latency_mp' and self.core_prov == 'verbs'): - self.env.pop('IBV_FORK_SAFE') - - -class MpichTestSuite(Test): - - def __init__(self, jobname, buildno, testname, hw, core_prov, fabric, - hosts, mpitype, ofi_build_mode, user_env, log_file, util_prov=None, weekly=None): - - super().__init__(jobname, buildno, testname, hw, core_prov, - fabric, hosts, ofi_build_mode, user_env, log_file, mpitype, - util_prov) - self.mpi_type = mpitype - if (mpitype != 'ompi'): - self.mpichsuitepath = f'{self.mpi.mpichpath}/test/mpi/' - self.pwd = os.getcwd() - self.weekly = weekly - self.mpichtests_exclude = { - 'tcp' : { 'rma' : [('win_shared_put_flush_load 3', 'test')], - 'threads/comm' : [('idup_nb 4','test')] - }, - 'verbs' : { 'threads/comm' : [('idup_nb 4','test')], - 'spawn' : [('concurrent_spawns 1', 'test')], - 'pt2pt' : [('sendrecv3 2','test'), - ('sendrecv3 2 arg=-isendrecv','test')], - 'threads/pt2pt': [(f"mt_improbe_sendrecv_huge 2 " - f"arg=-iter=64 arg=-count=4194304 " - f"env=MPIR_CVAR_CH4_OFI_EAGER_MAX_MSG_SIZE" - f"=16384", 'test')] - } - } - - def create_hostfile(self, file, hostlist): - with open(file, "w") as f: - for host in hostlist: - f.write(f"{host}\n") - - def update_testlists(self, filename, category): - with open(filename, 'r') as file: - lines = file.read().splitlines() - for line in lines: - if (line == category): - lines[lines.index(line)] = f'#{line}' - else: - continue - with open(filename, 'w') as file: - file.write('\n'.join(lines)) - - def exclude_tests(self, test_root, provider): - for path,exclude_list in self.mpichtests_exclude[f'{provider}'].items(): - for item in exclude_list: - self.update_testlists(f'{test_root}/{path}/testlist', item[0]) - if (item[1] == 'dir'): - filename = f'{test_root}/{path}/{item[0]}/testlist' - with open(filename,'r') as file: - for line in file: - line = line.strip() - if (not line.startswith('#')): - print(f'excluding:{path}/{item[0]}:{line}') - else: #item[1]=test - print(f'excluding:{path}/{item[0]}') - - @property - def execute_condn(self): - return ((self.mpi_type == 'impi' and self.weekly) or \ - self.mpi_type == 'mpich') - - def execute_cmd(self): - if (self.mpi_type == 'mpich'): - configure_cmd = f"./configure --with-mpi={self.mpi.mpich_dir} " - if (self.weekly): - print(f'Weekly {self.mpi_type} mpichsuite tests') - os.chdir(self.mpichsuitepath) - common.run_command(shlex.split(self.mpi.env + - configure_cmd + '\'')) - self.exclude_tests(self.mpichsuitepath, self.core_prov) - testcmd = 'make testing' - outputcmd = shlex.split(self.mpi.env + testcmd + '\'') - common.run_command(outputcmd) - common.run_command(shlex.split(f"cat {self.mpichsuitepath}/" \ - f"summary.tap")) - os.chdir(self.pwd) - else: - print(f"PR {self.mpi_type} mpichsuite tests") - os.chdir(self.mpichsuitepath) - common.run_command(shlex.split(self.mpi.env + - configure_cmd + '\'')) - common.run_command(['make', '-j']) - self.exclude_tests(self.mpichsuitepath, self.core_prov) - testcmd = "./runtests -tests=testlist " - testcmd += f" -xmlfile=summary.xml -tapfile=summary.tap " \ - f"-junitfile=summary.junit.xml " - common.run_command(shlex.split(self.mpi.env + testcmd + '\'')) - common.run_command(shlex.split(f"cat {self.mpichsuitepath}/" \ - f"summary.tap")) - os.chdir(self.pwd) - if (self.mpi_type == 'impi' and self.weekly == True): - print (f'Weekly {self.mpi_type} mpichsuite tests') - os.chdir(self.mpi.mpichpath) - print(self.hosts) - self.create_hostfile(f'{self.mpi.mpichpath}/hostfile', - self.hosts) - os.environ["I_MPI_HYDRA_HOST_FILE"] = \ - f'{self.mpi.mpichpath}/hostfile' - test_cmd = f"export I_MPI_HYDRA_HOST_FILE=" \ - f"{self.mpi.mpichpath}/hostfile; " - test_cmd += f"./test.sh --exclude lin,{self.core_prov},*,*,*,*; " - common.run_command(shlex.split(self.mpi.env + test_cmd + '\'')) - common.run_command(shlex.split(f"cat {self.mpichsuitepath}/" \ - f"summary.tap")) - os.chdir(self.pwd) - -class OneCCLTests(Test): - - def __init__(self, jobname, buildno, testname, hw, core_prov, fabric, - hosts, ofi_build_mode, user_env, log_file, util_prov=None): - super().__init__(jobname, buildno, testname, hw, core_prov, fabric, - hosts, ofi_build_mode, user_env, log_file, None, util_prov) - - self.oneccl_path = f'{self.middlewares_path}/oneccl/' - self.test_dir = f'{self.middlewares_path}/oneccl/ci_tests' - if self.util_prov: - self.prov = f"{self.core_prov}\;{self.util_prov}" - else: - self.prov = self.core_prov - self.oneccl_environ = { - 'FI_PROVIDER' : f"\"{self.prov}\"", - 'CCL_ATL_TRANSPORT' : 'ofi', - 'CCL_ATL_TRANSPORT_LIST' : 'ofi' - } - - self.ld_library = [ - f'{self.libfab_installpath}/lib', - f'{self.oneccl_path}/build/_install/lib' - ] - - def export_env(self): - environ = f"source {cloudbees_config.oneapi_root}/setvars.sh; " - environ += f"source {self.oneccl_path}/build/_install/env/vars.sh; " - if self.core_prov == 'psm3': - self.oneccl_environ['PSM3_MULTI_EP'] = '1' - - for key, val in self.oneccl_environ.items(): - environ += f"export {key}={val}; " - - ld_library_path = 'LD_LIBRARY_PATH=' - for item in self.ld_library: - ld_library_path += f'{item}:' - - environ += f"export {ld_library_path}$LD_LIBRARY_PATH; " - return environ - - def cmd(self): - return './run.sh ' - - def options(self): - opts = "--mode cpu " - return opts - - @property - def execute_condn(self): - return True - - @property - def execute_condn(self): - return True - - def execute_cmd(self): - curr_dir = os.getcwd() - os.chdir(self.test_dir) - command = f"bash -c \'{self.export_env()} {self.cmd()} "\ - f"{self.options()}\'" - outputcmd = shlex.split(command) - common.run_command(outputcmd) - os.chdir(curr_dir) - -class OneCCLTestsGPU(Test): - - def __init__(self, jobname, buildno, testname, hw, core_prov, fabric, - hosts, ofi_build_mode, user_env, log_file, util_prov=None): - super().__init__(jobname, buildno, testname, hw, core_prov, fabric, - hosts, ofi_build_mode, user_env, log_file, None, util_prov) - - self.n = 2 - self.ppn = 1 - self.oneccl_path = f'{self.middlewares_path}/oneccl_gpu/build' - if self.util_prov: - self.prov = f"{self.core_prov}\;{self.util_prov}" - else: - self.prov = self.core_prov - - self.onecclgpu_environ = { - 'FI_PROVIDER' : self.prov, - # 'LD_PRELOAD' : f"{self.libfab_installpath}/lib/libfabric.so", - 'CCL_ATL_TRANSPORT' : 'ofi', - 'CCL_ROOT' : f"{self.oneccl_path}/_install" - } - - self.ld_library = [ - f'{self.libfab_installpath}/lib', - '$LD_LIBRARY_PATH', - f'{self.oneccl_path}/_install/lib' - ] - - self.tests = { - 'examples' : [ - 'sycl_allgatherv_custom_usm_test', - 'sycl_allgatherv_inplace_test', - 'sycl_allgatherv_inplace_usm_test', - 'sycl_allgatherv_test', - 'sycl_allgatherv_usm_test', - 'sycl_allreduce_inplace_usm_test', - 'sycl_allreduce_test', - 'sycl_allreduce_usm_test', - 'sycl_alltoall_test', - 'sycl_alltoall_usm_test', - 'sycl_alltoallv_test', - 'sycl_alltoallv_usm_test', - 'sycl_broadcast_test', - 'sycl_broadcast_usm_test', - 'sycl_reduce_inplace_usm_test', - 'sycl_reduce_scatter_test', - 'sycl_reduce_scatter_usm_test', - 'sycl_reduce_test', - 'sycl_reduce_usm_test' - ], - 'functional' : [ - 'allgatherv_test', - 'alltoall_test', - 'alltoallv_test', - 'bcast_test', - 'reduce_scatter_test', - 'reduce_test' - ] - } - - def export_env(self): - environ = f"source {cloudbees_config.impi_root}/env/vars.sh "\ - "-i_mpi_internal=0; " - environ += f"source {cloudbees_config.intel_compiler_root}/env/vars.sh; " - for key, val in self.onecclgpu_environ.items(): - environ += f"export {key}={val}; " - - ld_library_path = 'LD_LIBRARY_PATH=' - for item in self.ld_library: - ld_library_path += f'{item}:' - - environ += f"export {ld_library_path}$LD_LIBRARY_PATH; " - return environ - - def cmd(self): - return f"{self.oneccl_path}/_install/bin/mpiexec " - - def options(self): - opts = "-l " - opts += f"-n {self.n} " - opts += f"-ppn {self.ppn} " - opts += f"-hosts {self.server},{self.client} " - return opts - - @property - def execute_condn(self): - return True - - - def execute_cmd(self, oneccl_test_gpu): - curr_dir = os.getcwd() - if 'examples' in oneccl_test_gpu: - os.chdir(f"{self.oneccl_path}/_install/examples/sycl") - else: - os.chdir(f"{self.oneccl_path}/tests/functional") - - for test in self.tests[oneccl_test_gpu]: - if '_usm_' in test: - gpu_selector = 'device' - else: - gpu_selector = 'default' - - command = f"bash -c \'{self.export_env()} {self.cmd()} "\ - f"{self.options()} ./{test} " - if 'examples' in oneccl_test_gpu: - command += f"gpu {gpu_selector}" - command += "\'" - - outputcmd = shlex.split(command) - common.run_command(outputcmd) - os.chdir(curr_dir) - -class DaosCartTest(Test): - - def __init__(self, jobname, buildno, testname, hw, core_prov, fabric, - hosts, ofi_build_mode, user_env, log_file, util_prov=None): - super().__init__(jobname, buildno, testname, hw, core_prov, fabric, - hosts, ofi_build_mode, user_env, log_file, None, util_prov) - - - self.set_paths(core_prov) - print(core_prov) - self.daos_nodes = cloudbees_config.prov_node_map[core_prov] - print(self.daos_nodes) - self.launch_node = self.daos_nodes[0] - - self.cart_tests = { - 'corpc_one_node' : {'tags' :'cart,corpc,one_node', 'numservers':1, 'numclients':0}, - 'corpc_two_node' : {'tags' :'cart,corpc,two_node', 'numservers':2, 'numclients':0}, - 'ctl_one_node' : {'tags' :'cart,ctl,one_node', 'numservers':1, 'numclients':1}, - 'ghost_rank_rpc_one_node' : {'tags' :'cart,ghost_rank_rpc,one_node', 'numservers':1, 'numclients':0}, - 'group_test' : {'tags' :'cart,group_test,one_node', 'numservers':1, 'numclients':0}, - 'iv_one_node' : {'tags' :'cart,iv,one_node', 'numservers':1, 'numclients':1}, - 'iv_two_node' : {'tags' :'cart,iv,two_node', 'numservers':2, 'numclients':1}, - 'launcher_one_node' : {'tags' :'cart,no_pmix_launcher,one_node','numservers':1, 'numclients':1}, - 'multictx_one_node' : {'tags' :'cart,no_pmix,one_node', 'numservers':1, 'numclients':0}, - 'rpc_one_node' : {'tags' :'cart,rpc,one_node', 'numservers':1, 'numclients':1}, - 'rpc_two_node' : {'tags' :'cart,rpc,two_node','numservers':2, 'numclients':1}, - 'swim_notification' : {'tags' :'cart,rpc,swim_rank_eviction,one_node', 'numservers':1, 'numclients':1} - } - - - def set_paths(self, core_prov): - self.ci_middlewares_path = f'{cloudbees_config.build_dir}/{core_prov}' - self.daos_install_root = f'{self.ci_middlewares_path}/daos/install' - self.cart_test_scripts = f'{self.daos_install_root}/lib/daos/TESTING/ftest' - self.mpipath = f'{cloudbees_config.daos_mpi}/bin' - self.pathlist = [f'{self.daos_install_root}/bin/', self.cart_test_scripts, self.mpipath, \ - f'{self.daos_install_root}/lib/daos/TESTING/tests'] - self.daos_prereq = f'{self.daos_install_root}/prereq' - common.run_command(['rm', '-rf', f'{self.ci_middlewares_path}/daos_logs/*']) - common.run_command(['rm','-rf', f'{self.daos_prereq}/debug/ofi']) - common.run_command(['ln', '-sfn', self.libfab_installpath, f'{self.daos_prereq}/debug/ofi']) - - @property - def cmd(self): - return f"env; echo {common.cloudbees_log_start_string}; "\ - "python3.6 launch.py " - - def remote_launch_cmd(self, testname): - -# The following env variables must be set appropriately prior -# to running the daos/cart tests OFI_DOMAIN, OFI_INTERFACE, -# CRT_PHY_ADDR_STR, PATH, DAOS_TEST_SHARED_DIR DAOS_TEST_LOG_DIR, -# LD_LIBRARY_PATH in the script being sourced below. - launch_cmd = f"ssh {self.launch_node} \"source {self.ci_middlewares_path}/daos_ci_env_setup.sh && \ - cd {self.cart_test_scripts} &&\" " - return launch_cmd - - def options(self, testname): - opts = "-s " - opts += f"{self.cart_tests[testname]['tags']} " - - if (self.cart_tests[testname]['numservers'] != 0): - servers = ",".join(self.daos_nodes[:self.cart_tests[testname]['numservers']]) - opts += f"--test_servers={servers} " - if (self.cart_tests[testname]['numclients'] != 0): - clients = ",".join(self.daos_nodes[:self.cart_tests[testname]['numclients']]) - opts += f"--test_clients={clients}" - return opts - - @property - def execute_condn(self): - return True - def execute_cmd(self): - sys.path.append(f'{self.daos_install_root}/lib64/python3.6/site-packages') - os.environ['PYTHONPATH']=f'{self.daos_install_root}/lib64/python3.6/site-packages' - - test_dir=self.cart_test_scripts - curdir=os.getcwd() - os.chdir(test_dir) - for test in self.cart_tests: - print(test) - command = self.remote_launch_cmd(test) + self.cmd + self.options(test) - outputcmd = shlex.split(command) - common.run_logging_command(outputcmd, self.log_file) - print("--------------------TEST COMPLETED----------------------") - os.chdir(curdir) - -class DMABUFTest(Test): - - def __init__(self, jobname, buildno, testname, hw, core_prov, fabric, - hosts, ofi_build_mode, user_env, log_file, util_prov=None): - - super().__init__(jobname, buildno, testname, hw, core_prov, fabric, - hosts, ofi_build_mode, user_env, log_file, - None, util_prov) - self.DMABUFtestpath = f'{self.libfab_installpath}/bin' - self.timeout = 300 - self.n = os.environ['SLURM_NNODES'] if 'SLURM_NNODES' \ - in os.environ.keys() \ - else 0 - - if util_prov: - self.prov = f'{self.core_prov}\;{self.util_prov}' - else: - self.prov = self.core_prov - - self.dmabuf_environ = { - 'ZEX_NUMBER_OF_CCS' : '0:4,1:4', - 'NEOReadDebugKeys' : '1', - 'EnableImplicitScaling' : '0', - 'MLX5_SCATTER_TO_CQE' : '0' - } - - self.tests = { - 'H2H' : [ - 'write', - 'read', - 'send' - ], - 'H2D' : [ - 'write', - 'read', - 'send' - ], - 'D2H' : [ - 'write', - 'read', - 'send' - ], - 'D2D' : [ - 'write', - 'read', - 'send' - ] - } - - @property - def execute_condn(self): - return True if (self.core_prov == 'verbs') \ - else False - - @property - def cmd(self): - return f"{self.DMABUFtestpath}/fi_xe_rdmabw" - - def dmabuf_env(self): - return ' '.join([f"{key}={self.dmabuf_environ[key]}" \ - for key in self.dmabuf_environ]) - - def execute_cmd(self, test_type): - os.chdir(self.DMABUFtestpath) - base_cmd = '' - log_prefix = f"{os.environ['LOG_DIR']}/dmabuf_{self.n}" - if 'H2H' in test_type or 'D2H' in test_type: - base_cmd = f"{self.cmd} -m malloc -p {self.core_prov}" - else: - base_cmd = f"{self.cmd} -m device -d 0 -p {self.core_prov}" - - for test in self.tests[test_type]: - client_command = f"{base_cmd} -t {test} {self.server}" - if 'send' in test: - server_command = f"{base_cmd} -t {test} " - else: - server_command = f"{base_cmd} " - RC = common.ClientServerTest( - f"ssh {self.server} {self.dmabuf_env()} {server_command}", - f"ssh {self.client} {self.dmabuf_env()} {client_command}", - f"{log_prefix}_server.log", f"{log_prefix}_client.log", - self.timeout - ).run() - - if RC == (0, 0): - print("------------------ TEST COMPLETED -------------------") - else: - print("------------------ TEST FAILED -------------------") - sys.exit(f"Exiting with returncode: {RC}") diff --git a/fabtests/Makefile.am b/fabtests/Makefile.am index 03f4be35e5e..4c643bdc1d3 100644 --- a/fabtests/Makefile.am +++ b/fabtests/Makefile.am @@ -15,7 +15,6 @@ endif bin_PROGRAMS = \ functional/fi_av_xfer \ functional/fi_msg \ - functional/fi_stream_msg \ functional/fi_msg_sockets \ functional/fi_rdm \ functional/fi_rdm_rma_event \ @@ -112,6 +111,8 @@ nobase_dist_config_DATA = \ test_configs/verbs/all.test \ test_configs/verbs/quick.test \ test_configs/verbs/verbs.exclude \ + test_configs/usnic/all.test \ + test_configs/usnic/quick.test \ test_configs/psm2/all.test \ test_configs/psm2/verify.test \ test_configs/psm2/psm2.exclude \ @@ -126,6 +127,7 @@ nobase_dist_config_DATA = \ test_configs/ofi_rxd/ofi_rxd.exclude \ test_configs/shm/all.test \ test_configs/shm/shm.exclude \ + test_configs/shm/cuda.exclude \ test_configs/shm/quick.test \ test_configs/shm/verify.test \ test_configs/sm2/quick.test \ @@ -182,6 +184,7 @@ nobase_dist_config_DATA = \ pytest/efa/test_mr.py \ pytest/efa/test_efa_shm_addr.py \ pytest/efa/test_env.py \ + pytest/efa/test_multi_ep.py \ pytest/shm/conftest.py \ pytest/shm/shm_common.py \ pytest/shm/test_av.py \ @@ -255,10 +258,6 @@ functional_fi_msg_SOURCES = \ functional/msg.c functional_fi_msg_LDADD = libfabtests.la -functional_fi_stream_msg_SOURCES = \ - functional/stream_msg.c -functional_fi_stream_msg_LDADD = libfabtests.la - functional_fi_rdm_SOURCES = \ functional/rdm.c functional_fi_rdm_LDADD = libfabtests.la diff --git a/fabtests/common/hmem_cuda.c b/fabtests/common/hmem_cuda.c index 704afc7c919..2f02b6f474c 100644 --- a/fabtests/common/hmem_cuda.c +++ b/fabtests/common/hmem_cuda.c @@ -188,7 +188,7 @@ int ft_cuda_init(void) } cuda_ops.cudaFreeHost = dlsym(cudart_handle, STRINGIFY(cudaFreeHost)); - if (!cuda_ops.cudaFree) { + if (!cuda_ops.cudaFreeHost) { FT_ERR("Failed to find cudaFreeHost"); goto err_dlclose_cuda; } @@ -242,21 +242,21 @@ int ft_cuda_init(void) #if HAVE_CUDA_DMABUF cuda_ops.cuMemGetHandleForAddressRange = dlsym(cuda_handle, STRINGIFY(cuMemGetHandleForAddressRange)); - if (!cuda_ops.cuPointerSetAttribute) { + if (!cuda_ops.cuMemGetHandleForAddressRange) { FT_ERR("Failed to find cuMemGetHandleForAddressRange\n"); goto err_dlclose_cuda; } #endif cuda_ops.cuDeviceGetAttribute = dlsym(cuda_handle, STRINGIFY(cuDeviceGetAttribute)); - if (!cuda_ops.cuPointerSetAttribute) { + if (!cuda_ops.cuDeviceGetAttribute) { FT_ERR("Failed to find cuPointerSetAttribute\n"); goto err_dlclose_cuda; } cuda_ops.cuDeviceGet = dlsym(cuda_handle, STRINGIFY(cuDeviceGet)); - if (!cuda_ops.cuPointerSetAttribute) { + if (!cuda_ops.cuDeviceGet) { FT_ERR("Failed to find cuDeviceGet\n"); goto err_dlclose_cuda; } diff --git a/fabtests/common/shared.c b/fabtests/common/shared.c index b5ac7332eb5..a8bf16ddcc0 100644 --- a/fabtests/common/shared.c +++ b/fabtests/common/shared.c @@ -517,7 +517,7 @@ static int ft_alloc_ctx_array(struct ft_context **mr_array, char ***mr_bufs, if (opts.options & FT_OPT_ALLOC_MULT_MR) { *mr_bufs = calloc(opts.window_size, sizeof(**mr_bufs)); - if (!mr_bufs) + if (!*mr_bufs) return -FI_ENOMEM; } @@ -749,7 +749,8 @@ int ft_open_fabric_res(void) int ft_alloc_ep_res(struct fi_info *fi, struct fid_cq **new_txcq, struct fid_cq **new_rxcq, struct fid_cntr **new_txcntr, struct fid_cntr **new_rxcntr, - struct fid_cntr **new_rma_cntr) + struct fid_cntr **new_rma_cntr, + struct fid_av **new_av) { int ret; @@ -834,7 +835,7 @@ int ft_alloc_ep_res(struct fi_info *fi, struct fid_cq **new_txcq, } } - if (!av && (fi->ep_attr->type == FI_EP_RDM || fi->ep_attr->type == FI_EP_DGRAM)) { + if (!*new_av && (fi->ep_attr->type == FI_EP_RDM || fi->ep_attr->type == FI_EP_DGRAM)) { if (fi->domain_attr->av_type != FI_AV_UNSPEC) av_attr.type = fi->domain_attr->av_type; @@ -842,7 +843,7 @@ int ft_alloc_ep_res(struct fi_info *fi, struct fid_cq **new_txcq, av_attr.name = opts.av_name; } av_attr.count = opts.av_size; - ret = fi_av_open(domain, &av_attr, &av, NULL); + ret = fi_av_open(domain, &av_attr, new_av, NULL); if (ret) { FT_PRINTERR("fi_av_open", ret); return ret; @@ -854,7 +855,7 @@ int ft_alloc_ep_res(struct fi_info *fi, struct fid_cq **new_txcq, int ft_alloc_active_res(struct fi_info *fi) { int ret; - ret = ft_alloc_ep_res(fi, &txcq, &rxcq, &txcntr, &rxcntr, &rma_cntr); + ret = ft_alloc_ep_res(fi, &txcq, &rxcq, &txcntr, &rxcntr, &rma_cntr, &av); if (ret) return ret; @@ -2064,9 +2065,9 @@ void init_test(struct ft_opts *opts, char *test_name, size_t test_name_len) void ft_force_progress(void) { if (txcq) - fi_cq_read(txcq, NULL, 0); + (void) fi_cq_read(txcq, NULL, 0); if (rxcq) - fi_cq_read(rxcq, NULL, 0); + (void) fi_cq_read(rxcq, NULL, 0); } int ft_progress(struct fid_cq *cq, uint64_t total, uint64_t *cq_cntr) @@ -2764,16 +2765,33 @@ int ft_get_tx_comp(uint64_t total) return ret; } +int ft_tx_msg(struct fid_ep *ep, fi_addr_t fi_addr, size_t size, void *ctx, uint64_t flags) +{ + int ret; + + if (ft_check_opts(FT_OPT_VERIFY_DATA | FT_OPT_ACTIVE)) { + ret = ft_fill_buf((char *) tx_buf + ft_tx_prefix_size(), size); + if (ret) + return ret; + } + + ret = ft_sendmsg(ep, fi_addr, size, ctx, flags); + if (ret) + return ret; + + ret = ft_get_tx_comp(tx_seq); + return ret; +} + int ft_sendmsg(struct fid_ep *ep, fi_addr_t fi_addr, size_t size, void *ctx, int flags) { - int ret; struct fi_msg msg; struct fi_msg_tagged tagged_msg; struct iovec msg_iov; msg_iov.iov_base = tx_buf; - msg_iov.iov_len = size; + msg_iov.iov_len = size + ft_tx_prefix_size(); if (hints->caps & FI_TAGGED) { tagged_msg.msg_iov = &msg_iov; @@ -2785,11 +2803,9 @@ int ft_sendmsg(struct fid_ep *ep, fi_addr_t fi_addr, tagged_msg.tag = ft_tag ? ft_tag : tx_seq; tagged_msg.ignore = 0; - ret = fi_tsendmsg(ep, &tagged_msg, flags); - if (ret) { - FT_PRINTERR("fi_tsendmsg", ret); - return ret; - } + FT_POST(fi_tsendmsg, ft_progress, txcq, tx_seq, + &tx_cq_cntr, "tsendmsg", ep, &tagged_msg, + flags); } else { msg.msg_iov = &msg_iov; msg.desc = &mr_desc; @@ -2798,20 +2814,18 @@ int ft_sendmsg(struct fid_ep *ep, fi_addr_t fi_addr, msg.data = NO_CQ_DATA; msg.context = ctx; - ret = fi_sendmsg(ep, &msg, flags); - if (ret) { - FT_PRINTERR("fi_sendmsg", ret); - return ret; - } + FT_POST(fi_sendmsg, ft_progress, txcq, tx_seq, + &tx_cq_cntr, "sendmsg", ep, &msg, + flags); } return 0; } + int ft_recvmsg(struct fid_ep *ep, fi_addr_t fi_addr, size_t size, void *ctx, int flags) { - int ret; struct fi_msg msg; struct fi_msg_tagged tagged_msg; struct iovec msg_iov; @@ -2829,11 +2843,9 @@ int ft_recvmsg(struct fid_ep *ep, fi_addr_t fi_addr, tagged_msg.tag = ft_tag ? ft_tag : tx_seq; tagged_msg.ignore = 0; - ret = fi_trecvmsg(ep, &tagged_msg, flags); - if (ret) { - FT_PRINTERR("fi_trecvmsg", ret); - return ret; - } + FT_POST(fi_trecvmsg, ft_progress, rxcq, rx_seq, + &rx_cq_cntr, "trecvmsg", ep, &tagged_msg, + flags); } else { msg.msg_iov = &msg_iov; msg.desc = &mr_desc; @@ -2842,11 +2854,9 @@ int ft_recvmsg(struct fid_ep *ep, fi_addr_t fi_addr, msg.data = NO_CQ_DATA; msg.context = ctx; - ret = fi_recvmsg(ep, &msg, flags); - if (ret) { - FT_PRINTERR("fi_recvmsg", ret); - return ret; - } + FT_POST(fi_recvmsg, ft_progress, rxcq, rx_seq, + &rx_cq_cntr, "recvmsg", ep, &msg, + flags); } return 0; @@ -2913,12 +2923,12 @@ void eq_readerr(struct fid_eq *eq, const char *eq_str) int ft_sync() { - char buf; + char buf = 'a'; int ret; if (opts.dst_addr) { if (!(opts.options & FT_OPT_OOB_SYNC)) { - ret = ft_tx(ep, remote_fi_addr, 1, &tx_ctx); + ret = ft_tx_msg(ep, remote_fi_addr, 1, &tx_ctx, FI_DELIVERY_COMPLETE); if (ret) return ret; @@ -2938,7 +2948,9 @@ int ft_sync() if (ret) return ret; - ret = ft_tx(ep, remote_fi_addr, 1, &tx_ctx); + ret = ft_tx_msg(ep, remote_fi_addr, 1, &tx_ctx, FI_DELIVERY_COMPLETE); + if (ret) + return ret; } else { ret = ft_sock_recv(oob_sock, &buf, 1); if (ret) @@ -3052,42 +3064,12 @@ int ft_wait_child(void) int ft_finalize_ep(struct fid_ep *ep) { - struct iovec iov; int ret; struct fi_context ctx; - iov.iov_base = tx_buf; - iov.iov_len = 4 + ft_tx_prefix_size(); - - if (hints->caps & FI_TAGGED) { - struct fi_msg_tagged tmsg; - - memset(&tmsg, 0, sizeof tmsg); - tmsg.msg_iov = &iov; - tmsg.desc = &mr_desc; - tmsg.iov_count = 1; - tmsg.addr = remote_fi_addr; - tmsg.tag = tx_seq; - tmsg.ignore = 0; - tmsg.context = &ctx; - - FT_POST(fi_tsendmsg, ft_progress, txcq, tx_seq, - &tx_cq_cntr, "tsendmsg", ep, &tmsg, - FI_TRANSMIT_COMPLETE); - } else { - struct fi_msg msg; - - memset(&msg, 0, sizeof msg); - msg.msg_iov = &iov; - msg.desc = &mr_desc; - msg.iov_count = 1; - msg.addr = remote_fi_addr; - msg.context = &ctx; - - FT_POST(fi_sendmsg, ft_progress, txcq, tx_seq, - &tx_cq_cntr, "sendmsg", ep, &msg, - FI_TRANSMIT_COMPLETE); - } + ret = ft_sendmsg(ep, remote_fi_addr, 4, &ctx, FI_TRANSMIT_COMPLETE); + if (ret) + return ret; ret = ft_get_tx_comp(tx_seq); if (ret) @@ -3464,6 +3446,7 @@ void ft_parse_opts_list(char* optarg) { int i, ret; char *token; + char *saveptr; optarg += 2; // remove 'l:' test_cnt = 1; @@ -3476,7 +3459,7 @@ void ft_parse_opts_list(char* optarg) exit(EXIT_FAILURE); } - token = strtok(optarg, ","); + token = strtok_r(optarg, ",", &saveptr); test_cnt = 0; while (token != NULL) { ret = sscanf(token, "%zu", &user_test_sizes[test_cnt].size); @@ -3485,7 +3468,7 @@ void ft_parse_opts_list(char* optarg) exit(EXIT_FAILURE); } test_cnt++; - token = strtok(NULL, ","); + token = strtok_r(NULL, ",", &saveptr); } test_size = user_test_sizes; } @@ -3821,7 +3804,7 @@ int ft_sock_listen(char *node, char *service) goto out; } - ret = listen(listen_sock, 0); + ret = listen(listen_sock, 511); if (ret) perror("listen"); @@ -3924,13 +3907,32 @@ int ft_sock_recv(int fd, void *msg, size_t len) int ft_sock_sync(int value) { int result = -FI_EOTHER; + int ret; if (listen_sock < 0) { - ft_sock_send(sock, &value, sizeof value); - ft_sock_recv(sock, &result, sizeof result); + ret = ft_sock_send(sock, &value, sizeof value); + if (ret) { + FT_PRINTERR("ft_sock_send", ret); + return ret; + } + + ret = ft_sock_recv(sock, &result, sizeof result); + if (ret) { + FT_PRINTERR("ft_sock_recv", ret); + return ret; + } } else { - ft_sock_recv(sock, &result, sizeof result); - ft_sock_send(sock, &value, sizeof value); + ret = ft_sock_recv(sock, &result, sizeof result); + if (ret) { + FT_PRINTERR("ft_sock_recv", ret); + return ret; + } + + ret = ft_sock_send(sock, &value, sizeof value); + if (ret) { + FT_PRINTERR("ft_sock_send", ret); + return ret; + } } return result; @@ -4135,6 +4137,10 @@ void ft_longopts_usage() FT_PRINT_OPTS_USAGE("--debug-assert", "Replace asserts with while loops to force process to\n" "spin until a debugger can be attached."); + FT_PRINT_OPTS_USAGE("--data-progress ", + "manual, or auto"); + FT_PRINT_OPTS_USAGE("--control-progress ", + "manual, auto, or unified"); } int debug_assert; @@ -4144,9 +4150,25 @@ struct option long_opts[] = { {"pin-core", required_argument, NULL, LONG_OPT_PIN_CORE}, {"timeout", required_argument, NULL, LONG_OPT_TIMEOUT}, {"debug-assert", no_argument, &debug_assert, LONG_OPT_DEBUG_ASSERT}, + {"data-progress", required_argument, NULL, LONG_OPT_DATA_PROGRESS}, + {"control-progress", required_argument, NULL, LONG_OPT_CONTROL_PROGRESS}, {NULL, 0, NULL, 0}, }; +int ft_parse_progress_model_string(char* progress_str) +{ + int ret = -1; + + if (!strcasecmp("manual", progress_str)) + ret = FI_PROGRESS_MANUAL; + else if (!strcasecmp("auto", progress_str)) + ret = FI_PROGRESS_AUTO; + else if (!strcasecmp("unified", progress_str)) + ret = FI_PROGRESS_CONTROL_UNIFIED; + + return ret; +} + int ft_parse_long_opts(int op, char *optarg) { switch (op) { @@ -4157,6 +4179,16 @@ int ft_parse_long_opts(int op, char *optarg) return 0; case LONG_OPT_DEBUG_ASSERT: return 0; + case LONG_OPT_DATA_PROGRESS: + hints->domain_attr->data_progress = ft_parse_progress_model_string(optarg); + if (hints->domain_attr->data_progress == -1) + return EXIT_FAILURE; + return 0; + case LONG_OPT_CONTROL_PROGRESS: + hints->domain_attr->control_progress = ft_parse_progress_model_string(optarg); + if (hints->domain_attr->control_progress == -1) + return EXIT_FAILURE; + return 0; default: return EXIT_FAILURE; } diff --git a/fabtests/component/dmabuf-rdma/fi-rdmabw-xe.c b/fabtests/component/dmabuf-rdma/fi-rdmabw-xe.c index 8bcd42dfa8b..4ad6e009950 100644 --- a/fabtests/component/dmabuf-rdma/fi-rdmabw-xe.c +++ b/fabtests/component/dmabuf-rdma/fi-rdmabw-xe.c @@ -515,19 +515,20 @@ static void init_ofi(int sockfd, char *server_name, int port, int test_type) int err; size_t len; char *domain_name; + char *saveptr; EXIT_ON_NULL((context_pool = init_context_pool(TX_DEPTH + 1))); num_nics = 0; if (domain_names) { - domain_name = strtok(domain_names, ","); + domain_name = strtok_r(domain_names, ",", &saveptr); while (domain_name && num_nics < MAX_NICS) { err = init_nic(num_nics, domain_name, server_name, port, test_type); if (err) return; num_nics++; - domain_name = strtok(NULL, ","); + domain_name = strtok_r(NULL, ",", &saveptr); } } else { err = init_nic(num_nics, NULL, server_name, port, test_type); @@ -1104,11 +1105,12 @@ static inline int string_to_location(char *s, int default_loc) void parse_buf_location(char *string, int *loc1, int *loc2, int default_loc) { char *s; + char *saveptr; - s = strtok(string, ":"); + s = strtok_r(string, ":", &saveptr); if (s) { *loc1 = string_to_location(s, default_loc); - s = strtok(NULL, ":"); + s = strtok_r(NULL, ":", &saveptr); if (s) *loc2 = string_to_location(s, default_loc); else diff --git a/fabtests/component/dmabuf-rdma/rdmabw-xe.c b/fabtests/component/dmabuf-rdma/rdmabw-xe.c index 88759f35b00..c412576451f 100644 --- a/fabtests/component/dmabuf-rdma/rdmabw-xe.c +++ b/fabtests/component/dmabuf-rdma/rdmabw-xe.c @@ -408,6 +408,7 @@ static void init_ib(char *ibdev_names, int sockfd) int ib_port = 1; char *ibdev_name; int i, j; + char *saveptr; dev_list = ibv_get_device_list(NULL); if (!dev_list) { @@ -422,11 +423,11 @@ static void init_ib(char *ibdev_names, int sockfd) num_nics = 0; if (ibdev_names) { - ibdev_name = strtok(ibdev_names, ","); + ibdev_name = strtok_r(ibdev_names, ",", &saveptr); while (ibdev_name && num_nics < MAX_NICS) { EXIT_ON_ERROR(init_nic(num_nics, ibdev_name, ib_port)); num_nics++; - ibdev_name = strtok(NULL, ","); + ibdev_name = strtok_r(NULL, ",", &saveptr); } } else { EXIT_ON_ERROR(init_nic(0, NULL, ib_port)); diff --git a/fabtests/component/dmabuf-rdma/xe.c b/fabtests/component/dmabuf-rdma/xe.c index 9068534131e..5416690fb60 100644 --- a/fabtests/component/dmabuf-rdma/xe.c +++ b/fabtests/component/dmabuf-rdma/xe.c @@ -137,6 +137,7 @@ int xe_init(char *gpu_dev_nums, int enable_multi_gpu) char *gpu_dev_num; char *s; int dev_num, subdev_num; + char *saveptr; EXIT_ON_ERROR(init_libze_ops()); EXIT_ON_ERROR(libze_ops.zeInit(ZE_INIT_FLAG_GPU_ONLY)); @@ -150,7 +151,7 @@ int xe_init(char *gpu_dev_nums, int enable_multi_gpu) num_gpus = 0; if (gpu_dev_nums) { - gpu_dev_num = strtok(gpu_dev_nums, ","); + gpu_dev_num = strtok_r(gpu_dev_nums, ",", &saveptr); while (gpu_dev_num && num_gpus < MAX_GPUS) { dev_num = 0; subdev_num = -1; @@ -158,7 +159,7 @@ int xe_init(char *gpu_dev_nums, int enable_multi_gpu) s = strchr(gpu_dev_num, '.'); if (s) subdev_num = atoi(s + 1); - gpu_dev_num = strtok(NULL, ","); + gpu_dev_num = strtok_r(NULL, ",", &saveptr); if (init_gpu(num_gpus, dev_num, subdev_num) < 0) continue; diff --git a/fabtests/component/sock_test.c b/fabtests/component/sock_test.c index 9d5365cbaf4..4a7ed4b9435 100755 --- a/fabtests/component/sock_test.c +++ b/fabtests/component/sock_test.c @@ -113,7 +113,7 @@ static int start_server(void) goto close; } - ret = listen(listen_sock, 0); + ret = listen(listen_sock, 511); if (ret) { FT_PRINTERR("listen", -errno); goto close; @@ -470,7 +470,9 @@ static int run_server(void) if (ret) return ret; - send(fds[0], &c, 1, 0); + ret = send(fds[0], &c, 1, 0); + if (ret < 0) + return -errno; return 0; } diff --git a/fabtests/configure.ac b/fabtests/configure.ac index c8426bb87ce..1ce4b9ed915 100644 --- a/fabtests/configure.ac +++ b/fabtests/configure.ac @@ -5,7 +5,7 @@ dnl dnl Process this file with autoconf to produce a configure script. AC_PREREQ(2.57) -AC_INIT([fabtests], [1.21.0a1], [ofiwg@lists.openfabrics.org]) +AC_INIT([fabtests], [1.22.0a1], [ofiwg@lists.openfabrics.org]) AC_CONFIG_AUX_DIR(config) AC_CONFIG_MACRO_DIR(config) AC_CONFIG_HEADERS(config.h) diff --git a/fabtests/functional/inject_test.c b/fabtests/functional/inject_test.c index 01c4b2658bb..b70e36e7885 100644 --- a/fabtests/functional/inject_test.c +++ b/fabtests/functional/inject_test.c @@ -52,23 +52,9 @@ static int send_msg(int sendmsg, size_t size) } if (sendmsg) { - while(1) { - ret = ft_sendmsg(ep, remote_fi_addr, size, - &tx_ctx, flag); - if (!ret) - break; - - if (ret != -FI_EAGAIN) { - FT_PRINTERR("ft_sendmsg", ret); - return ret; - } - - ret = ft_progress(txcq, tx_seq, &tx_cq_cntr); - if (ret && ret != -FI_EAGAIN) { - FT_ERR("Failed to get send completion"); - return ret; - } - } + ret = ft_sendmsg(ep, remote_fi_addr, size, &tx_ctx, flag); + if (ret) + return ret; } else { ret = ft_post_tx(ep, remote_fi_addr, size, NO_CQ_DATA, &tx_ctx); if (ret) { diff --git a/fabtests/functional/multi_ep.c b/fabtests/functional/multi_ep.c index b73e9e1850f..4e9b25edc76 100644 --- a/fabtests/functional/multi_ep.c +++ b/fabtests/functional/multi_ep.c @@ -54,11 +54,19 @@ static char **recv_bufs; static struct fi_context *recv_ctx; static struct fi_context *send_ctx; static struct fid_cq **txcqs, **rxcqs; +static struct fid_av **avs; static struct fid_mr *data_mr = NULL; static void *data_desc = NULL; static fi_addr_t *remote_addr; +static bool shared_cq = false; +static bool shared_av = false; int num_eps = 3; +enum { + LONG_OPT_SHARED_AV, + LONG_OPT_SHARED_CQ, +}; + static void free_ep_res() { int i; @@ -66,8 +74,12 @@ static void free_ep_res() FT_CLOSE_FID(data_mr); for (i = 0; i < num_eps; i++) { FT_CLOSE_FID(eps[i]); + } + + for (i = 0; i < num_eps; i++) { FT_CLOSE_FID(txcqs[i]); FT_CLOSE_FID(rxcqs[i]); + FT_CLOSE_FID(avs[i]); } free(txcqs); @@ -79,6 +91,7 @@ static void free_ep_res() free(recv_ctx); free(remote_addr); free(eps); + free(avs); } static int alloc_multi_ep_res() @@ -95,6 +108,7 @@ static int alloc_multi_ep_res() data_bufs = calloc(num_eps * 2, opts.transfer_size); txcqs = calloc(num_eps, sizeof(*txcqs)); rxcqs = calloc(num_eps, sizeof(*rxcqs)); + avs = calloc(num_eps, sizeof(*avs)); if (!eps || !remote_addr || !send_bufs || !recv_bufs || !send_ctx || !recv_ctx || !data_bufs || !txcqs || !rxcqs) @@ -119,13 +133,16 @@ static int alloc_multi_ep_res() static int ep_post_rx(int idx) { - int ret; + int ret, cq_read_idx = idx; + + if (shared_cq) + cq_read_idx = 0; do { ret = fi_recv(eps[idx], recv_bufs[idx], opts.transfer_size, data_desc, FI_ADDR_UNSPEC, &recv_ctx[idx]); if (ret == -FI_EAGAIN) - (void) fi_cq_read(rxcqs[idx], NULL, 0); + (void) fi_cq_read(rxcqs[cq_read_idx], NULL, 0); } while (ret == -FI_EAGAIN); @@ -134,16 +151,22 @@ static int ep_post_rx(int idx) static int ep_post_tx(int idx) { - int ret; + int ret, cq_read_idx = idx; - if (ft_check_opts(FT_OPT_VERIFY_DATA)) - ft_fill_buf(send_bufs[idx], opts.transfer_size); + if (shared_cq) + cq_read_idx = 0; + + if (ft_check_opts(FT_OPT_VERIFY_DATA)) { + ret = ft_fill_buf(send_bufs[idx], opts.transfer_size); + if (ret) + return ret; + } do { ret = fi_send(eps[idx], send_bufs[idx], opts.transfer_size, data_desc, remote_addr[idx], &send_ctx[idx]); if (ret == -FI_EAGAIN) - (void) fi_cq_read(txcqs[idx], NULL, 0); + (void) fi_cq_read(txcqs[cq_read_idx], NULL, 0); } while (ret == -FI_EAGAIN); @@ -152,7 +175,7 @@ static int ep_post_tx(int idx) static int do_transfers(void) { - int i, ret; + int i, ret, cq_read_idx; uint64_t cur; for (i = 0; i < num_eps; i++) { @@ -174,13 +197,17 @@ static int do_transfers(void) printf("Wait for all messages from peer\n"); for (i = 0; i < num_eps; i++) { + if (shared_cq) + cq_read_idx = 0; + else + cq_read_idx = i; cur = 0; - ret = ft_get_cq_comp(txcqs[i], &cur, 1, -1); + ret = ft_get_cq_comp(txcqs[cq_read_idx], &cur, 1, -1); if (ret < 0) return ret; cur = 0; - ret = ft_get_cq_comp(rxcqs[i], &cur, 1, -1); + ret = ft_get_cq_comp(rxcqs[cq_read_idx], &cur, 1, -1); if (ret < 0) return ret; } @@ -204,7 +231,13 @@ static int do_transfers(void) static int setup_client_ep(int idx) { - int ret; + int ret, av_bind_idx = idx, cq_bind_idx = idx; + + if (shared_cq) + cq_bind_idx = 0; + + if (shared_av) + av_bind_idx = 0; ret = fi_endpoint(domain, fi, &eps[idx], NULL); if (ret) { @@ -212,11 +245,11 @@ static int setup_client_ep(int idx) return ret; } - ret = ft_alloc_ep_res(fi, &txcqs[idx], &rxcqs[idx], NULL, NULL, NULL); + ret = ft_alloc_ep_res(fi, &txcqs[idx], &rxcqs[idx], NULL, NULL, NULL, &avs[idx]); if (ret) return ret; - ret = ft_enable_ep(eps[idx], eq, av, txcqs[idx], rxcqs[idx], + ret = ft_enable_ep(eps[idx], eq, avs[av_bind_idx], txcqs[cq_bind_idx], rxcqs[cq_bind_idx], NULL, NULL, NULL); if (ret) return ret; @@ -230,7 +263,13 @@ static int setup_client_ep(int idx) static int setup_server_ep(int idx) { - int ret; + int ret, av_bind_idx = idx, cq_bind_idx = idx; + + if (shared_cq) + cq_bind_idx = 0; + + if (shared_av) + av_bind_idx = 0; ret = ft_retrieve_conn_req(eq, &fi); if (ret) @@ -242,11 +281,11 @@ static int setup_server_ep(int idx) goto failed_accept; } - ret = ft_alloc_ep_res(fi, &txcqs[idx], &rxcqs[idx], NULL, NULL, NULL); + ret = ft_alloc_ep_res(fi, &txcqs[idx], &rxcqs[idx], NULL, NULL, NULL, &avs[idx]); if (ret) return ret; - ret = ft_enable_ep(eps[idx], eq, av, txcqs[idx], rxcqs[idx], + ret = ft_enable_ep(eps[idx], eq, avs[av_bind_idx], txcqs[cq_bind_idx], rxcqs[cq_bind_idx], NULL, NULL, NULL); if (ret) goto failed_accept; @@ -270,6 +309,7 @@ static int setup_av_ep(int idx) hints = fi_dupinfo(fi); fi_freeinfo(fi); + free(hints->src_addr); hints->src_addr = NULL; hints->src_addrlen = 0; @@ -285,7 +325,7 @@ static int setup_av_ep(int idx) return ret; } - ret = ft_alloc_ep_res(fi, &txcqs[idx], &rxcqs[idx], NULL, NULL, NULL); + ret = ft_alloc_ep_res(fi, &txcqs[idx], &rxcqs[idx], NULL, NULL, NULL, &avs[idx]); if (ret) return ret; @@ -294,14 +334,20 @@ static int setup_av_ep(int idx) static int enable_ep(int idx) { - int ret; + int ret, av_bind_idx = idx, cq_bind_idx = idx; + + if (shared_cq) + cq_bind_idx = 0; + + if (shared_av) + av_bind_idx = 0; - ret = ft_enable_ep(eps[idx], eq, av, txcqs[idx], rxcqs[idx], + ret = ft_enable_ep(eps[idx], eq, avs[av_bind_idx], txcqs[cq_bind_idx], rxcqs[cq_bind_idx], NULL, NULL, NULL); if (ret) return ret; - ret = ft_init_av_addr(av, eps[idx], &remote_addr[idx]); + ret = ft_init_av_addr(avs[av_bind_idx], eps[idx], &remote_addr[idx]); if (ret) return ret; @@ -375,7 +421,15 @@ int main(int argc, char **argv) if (!hints) return EXIT_FAILURE; - while ((op = getopt(argc, argv, "c:vh" ADDR_OPTS INFO_OPTS)) != -1) { + int lopt_idx = 0; + struct option long_opts[] = { + {"shared-av", no_argument, NULL, LONG_OPT_SHARED_AV}, + {"shared-cq", no_argument, NULL, LONG_OPT_SHARED_CQ}, + {0, 0, 0, 0} + }; + + while ((op = getopt_long(argc, argv, "c:vh" ADDR_OPTS INFO_OPTS, + long_opts, &lopt_idx)) != -1) { switch (op) { default: ft_parse_addr_opts(op, optarg, &opts); @@ -387,12 +441,24 @@ int main(int argc, char **argv) case 'v': opts.options |= FT_OPT_VERIFY_DATA; break; + case LONG_OPT_SHARED_AV: + shared_av = true; + break; + case LONG_OPT_SHARED_CQ: + shared_cq = true; + break; case '?': case 'h': ft_usage(argv[0], "Multi endpoint test"); FT_PRINT_OPTS_USAGE("-c ", "number of endpoints to create and test (def 3)"); FT_PRINT_OPTS_USAGE("-v", "Enable data verification"); + FT_PRINT_OPTS_USAGE("--shared-cq", + "Share tx/rx cq among endpoints. \n" + "By default each ep has its own tx/rx cq"); + FT_PRINT_OPTS_USAGE("--shared-av", + "Share the av among endpoints. \n" + "By default each ep has its own av"); return EXIT_FAILURE; } } diff --git a/fabtests/functional/rdm_deferred_wq.c b/fabtests/functional/rdm_deferred_wq.c index a250a17d99b..7526c709861 100644 --- a/fabtests/functional/rdm_deferred_wq.c +++ b/fabtests/functional/rdm_deferred_wq.c @@ -74,6 +74,7 @@ static void format_simple_msg_tagged(struct fi_msg_tagged *msg, struct iovec *io msg->addr = remote_fi_addr; msg->data = 0; msg->tag = tag; + msg->ignore = 0; iov->iov_base = src; iov->iov_len = size; @@ -89,6 +90,7 @@ static void format_simple_msg_rma(struct fi_msg_rma *msg, struct iovec *iov, msg->iov_count = 1; msg->addr = remote_fi_addr; msg->rma_iov_count = 1; + msg->data = 0; iov->iov_base = src; iov->iov_len = size; diff --git a/fabtests/functional/rdm_stress.c b/fabtests/functional/rdm_stress.c index 7a212e3b337..8f035650b1a 100644 --- a/fabtests/functional/rdm_stress.c +++ b/fabtests/functional/rdm_stress.c @@ -187,7 +187,7 @@ static int rpc_inject(struct rpc_hdr *hdr, fi_addr_t addr) start = ft_gettime_ms(); do { - fi_cq_read(txcq, NULL, 0); + (void) fi_cq_read(txcq, NULL, 0); ret = (int) fi_inject(ep, hdr, sizeof(*hdr), addr); } while ((ret == -FI_EAGAIN) && (ft_gettime_ms() - start < rpc_timeout)); @@ -205,7 +205,7 @@ static int rpc_send(struct rpc_hdr *hdr, size_t size, fi_addr_t addr) start = ft_gettime_ms(); do { - fi_cq_read(txcq, NULL, 0); + (void) fi_cq_read(txcq, NULL, 0); ret = (int) fi_send(ep, hdr, size, NULL, addr, hdr); } while ((ret == -FI_EAGAIN) && (ft_gettime_ms() - start < rpc_timeout)); @@ -236,7 +236,7 @@ static int rpc_deliver(struct rpc_hdr *hdr, size_t size, fi_addr_t addr) start = ft_gettime_ms(); do { - fi_cq_read(txcq, NULL, 0); + (void) fi_cq_read(txcq, NULL, 0); ret = (int) fi_sendmsg(ep, &msg, FI_DELIVERY_COMPLETE); } while ((ret == -FI_EAGAIN) && (ft_gettime_ms() - start < rpc_timeout)); @@ -455,7 +455,7 @@ static int rpc_reg_buf(struct rpc_ctrl *ctrl, size_t size, uint64_t access) return FI_SUCCESS; close: - fi_close(&ctrl->mr->fid); + FT_CLOSE_FID(ctrl->mr); return ret; } @@ -470,7 +470,9 @@ static int rpc_read_req(struct rpc_ctrl *ctrl) if (!ctrl->buf) return -FI_ENOMEM; - ft_fill_buf(&ctrl->buf[ctrl->offset], ctrl->size); + ret = ft_fill_buf(&ctrl->buf[ctrl->offset], ctrl->size); + if (ret) + goto free; ret = rpc_reg_buf(ctrl, size, FI_REMOTE_READ); if (ret) @@ -492,7 +494,7 @@ static int rpc_read_req(struct rpc_ctrl *ctrl) return 0; close: - fi_close(&ctrl->mr->fid); + FT_CLOSE_FID(ctrl->mr); free: free(ctrl->buf); return ret; @@ -513,7 +515,7 @@ static int rpc_read_resp(struct rpc_ctrl *ctrl) ret = ft_check_buf(&req->buf[req->offset], req->size); close: - fi_close(&req->mr->fid); + FT_CLOSE_FID(req->mr); free(req->buf); return ret; } @@ -549,7 +551,7 @@ static int rpc_write_req(struct rpc_ctrl *ctrl) return 0; close: - fi_close(&ctrl->mr->fid); + FT_CLOSE_FID(ctrl->mr); free: free(ctrl->buf); return ret; @@ -570,7 +572,7 @@ static int rpc_write_resp(struct rpc_ctrl *ctrl) ret = ft_check_buf(&req->buf[req->offset], req->size); close: - fi_close(&req->mr->fid); + FT_CLOSE_FID(req->mr); free(req->buf); return ret; } @@ -810,7 +812,8 @@ static int init_ctrls(const char *ctrlfile) if (stat(ctrlfile, &sb)) { FT_PRINTERR("stat", -errno); - return -errno; + ret = -errno; + goto no_mem_out; } js = malloc(sb.st_size + 1); @@ -987,7 +990,8 @@ static void complete_rpc(struct rpc_resp *resp) } if (resp->mr) - fi_close(&resp->mr->fid); + FT_CLOSE_FID(resp->mr); + (void) ft_check_buf(resp + 1, resp->hdr.size); free(resp); } @@ -1109,7 +1113,11 @@ static void start_rpc(struct rpc_hdr *req) goto free; resp->hdr = *req; - ft_fill_buf(resp + 1, resp->hdr.size); + ret = ft_fill_buf(resp + 1, resp->hdr.size); + if (ret) { + free(resp); + goto free; + } start = ft_gettime_ms(); do { diff --git a/fabtests/functional/rdm_tagged_peek.c b/fabtests/functional/rdm_tagged_peek.c index a8d95dab9c2..c583d37013b 100644 --- a/fabtests/functional/rdm_tagged_peek.c +++ b/fabtests/functional/rdm_tagged_peek.c @@ -352,6 +352,7 @@ int main(int argc, char **argv) hints->domain_attr->resource_mgmt = FI_RM_ENABLED; hints->tx_attr->msg_order = FI_ORDER_SAS; + hints->rx_attr->msg_order = FI_ORDER_SAS; hints->ep_attr->type = FI_EP_RDM; hints->caps = FI_TAGGED; hints->mode = FI_CONTEXT; diff --git a/fabtests/functional/scalable_ep.c b/fabtests/functional/scalable_ep.c index bed3cd94143..1b8a396ff0a 100644 --- a/fabtests/functional/scalable_ep.c +++ b/fabtests/functional/scalable_ep.c @@ -80,7 +80,7 @@ static int alloc_ep_res(struct fid_ep *sep) av_attr.rx_ctx_bits = rx_ctx_bits; - ret = ft_alloc_ep_res(fi, &txcq, &rxcq, &txcntr, &rxcntr, NULL); + ret = ft_alloc_ep_res(fi, &txcq, &rxcq, &txcntr, &rxcntr, NULL, &av); if (ret) return ret; diff --git a/fabtests/functional/shared_ctx.c b/fabtests/functional/shared_ctx.c index 43093944804..016a56e87fc 100644 --- a/fabtests/functional/shared_ctx.c +++ b/fabtests/functional/shared_ctx.c @@ -295,7 +295,7 @@ static int init_fabric(void) av_attr.count = ep_cnt; - ret = ft_alloc_ep_res(fi, &txcq, &rxcq, &txcntr, &rxcntr, NULL); + ret = ft_alloc_ep_res(fi, &txcq, &rxcq, &txcntr, &rxcntr, NULL, &av); if (ret) return ret; @@ -350,7 +350,7 @@ static int client_connect(void) if (ret) return ret; - ret = ft_alloc_ep_res(fi, &txcq, &rxcq, &txcntr, &rxcntr, NULL); + ret = ft_alloc_ep_res(fi, &txcq, &rxcq, &txcntr, &rxcntr, NULL, &av); if (ret) return ret; @@ -436,7 +436,7 @@ static int server_connect(void) goto err; ret = ft_alloc_ep_res(fi, &txcq, &rxcq, &txcntr, - &rxcntr, NULL); + &rxcntr, NULL, &av); if (ret) goto err; } diff --git a/fabtests/functional/stream_msg.c b/fabtests/functional/stream_msg.c deleted file mode 100644 index ee6de1eab2f..00000000000 --- a/fabtests/functional/stream_msg.c +++ /dev/null @@ -1,277 +0,0 @@ -/* - * Copyright (c) 2018 Intel Corporation. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include -#include "shared.h" -#include -#include -#include -#include - -const char *msg = "hello stream!"; - - -int send_stream(struct fid_ep *ep, const char *msg, size_t msg_len) -{ - int offset, ret; - - for (offset = 0; offset < msg_len; ) { - ret = fi_send(ep, (msg + offset), (msg_len - offset), NULL, 0, NULL); - if (ret < 0 && ret != -FI_EAGAIN) { - fprintf(stderr, "%s error %s\n", __func__, fi_strerror(-ret)); - return ret; - } - - if (ret > 0) - offset += ret; - } - - return offset; -} - -int recv_stream(struct fid_ep *ep, char *msg, size_t msg_len) -{ - int offset, ret; - - for (offset = 0; offset < msg_len; ) { - ret = fi_recv(ep, (msg + offset), (msg_len - offset), NULL, 0, NULL); - if (ret < 0 && ret != -FI_EAGAIN) { - fprintf(stderr, "%s error %s\n", __func__, fi_strerror(-ret)); - return ret; - } - if (ret > 0) - offset += ret; - } - - return offset; -} - -static int send_greeting(struct fid_ep *ep) -{ - const size_t msg_len = strlen(msg); - char buffer[msg_len]; - int ret; - - ret = send_stream(ep, msg, msg_len); - if (ret < 0) - return ret; - - ret = recv_stream(ep, buffer, msg_len); - if (ret < 0) - return ret; - - if (strncmp(buffer, msg, msg_len) != 0) { - printf("error recv: %s\n", buffer); - return -FI_EIO; - } - - return 0; -} - -static int recv_greeting(struct fid_ep *ep) -{ - const size_t msg_len = strlen(msg); - char buffer[msg_len]; - int ret; - - ret = recv_stream(ep, buffer, msg_len); - if (ret < 0) - return ret; - - if (strncmp(buffer, msg, msg_len) != 0) { - printf("error recv: %s\n", buffer); - return -FI_EIO; - } - - ret = send_stream(ep, msg, msg_len); - if (ret < 0) - return ret; - - return 0; -} - -static int send_recv_greeting(struct fid_ep *ep) -{ - return opts.dst_addr ? recv_greeting(ep) : send_greeting(ep); -} - -int stream_init_ep() -{ - int ret = fi_endpoint(domain, fi, &ep, NULL); - if (ret) { - FT_PRINTERR("fi_endpoint", ret); - return ret; - } - - FT_EP_BIND(ep, eq, 0); - - ret = fi_enable(ep); - if (ret) { - FT_PRINTERR("fi_enable", ret); - return ret; - } - return 0; -} - -void print_address(struct sockaddr_in *addr) -{ - printf(" accepted IPv4: %s port: %u\n", inet_ntoa(addr->sin_addr), - ntohs(addr->sin_port)); -} - -int stream_server_connect(void) -{ - int ret; - struct sockaddr_in peer_addr; - size_t addrlen = sizeof(struct sockaddr_in); - - ret = ft_retrieve_conn_req(eq, &fi); - if (ret) - goto err; - - ret = fi_domain(fabric, fi, &domain, NULL); - if (ret) { - FT_PRINTERR("fi_domain", ret); - goto err; - } - - ret = stream_init_ep(); - if (ret) - goto err; - - ret = ft_accept_connection(ep, eq); - if (ret) - goto err; - - ret = fi_getpeer(ep, &peer_addr, &addrlen); - print_address(&peer_addr); - return 0; - -err: - return ret; -} - - -int stream_client_connect() -{ - int ret; - - ret = ft_getinfo(hints, &fi); - if (ret) - return ret; - - ret = ft_open_fabric_res(); - if (ret) - return ret; - - ret = stream_init_ep(); - if (ret) - return ret; - - ret = ft_connect_ep(ep, eq, fi->dest_addr); - if (ret) - return ret; - - return 0; -} - -void set_stream_hints(void) { - hints->ep_attr->type = FI_EP_SOCK_STREAM; - hints->caps = FI_MSG; - hints->domain_attr->mr_mode = 0; - hints->addr_format = FI_SOCKADDR; - hints->domain_attr->threading = FI_THREAD_SAFE; - hints->domain_attr->data_progress = FI_PROGRESS_MANUAL; - hints->domain_attr->control_progress = FI_PROGRESS_AUTO; - hints->tx_attr->msg_order = FI_ORDER_SAS; - hints->rx_attr->msg_order = FI_ORDER_SAS; -} -static int stream_run(void) -{ - int ret; - - if (!opts.dst_addr) { - ret = ft_start_server(); - if (ret) - return ret; - } - - ret = opts.dst_addr ? stream_client_connect() : stream_server_connect(); - if (ret) { - return ret; - } - - ret = send_recv_greeting(ep); - if (ret < 0) - return ret; - - fi_shutdown(ep, 0); - return ret; -} - -int main(int argc, char **argv) -{ - int op, ret; - - opts = INIT_OPTS; - /* remove CQ usage on ep */ - opts.options = FT_OPT_SIZE; - - hints = fi_allocinfo(); - if (!hints) - return EXIT_FAILURE; - - while ((op = getopt(argc, argv, "h" ADDR_OPTS INFO_OPTS)) != -1) { - switch (op) { - default: - ft_parse_addr_opts(op, optarg, &opts); - ft_parseinfo(op, optarg, hints, &opts); - break; - case '?': - case 'h': - ft_usage(argv[0], "A simple MSG client-sever example."); - return EXIT_FAILURE; - } - } - - if (optind < argc) - opts.dst_addr = argv[optind]; - - set_stream_hints(); - - ret = stream_run(); - - ft_free_res(); - return ft_exit_code(ret); -} diff --git a/fabtests/include/ft_osd.h b/fabtests/include/ft_osd.h index 6ea377941cd..b1a714eb8cb 100644 --- a/fabtests/include/ft_osd.h +++ b/fabtests/include/ft_osd.h @@ -34,15 +34,15 @@ #define _FT_OSD_H_ #ifdef __APPLE__ -#include -#include +#include "osx/osd.h" +#include "unix/osd.h" #elif defined __FreeBSD__ -#include -#include +#include "freebsd/osd.h" +#include "unix/osd.h" #elif defined _WIN32 -#include +#include "windows/osd.h" #else -#include +#include "unix/osd.h" #endif #define OFI_DATATYPE_CNT (FI_UINT128 + 1) diff --git a/fabtests/include/shared.h b/fabtests/include/shared.h index 252a5273517..d3c86bcea56 100644 --- a/fabtests/include/shared.h +++ b/fabtests/include/shared.h @@ -432,7 +432,8 @@ int ft_connect_ep(struct fid_ep *ep, int ft_alloc_ep_res(struct fi_info *fi, struct fid_cq **new_txcq, struct fid_cq **new_rxcq, struct fid_cntr **new_txcntr, struct fid_cntr **new_rxcntr, - struct fid_cntr **new_rma_cntr); + struct fid_cntr **new_rma_cntr, + struct fid_av **new_av); int ft_alloc_msgs(void); int ft_alloc_host_tx_buf(size_t size); void ft_free_host_tx_buf(void); @@ -599,6 +600,8 @@ int ft_recvmsg(struct fid_ep *ep, fi_addr_t fi_addr, size_t size, void *ctx, int flags); int ft_sendmsg(struct fid_ep *ep, fi_addr_t fi_addr, size_t size, void *ctx, int flags); +int ft_tx_msg(struct fid_ep *ep, fi_addr_t fi_addr, + size_t size, void *ctx, uint64_t flags); int ft_cq_read_verify(struct fid_cq *cq, void *op_context); void eq_readerr(struct fid_eq *eq, const char *eq_str); @@ -632,6 +635,8 @@ enum { LONG_OPT_PIN_CORE = 1, LONG_OPT_TIMEOUT, LONG_OPT_DEBUG_ASSERT, + LONG_OPT_DATA_PROGRESS, + LONG_OPT_CONTROL_PROGRESS, }; extern int debug_assert; diff --git a/fabtests/include/windows/osd.h b/fabtests/include/windows/osd.h index e4df796cbb8..564f4453c16 100644 --- a/fabtests/include/windows/osd.h +++ b/fabtests/include/windows/osd.h @@ -617,6 +617,11 @@ static inline char* strsep(char **stringp, const char *delim) return ptr; } +static inline char *strtok_r(char *str, const char *delimiters, char **saveptr) +{ + return strtok_s(str, delimiters, saveptr); +} + #define _SC_PAGESIZE 30 static long int sysconf(int name) diff --git a/fabtests/multinode/src/core.c b/fabtests/multinode/src/core.c index 943fb61798d..3fec8b79f54 100644 --- a/fabtests/multinode/src/core.c +++ b/fabtests/multinode/src/core.c @@ -403,7 +403,7 @@ static int multi_barrier(void) if (pm_job.my_rank == 0) { for (i = 1; i < pm_job.num_ranks; i++) { do { - fi_cq_read(txcq, NULL, 0); + (void) fi_cq_read(txcq, NULL, 0); msg.addr = pm_job.fi_addrs[i]; ret = fi_sendmsg(ep, &msg, FI_DELIVERY_COMPLETE); } while (ret == -FI_EAGAIN); diff --git a/fabtests/multinode/src/core_coll.c b/fabtests/multinode/src/core_coll.c index a8341ca9f05..96147820775 100644 --- a/fabtests/multinode/src/core_coll.c +++ b/fabtests/multinode/src/core_coll.c @@ -185,7 +185,7 @@ static int coll_teardown(void) ret = fi_close(&coll_mc->fid); if (ret) - fi_close(&av_set->fid); + FT_CLOSE_FID(av_set); else ret = fi_close(&av_set->fid); return ret; diff --git a/fabtests/multinode/src/harness.c b/fabtests/multinode/src/harness.c index 77f7c003067..e56a55918ae 100644 --- a/fabtests/multinode/src/harness.c +++ b/fabtests/multinode/src/harness.c @@ -149,7 +149,7 @@ int pm_allgather(void *my_item, void *items, int item_size) void pm_barrier(void) { - char ch; + char ch = 'a'; char *chs = alloca(pm_job.num_ranks); pm_allgather(&ch, chs, 1); diff --git a/fabtests/multinode/src/timing.c b/fabtests/multinode/src/timing.c index 8b3cd9314b4..eee541adf51 100644 --- a/fabtests/multinode/src/timing.c +++ b/fabtests/multinode/src/timing.c @@ -160,8 +160,10 @@ int multi_timer_gather(struct multi_timer *all_timer, recv_timer = calloc(timer_count, sizeof(*recv_timer)); ret = socket_recv(pm_job.clients[i-1], recv_timer, sizeof(*recv_timer) * timer_count, 0); - if (ret < 0) + if (ret < 0) { + free(recv_timer); return ret; + } for (j = 0; j < timer_count; j++) all_timer[i * timer_count + j] = recv_timer[j]; @@ -172,7 +174,6 @@ int multi_timer_gather(struct multi_timer *all_timer, sizeof(*recv_timer) * timer_count, 0); } - return ret; } diff --git a/fabtests/prov/efa/src/efa_exhaust_mr_reg_common.c b/fabtests/prov/efa/src/efa_exhaust_mr_reg_common.c index e8a680ea7a4..29090e7602b 100644 --- a/fabtests/prov/efa/src/efa_exhaust_mr_reg_common.c +++ b/fabtests/prov/efa/src/efa_exhaust_mr_reg_common.c @@ -96,16 +96,29 @@ int ft_efa_deregister_mr_reg(struct ibv_mr **mr_reg_vec, size_t count) return err; } -int ft_efa_alloc_bufs(void **buffers, size_t buf_size, size_t count) { +int ft_efa_alloc_bufs(void **buffers, size_t buf_size, size_t count, size_t *alloced) { int i; + int ret = FI_SUCCESS; + for (i = 0; i < count; i++) { buffers[i] = malloc(buf_size); if (!buffers[i]) { FT_ERR("malloc failed!\n"); - return EXIT_FAILURE; + ret = EXIT_FAILURE; + goto out; } } - return FI_SUCCESS; + +out: + *alloced = i; + return ret; +} + +void ft_efa_free_bufs(void **buffers, size_t count) { + int i; + + for (i = 0; i < count; i++) + free(buffers[i]); } int ft_efa_unexpected_pingpong(void) diff --git a/fabtests/prov/efa/src/efa_exhaust_mr_reg_common.h b/fabtests/prov/efa/src/efa_exhaust_mr_reg_common.h index 867d4c36dbe..68a33ddb5aa 100644 --- a/fabtests/prov/efa/src/efa_exhaust_mr_reg_common.h +++ b/fabtests/prov/efa/src/efa_exhaust_mr_reg_common.h @@ -16,7 +16,8 @@ void ft_efa_destroy_ibv_pd(struct ibv_pd *pd); int ft_efa_register_mr_reg(struct ibv_pd *pd, void **buffers, size_t buf_size, struct ibv_mr **mr_reg_vec, size_t count, size_t *registered); int ft_efa_deregister_mr_reg(struct ibv_mr **mr_reg_vec, size_t count); -int ft_efa_alloc_bufs(void **buffers, size_t buf_size, size_t count); +int ft_efa_alloc_bufs(void **buffers, size_t buf_size, size_t count, size_t *alloced); +void ft_efa_free_bufs(void **buffers, size_t count); int ft_efa_unexpected_pingpong(void); #endif /* _EFA_EXHAUST_MR_REG_COMMON_H */ diff --git a/fabtests/prov/efa/src/efa_exhaust_mr_reg_rdm_pingpong.c b/fabtests/prov/efa/src/efa_exhaust_mr_reg_rdm_pingpong.c index fe49df3b36e..9cde8bc43a3 100644 --- a/fabtests/prov/efa/src/efa_exhaust_mr_reg_rdm_pingpong.c +++ b/fabtests/prov/efa/src/efa_exhaust_mr_reg_rdm_pingpong.c @@ -38,6 +38,7 @@ int main(int argc, char **argv) { int op, ret, err, mr_reg_limit; size_t registered; + size_t alloced; void **buffers = NULL; struct ibv_mr **mr_reg_vec = NULL; struct ibv_context *ibv_ctx = NULL; @@ -108,7 +109,7 @@ int main(int argc, char **argv) printf("Exhausting MRs on client\n"); err = ft_efa_alloc_bufs(buffers, EFA_MR_REG_BUF_SIZE, - mr_reg_limit); + mr_reg_limit, &alloced); if (err) FT_PRINTERR("alloc bufs", -err); @@ -134,13 +135,13 @@ int main(int argc, char **argv) err = ft_efa_deregister_mr_reg(mr_reg_vec, registered); if (err) FT_PRINTERR("ibv mr dereg", -err); + ft_efa_free_bufs(buffers, alloced); + free(buffers); + free(mr_reg_vec); ft_efa_destroy_ibv_pd(pd); ft_efa_close_ibv_device(ibv_ctx); } - free(buffers); - free(mr_reg_vec); - ft_finalize(); ft_free_res(); diff --git a/fabtests/prov/efa/src/efa_info_test.c b/fabtests/prov/efa/src/efa_info_test.c index f0cbe5553f2..7b9c8e41fe4 100644 --- a/fabtests/prov/efa/src/efa_info_test.c +++ b/fabtests/prov/efa/src/efa_info_test.c @@ -38,6 +38,8 @@ int main(int argc, char **argv) { int ret; + struct fi_info *info; + hints = fi_allocinfo(); if (!hints) return EXIT_FAILURE; @@ -52,12 +54,14 @@ int main(int argc, char **argv) FT_PRINTERR("ft_getinfo", -ret); goto out; } - while (NULL != fi) { - if (0 != strcmp(fi->fabric_attr->name, "efa")) { + + info = fi; + while (NULL != info) { + if (0 != strcmp(info->fabric_attr->name, "efa")) { ret = EXIT_FAILURE; goto out; } - fi = fi->next; + info = info->next; } out: diff --git a/fabtests/prov/efa/src/rdm_rnr_queue_resend.c b/fabtests/prov/efa/src/rdm_rnr_queue_resend.c index a4169efabbc..9a8889ca4cf 100644 --- a/fabtests/prov/efa/src/rdm_rnr_queue_resend.c +++ b/fabtests/prov/efa/src/rdm_rnr_queue_resend.c @@ -77,37 +77,42 @@ int global_expected_rnr_error = 1; -static int alloc_atomic_res(struct fi_info *fi, void **result, void **compare, - struct fid_mr **mr_result, struct fid_mr **mr_compare) +/* global atomic resources */ +void *result = NULL; +void *compare = NULL; +struct fid_mr *mr_result = NULL; +struct fid_mr *mr_compare = NULL; + +static int alloc_atomic_res() { int ret; int mr_local = !!(fi->domain_attr->mr_mode & FI_MR_LOCAL); - *result = malloc(buf_size); + result = malloc(buf_size); if (!result) { perror("malloc"); return -1; } - *compare = malloc(buf_size); + compare = malloc(buf_size); if (!compare) { perror("malloc"); return -1; } // registers local data buffer that stores results - ret = fi_mr_reg(domain, *result, buf_size, + ret = fi_mr_reg(domain, result, buf_size, (mr_local ? FI_READ : 0) | FI_REMOTE_WRITE, 0, - 0, 0, mr_result, NULL); + 0, 0, &mr_result, NULL); if (ret) { FT_PRINTERR("fi_mr_reg", -ret); return ret; } // registers local data buffer that contains comparison data - ret = fi_mr_reg(domain, *compare, buf_size, + ret = fi_mr_reg(domain, compare, buf_size, (mr_local ? FI_WRITE : 0) | FI_REMOTE_READ, 0, - 0, 0, mr_compare, NULL); + 0, 0, &mr_compare, NULL); if (ret) { FT_PRINTERR("fi_mr_reg", ret); return ret; @@ -116,11 +121,17 @@ static int alloc_atomic_res(struct fi_info *fi, void **result, void **compare, return 0; } -static void free_atomic_res(void *result, void *compare, - struct fid_mr *mr_result, struct fid_mr *mr_compare) +static void free_atomic_res() { - FT_CLOSE_FID(mr_result); - FT_CLOSE_FID(mr_compare); + if (mr_result) { + FT_CLOSE_FID(mr_result); + mr_result = NULL; + } + if (mr_compare) { + FT_CLOSE_FID(mr_compare); + mr_compare = NULL; + } + if (result) { free(result); result = NULL; @@ -194,23 +205,12 @@ static int trigger_rnr_queue_resend(enum fi_op atomic_op, void *result, void *co static int rnr_queue_resend_test(int req_pkt, enum fi_op atomic_op) { int ret, i; - void *result = NULL; - void *compare = NULL; - struct fid_mr *mr_result = NULL; - struct fid_mr *mr_compare = NULL; /* * The handshake procedure between server and client will happen in * either ft_sync() or ft_exchange_key(), which is before the real * RNR triggering procedure. */ - if (atomic_op) { - ret = alloc_atomic_res(fi, &result, &compare, &mr_result, &mr_compare); - if (ret) { - FT_PRINTERR("alloc_ep_res_atomic()", -ret); - return ret; - } - } if (opts.rma_op || atomic_op) { ret = ft_exchange_keys(&remote); if (ret) { @@ -259,7 +259,7 @@ static int rnr_queue_resend_test(int req_pkt, enum fi_op atomic_op) if (req_pkt) { ret = trigger_rnr_queue_resend(atomic_op, result, compare, mr_result, mr_compare); if (ret) - goto out; + return ret; } else if (!opts.rma_op && !atomic_op) { for (i = 0; i < global_expected_rnr_error; i++) { ret = ft_rx(ep, opts.transfer_size); @@ -286,7 +286,7 @@ static int rnr_queue_resend_test(int req_pkt, enum fi_op atomic_op) sleep(3); ret = trigger_rnr_queue_resend(atomic_op, result, compare, mr_result, mr_compare); if (ret) - goto out; + return ret; } printf("Sleeping 3 seconds to trigger RNR on the client side\n"); sleep(3); @@ -314,10 +314,6 @@ static int rnr_queue_resend_test(int req_pkt, enum fi_op atomic_op) return ret; } -out: - if (atomic_op) - free_atomic_res(result, compare, mr_result, mr_compare); - return ret; } @@ -329,23 +325,31 @@ static int run(int req_pkt, enum fi_op atomic_op) ret = ft_efa_rnr_init_fabric(); if (ret) { FT_PRINTERR("ft_efa_rnr_init_fabric", -ret); - return ret; + goto out; + } + + ret = alloc_atomic_res(); + if (ret) { + FT_PRINTERR("alloc_atomic_res()", -ret); + goto out; } ret = rnr_queue_resend_test(req_pkt, atomic_op); if (ret) { FT_PRINTERR("rnr_queue_resend_test", -ret); - return ret; + goto out; } ret = ft_close_oob(); if (ret) { FT_PRINTERR("ft_close_oob", -ret); - return ret; + goto out; } - ft_free_res(); - return 0; +out: + free_atomic_res(); + ft_free_res(); + return ret; } static void print_opts_usage(char *name, char *desc) diff --git a/fabtests/prov/efa/src/rdm_rnr_read_cq_error.c b/fabtests/prov/efa/src/rdm_rnr_read_cq_error.c index b1b35afe7ba..d67d3d89b31 100644 --- a/fabtests/prov/efa/src/rdm_rnr_read_cq_error.c +++ b/fabtests/prov/efa/src/rdm_rnr_read_cq_error.c @@ -41,8 +41,6 @@ static int rnr_read_cq_error(void) { - struct fi_cq_data_entry comp; - struct fi_cq_err_entry comp_err = {0}; int total_send, expected_rnr_error; int ret, i, cnt, rnr_flag; const char *prov_errmsg; @@ -60,7 +58,12 @@ static int rnr_read_cq_error(void) for (i = 0; i < total_send; i++) { do { ret = fi_send(ep, tx_buf, 32, mr_desc, remote_fi_addr, &tx_ctx); - if (ret < 0 && ret != -FI_EAGAIN) { + if (ret == -FI_EAGAIN) { + (void) fi_cq_read(txcq, NULL, 0); + continue; + } + + if (ret < 0) { FT_PRINTERR("fi_send", -ret); return ret; } @@ -69,6 +72,9 @@ static int rnr_read_cq_error(void) cnt = total_send; do { + struct fi_cq_data_entry comp = {0}; + struct fi_cq_err_entry comp_err = {0}; + ret = fi_cq_read(txcq, &comp, 1); if (ret == 1) { cnt--; diff --git a/fabtests/pytest/common.py b/fabtests/pytest/common.py index be1b54620c6..23fb3b98837 100644 --- a/fabtests/pytest/common.py +++ b/fabtests/pytest/common.py @@ -13,8 +13,12 @@ from retrying import retry import pytest + +perf_progress_model_cli = "--data-progress manual --control-progress unified" SERVER_RESTART_DELAY_MS = 10_1000 CLIENT_RETRY_INTERVAL_MS = 1_000 + + class SshConnectionError(Exception): def __init__(self): diff --git a/fabtests/pytest/efa/test_cq.py b/fabtests/pytest/efa/test_cq.py index f9520f560b0..e45a9d99237 100644 --- a/fabtests/pytest/efa/test_cq.py +++ b/fabtests/pytest/efa/test_cq.py @@ -1,5 +1,8 @@ import pytest +# this test must be run in serial mode because it will open the maximal number +# of cq that efa device can support +@pytest.mark.serial @pytest.mark.unit def test_cq(cmdline_args): from common import UnitTest diff --git a/fabtests/pytest/efa/test_multi_ep.py b/fabtests/pytest/efa/test_multi_ep.py new file mode 100644 index 00000000000..561919f1446 --- /dev/null +++ b/fabtests/pytest/efa/test_multi_ep.py @@ -0,0 +1,11 @@ +import pytest + +@pytest.mark.functional +@pytest.mark.parametrize("shared_cq", [True, False]) +def test_multi_ep(cmdline_args, shared_cq): + from common import ClientServerTest + cmd = "fi_multi_ep -e rdm" + if shared_cq: + cmd += " --shared-cq" + test = ClientServerTest(cmdline_args, cmd) + test.run() diff --git a/fabtests/pytest/efa/test_rdm.py b/fabtests/pytest/efa/test_rdm.py index 9586c0950f2..ab7d1bb8d80 100644 --- a/fabtests/pytest/efa/test_rdm.py +++ b/fabtests/pytest/efa/test_rdm.py @@ -1,5 +1,6 @@ from default.test_rdm import test_rdm, test_rdm_bw_functional from efa.efa_common import efa_run_client_server_test +from common import perf_progress_model_cli import pytest import copy @@ -9,7 +10,8 @@ [pytest.param("short", marks=pytest.mark.short), pytest.param("standard", marks=pytest.mark.standard)]) def test_rdm_pingpong(cmdline_args, iteration_type, completion_semantic, memory_type, completion_type): - efa_run_client_server_test(cmdline_args, "fi_rdm_pingpong", iteration_type, + command = "fi_rdm_pingpong" + " " + perf_progress_model_cli + efa_run_client_server_test(cmdline_args, command, iteration_type, completion_semantic, memory_type, "all", completion_type=completion_type) @pytest.mark.functional @@ -32,7 +34,8 @@ def test_rdm_pingpong_no_inject_range(cmdline_args, completion_semantic, inject_ [pytest.param("short", marks=pytest.mark.short), pytest.param("standard", marks=pytest.mark.standard)]) def test_rdm_tagged_pingpong(cmdline_args, iteration_type, completion_semantic, memory_type, completion_type): - efa_run_client_server_test(cmdline_args, "fi_rdm_tagged_pingpong", iteration_type, + command = "fi_rdm_tagged_pingpong" + " " + perf_progress_model_cli + efa_run_client_server_test(cmdline_args, command, iteration_type, completion_semantic, memory_type, "all", completion_type=completion_type) @pytest.mark.functional @@ -44,7 +47,8 @@ def test_rdm_tagged_pingpong_range(cmdline_args, completion_semantic, memory_typ [pytest.param("short", marks=pytest.mark.short), pytest.param("standard", marks=pytest.mark.standard)]) def test_rdm_tagged_bw(cmdline_args, iteration_type, completion_semantic, memory_type, completion_type): - efa_run_client_server_test(cmdline_args, "fi_rdm_tagged_bw", iteration_type, + command = "fi_rdm_tagged_bw" + " " + perf_progress_model_cli + efa_run_client_server_test(cmdline_args, command, iteration_type, completion_semantic, memory_type, "all", completion_type=completion_type) @pytest.mark.functional @@ -58,10 +62,12 @@ def test_rdm_tagged_bw_no_inject_range(cmdline_args, completion_semantic, inject completion_semantic, "host_to_host", inject_message_size) @pytest.mark.functional -def test_rdm_tagged_bw_small_tx(cmdline_args, completion_semantic, memory_type, completion_type): +@pytest.mark.parametrize("env_vars", [["FI_EFA_TX_SIZE=64"], ["FI_EFA_RX_SIZE=64"], ["FI_EFA_TX_SIZE=64", "FI_EFA_RX_SIZE=64"]]) +def test_rdm_tagged_bw_small_tx_rx(cmdline_args, completion_semantic, memory_type, completion_type, env_vars): cmdline_args_copy = copy.copy(cmdline_args) - cmdline_args_copy.append_environ("FI_EFA_TX_SIZE=64") - # Use a window size larger than tx size + for env_var in env_vars: + cmdline_args_copy.append_environ(env_var) + # Use a window size larger than tx/rx size efa_run_client_server_test(cmdline_args_copy, "fi_rdm_tagged_bw -W 128", "short", completion_semantic, memory_type, "all", completion_type=completion_type) @@ -80,6 +86,7 @@ def test_rdm_atomic(cmdline_args, iteration_type, completion_semantic, memory_ty # the issue is tracked in: https://github.com/ofiwg/libfabric/issues/7002 # to mitigate the issue, set the maximum timeout of fi_rdm_atomic to 1800 seconds. cmdline_args_copy = copy(cmdline_args) + command = "fi_rdm_atomic" + " " + perf_progress_model_cli test = ClientServerTest(cmdline_args_copy, "fi_rdm_atomic", iteration_type, completion_semantic, memory_type=memory_type, timeout=1800) test.run() diff --git a/fabtests/pytest/efa/test_rma_bw.py b/fabtests/pytest/efa/test_rma_bw.py index 52ff76c3e1b..69a2388bc32 100644 --- a/fabtests/pytest/efa/test_rma_bw.py +++ b/fabtests/pytest/efa/test_rma_bw.py @@ -1,4 +1,5 @@ from efa.efa_common import efa_run_client_server_test +from common import perf_progress_model_cli import pytest import copy @@ -9,18 +10,20 @@ pytest.param("standard", marks=pytest.mark.standard)]) def test_rma_bw(cmdline_args, iteration_type, operation_type, completion_semantic, memory_type): command = "fi_rma_bw -e rdm" - command = command + " -o " + operation_type + command = command + " -o " + operation_type + " " + perf_progress_model_cli # rma_bw test with data verification takes longer to finish timeout = max(540, cmdline_args.timeout) efa_run_client_server_test(cmdline_args, command, iteration_type, completion_semantic, memory_type, "all", timeout=timeout) @pytest.mark.parametrize("operation_type", ["read", "writedata", "write"]) -def test_rma_bw_small_tx(cmdline_args, operation_type, completion_semantic, memory_type): +@pytest.mark.parametrize("env_vars", [["FI_EFA_TX_SIZE=64"], ["FI_EFA_RX_SIZE=64"], ["FI_EFA_TX_SIZE=64", "FI_EFA_RX_SIZE=64"]]) +def test_rma_bw_small_tx_rx(cmdline_args, operation_type, completion_semantic, memory_type, env_vars): cmdline_args_copy = copy.copy(cmdline_args) - cmdline_args_copy.append_environ("FI_EFA_TX_SIZE=64") - # Use a window size larger than tx size + for env_var in env_vars: + cmdline_args_copy.append_environ(env_var) + # Use a window size larger than tx/rx size command = "fi_rma_bw -e rdm -W 128" - command = command + " -o " + operation_type + command = command + " -o " + operation_type + " " + perf_progress_model_cli # rma_bw test with data verification takes longer to finish timeout = max(540, cmdline_args_copy.timeout) efa_run_client_server_test(cmdline_args_copy, command, "short", completion_semantic, memory_type, "all", timeout=timeout) diff --git a/fabtests/pytest/efa/test_rma_pingpong.py b/fabtests/pytest/efa/test_rma_pingpong.py index 608e88fdd0c..b14c0d88891 100644 --- a/fabtests/pytest/efa/test_rma_pingpong.py +++ b/fabtests/pytest/efa/test_rma_pingpong.py @@ -1,4 +1,5 @@ from efa.efa_common import efa_run_client_server_test +from common import perf_progress_model_cli import pytest @@ -10,7 +11,7 @@ def test_rma_pingpong(cmdline_args, iteration_type, operation_type, completion_s if memory_type != "host_to_host" and operation_type == "write": pytest.skip("no hmem memory support for pingpong_rma write test") command = "fi_rma_pingpong -e rdm" - command = command + " -o " + operation_type + command = command + " -o " + operation_type + " " + perf_progress_model_cli # rma_pingpong test with data verification takes longer to finish timeout = max(540, cmdline_args.timeout) efa_run_client_server_test(cmdline_args, command, iteration_type, completion_semantic, memory_type, "all", timeout=timeout) diff --git a/fabtests/pytest/shm/test_rdm.py b/fabtests/pytest/shm/test_rdm.py index 54657e9e504..9be0bdafd52 100644 --- a/fabtests/pytest/shm/test_rdm.py +++ b/fabtests/pytest/shm/test_rdm.py @@ -1,13 +1,15 @@ import pytest from default.test_rdm import test_rdm, \ test_rdm_bw_functional, test_rdm_atomic +from common import perf_progress_model_cli from shm.shm_common import shm_run_client_server_test @pytest.mark.parametrize("iteration_type", [pytest.param("short", marks=pytest.mark.short), pytest.param("standard", marks=pytest.mark.standard)]) def test_rdm_pingpong(cmdline_args, iteration_type, completion_semantic, memory_type, completion_type): - shm_run_client_server_test(cmdline_args, "fi_rdm_pingpong", iteration_type, + command = "fi_rdm_pingpong" + " " + perf_progress_model_cli + shm_run_client_server_test(cmdline_args, command, iteration_type, completion_semantic, memory_type, completion_type=completion_type) @@ -15,7 +17,8 @@ def test_rdm_pingpong(cmdline_args, iteration_type, completion_semantic, memory_ [pytest.param("short", marks=pytest.mark.short), pytest.param("standard", marks=pytest.mark.standard)]) def test_rdm_tagged_pingpong(cmdline_args, iteration_type, completion_semantic, memory_type, completion_type): - shm_run_client_server_test(cmdline_args, "fi_rdm_tagged_pingpong", iteration_type, + command = "fi_rdm_tagged_pingpong" + " " + perf_progress_model_cli + shm_run_client_server_test(cmdline_args, command, iteration_type, completion_semantic, memory_type, completion_type=completion_type) @@ -23,7 +26,8 @@ def test_rdm_tagged_pingpong(cmdline_args, iteration_type, completion_semantic, [pytest.param("short", marks=pytest.mark.short), pytest.param("standard", marks=pytest.mark.standard)]) def test_rdm_tagged_bw(cmdline_args, iteration_type, completion_semantic, memory_type, completion_type): - shm_run_client_server_test(cmdline_args, "fi_rdm_tagged_bw", iteration_type, + command = "fi_rdm_tagged_bw" + " " + perf_progress_model_cli + shm_run_client_server_test(cmdline_args, command, iteration_type, completion_semantic, memory_type, completion_type=completion_type) @pytest.mark.functional diff --git a/fabtests/pytest/shm/test_rma_bw.py b/fabtests/pytest/shm/test_rma_bw.py index 56721d67565..833b5ea690c 100644 --- a/fabtests/pytest/shm/test_rma_bw.py +++ b/fabtests/pytest/shm/test_rma_bw.py @@ -1,5 +1,6 @@ import pytest from shm.shm_common import shm_run_client_server_test +from common import perf_progress_model_cli @pytest.mark.parametrize("operation_type", ["read", "writedata", "write"]) @@ -8,7 +9,7 @@ pytest.param("standard", marks=pytest.mark.standard)]) def test_rma_bw(cmdline_args, iteration_type, operation_type, completion_semantic, memory_type): command = "fi_rma_bw -e rdm" - command = command + " -o " + operation_type + command = command + " -o " + operation_type + " " + perf_progress_model_cli # rma_bw test with data verification takes longer to finish timeout = max(540, cmdline_args.timeout) shm_run_client_server_test(cmdline_args, command, iteration_type, completion_semantic, memory_type, "all", timeout=timeout) diff --git a/fabtests/pytest/shm/test_rma_pingpong.py b/fabtests/pytest/shm/test_rma_pingpong.py index 3147f7d2296..3ff780a073d 100644 --- a/fabtests/pytest/shm/test_rma_pingpong.py +++ b/fabtests/pytest/shm/test_rma_pingpong.py @@ -1,5 +1,6 @@ import pytest from shm.shm_common import shm_run_client_server_test +from common import perf_progress_model_cli @pytest.mark.parametrize("operation_type", ["writedata", "write"]) @@ -10,7 +11,7 @@ def test_rma_pingpong(cmdline_args, iteration_type, operation_type, completion_s if memory_type != "host_to_host" and operation_type == "write": pytest.skip("no hmem memory support for pingpong_rma write test") command = "fi_rma_pingpong -e rdm" - command = command + " -o " + operation_type + command = command + " -o " + operation_type + " " + perf_progress_model_cli # rma_pingpong test with data verification takes longer to finish timeout = max(540, cmdline_args.timeout) shm_run_client_server_test(cmdline_args, command, iteration_type, completion_semantic, memory_type, "all", timeout=timeout) diff --git a/fabtests/scripts/runfabtests.py b/fabtests/scripts/runfabtests.py index e17c0f34422..e099e8f6e1d 100755 --- a/fabtests/scripts/runfabtests.py +++ b/fabtests/scripts/runfabtests.py @@ -1,35 +1,6 @@ #!/usr/bin/env python3 -# -# Copyright (c) 2021-2022 Amazon.com, Inc. or its affiliates. All rights reserved. -# -# This software is available to you under a choice of one of two -# licenses. You may choose to be licensed under the terms of the GNU -# General Public License (GPL) Version 2, available from the file -# COPYING in the main directory of this source tree, or the -# BSD license below: -# -# Redistribution and use in source and binary forms, with or -# without modification, are permitted provided that the following -# conditions are met: -# -# - Redistributions of source code must retain the above -# copyright notice, this list of conditions and the following -# disclaimer. -# -# - Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following -# disclaimer in the documentation and/or other materials -# provided with the distribution. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS -# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# +# SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only +# SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved import argparse import builtins @@ -39,6 +10,7 @@ import yaml import pytest +from junitparser import JUnitXml from pytest import ExitCode @@ -355,11 +327,8 @@ def main(): serial_status = run(fabtests_args, shared_options, "serial") if fabtests_args.junit_xml: - os.system("junitparser merge {}.parallel {}.serial {}".format( - fabtests_args.junit_xml, - fabtests_args.junit_xml, - fabtests_args.junit_xml) - ) + merged_xml = JUnitXml.fromfile(f'{fabtests_args.junit_xml}.parallel') + JUnitXml.fromfile(f'{fabtests_args.junit_xml}.serial') + merged_xml.write(f'{fabtests_args.junit_xml}') os.unlink(fabtests_args.junit_xml + ".parallel") os.unlink(fabtests_args.junit_xml + ".serial") diff --git a/fabtests/scripts/runfabtests.sh b/fabtests/scripts/runfabtests.sh index 3c8b54a3191..c37a4279690 100755 --- a/fabtests/scripts/runfabtests.sh +++ b/fabtests/scripts/runfabtests.sh @@ -132,8 +132,8 @@ functional_tests=( "fi_rdm_shared_av" "fi_multi_mr -e msg -V" "fi_multi_mr -e rdm -V" - "fi_multi_ep -e msg -v" - "fi_multi_ep -e rdm -v" + "fi_multi_ep -e msg -v --shared-av" + "fi_multi_ep -e rdm -v --shared-av" "fi_recv_cancel -e rdm -V" "fi_unexpected_msg -e msg -I 10 -v" "fi_unexpected_msg -e rdm -I 10 -v" diff --git a/fabtests/test_configs/shm/cuda.exclude b/fabtests/test_configs/shm/cuda.exclude new file mode 100644 index 00000000000..b3d94d7c430 --- /dev/null +++ b/fabtests/test_configs/shm/cuda.exclude @@ -0,0 +1,28 @@ +# Regex patterns of tests to exclude in runfabtests.sh + +inject_test +^fi_msg +-e msg +^fi_dgram +-e dgram +rdm_tagged_peek +multi_ep +av_xfer +unexpected_msg +multi_recv + +# Exclude tests that use sread/polling +rdm_cntr_pingpong +poll + +# Exclude tests with unsupported capabilities +-k +cm_data +trigger +shared_ctx +scalable_ep +shared_av +multi_mr +av_test + +multinode diff --git a/fabtests/test_configs/tcp/io_uring.exclude b/fabtests/test_configs/tcp/io_uring.exclude index e9407cffebc..db1a61ff621 100644 --- a/fabtests/test_configs/tcp/io_uring.exclude +++ b/fabtests/test_configs/tcp/io_uring.exclude @@ -74,10 +74,18 @@ fi_unexpected_msg -e rdm # fi_eq_sread(): common/shared.c:1165, ret=-4 (Interrupted system call) fi_bw -e msg +# fi_bw fails by hanging +# This is a suspected race condition +fi_bw + # fi_msg_pingpong fails with # fi_eq_sread(): common/shared.c:1127, ret=-4 (Interrupted system call) fi_msg_pingpong +# fi_rdm_cntr_pingpong passes but reports errors of +# fi_cntr_wait(): common/shared.c:2708, ret=-4 (Interrupted system call) +fi_rdm_cntr_pingpong + # fi_msg_bw fails with # fi_eq_sread(): common/shared.c:1127, ret=-4 (Interrupted system call) fi_msg_bw diff --git a/fabtests/test_configs/usnic/all.test b/fabtests/test_configs/usnic/all.test new file mode 100644 index 00000000000..7c93a0ce53a --- /dev/null +++ b/fabtests/test_configs/usnic/all.test @@ -0,0 +1,40 @@ +#: "Suite of tests for the usnic provider" +{ + prov_name: usnic, + test_type: [ + FT_TEST_LATENCY, + FT_TEST_BANDWIDTH, + ], + class_function: [ + FT_FUNC_SEND, + FT_FUNC_SENDV, + FT_FUNC_SENDMSG, + FT_FUNC_INJECT, + ], + ep_type: [ + FI_EP_DGRAM, + FI_EP_RDM, + FI_EP_MSG + ], + av_type: [ + FI_AV_MAP, + ], + comp_type: [ + FT_COMP_QUEUE, + ], + eq_wait_obj: [ + FI_WAIT_NONE, + FI_WAIT_UNSPEC, + FI_WAIT_FD, + ], + cq_wait_obj: [ + FI_WAIT_NONE, + FI_WAIT_UNSPEC, + ], + mode: [ + FI_CONTEXT, FI_RX_CQ_DATA, + ], + test_class: [ + FT_CAP_MSG, + ], +}, diff --git a/fabtests/test_configs/usnic/quick.test b/fabtests/test_configs/usnic/quick.test new file mode 100644 index 00000000000..225d4cd77a5 --- /dev/null +++ b/fabtests/test_configs/usnic/quick.test @@ -0,0 +1,41 @@ +#: "Suite of tests for the usnic provider" +{ + prov_name: usnic, + test_type: [ + FT_TEST_LATENCY, + FT_TEST_BANDWIDTH, + ], + class_function: [ + FT_FUNC_SEND, + FT_FUNC_SENDV, + FT_FUNC_SENDMSG, + FT_FUNC_INJECT, + ], + ep_type: [ + FI_EP_DGRAM, + FI_EP_RDM, + FI_EP_MSG, + ], + av_type: [ + FI_AV_MAP, + ], + comp_type: [ + FT_COMP_QUEUE, + ], + eq_wait_obj: [ + FI_WAIT_NONE, + FI_WAIT_UNSPEC, + FI_WAIT_FD, + ], + cq_wait_obj: [ + FI_WAIT_NONE, + FI_WAIT_UNSPEC, + ], + mode: [ + FI_CONTEXT, FI_RX_CQ_DATA, + ], + test_class: [ + FT_CAP_MSG, + ], + test_flags: FT_FLAG_QUICKTEST +}, diff --git a/fabtests/ubertest/test_ctrl.c b/fabtests/ubertest/test_ctrl.c index 38996f56a39..e829b8ff7a8 100644 --- a/fabtests/ubertest/test_ctrl.c +++ b/fabtests/ubertest/test_ctrl.c @@ -34,6 +34,38 @@ #include "fabtest.h" +int ft_random_fd = -1; + +static inline void ft_init_random(void) +{ + ft_random_fd = open("/dev/urandom", O_RDONLY); + if (ft_random_fd < 0) + FT_PRINTERR("ft_open_control", ft_random_fd); +} + +static inline void ft_cleanup_random(void) +{ + if (ft_random_fd >= 0) { + close(ft_random_fd); + ft_random_fd = -1; + } +} + +static int get_random_data(void) +{ + ssize_t ret; + static int rdata = 1; + + if (ft_random_fd < 0) + return rdata; + + ret = read(ft_random_fd, &rdata, sizeof(rdata)); + if (ret < 0) + ft_record_error(ret); + + return rdata; +} + void ft_record_error(int error) { if (!ft_ctrl.error) { @@ -247,7 +279,7 @@ static void ft_format_iov_random(struct iovec *iov, size_t cnt, char *buf, * the remaining IOV count. This is so we can reserve at * least a length of 1 for every IOV. */ - weight = (rand() % (len - (cnt - i) + 1)) + 1; + weight = (get_random_data() % (len - (cnt - i) + 1)) + 1; } len -= weight; @@ -272,7 +304,7 @@ void ft_format_iov(struct iovec *iov, size_t cnt, char *buf, size_t len) ft_format_iov_random }; - choice = rand() % ARRAY_SIZE(options); + choice = get_random_data() % ARRAY_SIZE(options); options[choice](iov, cnt, buf, len); } @@ -989,6 +1021,7 @@ void ft_cleanup(void) ft_free_host_tx_buf(); ft_cleanup_mr_control(&ft_mr_ctrl); ft_cleanup_atomic_control(&ft_atom_ctrl); + ft_cleanup_random(); ft_free_res(); memset(&ft_ctrl, 0, sizeof ft_ctrl); } @@ -1074,6 +1107,8 @@ int ft_open_res() FT_PRINTERR("ft_open_control", ret); goto cleanup; } + + ft_init_random(); if (test_info.ep_type == FI_EP_MSG && listen_sock >= 0) { ret = ft_open_passive(); if (ret) { diff --git a/fabtests/ubertest/verify.c b/fabtests/ubertest/verify.c index ce578da8f76..89c1fb4fd5e 100644 --- a/fabtests/ubertest/verify.c +++ b/fabtests/ubertest/verify.c @@ -119,7 +119,11 @@ int ft_sync_fill_bufs(size_t size) if (ret) return ret; - ft_hmem_copy_from(opts.iface, opts.device, ft_tx_ctrl.cpy_buf, ft_tx_ctrl.buf, size); + ret = ft_hmem_copy_from(opts.iface, opts.device, + ft_tx_ctrl.cpy_buf, + ft_tx_ctrl.buf, size); + if (ret) + return ret; } ft_sock_sync(0); diff --git a/fabtests/unit/cq_test.c b/fabtests/unit/cq_test.c index 19bd108cf41..a80fd16a415 100644 --- a/fabtests/unit/cq_test.c +++ b/fabtests/unit/cq_test.c @@ -79,13 +79,15 @@ static int cq_open_close_simultaneous(void) return -FI_ENOMEM; ret = 0; - for (opened = 0; opened < count && !ret; opened++) { + for (opened = 0; opened < count; opened++) { ret = create_cq(&cq_array[opened], 0, 0, FI_CQ_FORMAT_UNSPEC, FI_WAIT_UNSPEC); if (ret) { ret = create_cq(&cq_array[opened], 0, 0, FI_CQ_FORMAT_UNSPEC, FI_WAIT_NONE); } + if (ret) + break; } if (ret) { FT_WARN("fi_cq_open failed after %d (cq_cnt: %zu): %s", diff --git a/fabtests/unit/eq_test.c b/fabtests/unit/eq_test.c index 96ce477ed66..f7a03a0e7d5 100644 --- a/fabtests/unit/eq_test.c +++ b/fabtests/unit/eq_test.c @@ -610,9 +610,7 @@ int main(int argc, char **argv) } hints->mode = FI_CONTEXT | FI_CONTEXT2 | FI_MSG_PREFIX | FI_ASYNC_IOV | - FI_RX_CQ_DATA | FI_NOTIFY_FLAGS_ONLY | FI_RESTRICTED_COMP | - FI_BUFFERED_RECV; - hints->domain_attr->mode = FI_RESTRICTED_COMP; + FI_RX_CQ_DATA | FI_BUFFERED_RECV; hints->domain_attr->mr_mode = ~(FI_MR_BASIC | FI_MR_SCALABLE); ret = fi_getinfo(FT_FIVERSION, NULL, 0, 0, hints, &fi); diff --git a/fabtests/unit/getinfo_test.c b/fabtests/unit/getinfo_test.c index a21e268aff3..57653b8bd67 100644 --- a/fabtests/unit/getinfo_test.c +++ b/fabtests/unit/getinfo_test.c @@ -214,8 +214,7 @@ static int init_caps(struct fi_info *hints, uint64_t bits) FI_MULTICAST | FI_NAMED_RX_CTX | FI_HMEM | \ FI_COLLECTIVE) #define PRIMARY_RX_CAPS (FI_MSG | FI_RMA | FI_TAGGED | FI_ATOMIC | \ - FI_DIRECTED_RECV | FI_VARIABLE_MSG | \ - FI_HMEM | FI_COLLECTIVE) + FI_DIRECTED_RECV | FI_HMEM | FI_COLLECTIVE) #define PRIMARY_CAPS (PRIMARY_TX_CAPS | PRIMARY_RX_CAPS) #define DOMAIN_CAPS (FI_LOCAL_COMM | FI_REMOTE_COMM | FI_SHARED_AV) @@ -695,11 +694,16 @@ static int test_caps_regression(char *node, char *service, uint64_t flags, struct fi_info *fi; int ret; + if (!hints) { + printf("invalid test case: hints may not be null"); + return -FI_EINVAL; + } + ret = fi_getinfo(FT_FIVERSION, node, service, flags, NULL, info); if (ret) return ret; - if (!hints || !hints->fabric_attr || !hints->fabric_attr->prov_name) { + if (!hints->fabric_attr || !hints->fabric_attr->prov_name) { fi = *info; } else { for (fi = *info; fi; fi = fi->next) { @@ -810,7 +814,7 @@ static int getinfo_unit_test(char *node, char *service, uint64_t flags, * At the moment, only invalid_dom does this and the domain name * is the only application owned memory. Free the application owned * memory so that fi_freeinfo only frees memory that it owns. */ - if (init) { + if (init && !!test_hints) { free(test_hints->domain_attr->name); test_hints->domain_attr->name = NULL; } diff --git a/fabtests/unit/mr_cache_evict.c b/fabtests/unit/mr_cache_evict.c index 11435c7ebfc..a12f9c31372 100644 --- a/fabtests/unit/mr_cache_evict.c +++ b/fabtests/unit/mr_cache_evict.c @@ -636,19 +636,19 @@ static int mr_cache_test(enum alloc_type type) cleanup: if (realloc_mr) - fi_close(&realloc_mr->fid); + FT_CLOSE_FID(realloc_mr); if (cached_mr) - fi_close(&cached_mr->fid); + FT_CLOSE_FID(cached_mr); if (mr) - fi_close(&mr->fid); + FT_CLOSE_FID(mr); if (buf) mem_free(buf, type); if (prime_mr) - fi_close(&prime_mr->fid); + FT_CLOSE_FID(prime_mr); if (prime_buf) { switch (iface) { diff --git a/include/ofi.h b/include/ofi.h index 71203c3822e..ae7eb38b751 100644 --- a/include/ofi.h +++ b/include/ofi.h @@ -97,8 +97,7 @@ extern "C" { #define OFI_PRIMARY_RX_CAPS \ (FI_MSG | FI_RMA | FI_TAGGED | FI_ATOMIC | \ FI_REMOTE_READ | FI_REMOTE_WRITE | FI_RECV | \ - FI_DIRECTED_RECV | FI_VARIABLE_MSG | \ - FI_COLLECTIVE | FI_HMEM) + FI_DIRECTED_RECV | FI_COLLECTIVE | FI_HMEM) #define OFI_SECONDARY_RX_CAPS \ (FI_MULTI_RECV | FI_TRIGGER | FI_RMA_PMEM | FI_SOURCE | \ @@ -122,8 +121,7 @@ extern "C" { #define OFI_IGNORED_TX_CAPS /* older Rx caps not applicable to Tx */ \ (FI_REMOTE_READ | FI_REMOTE_WRITE | FI_RECV | FI_DIRECTED_RECV | \ - FI_VARIABLE_MSG | FI_MULTI_RECV | FI_SOURCE | FI_RMA_EVENT | \ - FI_SOURCE_ERR) + FI_MULTI_RECV | FI_SOURCE | FI_RMA_EVENT | FI_SOURCE_ERR) #define OFI_IGNORED_RX_CAPS /* Older Tx caps not applicable to Rx */ \ (FI_READ | FI_WRITE | FI_SEND | FI_FENCE | FI_MULTICAST | \ FI_NAMED_RX_CTX) diff --git a/include/ofi_hmem.h b/include/ofi_hmem.h index fcb7e44c7dc..60a4939e54f 100644 --- a/include/ofi_hmem.h +++ b/include/ofi_hmem.h @@ -62,6 +62,9 @@ CUresult ofi_cuGetErrorName(CUresult error, const char** pStr); CUresult ofi_cuGetErrorString(CUresult error, const char** pStr); CUresult ofi_cuPointerGetAttribute(void *data, CUpointer_attribute attribute, CUdeviceptr ptr); +CUresult ofi_cuPointerGetAttributes(unsigned int num_attributes, + CUpointer_attribute *attributes, + void **data, CUdeviceptr ptr); CUresult ofi_cuDeviceCanAccessPeer(int *canAccessPeer, CUdevice srcDevice, CUdevice dstDevice); cudaError_t ofi_cudaHostRegister(void *ptr, size_t size, unsigned int flags); diff --git a/include/ofi_prov.h b/include/ofi_prov.h index aabce7fc283..08c8efe059e 100644 --- a/include/ofi_prov.h +++ b/include/ofi_prov.h @@ -123,6 +123,17 @@ SOCKETS_INI ; # define SOCKETS_INIT NULL #endif +#if (HAVE_USNIC) && (HAVE_USNIC_DL) +# define USNIC_INI FI_EXT_INI +# define USNIC_INIT NULL +#elif (HAVE_USNIC) +# define USNIC_INI INI_SIG(fi_usnic_ini) +# define USNIC_INIT fi_usnic_ini() +USNIC_INI ; +#else +# define USNIC_INIT NULL +#endif + #if (HAVE_UDP) && (HAVE_UDP_DL) # define UDP_INI FI_EXT_INI # define UDP_INIT NULL diff --git a/include/ofi_util.h b/include/ofi_util.h index d6edfa6abd2..0d357d3136a 100644 --- a/include/ofi_util.h +++ b/include/ofi_util.h @@ -214,6 +214,7 @@ struct util_domain { struct ofi_mr_map mr_map; enum fi_threading threading; enum fi_progress data_progress; + enum fi_progress control_progress; }; int ofi_domain_init(struct fid_fabric *fabric_fid, const struct fi_info *info, @@ -1247,6 +1248,7 @@ void *ofi_ns_resolve_name(struct util_ns *ns, const char *server, * the core by calling add_credits. */ #define OFI_OPS_FLOW_CTRL "ofix_flow_ctrl_v1" +#define OFI_PRIORITY (1ULL << 62) struct ofi_ops_flow_ctrl { size_t size; @@ -1369,6 +1371,13 @@ static inline void ofi_cq_err_memcpy(uint32_t api_version, } } +static inline enum ofi_lock_type +ofi_progress_lock_type(enum fi_threading threading, enum fi_progress control) +{ + return (threading == FI_THREAD_DOMAIN || threading == FI_THREAD_COMPLETION) && + control == FI_PROGRESS_CONTROL_UNIFIED ? OFI_LOCK_NOOP : OFI_LOCK_MUTEX; +} + #ifdef __cplusplus } #endif diff --git a/include/rdma/fabric.h b/include/rdma/fabric.h index a705ba6e200..b0c8252da9c 100644 --- a/include/rdma/fabric.h +++ b/include/rdma/fabric.h @@ -84,7 +84,7 @@ extern "C" { #endif #define FI_MAJOR_VERSION 1 -#define FI_MINOR_VERSION 20 +#define FI_MINOR_VERSION 21 #define FI_REVISION_VERSION 0 enum { @@ -118,11 +118,6 @@ struct fid_nic; typedef struct fid *fid_t; -/* - * Provider specific values are indicated by setting the high-order bit. - */ -#define FI_PROV_SPECIFIC (1U << 31) - /* * Flags * The 64-bit flag field is used as follows: @@ -153,7 +148,7 @@ typedef struct fid *fid_t; #define FI_PEEK (1ULL << 19) #define FI_TRIGGER (1ULL << 20) #define FI_FENCE (1ULL << 21) -#define FI_PRIORITY (1ULL << 22) +/* #define FI_PRIORITY (1ULL << 22) */ #define FI_COMPLETION (1ULL << 24) #define FI_EVENT FI_COMPLETION @@ -169,11 +164,11 @@ typedef struct fid *fid_t; #define FI_MR_DMABUF (1ULL << 40) #define FI_AV_USER_ID (1ULL << 41) #define FI_PEER (1ULL << 43) -#define FI_XPU_TRIGGER (1ULL << 44) +/* #define FI_XPU_TRIGGER (1ULL << 44) */ #define FI_HMEM_HOST_ALLOC (1ULL << 45) #define FI_HMEM_DEVICE_ONLY (1ULL << 46) #define FI_HMEM (1ULL << 47) -#define FI_VARIABLE_MSG (1ULL << 48) +/* #define FI_VARIABLE_MSG (1ULL << 48) */ #define FI_RMA_PMEM (1ULL << 49) #define FI_SOURCE_ERR (1ULL << 50) #define FI_LOCAL_COMM (1ULL << 51) @@ -256,7 +251,8 @@ enum fi_mr_mode { enum fi_progress { FI_PROGRESS_UNSPEC, FI_PROGRESS_AUTO, - FI_PROGRESS_MANUAL + FI_PROGRESS_MANUAL, + FI_PROGRESS_CONTROL_UNIFIED, }; enum fi_threading { @@ -302,8 +298,8 @@ enum fi_ep_type { FI_EP_MSG, FI_EP_DGRAM, FI_EP_RDM, - FI_EP_SOCK_STREAM, - FI_EP_SOCK_DGRAM, + /* FI_EP_SOCK_STREAM, */ + /* FI_EP_SOCK_DGRAM, */ }; /* Endpoint protocol @@ -346,6 +342,7 @@ enum { FI_PROTO_COLL, FI_PROTO_UCX, FI_PROTO_SM2, + FI_PROTO_CXI_RNR, }; enum { @@ -376,8 +373,8 @@ static inline uint8_t fi_tc_dscp_get(uint32_t tclass) #define FI_ASYNC_IOV (1ULL << 57) #define FI_RX_CQ_DATA (1ULL << 56) #define FI_LOCAL_MR (1ULL << 55) -#define FI_NOTIFY_FLAGS_ONLY (1ULL << 54) -#define FI_RESTRICTED_COMP (1ULL << 53) +/* #define FI_NOTIFY_FLAGS_ONLY (1ULL << 54) */ +/* #define FI_RESTRICTED_COMP (1ULL << 53) */ #define FI_CONTEXT2 (1ULL << 52) #define FI_BUFFERED_RECV (1ULL << 51) /* #define FI_PEER_TRANSFER (1ULL << 36) */ @@ -697,11 +694,7 @@ static inline int fi_alias(struct fid *fid, struct fid **alias_fid, uint64_t fla return fi_control(fid, FI_ALIAS, &alias); } -/* fid value names */ -/* - * Currently no common name is defined. Provider specific names should - * have the FI_PROV_SPECIFIC bit set. - */ +/* Provider specific names should set the uppermost bit. */ static inline int fi_get_val(struct fid *fid, int name, void *val) { @@ -792,6 +785,12 @@ struct fi_param { int fi_getparams(struct fi_param **params, int *count); void fi_freeparams(struct fi_param *params); +/* Dummy definitions for removed flags/caps/types. For compiling old fabtests */ +#define FI_VARIABLE_MSG 0ULL +#define FI_NOTIFY_FLAGS_ONLY 0ULL +#define FI_RESTRICTED_COMP 0ULL +#define FI_EP_SOCK_STREAM FI_EP_UNSPEC + #ifdef FABRIC_DIRECT #include #endif /* FABRIC_DIRECT */ diff --git a/include/rdma/fi_endpoint.h b/include/rdma/fi_endpoint.h index 3b3b9d49545..ad2a9387dfe 100644 --- a/include/rdma/fi_endpoint.h +++ b/include/rdma/fi_endpoint.h @@ -67,7 +67,7 @@ enum { FI_OPT_TX_SIZE, FI_OPT_RX_SIZE, FI_OPT_FI_HMEM_P2P, /* int */ - FI_OPT_XPU_TRIGGER, /* struct fi_trigger_xpu */ + FI_OPT_XPU_TRIGGER, /* reserved for compatibility */ FI_OPT_CUDA_API_PERMITTED, /* bool */ FI_OPT_SHARED_MEMORY_PERMITTED, /* bool */ }; diff --git a/include/rdma/fi_eq.h b/include/rdma/fi_eq.h index 6664ed48dcf..155c6b65dba 100644 --- a/include/rdma/fi_eq.h +++ b/include/rdma/fi_eq.h @@ -285,7 +285,8 @@ struct fid_cq { */ enum fi_cntr_events { - FI_CNTR_EVENTS_COMP + FI_CNTR_EVENTS_COMP, + FI_CNTR_EVENTS_BYTES /* count bytes not completeion events */ }; struct fi_cntr_attr { diff --git a/include/windows/config.h b/include/windows/config.h index 70f939d8926..02664853f7a 100644 --- a/include/windows/config.h +++ b/include/windows/config.h @@ -256,7 +256,7 @@ #define PACKAGE_TARNAME PACKAGE /* Define to the version of this package. */ -#define PACKAGE_VERSION "1.21.0a1" +#define PACKAGE_VERSION "1.22.0a1" /* Define to the full name and version of this package. */ #define PACKAGE_STRING PACKAGE_NAME " " PACKAGE_VERSION diff --git a/include/windows/osd.h b/include/windows/osd.h index efcad930baf..dcd6ed14c63 100644 --- a/include/windows/osd.h +++ b/include/windows/osd.h @@ -205,6 +205,9 @@ extern "C" { #ifndef ESTALE # define ESTALE 246 /* Stale NFS file handle */ #endif +#ifndef EREMOTEIO +# define EREMOTEIO 247 /* Remote I/O error */ +#endif /* MSG_NOSIGNAL doesn't exist on Windows */ #ifndef MSG_NOSIGNAL @@ -281,6 +284,9 @@ do \ #define be64toh ntohll #define strncasecmp _strnicmp +#define access(path, mode) _access(path, mode) +#define F_OK 0 + typedef int pid_t; #define getpid (int)GetCurrentProcessId @@ -924,6 +930,11 @@ static inline char *strcasestr(const char *haystack, const char *needle) return pos; } +static inline char *strtok_r(char *str, const char *delimiters, char **saveptr) +{ + return strtok_s(str, delimiters, saveptr); +} + #ifndef _SC_PAGESIZE #define _SC_PAGESIZE 0 #endif diff --git a/man/fabric.7.md b/man/fabric.7.md index d9b7ee42eb0..789dd1ef2f7 100644 --- a/man/fabric.7.md +++ b/man/fabric.7.md @@ -251,7 +251,9 @@ Users can enable or disable available providers through build configuration options. See 'configure --help' for details. In general, a specific provider can be controlled using the configure option '--enable-'. For example, '--enable-udp' (or '--enable-udp=yes') will add the udp provider to the -build. To disable the provider, '--enable-udp=no' can be used. +build. To disable the provider, '--enable-udp=no' can be used. To build the +provider as a stand-alone dynamically loadable library (i.e. DL provider), +'--enable-udp=dl' can be used. Providers can also be enable or disabled at run time using the FI_PROVIDER environment variable. The FI_PROVIDER variable is set to a comma separated @@ -259,7 +261,55 @@ list of providers to include. If the list begins with the '^' symbol, then the list will be negated. Example: To enable the udp and tcp providers only, set: - FI_PROVIDER="udp,tcp" + `FI_PROVIDER="udp,tcp"` + +When libfabric is installed, DL providers are put under the *default provider path*, +which is determined by how libfabric is built and installed. Usually the +default provider path is `/lib/libfabric` or +`/lib64/libfabric`. By default, libfabric tries to +find DL providers in the following order: + + 1. Use 'dlopen' to load provider libraries named `lib-fi.so` for + all providers enabled at build time. The search path of 'ld.so' is used + to locate the files. This step is skipped if libfabric is configured with + the option '--enable-restricted-dl'. + + 2. Try to load every file under the default provider path as a DL provider. + +The FI_PROVIDER_PATH variable can be used to change the location to search +for DL providers and how to resolve conflicts if multiple providers with the +same name are found. Setting FI_PROVIDER_PATH to any value, even if empty, +would cause step 1 be skipped, and may change the search directory used in +step 2. + +In the simplest form, the FI_PROVIDER_PATH variable is set to a colon +separated list of directories. These directories replace the default provider +path used in step 2. For example: + + FI_PROVIDER_PATH=/opt/libfabric:/opt/libfabric2 + +By default, if multiple providers (including the built-in providers) with the +same name are found, the first one with the highest version is active and all +the others are hidden. This can be changed by setting the FI_PROVIDER_PATH +variable to start with '@', which force the first one to be active regardless +of the version. For example: + + FI_PROVIDER_PATH=@/opt/libfabric:/opt/libfabric2 + +The FI_PROVIDER_PATH variable can also specify preferred providers by supplying +full paths to libraries instead of directories to search under. A preferred +provider takes precedence over other providers with the same name. The +specification of a preferred provider must be prefixed with '+'. For example: + + FI_PROVIDER_PATH=+/opt/libfabric2/libtcp-fi.so:/opt/libfabric:+/opt/libfabric2/libudp-fi.so + +If FI_PROVIDER_PATH is set, but no directory is supplied, the default +provider path is used. Some examples: + + FI_PROVIDER_PATH= + FI_PROVIDER_PATH=@ + FI_PROVIDER_PATH=+/opt/libfabric/libtcp-fi.so + FI_PROVIDER_PATH=@+/opt/libfabric/libtcp-fi.so The fi_info utility, which is included as part of the libfabric package, can be used to retrieve information about which providers are available in the diff --git a/man/fi_cntr.3.md b/man/fi_cntr.3.md index cc87f5ab909..a5d1ea3e5cc 100644 --- a/man/fi_cntr.3.md +++ b/man/fi_cntr.3.md @@ -123,6 +123,26 @@ struct fi_cntr_attr { on all successful completions, separately from whether the operation generates an entry in an event queue. +- *FI_CNTR_EVENTS_BYTES* +: The counter is incremented by the number of user bytes, + excluding any CQ data, transferred in a transport message + upon reaching the specified completion semantic. + For initiator side counters, the count reflects the size of the + requested transfer and is updated after the message reaches + the desired completion level (FI_INJECT_COMPLETE, + FI_TRANSMIT_COMPLETE, etc.). For send and write operations, + the count reflects the number of bytes transferred to the peer. + For read operations, the count reflects the number of bytes + returned in a read response. Operations which may both + write and read data, such as atomics, behave as read operations + at the initiator, but writes at the target. For target side + counters, the count reflects the size of received user data + and is incremented subject to target side completion semantics. + In most cases, this indicates FI_DELIVERY_COMPLETE, but may + differ when accessing device memory (HMEM). On error, the + tranfer size is not applied to the error field, that field is + increment by 1. The FI_COLLECTIVE transfer type is not supported. + *wait_obj* : Counters may be associated with a specific wait object. Wait objects allow applications to block until the wait object is diff --git a/man/fi_cq.3.md b/man/fi_cq.3.md index 15ac8091b2a..ab3684a01e0 100644 --- a/man/fi_cq.3.md +++ b/man/fi_cq.3.md @@ -985,12 +985,6 @@ A completion queue must be bound to at least one enabled endpoint before any operation such as fi_cq_read, fi_cq_readfrom, fi_cq_sread, fi_cq_sreadfrom etc. can be called on it. -Completion flags may be suppressed if the FI_NOTIFY_FLAGS_ONLY mode bit -has been set. When enabled, only the following flags are guaranteed to -be set in completion data when they are valid: FI_REMOTE_READ and -FI_REMOTE_WRITE (when FI_RMA_EVENT capability bit has been set), -FI_REMOTE_CQ_DATA, and FI_MULTI_RECV. - If a completion queue has been overrun, it will be placed into an 'overrun' state. Read operations will continue to return any valid, non-corrupted completions, if available. After all valid completions have been retrieved, diff --git a/man/fi_cxi.7.md b/man/fi_cxi.7.md index 7cc3d288675..0a32850cee8 100644 --- a/man/fi_cxi.7.md +++ b/man/fi_cxi.7.md @@ -229,6 +229,18 @@ CXI integrated launcher and CXI authorization key aware libfabric user: 7. Application processes select from the list of available service IDs and VNIs to form an authorization key to use for Endpoint allocation. +## Endpoint Protocols + +The provider supports multiple endpoint protocols. The default protocol is +FI_PROTO_CXI and fully supports the messaging requirements of parallel +applicaitons. + +The FI_PROTO_CXI_RNR endpoint protocol is an optional protocol that targets +client/server environments where send-after-send ordering is not required and +messaging is generally to pre-posted buffers; FI_MULTI_RECV is recommended. +It utilizes a receiver-not-ready implementation where +*FI_CXI_RNR_MAX_TIMEOUT_US* can be tuned to control the maximum retry duration. + ## Address Vectors The CXI provider supports both *FI_AV_TABLE* and *FI_AV_MAP* with the same @@ -433,6 +445,15 @@ faults but requires all buffers to be backed by physical memory. Copy-on-write semantics are broken when using pinned memory. See the Fork section for more information. +The CXI provider supports DMABUF for device memory registration. If the ROCR +and CUDA libraries support it, the CXI provider will default to use DMA-buf. +There may be situations with CUDA that may double the BAR consumption. +Until this is fixed in the CUDA stack, the environment variable +*FI_CXI_DISABLE_DMABUF_CUDA* can be used to fall back to the nvidia +peer-memory interface. +Also, *FI_CXI_DISABLE_DMABUF_ROCR* can be used to fall back to the amdgpu +peer-memory interface. + ## Translation Cache Mapping a buffer for use by the NIC is an expensive operation. To avoid this @@ -1077,6 +1098,12 @@ The CXI provider checks for the following environment variables: *FI_CXI_DEFAULT_VNI* : Default VNI value used only for service IDs where the VNI is not restricted. +*FI_CXI_RNR_MAX_TIMEOUT_US* +: When using the endpoint FI_PROTO_CXI_RNR protocol, this setting is used to + control the maximum time from the original posting of the message that the + message should be retried. A value of 0 will return an error completion + on the first RNR ack status. + *FI_CXI_EQ_ACK_BATCH_SIZE* : Number of EQ events to process before writing an acknowledgement to HW. Batching ACKs amortizes the cost of event acknowledgement over multiple diff --git a/man/fi_domain.3.md b/man/fi_domain.3.md index e98b96d2c8c..e9f3000f981 100644 --- a/man/fi_domain.3.md +++ b/man/fi_domain.3.md @@ -238,24 +238,25 @@ The name of the access domain. The threading model specifies the level of serialization required of an application when using the libfabric data transfer interfaces. -Control interfaces are always considered thread safe, and may be -accessed by multiple threads. Applications which can guarantee -serialization in their access of provider allocated resources and -interfaces enables a provider to eliminate lower-level locks. +Control interfaces are always considered thread safe unless the +control progress model is FI_PROGRESS_CONTROL_UNIFIED. A thread safe +control interface allows multiple threads to progress the control +interface, and (depending on threading model selected) one or more +threads to progress the data interfaces at the same time. Applications +which can guarantee serialization in their access of provider allocated +resources and interfaces enable a provider to eliminate lower-level locks. *FI_THREAD_COMPLETION* -: The completion threading model is intended for providers that make use - of manual progress. Applications must serialize access to all objects - that are associated through the use of having a shared completion - structure. This includes endpoint, transmit context, receive context, - completion queue, counter, wait set, and poll set objects. +: The completion threading model is best suited for multi-threaded applications + using scalable endpoints which desire lockless operation. Applications must + serialize access to all objects that are associated by a common completion + mechanism (for example, endpoints bound to the same CQ or counter). It is + recommended that providers which support scalable endpoints also support this + threading model. - For example, threads must serialize access to an endpoint and its - bound completion queue(s) and/or counters. Access to endpoints that - share the same completion queue must also be serialized. - - The use of FI_THREAD_COMPLETION can increase parallelism over - FI_THREAD_SAFE, but requires the use of isolated resources. + Applications wanting to leverage FI_THREAD_COMPLETION should allocate + transmit contexts, receive contexts, and completion queues and counters to + individual threads. *FI_THREAD_DOMAIN* : A domain serialization model requires applications to serialize @@ -331,7 +332,7 @@ Progress frequently requires action being taken at both the transmitting and receiving sides of an operation. This is often a requirement for reliable transfers, as a result of retry and acknowledgement processing. -To balance between performance and ease of use, two progress models +To balance between performance and ease of use, the following progress models are defined. *FI_PROGRESS_AUTO* @@ -375,6 +376,14 @@ are defined. manual progress may still need application assistance to process received operations. +*FI_PROGRESS_CONTROL_UNIFIED* +: This progress model indicates that the user will synchronize progressing the + data and control operations themselves (i.e. this allows the control interface + to NOT be thread safe). It is only valid for control progress (not data progress). + Setting control=FI_PROGRESS_CONTROL_UNIFIED, data=FI_PROGRESS_MANUAL, and + threading=FI_THREAD_DOMAIN/FI_THREAD_COMPLETION allows Libfabric to remove all + locking in the critical data progress path. + *FI_PROGRESS_UNSPEC* : This value indicates that no progress model has been defined. It may be used on input hints to the fi_getinfo call. @@ -741,15 +750,6 @@ The following are supported secondary capabilities: See [`fi_getinfo`(3)](fi_getinfo.3.html) for a discussion on primary versus secondary capabilities. -## mode - -The operational mode bit related to using the domain. - -*FI_RESTRICTED_COMP* -: This bit indicates that the domain limits completion queues and counters - to only be used with endpoints, transmit contexts, and receive contexts that - have the same set of capability flags. - ## Default authorization key (auth_key) The default authorization key to associate with endpoint and memory diff --git a/man/fi_endpoint.3.md b/man/fi_endpoint.3.md index 819e00d350a..78ce6947e89 100644 --- a/man/fi_endpoint.3.md +++ b/man/fi_endpoint.3.md @@ -551,22 +551,6 @@ The following option levels and option names and parameters are defined. : The FI_HMEM_DISABLE_P2P environment variable discussed in [`fi_mr`(3)](fi_mr.3.html) takes precedence over this setopt option. -- *FI_OPT_XPU_TRIGGER - struct fi_trigger_xpu \** -: This option only applies to the fi_getopt() call. It is used to query - the maximum number of variables required to support XPU - triggered operations, along with the size of each variable. - - The user provides a filled out struct fi_trigger_xpu on input. The iface - and device fields should reference an HMEM domain. If the provider does not - support XPU triggered operations from the given device, fi_getopt() will - return -FI_EOPNOTSUPP. On input, var should reference an array of - struct fi_trigger_var data structures, with count set to the size of the - referenced array. If count is 0, the var field will be ignored, and the - provider will return the number of fi_trigger_var structures needed. If - count is > 0, the provider will set count to the needed value, and for - each fi_trigger_var available, set the datatype and count of the variable - used for the trigger. - - *FI_OPT_CUDA_API_PERMITTED - bool \** : This option only applies to the fi_setopt call. It is used to control endpoint's behavior in making calls to CUDA API. By default, an endpoint @@ -662,20 +646,6 @@ desired. Supported types are: transfer service with flow control that maintains message boundaries. -*FI_EP_SOCK_DGRAM* -: A connectionless, unreliable datagram endpoint with UDP socket-like - semantics. FI_EP_SOCK_DGRAM is most useful for applications designed - around using UDP sockets. See the SOCKET ENDPOINT section for additional - details and restrictions that apply to datagram socket endpoints. - -*FI_EP_SOCK_STREAM* -: Data streaming endpoint with TCP socket-like semantics. Provides - a reliable, connection-oriented data transfer service that does - not maintain message boundaries. FI_EP_SOCK_STREAM is most useful for - applications designed around using TCP sockets. See the SOCKET - ENDPOINT section for additional details and restrictions that apply - to stream endpoints. - *FI_EP_UNSPEC* : The type of endpoint is not specified. This is usually provided as input, with other attributes of the endpoint or the provider @@ -753,6 +723,15 @@ protocol value set to one. : Protocol for intra-node communication using shared memory segments used by the sm2 provider +*FI_PROTO_CXI* +: Reliable-datagram protocol optimized for HPC applications + used by cxi provider. + +*FI_PROTO_CXI_RNR* +: A version of the FI_PROTO_CXI protocol that implements an RNR + protocol which can be used when messaging is primarily expected + and FI_ORDER_SAS ordering is not required. + *FI_PROTO_UNSPEC* : The protocol is not specified. This is usually provided as input, with other attributes of the socket or the provider selecting the @@ -1269,7 +1248,7 @@ capability bits from the fi_info structure will be used. The following capabilities apply to the receive attributes: FI_MSG, FI_RMA, FI_TAGGED, FI_ATOMIC, FI_REMOTE_READ, FI_REMOTE_WRITE, FI_RECV, -FI_HMEM, FI_TRIGGER, FI_RMA_PMEM, FI_DIRECTED_RECV, FI_VARIABLE_MSG, +FI_HMEM, FI_TRIGGER, FI_RMA_PMEM, FI_DIRECTED_RECV, FI_MULTI_RECV, FI_SOURCE, FI_RMA_EVENT, FI_SOURCE_ERR, FI_COLLECTIVE, and FI_XPU. @@ -1653,7 +1632,7 @@ Fabric errno values are defined in `rdma/fi_errno.h`. made to bind multiple domains. *-FI_ENOCQ* -: The endpoint has not been configured with necessary event queue. +: The endpoint has not been configured with necessary completion queue. *-FI_EOPBADSTATE* : The endpoint's state does not permit the requested operation. diff --git a/man/fi_errno.3.md b/man/fi_errno.3.md index 4e3ca7dcd88..476fbce6c89 100644 --- a/man/fi_errno.3.md +++ b/man/fi_errno.3.md @@ -61,12 +61,18 @@ const char *fi_strerror(int errno); *FI_ENOSYS* : Function not implemented +*FI_EWOULDBLOCK* +: Operation would block + *FI_ENOMSG* : No message of desired type *FI_ENODATA* : No data available +*FI_EOVERFLOW* +: Value too large for defined data type + *FI_EMSGSIZE* : Message too long @@ -94,6 +100,9 @@ const char *fi_strerror(int errno); *FI_ECONNRESET* : Connection reset by peer +*FI_ENOBUFS* +: No buffer space available + *FI_EISCONN* : Transport endpoint is already connected @@ -109,6 +118,9 @@ const char *fi_strerror(int errno); *FI_ECONNREFUSED* : Connection refused +*FI_EHOSTDOWN* +: Host is down + *FI_EHOSTUNREACH* : No route to host @@ -154,6 +166,27 @@ const char *fi_strerror(int errno); *FI_ENOCQ* : Missing or unavailable completion queue +*FI_ECRC* +: CRC error + +*FI_ETRUNC* +: Truncation error + +*FI_ENOKEY* +: Required key not available + +*FI_ENOAV* +: Missing or unavailable address vector + +*FI_EOVERRUN* +: Queue has been overrun + +*FI_ENORX* +: Receiver not ready, no receive buffers available + +*FI_ENOMR* +: Memory registration limit exceeded + # SEE ALSO [`fabric`(7)](fabric.7.html) diff --git a/man/fi_getinfo.3.md b/man/fi_getinfo.3.md index e97bbb5a42b..0464dd0904f 100644 --- a/man/fi_getinfo.3.md +++ b/man/fi_getinfo.3.md @@ -275,7 +275,7 @@ additional optimizations. : Requests that the provider support the association of a user specified identifier with each address vector (AV) address. User identifiers are returned with completion data in place of the AV address. See - [`fi_domain`(3)](fi_domain.3.html) and [`fi_av`(3)] (fi_av.3.html) for + [`fi_domain`(3)](fi_domain.3.html) and [`fi_av`(3)](fi_av.3.html) for more details. *FI_COLLECTIVE* @@ -429,16 +429,6 @@ additional optimizations. Endpoints support this capability must meet the usage model as described by [`fi_trigger`(3)](fi_trigger.3.html). -*FI_VARIABLE_MSG* - -: Requests that the provider must notify a receiver when a variable - length message is ready to be received prior to attempting to place - the data. Such notification will include the size of the message and - any associated message tag (for FI_TAGGED). See 'Variable Length - Messages' in fi_msg.3 for full details. Variable length messages - are any messages larger than an endpoint configurable size. This - flag requires that FI_MSG and/or FI_TAGGED be set. - *FI_WRITE* : Indicates that the user requires an endpoint capable of initiating writes against remote memory regions. This flag requires that FI_RMA @@ -466,8 +456,7 @@ may optionally report non-selected secondary capabilities if doing so would not compromise performance or security. Primary capabilities: FI_MSG, FI_RMA, FI_TAGGED, FI_ATOMIC, FI_MULTICAST, -FI_NAMED_RX_CTX, FI_DIRECTED_RECV, FI_VARIABLE_MSG, FI_HMEM, FI_COLLECTIVE, -FI_XPU, FI_AV_USER_ID +FI_NAMED_RX_CTX, FI_DIRECTED_RECV, FI_HMEM, FI_COLLECTIVE, FI_XPU, FI_AV_USER_ID Primary modifiers: FI_READ, FI_WRITE, FI_RECV, FI_SEND, FI_REMOTE_READ, FI_REMOTE_WRITE @@ -593,20 +582,6 @@ supported set of modes will be returned in the info structure(s). must be a contiguous region, though it may or may not be directly adjacent to the payload portion of the buffer. -*FI_NOTIFY_FLAGS_ONLY* -: This bit indicates that general completion flags may not be set by - the provider, and are not needed by the application. If specified, - completion flags which simply report the type of operation that - completed (e.g. send or receive) may not be set. However, - completion flags that are used for remote notifications will still - be set when applicable. See [`fi_cq`(3)](fi_cq.3.html) for details on - which completion flags are valid when this mode bit is enabled. - -*FI_RESTRICTED_COMP* -: This bit indicates that the application will only share completion queues - and counters among endpoints, transmit contexts, and receive contexts that - have the same set of capability flags. - *FI_RX_CQ_DATA* : This mode bit only applies to data transfers that set FI_REMOTE_CQ_DATA. When set, a data transfer that carries remote CQ data will consume a diff --git a/man/fi_info.1.md b/man/fi_info.1.md index 437ee1db1e5..e27fc726214 100644 --- a/man/fi_info.1.md +++ b/man/fi_info.1.md @@ -73,7 +73,7 @@ providers, see the `--list` option. : List libfabric related environment variables which can be used to enable extra configuration or tuning. -*-g [filter] +*-g [filter]* : Same as -e option, with output limited to environment variables containing filter as a substring. diff --git a/man/fi_mlx.7.md b/man/fi_mlx.7.md deleted file mode 100644 index 0c382356f88..00000000000 --- a/man/fi_mlx.7.md +++ /dev/null @@ -1,20 +0,0 @@ ---- -layout: page -title: fi_mlx(7) -tagline: Libfabric Programmer's Manual ---- -{% include JB/setup %} - -# NAME - -fi_mlx \- The MLX Fabric Provider - -# OVERVIEW - -The mlx provider was deprecated and removed in libfabric 1.9 -due to a lack of a maintainer. - -# SEE ALSO - -[`fabric`(7)](fabric.7.html), -[`fi_provider`(7)](fi_provider.7.html), diff --git a/man/fi_mr.3.md b/man/fi_mr.3.md index d9233d50400..b2a44ded2f5 100644 --- a/man/fi_mr.3.md +++ b/man/fi_mr.3.md @@ -422,7 +422,7 @@ this call. Returns the remote protection key associated with a MR. The memory registration must have completed successfully before invoking this. The returned key may be used in data transfer operations at a peer. If the -FI_RAW_MR mode bit has been set for the domain, then the memory key must +FI_MR_RAW mode bit has been set for the domain, then the memory key must be obtained using the fi_mr_raw_key function instead. A return value of FI_KEY_NOTAVAIL will be returned if the registration has not completed or a raw memory key is required. @@ -431,7 +431,7 @@ or a raw memory key is required. Returns the raw, remote protection key and base address associated with a MR. The memory registration must have completed successfully before invoking -this routine. Use of this call is required if the FI_RAW_MR mode bit has +this routine. Use of this call is required if the FI_MR_RAW mode bit has been set by the provider; however, it is safe to use this call with any memory region. @@ -452,7 +452,7 @@ can be used for data transfer operations. The mapping is done by the peer that initiates the RMA or atomic operation. The mapping function takes as input the raw key and its size, and returns the mapped key. Use of the fi_mr_map_raw function is required if the peer has the -FI_RAW_MR mode bit set, but this routine may be called on any valid +FI_MR_RAW mode bit set, but this routine may be called on any valid key. All mapped keys must be freed by calling fi_mr_unmap_key when access to the peer memory region is no longer necessary. @@ -766,7 +766,7 @@ the memory region. The application is responsible for transferring this key to the peer. If FI_MR_RAW mode has been set, the key must be retrieved using the fi_mr_raw_attr function. -FI_RAW_MR allows support for providers that require more than 8-bytes for +FI_MR_RAW allows support for providers that require more than 8-bytes for their protection keys or need additional setup before a key can be used for transfers. After a raw key has been retrieved, it must be exchanged with the remote peer. The peer must use fi_mr_map_raw to convert @@ -823,7 +823,7 @@ The follow flag may be specified to any memory registration call. fi_mr_attr structure. This flag is only usable for domains opened with FI_HMEM capability support. -- *FI_AUTH_KEY* +*FI_AUTH_KEY* : Only valid with domains configured with FI_AV_AUTH_KEY. When used with fi_mr_regattr, this flag denotes that the fi_mr_auth_key::src_addr field contains an authorization key fi_addr_t (i.e. fi_addr_t returned from diff --git a/man/fi_msg.3.md b/man/fi_msg.3.md index ec3764bc162..9e54027a132 100644 --- a/man/fi_msg.3.md +++ b/man/fi_msg.3.md @@ -226,7 +226,7 @@ fi_sendmsg. *FI_CLAIM* : Applies to posted receive operations for endpoints configured - for FI_BUFFERED_RECV or FI_VARIABLE_MSG. This flag is used to + for FI_BUFFERED_RECV. This flag is used to retrieve a message that was buffered by the provider. See the Buffered Receives section for details. @@ -238,7 +238,7 @@ fi_sendmsg. *FI_DISCARD* : Applies to posted receive operations for endpoints configured - for FI_BUFFERED_RECV or FI_VARIABLE_MSG. This flag is used to + for FI_BUFFERED_RECV. This flag is used to free a message that was buffered by the provider. See the Buffered Receives section for details. @@ -399,30 +399,6 @@ restrictions assigned to an endpoint. For example, completions may indicate the order in which received messages arrived at the receiver based on the endpoint attributes. -# Variable Length Messages - -Variable length messages, or simply variable messages, are transfers -where the size of the message is unknown to the receiver prior to the -message being sent. It indicates that the recipient of a message does -not know the amount of data to expect prior to the message arriving. -It is most commonly used when the size of message transfers varies -greatly, with very large messages interspersed with much smaller -messages, making receive side message buffering difficult to manage. -Variable messages are not subject to max message length -restrictions (i.e. struct fi_ep_attr::max_msg_size limits), and may -be up to the maximum value of size_t (e.g. SIZE_MAX) in length. - -Variable length messages support requests that the provider allocate and -manage the network message buffers. As a result, the application -requirements and provider behavior is identical as those defined -for supporting the FI_BUFFERED_RECV mode bit. See the Buffered -Receive section above for details. The main difference is that buffered -receives are limited by the fi_ep_attr::max_msg_size threshold, whereas -variable length messages are not. - -Support for variable messages is indicated through the FI_VARIABLE_MSG -capability bit. - # NOTES If an endpoint has been configured with FI_MSG_PREFIX, the application diff --git a/man/fi_netdir.7.md b/man/fi_netdir.7.md deleted file mode 100644 index dccf4c72ec3..00000000000 --- a/man/fi_netdir.7.md +++ /dev/null @@ -1,116 +0,0 @@ ---- -layout: page -title: fi_netdir(7) -tagline: Libfabric Programmer's Manual ---- -{% include JB/setup %} - -# NAME - -fi_netdir \- The Network Direct Fabric Provider - -# OVERVIEW - -The Network Direct provider enables applications using OFI to be run over -any verbs hardware (Infiniband, iWarp and etc). It uses the Microsoft Network -Direct SPI for network transport and provides a translation of OFI calls to -appropriate Network Direct API calls. -The Network Direct providers allows to OFI-based applications utilize -zero-copy data transfers between applications, kernel-bypass I/O generation and -one-sided data transfer operations on Microsoft Windows OS. -An application is able to use OFI with Network Direct provider enabled on -Windows OS to expose the capabilities of the networking devices if the hardware -vendors of the devices implemented the Network Direct service provider interface -(SPI) for their hardware. - -# SUPPORTED FEATURES - -The Network Direct provider support the following features defined for the -libfabric API: - -*Endpoint types* -: The provider support the FI_EP_MSG endpoint types. - -*Memory registration modes* -: The provider implements the *FI_MR_BASIC* memory registration mode. - -*Data transfer operations* -: The following data transfer interfaces are supported for the following - endpoint types: *FI_MSG*, *FI_RMA*. See DATA TRANSFER OPERATIONS below - for more details. - -*Modes* -: The Network Direct provider requires applications to support - the following modes: - * FI_LOCAL_MR for all applications. - -*Addressing Formats* -: Supported addressing formats include FI_SOCKADDR, FI_SOCKADDR_IN, FI_SOCKADDR_IN6 - -*Progress* -: The Network Direct provider supports FI_PROGRESS_AUTO: Asynchronous operations - make forward progress automatically. - -*Operation flags* -: The provider supports FI_INJECT, FI_COMPLETION, FI_TRANSMIT_COMPLETE, - FI_INJECT_COMPLETE, FI_DELIVERY_COMPLETE, FI_SELECTIVE_COMPLETION - -*Completion ordering* -: RX/TX contexts: FI_ORDER_STRICT - -*Other supported features* -: Multiple input/output vector (IOV) is supported for FI_RMA read/write and - FI_MSG receive/transmit operations. - -# LIMITATIONS - -*Memory Regions* -: Only FI_MR_BASIC mode is supported. Adding regions via s/g list is - supported only up to a s/g list size of 1. No support for binding memory - regions to a counter. - -*Wait objects* -: Wait object and wait sets are not supported. - -*Resource Management* -: Application has to make sure CQs are not overrun as this cannot be detected - by the provider. - -*Unsupported Endpoint types* -: FI_EP_DGRAM, FI_EP_RDM - -*Other unsupported features* -: Scalable endpoints, FABRIC_DIRECT - -*Unsupported features specific to MSG endpoints* -: FI_SOURCE, FI_TAGGED, FI_CLAIM, fi_ep_alias, shared TX context, operations. - -# RUNTIME PARAMETERS - -The Network Direct provider checks for the following environment variables. - -### Variables specific to RDM endpoints - -*FI_NETDIR_INLINETHR* -: The size of the (default: 8 Kbyte): - * Transmitted data that can be inlined - * Preposted data for the unexpected receive queue - -*FI_NETDIR_PREPOSTCNT* -: The number of pre-registered buffers between the endpoints that are not require - internal ACK messages, must be a power of 2 (default: 8). - -*FI_NETDIR_PREPOSTBUFCNT* -: The number of preposted arrays of buffers, must be a power of 2 (default: 1). - -### Environment variables notes -The fi_info utility would give the up-to-date information on environment variables: -fi_info -p netdir -e - -# SEE ALSO - -[`fabric`(7)](fabric.7.html), -[`fi_open_ops`(3)](fi_open_ops.3.html), -[`fi_provider`(7)](fi_provider.7.html), -[`fi_getinfo`(3)](fi_getinfo.3.html) -[`fi_atomic`(3)](fi_atomic.3.html) diff --git a/man/fi_opx.7.md b/man/fi_opx.7.md index 54d241fa21f..882f884632d 100644 --- a/man/fi_opx.7.md +++ b/man/fi_opx.7.md @@ -192,8 +192,12 @@ OPX is not compatible with Open MPI 4.1.x PML/BTL. - `FI_OPX_HFI_SELECT=default,core:1:0` all callers will use default HFI selection logic. *FI_OPX_DELIVERY_COMPLETION_THRESHOLD* -: Integer. The minimum message length in bytes to force delivery completion. - Value must be between 16385 and 2147483646. Defaults to 16385. +: Integer. Will be deprecated. Please use FI_OPX_SDMA_BOUNCE_BUF_THRESHOLD. + +*FI_OPX_SDMA_BOUNCE_BUF_THRESHOLD* +: Integer. The maximum message length in bytes that will be copied to the SDMA bounce buffer. + For messages larger than this threshold, the send will not be completed until receiver + has ACKed. Value must be between 16385 and 2147483646. Defaults to 16385. *FI_OPX_SDMA_DISABLE* : Integer. Disables SDMA offload hardware. Default is 0 diff --git a/man/fi_pingpong.1.md b/man/fi_pingpong.1.md index ab59d45e028..f39dae502c6 100644 --- a/man/fi_pingpong.1.md +++ b/man/fi_pingpong.1.md @@ -72,7 +72,7 @@ given domains cannot communicate, then the application will fail. ## Fabric Filtering *-p \* -: The name of the underlying fabric provider (e.g., sockets, psm3, etc.). +: The name of the underlying fabric provider (e.g., sockets, psm3, usnic, etc.). If a provider is not specified via the -p switch, the test will pick one from the list of available providers (as returned by fi_getinfo(3)). @@ -84,6 +84,9 @@ given domains cannot communicate, then the application will fail. *-d \* : The name of the specific domain to be used. +*-s \* +: Address to corresponding domain. Required in multi-adapter environment. + ## Test Options *-I \* @@ -119,15 +122,15 @@ given domains cannot communicate, then the application will fail. ## An example with various options ### Server: -`server$ fi_pingpong -p tcp -I 1000 -S 1024` +`server$ fi_pingpong -p usnic -I 1000 -S 1024` ### Client: -`client$ fi_pingpong -p tcp -I 1000 -S 1024 192.168.0.123` +`client$ fi_pingpong -p usnic -I 1000 -S 1024 192.168.0.123` Specifically, this will run a pingpong test with: -- tcp provider +- usNIC provider - 1000 iterations - 1024 bytes message size - server node as 192.168.0.123 @@ -135,10 +138,10 @@ Specifically, this will run a pingpong test with: ## A longer test ### Server: -`server$ fi_pingpong -p tcp -I 10000 -S all` +`server$ fi_pingpong -p usnic -I 10000 -S all` ### Client: -`client$ fi_pingpong -p tcp -I 10000 -S all 192.168.0.123` +`client$ fi_pingpong -p usnic -I 10000 -S all 192.168.0.123` # DEFAULTS diff --git a/man/fi_provider.7.md b/man/fi_provider.7.md index 46e20cbf431..ba820906a12 100644 --- a/man/fi_provider.7.md +++ b/man/fi_provider.7.md @@ -67,6 +67,10 @@ The following core providers are built into libfabric by default, assuming all build pre-requisites are met. That is, necessary libraries are installed, operating system support is available, etc. This list is not exhaustive. +*CXI* +: Provider for Cray's Slingshot network. See + [`fi_cxi`(7)](fi_cxi.7.html) for more information. + *EFA* : A provider for the [Amazon EC2 Elastic Fabric Adapter (EFA)](https://aws.amazon.com/hpc/efa/), a custom-built OS bypass diff --git a/man/fi_rxm.7.md b/man/fi_rxm.7.md index f5ae80716c5..cc163f08cbb 100644 --- a/man/fi_rxm.7.md +++ b/man/fi_rxm.7.md @@ -175,6 +175,12 @@ with (default: 256). consecutively read across progress calls without checking to see if the CM progress interval has been reached (default: 128) +*FI_OFI_RXM_DETECT_HMEM_IFACE* +: Set this to 1 to allow automatic detection of HMEM iface of user buffers + when such information is not supplied. This feature allows such buffers be + copied or registered (e.g. in Rendezvous) internally by RxM. Note that no + extra memory registration is performed with this option. (default: false) + # Tuning ## Bandwidth diff --git a/man/fi_setup.7.md b/man/fi_setup.7.md index 430aa95c493..096dd5f5115 100644 --- a/man/fi_setup.7.md +++ b/man/fi_setup.7.md @@ -947,8 +947,12 @@ struct fi_cq_err_entry { /* Sample error handling */ struct fi_cq_msg_entry entry; struct fi_cq_err_entry err_entry; +char err_data[256]; int ret; +err_entry.err_data = err_data; +err_entry.err_data_size = 256; + ret = fi_cq_read(cq, &entry, 1); if (ret == -FI_EAVAIL) ret = fi_cq_readerr(cq, &err_entry, 0); diff --git a/man/fi_tagged.3.md b/man/fi_tagged.3.md index 146bb306a11..49d8b4179f5 100644 --- a/man/fi_tagged.3.md +++ b/man/fi_tagged.3.md @@ -329,8 +329,8 @@ The following flags may be used with fi_trecvmsg. fi_context structure used for an FI_PEEK + FI_CLAIM operation must be used by the paired FI_CLAIM request. - This flag also applies to endpoints configured for FI_BUFFERED_RECV or - FI_VARIABLE_MSG. When set, it is used to retrieve a tagged message that + This flag also applies to endpoints configured for FI_BUFFERED_RECV. + When set, it is used to retrieve a tagged message that was buffered by the provider. See Buffered Tagged Receives section for details. @@ -343,8 +343,8 @@ The following flags may be used with fi_trecvmsg. FI_CLAIM in order to discard a message previously claimed using an FI_PEEK + FI_CLAIM request. - This flag also applies to endpoints configured for FI_BUFFERED_RECV or - FI_VARIABLE_MSG. When set, it indicates that the provider should free + This flag also applies to endpoints configured for FI_BUFFERED_RECV. + When set, it indicates that the provider should free a buffered messages. See Buffered Tagged Receives section for details. If this flag is set, the input buffer(s) and length parameters are ignored. @@ -385,12 +385,6 @@ After being notified that a buffered receive has arrived, applications must either claim or discard the message as described in [`fi_msg`(3)](fi_msg.3.html). -# Variable Length Tagged Messages - -Variable length messages are defined in [`fi_msg`(3)](fi_msg.3.html). -The requirements for handling variable length tagged messages is identical -to those defined above for buffered tagged receives. - # RETURN VALUE The tagged send and receive calls return 0 on success. On error, a diff --git a/man/fi_trigger.3.md b/man/fi_trigger.3.md index 9e4e2036a25..98671803e87 100644 --- a/man/fi_trigger.3.md +++ b/man/fi_trigger.3.md @@ -103,124 +103,6 @@ struct fi_trigger_threshold { they will be triggered in the order in which they were submitted to the endpoint. -# XPU TRIGGERS - -XPU based triggers work in conjunction with heterogenous memory (FI_HMEM -capability). XPU triggers define a split execution model for specifying -a data transfer separately from initiating the transfer. Unlike completion -triggers, the user controls the timing of when the transfer starts by -writing data into a trigger variable location. - -XPU transfers allow the requesting and triggering to occur on separate -computational domains. For example, a process running on the host CPU can -setup a data transfer, with a compute kernel running on a GPU signaling -the start of the transfer. XPU refers to a CPU, GPU, FPGA, or other -acceleration device with some level of computational ability. - -Endpoints must be created with both the FI_TRIGGER and FI_XPU capabilities -to use XPU triggers. XPU triggered enabled endpoints only support XPU -triggered operations. The behavior of mixing XPU triggered operations with -normal data transfers or non-XPU triggered operations is not defined by -the API and subject to provider support and implementation. - -The use of XPU triggers requires coordination between the fabric provider, -application, and submitting XPU. The result is that hardware -implementation details need to be conveyed across the computational domains. -The XPU trigger API abstracts those details. When submitting a XPU trigger -operation, the user identifies the XPU where the triggering will -occur. The triggering XPU must match with the location of the local memory -regions. For example, if triggering will be done by a GPU kernel, the -type of GPU and its local identifier are given. As output, the fabric -provider will return a list of variables and corresponding values. -The XPU signals that the data transfer is safe to initiate by writing -the given values to the specified variable locations. The number of -variables and their sizes are provider specific. - -XPU trigger operations are submitted using the FI_TRIGGER flag with -struct fi_triggered_context or struct fi_triggered_context2, as -required by the provider. The trigger event_type is: - -*FI_TRIGGER_XPU* -: Indicates that the data transfer operation will be deferred until - the user writes provider specified data to provider indicated - memory locations. The user indicates which device will initiate - the write. The struct fi_trigger_xpu is used to convey both - input and output data regarding the signaling of the trigger. - -```c -struct fi_trigger_var { - enum fi_datatype datatype; - int count; - void *addr; - union { - uint8_t val8; - uint16_t val16; - uint32_t val32; - uint64_t val64; - uint8_t *data; - } value; -}; - -struct fi_trigger_xpu { - int count; - enum fi_hmem_iface iface; - union { - uint64_t reserved; - int cuda; - int ze; - } device; - struct fi_trigger_var *var; -}; -``` - -On input to a triggered operation, the iface field indicates the software -interface that will be used to write the variables. The device union -specifies the device identifier. For valid iface and device values, see -[`fi_mr`(3)](fi_mr.3.html). The iface and device must match with the -iface and device of any local HMEM memory regions. Count should be set -to the number of fi_trigger_var structures available, with the var field -pointing to an array of struct fi_trigger_var. The user is responsible for -ensuring that there are sufficient fi_trigger_var structures available and of -an appropriate size. The count and size of fi_trigger_var structures -can be obtained by calling fi_getopt() on the endpoint with the -FI_OPT_XPU_TRIGGER option. See [`fi_endpoint`(3)](fi_endpoint.3.html) -for details. - -Each fi_trigger_var structure referenced should have the datatype -and count fields initialized to the number of values referenced by the -struct fi_trigger_val. If the count is 1, one of the val fields will be used -to return the necessary data (val8, val16, etc.). If count > 1, the data -field will return all necessary data used to signal the trigger. The data -field must reference a buffer large enough to hold the returned bytes. - -On output, the provider will set the fi_trigger_xpu count to the number of -fi_trigger_var variables that must be signaled. Count will be less than or -equal to the input value. The provider will initialize each valid -fi_trigger_var entry with information needed to signal the trigger. The -datatype indicates the size of the data that must be written. Valid datatype -values are FI_UINT8, FI_UINT16, FI_UINT32, and FI_UINT64. For signal -variables <= 64 bits, the count field will be 1. If a trigger requires writing -more than 64-bits, the datatype field will be set to FI_UINT8, with count set -to the number of bytes that must be written. The data that must be written -to signal the start of an operation is returned through either the value -union val fields or data array. - -Users signal the start of a transfer by writing the returned data to the -given memory address. The write must occur from the specified input XPU -location (based on the iface and device fields). If a transfer cannot -be initiated for some reason, such as an error occurring before the -transfer can start, the triggered operation should -be canceled to release any allocated resources. If multiple variables are -specified, they must be updated in order. - -Note that the provider will not modify the fi_trigger_xpu or fi_trigger_var -structures after returning from the data transfer call. - -In order to support multiple provider implementations, users should trigger -data transfer operations in the same order that they are queued and should -serialize the writing of triggers that reference the same endpoint. Providers -may return the same trigger variable for multiple data transfer requests. - # DEFERRED WORK QUEUES The following feature and description are enhancements to triggered diff --git a/man/fi_usnic.7.md b/man/fi_usnic.7.md new file mode 100644 index 00000000000..88855fc35fb --- /dev/null +++ b/man/fi_usnic.7.md @@ -0,0 +1,330 @@ +--- +layout: page +title: fi_usnic(7) +tagline: Libfabric Programmer's Manual +--- +{% include JB/setup %} + +# NAME + +fi_usnic \- The usNIC Fabric Provider + +# OVERVIEW + +The *usnic* provider is designed to run over the Cisco VIC +(virtualized NIC) hardware on Cisco UCS servers. It utilizes the +Cisco usNIC (userspace NIC) capabilities of the VIC to enable ultra +low latency and other offload capabilities on Ethernet networks. + +# RELEASE NOTES + +* The *usnic* libfabric provider requires the use of the "libnl" + library. + - There are two versions of libnl generally available: v1 and v3; + the usnic provider can use either version. + - If you are building libfabric/the usnic provider from source, you + will need to have the libnl header files available (e.g., if you + are installing libnl from RPM or other packaging system, install + the "-devel" versions of the package). + - If you have libnl (either v1 or v3) installed in a non-standard + location (e.g., not in /usr/lib or /usr/lib64), you may need to + tell libfabric's configure where to find libnl via the + `--with-libnl=DIR` command line option (where DIR is the + installation prefix of the libnl package). +* The most common way to use the libfabric usnic provider is via an + MPI implementation that uses libfabric (and the usnic provider) as a + lower layer transport. MPI applications do not need to know + anything about libfabric or usnic in this use case -- the MPI + implementation hides all these details from the application. +* If you are writing applications directly to the libfabric API: + - *FI_EP_DGRAM* endpoints are the best supported method of utilizing + the usNIC interface. Specifically, the *FI_EP_DGRAM* endpoint + type has been extensively tested as the underlying layer for Open + MPI's *usnic* BTL. + - *FI_EP_MSG* and *FI_EP_RDM* endpoints are implemented, but are + only lightly tested. It is likely that there are still some bugs + in these endpoint types. In particular, there are known bugs in RDM + support in the presence of congestion or packet loss (issue 1621). + RMA is not yet supported. + - [`fi_provider`(7)](fi_provider.7.html) lists requirements for all + providers. The following limitations exist in the *usnic* + provider: + * multicast operations are not supported on *FI_EP_DGRAM* and + *FI_EP_RDM* endpoints. + * *FI_EP_MSG* endpoints only support connect, accept, and getname + CM operations. + * Passive endpoints only support listen, setname, and getname CM + operations. + * *FI_EP_DGRAM* endpoints support `fi_sendmsg()` and + `fi_recvmsg()`, but some flags are ignored. `fi_sendmsg()` + supports `FI_INJECT` and `FI_COMPLETION`. `fi_recvmsg()` + supports `FI_MORE`. + * Address vectors only support `FI_AV_MAP`. + * No counters are supported. + * The tag matching interface is not supported. + * *FI_MSG_PREFIX* is only supported on *FI_EP_DGRAM* and usage + is limited to releases 1.1 and beyond. + * fi_control with FI_GETWAIT may only be used on CQs that have been + bound to an endpoint. If fi_control is used on an unbound CQ, it will + return -FI_EOPBADSTATE. + * There is limited support for data returned as part of an erroneous + asynchronous operation. EQs will return error data for CM operations, + CQs do not support returning error data. + * As of 1.5, usNIC supports fi_mr_regv, and fi_mr_regattr. Support is + limited to a single iov. + * Atomic operations are not supported. + - Resource management is not supported. The application is responsible for + resource protection. + - The usnic libfabric provider supports extensions that provide + information and functionality beyond the standard libfabric + interface. See the "USNIC EXTENSIONS" section, below. + +# USNIC EXTENSIONS + +The usnic libfabric provider exports extensions for additional VIC, +usNIC, and Ethernet capabilities not provided by the standard +libfabric interface. + +These extensions are available via the "fi_ext_usnic.h" header file. + +## Fabric Extension: getinfo + +Version 2 of the "fabric getinfo" extension was introduced in Libfabric release +v1.3.0 and can be used to retrieve IP and SR-IOV information about a usNIC +device obtained from the [`fi_getinfo`(3)](fi_getinfo.3.html) function. + +The "fabric getinfo" extension is obtained by calling `fi_open_ops` and +requesting `FI_USNIC_FABRIC_OPS_1` to get the usNIC fabric extension +operations. The `getinfo` function accepts a version parameter that can be +used to select different versions of the extension. The information returned by +the "fabric getinfo" extension is accessible through a `fi_usnic_info` struct +that uses a version tagged union. The accessed union member must correspond +with the requested version. It is recommended that applications explicitly +request a version rather than using the header provided +`FI_EXT_USNIC_INFO_VERSION`. Although there is a version 1 of the extension, +its use is discouraged, and it may not be available in future releases. + +### Compatibility issues + +The addition of version 2 of the extension caused an alignment issue that +could lead to invalid data in the v1 portion of the structure. This means that +the alignment difference manifests when an application using v1 of the +extension is compiled with Libfabric v1.1.x or v1.2.x, but then runs with +Libfabric.so that is v1.3.x or higher (and vice versa). + +The v1.4.0 release of Libfabric introduced a padding field to explicitly +maintain compatibility with the v1.3.0 release. If the issue is encountered, +then it is recommended that you upgrade to a release containing version 2 of +the extension, or recompile with a patched version of an older release. + + +```c +#include + +struct fi_usnic_info { + uint32_t ui_version; + uint8_t ui_pad0[4]; + union { + struct fi_usnic_info_v1 v1; + struct fi_usnic_info_v2 v2; + } ui; +} __attribute__((packed)); + +int getinfo(uint32_t version, struct fid_fabric *fabric, + struct fi_usnic_info *info); +``` + +*version* +: Version of getinfo to be used + +*fabric* +: Fabric descriptor + +*info* +: Upon successful return, this parameter will contain information about the +fabric. + +- Version 2 + +```c +struct fi_usnic_cap { + const char *uc_capability; + int uc_present; +} __attribute__((packed)); + +struct fi_usnic_info_v2 { + uint32_t ui_link_speed; + uint32_t ui_netmask_be; + char ui_ifname[IFNAMSIZ]; + unsigned ui_num_vf; + unsigned ui_qp_per_vf; + unsigned ui_cq_per_vf; + + char ui_devname[FI_EXT_USNIC_MAX_DEVNAME]; + uint8_t ui_mac_addr[6]; + + uint8_t ui_pad0[2]; + + uint32_t ui_ipaddr_be; + uint32_t ui_prefixlen; + uint32_t ui_mtu; + uint8_t ui_link_up; + + uint8_t ui_pad1[3]; + + uint32_t ui_vendor_id; + uint32_t ui_vendor_part_id; + uint32_t ui_device_id; + char ui_firmware[64]; + + unsigned ui_intr_per_vf; + unsigned ui_max_cq; + unsigned ui_max_qp; + + unsigned ui_max_cqe; + unsigned ui_max_send_credits; + unsigned ui_max_recv_credits; + + const char *ui_nicname; + const char *ui_pid; + + struct fi_usnic_cap **ui_caps; +} __attribute__((packed)); +``` + +- Version 1 + +```c +struct fi_usnic_info_v1 { + uint32_t ui_link_speed; + uint32_t ui_netmask_be; + char ui_ifname[IFNAMSIZ]; + + uint32_t ui_num_vf; + uint32_t ui_qp_per_vf; + uint32_t ui_cq_per_vf; +} __attribute__((packed)); +``` + +Version 1 of the "fabric getinfo" extension can be used by explicitly +requesting it in the call to `getinfo` and accessing the `v1` portion of the +`fi_usnic_info.ui` union. Use of version 1 is not recommended and it may be +removed from future releases. + + +The following is an example of how to utilize version 2 of the usnic "fabric +getinfo" extension. + +```c +#include +#include + +/* The usNIC extensions are all in the + rdma/fi_ext_usnic.h header */ +#include + +int main(int argc, char *argv[]) { + struct fi_info *info; + struct fi_info *info_list; + struct fi_info hints = {0}; + struct fi_ep_attr ep_attr = {0}; + struct fi_fabric_attr fabric_attr = {0}; + + fabric_attr.prov_name = "usnic"; + ep_attr.type = FI_EP_DGRAM; + + hints.caps = FI_MSG; + hints.mode = FI_LOCAL_MR | FI_MSG_PREFIX; + hints.addr_format = FI_SOCKADDR; + hints.ep_attr = &ep_attr; + hints.fabric_attr = &fabric_attr; + + /* Find all usnic providers */ + fi_getinfo(FI_VERSION(1, 0), NULL, 0, 0, &hints, &info_list); + + for (info = info_list; NULL != info; info = info->next) { + /* Open the fabric on the interface */ + struct fid_fabric *fabric; + fi_fabric(info->fabric_attr, &fabric, NULL); + + /* Pass FI_USNIC_FABRIC_OPS_1 to get usnic ops + on the fabric */ + struct fi_usnic_ops_fabric *usnic_fabric_ops; + fi_open_ops(&fabric->fid, FI_USNIC_FABRIC_OPS_1, 0, + (void **) &usnic_fabric_ops, NULL); + + /* Now use the returned usnic ops structure to call + usnic extensions. The following extension queries + some IP and SR-IOV characteristics about the + usNIC device. */ + struct fi_usnic_info usnic_info; + + /* Explicitly request version 2. */ + usnic_fabric_ops->getinfo(2, fabric, &usnic_info); + + printf("Fabric interface %s is %s:\n" + "\tNetmask: 0x%08x\n\tLink speed: %d\n" + "\tSR-IOV VFs: %d\n\tQPs per SR-IOV VF: %d\n" + "\tCQs per SR-IOV VF: %d\n", + info->fabric_attr->name, + usnic_info.ui.v2.ui_ifname, + usnic_info.ui.v2.ui_netmask_be, + usnic_info.ui.v2.ui_link_speed, + usnic_info.ui.v2.ui_num_vf, + usnic_info.ui.v2.ui_qp_per_vf, + usnic_info.ui.v2.ui_cq_per_vf); + + fi_close(&fabric->fid); + } + + fi_freeinfo(info_list); + return 0; +} +``` + +## Adress Vector Extension: get_distance + +The "address vector get_distance" extension was introduced in Libfabric release +v1.0.0 and can be used to retrieve the network distance of an address. + +The "get_distance" extension is obtained by calling `fi_open_ops` and +requesting `FI_USNIC_AV_OPS_1` to get the usNIC address vector extension +operations. + +```c +int get_distance(struct fid_av *av, void *addr, int *metric); +``` + +*av* +: Address vector + +*addr* +: Destination address + +*metric* +: On output this will contain `-1` if the destination host is unreachable, `0` +is the destination host is locally connected, and `1` otherwise. + +See fi_ext_usnic.h for more details. + +# VERSION DIFFERENCES + +## New naming convention for fabric/domain starting with libfabric v1.4 + +The release of libfabric v1.4 introduced a new naming convention for fabric and domain. However the usNIC provider +remains backward compatible with applications supporting the old scheme and decides which one to use based on +the version passed to `fi_getinfo`: + +* When `FI_VERSION(1,4)` or higher is used: + - fabric name is the network address with the CIDR notation (i.e., `a.b.c.d/e`) + - domain name is the usNIC Linux interface name (i.e., `usnic_X`) + +* When a lower version number is used, like `FI_VERSION(1, 3)`, it follows the same behavior the usNIC provider exhibited in libfabric <= v1.3: + - fabric name is the usNIC Linux interface name (i.e., `usnic_X`) + - domain name is `NULL` + +# SEE ALSO + +[`fabric`(7)](fabric.7.html), +[`fi_open_ops`(3)](fi_open_ops.3.html), +[`fi_provider`(7)](fi_provider.7.html), diff --git a/man/man1/fi_info.1 b/man/man1/fi_info.1 index 13711997be4..7ca43b6566b 100644 --- a/man/man1/fi_info.1 +++ b/man/man1/fi_info.1 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 2.9.2.1 .\" -.TH "fi_info" "1" "2022\-12\-09" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_info" "1" "2024\-04\-01" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -69,7 +69,7 @@ Filter interfaces to only those with the given fabric name. List libfabric related environment variables which can be used to enable extra configuration or tuning. .TP -*-g [filter] +\f[I]-g [filter]\f[R] Same as -e option, with output limited to environment variables containing filter as a substring. .TP diff --git a/man/man1/fi_pingpong.1 b/man/man1/fi_pingpong.1 index f924dd77e34..eced1397e6c 100644 --- a/man/man1/fi_pingpong.1 +++ b/man/man1/fi_pingpong.1 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 2.9.2.1 .\" -.TH "fi_pingpong" "1" "2023\-12\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_pingpong" "1" "2024\-04\-04" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -72,7 +72,8 @@ The server ignores this option. .SS Fabric Filtering .TP \f[I]-p \f[R] -The name of the underlying fabric provider (e.g., sockets, psm3, etc.). +The name of the underlying fabric provider (e.g., sockets, psm3, usnic, +etc.). If a provider is not specified via the -p switch, the test will pick one from the list of available providers (as returned by fi_getinfo(3)). .TP @@ -84,6 +85,10 @@ For more information on endpoint types, see fi_endpoint(3). .TP \f[I]-d \f[R] The name of the specific domain to be used. +.TP +\f[I]-s \f[R] +Address to corresponding domain. +Required in multi-adapter environment. .SS Test Options .TP \f[I]-I \f[R] @@ -114,14 +119,14 @@ Displays help output for the pingpong test. .SS An example with various options .SS Server: .PP -\f[C]server$ fi_pingpong -p tcp -I 1000 -S 1024\f[R] +\f[C]server$ fi_pingpong -p usnic -I 1000 -S 1024\f[R] .SS Client: .PP -\f[C]client$ fi_pingpong -p tcp -I 1000 -S 1024 192.168.0.123\f[R] +\f[C]client$ fi_pingpong -p usnic -I 1000 -S 1024 192.168.0.123\f[R] .PP Specifically, this will run a pingpong test with: .IP \[bu] 2 -tcp provider +usNIC provider .IP \[bu] 2 1000 iterations .IP \[bu] 2 @@ -131,10 +136,10 @@ server node as 192.168.0.123 .SS A longer test .SS Server: .PP -\f[C]server$ fi_pingpong -p tcp -I 10000 -S all\f[R] +\f[C]server$ fi_pingpong -p usnic -I 10000 -S all\f[R] .SS Client: .PP -\f[C]client$ fi_pingpong -p tcp -I 10000 -S all 192.168.0.123\f[R] +\f[C]client$ fi_pingpong -p usnic -I 10000 -S all 192.168.0.123\f[R] .SH DEFAULTS .PP There is no default provider; if a provider is not specified via the diff --git a/man/man3/fi_cntr.3 b/man/man3/fi_cntr.3 index e825df7bffa..4e88579f150 100644 --- a/man/man3/fi_cntr.3 +++ b/man/man3/fi_cntr.3 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 2.9.2.1 .\" -.TH "fi_cntr" "3" "2022\-12\-09" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_cntr" "3" "2024\-03\-07" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -124,6 +124,27 @@ and the endpoint. Counters increment on all successful completions, separately from whether the operation generates an entry in an event queue. .TP +- \f[I]FI_CNTR_EVENTS_BYTES\f[R] +The counter is incremented by the number of user bytes, excluding any CQ +data, transferred in a transport message upon reaching the specified +completion semantic. +For initiator side counters, the count reflects the size of the +requested transfer and is updated after the message reaches the desired +completion level (FI_INJECT_COMPLETE, FI_TRANSMIT_COMPLETE, etc.). +For send and write operations, the count reflects the number of bytes +transferred to the peer. +For read operations, the count reflects the number of bytes returned in +a read response. +Operations which may both write and read data, such as atomics, behave +as read operations at the initiator, but writes at the target. +For target side counters, the count reflects the size of received user +data and is incremented subject to target side completion semantics. +In most cases, this indicates FI_DELIVERY_COMPLETE, but may differ when +accessing device memory (HMEM). +On error, the tranfer size is not applied to the error field, that field +is increment by 1. +The FI_COLLECTIVE transfer type is not supported. +.TP \f[I]wait_obj\f[R] Counters may be associated with a specific wait object. Wait objects allow applications to block until the wait object is diff --git a/man/man3/fi_cq.3 b/man/man3/fi_cq.3 index 77b192cc311..a1c36a632d4 100644 --- a/man/man3/fi_cq.3 +++ b/man/man3/fi_cq.3 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 2.9.2.1 .\" -.TH "fi_cq" "3" "2023\-10\-31" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_cq" "3" "2024\-03\-07" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -1091,13 +1091,6 @@ any operation such as fi_cq_read, fi_cq_readfrom, fi_cq_sread, fi_cq_sreadfrom etc. can be called on it. .PP -Completion flags may be suppressed if the FI_NOTIFY_FLAGS_ONLY mode bit -has been set. -When enabled, only the following flags are guaranteed to be set in -completion data when they are valid: FI_REMOTE_READ and FI_REMOTE_WRITE -(when FI_RMA_EVENT capability bit has been set), FI_REMOTE_CQ_DATA, and -FI_MULTI_RECV. -.PP If a completion queue has been overrun, it will be placed into an `overrun' state. Read operations will continue to return any valid, non-corrupted diff --git a/man/man3/fi_domain.3 b/man/man3/fi_domain.3 index b059de3eda1..5781d51d3fb 100644 --- a/man/man3/fi_domain.3 +++ b/man/man3/fi_domain.3 @@ -1,7 +1,7 @@ .\"t .\" Automatically generated by Pandoc 2.9.2.1 .\" -.TH "fi_domain" "3" "2023\-10\-31" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_domain" "3" "2024\-03\-07" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -237,27 +237,27 @@ The name of the access domain. .PP The threading model specifies the level of serialization required of an application when using the libfabric data transfer interfaces. -Control interfaces are always considered thread safe, and may be -accessed by multiple threads. +Control interfaces are always considered thread safe unless the control +progress model is FI_PROGRESS_CONTROL_UNIFIED. +A thread safe control interface allows multiple threads to progress the +control interface, and (depending on threading model selected) one or +more threads to progress the data interfaces at the same time. Applications which can guarantee serialization in their access of -provider allocated resources and interfaces enables a provider to +provider allocated resources and interfaces enable a provider to eliminate lower-level locks. .TP \f[I]FI_THREAD_COMPLETION\f[R] -The completion threading model is intended for providers that make use -of manual progress. -Applications must serialize access to all objects that are associated -through the use of having a shared completion structure. -This includes endpoint, transmit context, receive context, completion -queue, counter, wait set, and poll set objects. -.PP -For example, threads must serialize access to an endpoint and its bound -completion queue(s) and/or counters. -Access to endpoints that share the same completion queue must also be -serialized. -.PP -The use of FI_THREAD_COMPLETION can increase parallelism over -FI_THREAD_SAFE, but requires the use of isolated resources. +The completion threading model is best suited for multi-threaded +applications using scalable endpoints which desire lockless operation. +Applications must serialize access to all objects that are associated by +a common completion mechanism (for example, endpoints bound to the same +CQ or counter). +It is recommended that providers which support scalable endpoints also +support this threading model. +.PP +Applications wanting to leverage FI_THREAD_COMPLETION should allocate +transmit contexts, receive contexts, and completion queues and counters +to individual threads. .TP \f[I]FI_THREAD_DOMAIN\f[R] A domain serialization model requires applications to serialize access @@ -337,8 +337,8 @@ and receiving sides of an operation. This is often a requirement for reliable transfers, as a result of retry and acknowledgement processing. .PP -To balance between performance and ease of use, two progress models are -defined. +To balance between performance and ease of use, the following progress +models are defined. .TP \f[I]FI_PROGRESS_AUTO\f[R] This progress model indicates that the provider will make forward @@ -385,6 +385,15 @@ For example, an endpoint that acts purely as the target of RMA or atomic operations that uses manual progress may still need application assistance to process received operations. .TP +\f[I]FI_PROGRESS_CONTROL_UNIFIED\f[R] +This progress model indicates that the user will synchronize progressing +the data and control operations themselves (i.e.\ this allows the +control interface to NOT be thread safe). +It is only valid for control progress (not data progress). +Setting control=FI_PROGRESS_CONTROL_UNIFIED, data=FI_PROGRESS_MANUAL, +and threading=FI_THREAD_DOMAIN/FI_THREAD_COMPLETION allows Libfabric to +remove all locking in the critical data progress path. +.TP \f[I]FI_PROGRESS_UNSPEC\f[R] This value indicates that no progress model has been defined. It may be used on input hints to the fi_getinfo call. @@ -878,14 +887,6 @@ among multiple processes using the named address vector feature. .PP See \f[C]fi_getinfo\f[R](3) for a discussion on primary versus secondary capabilities. -.SS mode -.PP -The operational mode bit related to using the domain. -.TP -\f[I]FI_RESTRICTED_COMP\f[R] -This bit indicates that the domain limits completion queues and counters -to only be used with endpoints, transmit contexts, and receive contexts -that have the same set of capability flags. .SS Default authorization key (auth_key) .PP The default authorization key to associate with endpoint and memory diff --git a/man/man3/fi_endpoint.3 b/man/man3/fi_endpoint.3 index ee6c65c16ac..5b10304c79c 100644 --- a/man/man3/fi_endpoint.3 +++ b/man/man3/fi_endpoint.3 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 2.9.2.1 .\" -.TH "fi_endpoint" "3" "2024\-01\-23" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_endpoint" "3" "2024\-03\-20" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -612,25 +612,6 @@ The FI_HMEM_DISABLE_P2P environment variable discussed in .RE \[bu] .RS 2 .TP -\f[I]FI_OPT_XPU_TRIGGER - struct fi_trigger_xpu *\f[R] -This option only applies to the fi_getopt() call. -It is used to query the maximum number of variables required to support -XPU triggered operations, along with the size of each variable. -.PP -The user provides a filled out struct fi_trigger_xpu on input. -The iface and device fields should reference an HMEM domain. -If the provider does not support XPU triggered operations from the given -device, fi_getopt() will return -FI_EOPNOTSUPP. -On input, var should reference an array of struct fi_trigger_var data -structures, with count set to the size of the referenced array. -If count is 0, the var field will be ignored, and the provider will -return the number of fi_trigger_var structures needed. -If count is > 0, the provider will set count to the needed value, and -for each fi_trigger_var available, set the datatype and count of the -variable used for the trigger. -.RE -\[bu] .RS 2 -.TP \f[I]FI_OPT_CUDA_API_PERMITTED - bool *\f[R] This option only applies to the fi_setopt call. It is used to control endpoint\[cq]s behavior in making calls to CUDA @@ -736,23 +717,6 @@ Reliable datagram message. Provides a reliable, connectionless data transfer service with flow control that maintains message boundaries. .TP -\f[I]FI_EP_SOCK_DGRAM\f[R] -A connectionless, unreliable datagram endpoint with UDP socket-like -semantics. -FI_EP_SOCK_DGRAM is most useful for applications designed around using -UDP sockets. -See the SOCKET ENDPOINT section for additional details and restrictions -that apply to datagram socket endpoints. -.TP -\f[I]FI_EP_SOCK_STREAM\f[R] -Data streaming endpoint with TCP socket-like semantics. -Provides a reliable, connection-oriented data transfer service that does -not maintain message boundaries. -FI_EP_SOCK_STREAM is most useful for applications designed around using -TCP sockets. -See the SOCKET ENDPOINT section for additional details and restrictions -that apply to stream endpoints. -.TP \f[I]FI_EP_UNSPEC\f[R] The type of endpoint is not specified. This is usually provided as input, with other attributes of the endpoint @@ -832,6 +796,15 @@ by the shm provider Protocol for intra-node communication using shared memory segments used by the sm2 provider .TP +\f[I]FI_PROTO_CXI\f[R] +Reliable-datagram protocol optimized for HPC applications used by cxi +provider. +.TP +\f[I]FI_PROTO_CXI_RNR\f[R] +A version of the FI_PROTO_CXI protocol that implements an RNR protocol +which can be used when messaging is primarily expected and FI_ORDER_SAS +ordering is not required. +.TP \f[I]FI_PROTO_UNSPEC\f[R] The protocol is not specified. This is usually provided as input, with other attributes of the socket @@ -1392,9 +1365,8 @@ capability bits from the fi_info structure will be used. .PP The following capabilities apply to the receive attributes: FI_MSG, FI_RMA, FI_TAGGED, FI_ATOMIC, FI_REMOTE_READ, FI_REMOTE_WRITE, FI_RECV, -FI_HMEM, FI_TRIGGER, FI_RMA_PMEM, FI_DIRECTED_RECV, FI_VARIABLE_MSG, -FI_MULTI_RECV, FI_SOURCE, FI_RMA_EVENT, FI_SOURCE_ERR, FI_COLLECTIVE, -and FI_XPU. +FI_HMEM, FI_TRIGGER, FI_RMA_PMEM, FI_DIRECTED_RECV, FI_MULTI_RECV, +FI_SOURCE, FI_RMA_EVENT, FI_SOURCE_ERR, FI_COLLECTIVE, and FI_XPU. .PP Many applications will be able to ignore this field and rely solely on the fi_info::caps field. @@ -1807,7 +1779,7 @@ A resource domain was not bound to the endpoint or an attempt was made to bind multiple domains. .TP \f[I]-FI_ENOCQ\f[R] -The endpoint has not been configured with necessary event queue. +The endpoint has not been configured with necessary completion queue. .TP \f[I]-FI_EOPBADSTATE\f[R] The endpoint\[cq]s state does not permit the requested operation. diff --git a/man/man3/fi_errno.3 b/man/man3/fi_errno.3 index 92b8f3ebfc1..dcac687918e 100644 --- a/man/man3/fi_errno.3 +++ b/man/man3/fi_errno.3 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 2.9.2.1 .\" -.TH "fi_errno" "3" "2022\-12\-09" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_errno" "3" "2024\-03\-20" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -57,12 +57,18 @@ No space left on device \f[I]FI_ENOSYS\f[R] Function not implemented .TP +\f[I]FI_EWOULDBLOCK\f[R] +Operation would block +.TP \f[I]FI_ENOMSG\f[R] No message of desired type .TP \f[I]FI_ENODATA\f[R] No data available .TP +\f[I]FI_EOVERFLOW\f[R] +Value too large for defined data type +.TP \f[I]FI_EMSGSIZE\f[R] Message too long .TP @@ -90,6 +96,9 @@ Software caused connection abort \f[I]FI_ECONNRESET\f[R] Connection reset by peer .TP +\f[I]FI_ENOBUFS\f[R] +No buffer space available +.TP \f[I]FI_EISCONN\f[R] Transport endpoint is already connected .TP @@ -105,6 +114,9 @@ Operation timed out \f[I]FI_ECONNREFUSED\f[R] Connection refused .TP +\f[I]FI_EHOSTDOWN\f[R] +Host is down +.TP \f[I]FI_EHOSTUNREACH\f[R] No route to host .TP @@ -149,6 +161,27 @@ Invalid resource domain .TP \f[I]FI_ENOCQ\f[R] Missing or unavailable completion queue +.TP +\f[I]FI_ECRC\f[R] +CRC error +.TP +\f[I]FI_ETRUNC\f[R] +Truncation error +.TP +\f[I]FI_ENOKEY\f[R] +Required key not available +.TP +\f[I]FI_ENOAV\f[R] +Missing or unavailable address vector +.TP +\f[I]FI_EOVERRUN\f[R] +Queue has been overrun +.TP +\f[I]FI_ENORX\f[R] +Receiver not ready, no receive buffers available +.TP +\f[I]FI_ENOMR\f[R] +Memory registration limit exceeded .SH SEE ALSO .PP \f[C]fabric\f[R](7) diff --git a/man/man3/fi_getinfo.3 b/man/man3/fi_getinfo.3 index 638ec80032f..d79af2fc56d 100644 --- a/man/man3/fi_getinfo.3 +++ b/man/man3/fi_getinfo.3 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 2.9.2.1 .\" -.TH "fi_getinfo" "3" "2024\-01\-10" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_getinfo" "3" "2024\-05\-07" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -308,8 +308,7 @@ Requests that the provider support the association of a user specified identifier with each address vector (AV) address. User identifiers are returned with completion data in place of the AV address. -See \f[C]fi_domain\f[R](3) and [\f[C]fi_av\f[R](3)] (fi_av.3.html) for -more details. +See \f[C]fi_domain\f[R](3) and \f[C]fi_av\f[R](3) for more details. .TP \f[I]FI_COLLECTIVE\f[R] Requests support for collective operations. @@ -477,16 +476,6 @@ Indicates that the endpoint should support triggered operations. Endpoints support this capability must meet the usage model as described by \f[C]fi_trigger\f[R](3). .TP -\f[I]FI_VARIABLE_MSG\f[R] -Requests that the provider must notify a receiver when a variable length -message is ready to be received prior to attempting to place the data. -Such notification will include the size of the message and any -associated message tag (for FI_TAGGED). -See `Variable Length Messages' in fi_msg.3 for full details. -Variable length messages are any messages larger than an endpoint -configurable size. -This flag requires that FI_MSG and/or FI_TAGGED be set. -.TP \f[I]FI_WRITE\f[R] Indicates that the user requires an endpoint capable of initiating writes against remote memory regions. @@ -516,8 +505,8 @@ A provider may optionally report non-selected secondary capabilities if doing so would not compromise performance or security. .PP Primary capabilities: FI_MSG, FI_RMA, FI_TAGGED, FI_ATOMIC, -FI_MULTICAST, FI_NAMED_RX_CTX, FI_DIRECTED_RECV, FI_VARIABLE_MSG, -FI_HMEM, FI_COLLECTIVE, FI_XPU, FI_AV_USER_ID +FI_MULTICAST, FI_NAMED_RX_CTX, FI_DIRECTED_RECV, FI_HMEM, FI_COLLECTIVE, +FI_XPU, FI_AV_USER_ID .PP Primary modifiers: FI_READ, FI_WRITE, FI_RECV, FI_SEND, FI_REMOTE_READ, FI_REMOTE_WRITE @@ -653,21 +642,6 @@ For scatter-gather send/recv operations, the prefix buffer must be a contiguous region, though it may or may not be directly adjacent to the payload portion of the buffer. .TP -\f[I]FI_NOTIFY_FLAGS_ONLY\f[R] -This bit indicates that general completion flags may not be set by the -provider, and are not needed by the application. -If specified, completion flags which simply report the type of operation -that completed (e.g.\ send or receive) may not be set. -However, completion flags that are used for remote notifications will -still be set when applicable. -See \f[C]fi_cq\f[R](3) for details on which completion flags are valid -when this mode bit is enabled. -.TP -\f[I]FI_RESTRICTED_COMP\f[R] -This bit indicates that the application will only share completion -queues and counters among endpoints, transmit contexts, and receive -contexts that have the same set of capability flags. -.TP \f[I]FI_RX_CQ_DATA\f[R] This mode bit only applies to data transfers that set FI_REMOTE_CQ_DATA. When set, a data transfer that carries remote CQ data will consume a diff --git a/man/man3/fi_mr.3 b/man/man3/fi_mr.3 index b00c20117b1..cd9a479b0ad 100644 --- a/man/man3/fi_mr.3 +++ b/man/man3/fi_mr.3 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 2.9.2.1 .\" -.TH "fi_mr" "3" "2023\-11\-03" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_mr" "3" "2024\-05\-01" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -448,7 +448,7 @@ Returns the remote protection key associated with a MR. The memory registration must have completed successfully before invoking this. The returned key may be used in data transfer operations at a peer. -If the FI_RAW_MR mode bit has been set for the domain, then the memory +If the FI_MR_RAW mode bit has been set for the domain, then the memory key must be obtained using the fi_mr_raw_key function instead. A return value of FI_KEY_NOTAVAIL will be returned if the registration has not completed or a raw memory key is required. @@ -458,7 +458,7 @@ Returns the raw, remote protection key and base address associated with a MR. The memory registration must have completed successfully before invoking this routine. -Use of this call is required if the FI_RAW_MR mode bit has been set by +Use of this call is required if the FI_MR_RAW mode bit has been set by the provider; however, it is safe to use this call with any memory region. .PP @@ -483,7 +483,7 @@ operation. The mapping function takes as input the raw key and its size, and returns the mapped key. Use of the fi_mr_map_raw function is required if the peer has the -FI_RAW_MR mode bit set, but this routine may be called on any valid key. +FI_MR_RAW mode bit set, but this routine may be called on any valid key. All mapped keys must be freed by calling fi_mr_unmap_key when access to the peer memory region is no longer necessary. .SS fi_mr_unmap_key @@ -810,7 +810,7 @@ The application is responsible for transferring this key to the peer. If FI_MR_RAW mode has been set, the key must be retrieved using the fi_mr_raw_attr function. .PP -FI_RAW_MR allows support for providers that require more than 8-bytes +FI_MR_RAW allows support for providers that require more than 8-bytes for their protection keys or need additional setup before a key can be used for transfers. After a raw key has been retrieved, it must be exchanged with the remote @@ -875,7 +875,7 @@ fi_mr_attr structure. This flag is only usable for domains opened with FI_HMEM capability support. .TP -- \f[I]FI_AUTH_KEY\f[R] +\f[I]FI_AUTH_KEY\f[R] Only valid with domains configured with FI_AV_AUTH_KEY. When used with fi_mr_regattr, this flag denotes that the fi_mr_auth_key::src_addr field contains an authorization key fi_addr_t diff --git a/man/man3/fi_msg.3 b/man/man3/fi_msg.3 index f04b430bb3b..a8f13b7b7f6 100644 --- a/man/man3/fi_msg.3 +++ b/man/man3/fi_msg.3 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 2.9.2.1 .\" -.TH "fi_msg" "3" "2023\-11\-30" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_msg" "3" "2024\-03\-07" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -222,7 +222,7 @@ This flag is implicitly set for fi_senddata and fi_injectdata. .TP \f[I]FI_CLAIM\f[R] Applies to posted receive operations for endpoints configured for -FI_BUFFERED_RECV or FI_VARIABLE_MSG. +FI_BUFFERED_RECV. This flag is used to retrieve a message that was buffered by the provider. See the Buffered Receives section for details. @@ -236,7 +236,7 @@ this flag is ignored. .TP \f[I]FI_DISCARD\f[R] Applies to posted receive operations for endpoints configured for -FI_BUFFERED_RECV or FI_VARIABLE_MSG. +FI_BUFFERED_RECV. This flag is used to free a message that was buffered by the provider. See the Buffered Receives section for details. .TP @@ -415,31 +415,6 @@ The handling of buffered receives follows all message ordering restrictions assigned to an endpoint. For example, completions may indicate the order in which received messages arrived at the receiver based on the endpoint attributes. -.SH Variable Length Messages -.PP -Variable length messages, or simply variable messages, are transfers -where the size of the message is unknown to the receiver prior to the -message being sent. -It indicates that the recipient of a message does not know the amount of -data to expect prior to the message arriving. -It is most commonly used when the size of message transfers varies -greatly, with very large messages interspersed with much smaller -messages, making receive side message buffering difficult to manage. -Variable messages are not subject to max message length restrictions -(i.e.\ struct fi_ep_attr::max_msg_size limits), and may be up to the -maximum value of size_t (e.g.\ SIZE_MAX) in length. -.PP -Variable length messages support requests that the provider allocate and -manage the network message buffers. -As a result, the application requirements and provider behavior is -identical as those defined for supporting the FI_BUFFERED_RECV mode bit. -See the Buffered Receive section above for details. -The main difference is that buffered receives are limited by the -fi_ep_attr::max_msg_size threshold, whereas variable length messages are -not. -.PP -Support for variable messages is indicated through the FI_VARIABLE_MSG -capability bit. .SH NOTES .PP If an endpoint has been configured with FI_MSG_PREFIX, the application diff --git a/man/man3/fi_tagged.3 b/man/man3/fi_tagged.3 index 32392e4bb2a..037e3902bfa 100644 --- a/man/man3/fi_tagged.3 +++ b/man/man3/fi_tagged.3 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 2.9.2.1 .\" -.TH "fi_tagged" "3" "2023\-11\-30" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_tagged" "3" "2024\-03\-07" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -343,8 +343,7 @@ struct fi_recv_context in the case of buffered receives. The same fi_context structure used for an FI_PEEK + FI_CLAIM operation must be used by the paired FI_CLAIM request. .PP -This flag also applies to endpoints configured for FI_BUFFERED_RECV or -FI_VARIABLE_MSG. +This flag also applies to endpoints configured for FI_BUFFERED_RECV. When set, it is used to retrieve a tagged message that was buffered by the provider. See Buffered Tagged Receives section for details. @@ -359,8 +358,7 @@ This flag may also be used in conjunction with FI_CLAIM in order to discard a message previously claimed using an FI_PEEK + FI_CLAIM request. .PP -This flag also applies to endpoints configured for FI_BUFFERED_RECV or -FI_VARIABLE_MSG. +This flag also applies to endpoints configured for FI_BUFFERED_RECV. When set, it indicates that the provider should free a buffered messages. See Buffered Tagged Receives section for details. @@ -408,11 +406,6 @@ Other fields are set as defined in \f[C]fi_msg\f[R](3). After being notified that a buffered receive has arrived, applications must either claim or discard the message as described in \f[C]fi_msg\f[R](3). -.SH Variable Length Tagged Messages -.PP -Variable length messages are defined in \f[C]fi_msg\f[R](3). -The requirements for handling variable length tagged messages is -identical to those defined above for buffered tagged receives. .SH RETURN VALUE .PP The tagged send and receive calls return 0 on success. diff --git a/man/man3/fi_trigger.3 b/man/man3/fi_trigger.3 index dc1f38ae829..68586cd026a 100644 --- a/man/man3/fi_trigger.3 +++ b/man/man3/fi_trigger.3 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 2.9.2.1 .\" -.TH "fi_trigger" "3" "2022\-12\-09" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_trigger" "3" "2024\-03\-07" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -103,145 +103,6 @@ Threshold operations are triggered in the order of the threshold values. This is true even if the counter increments by a value greater than 1. If two triggered operations have the same threshold, they will be triggered in the order in which they were submitted to the endpoint. -.SH XPU TRIGGERS -.PP -XPU based triggers work in conjunction with heterogenous memory (FI_HMEM -capability). -XPU triggers define a split execution model for specifying a data -transfer separately from initiating the transfer. -Unlike completion triggers, the user controls the timing of when the -transfer starts by writing data into a trigger variable location. -.PP -XPU transfers allow the requesting and triggering to occur on separate -computational domains. -For example, a process running on the host CPU can setup a data -transfer, with a compute kernel running on a GPU signaling the start of -the transfer. -XPU refers to a CPU, GPU, FPGA, or other acceleration device with some -level of computational ability. -.PP -Endpoints must be created with both the FI_TRIGGER and FI_XPU -capabilities to use XPU triggers. -XPU triggered enabled endpoints only support XPU triggered operations. -The behavior of mixing XPU triggered operations with normal data -transfers or non-XPU triggered operations is not defined by the API and -subject to provider support and implementation. -.PP -The use of XPU triggers requires coordination between the fabric -provider, application, and submitting XPU. -The result is that hardware implementation details need to be conveyed -across the computational domains. -The XPU trigger API abstracts those details. -When submitting a XPU trigger operation, the user identifies the XPU -where the triggering will occur. -The triggering XPU must match with the location of the local memory -regions. -For example, if triggering will be done by a GPU kernel, the type of GPU -and its local identifier are given. -As output, the fabric provider will return a list of variables and -corresponding values. -The XPU signals that the data transfer is safe to initiate by writing -the given values to the specified variable locations. -The number of variables and their sizes are provider specific. -.PP -XPU trigger operations are submitted using the FI_TRIGGER flag with -struct fi_triggered_context or struct fi_triggered_context2, as required -by the provider. -The trigger event_type is: -.TP -\f[I]FI_TRIGGER_XPU\f[R] -Indicates that the data transfer operation will be deferred until the -user writes provider specified data to provider indicated memory -locations. -The user indicates which device will initiate the write. -The struct fi_trigger_xpu is used to convey both input and output data -regarding the signaling of the trigger. -.IP -.nf -\f[C] -struct fi_trigger_var { - enum fi_datatype datatype; - int count; - void *addr; - union { - uint8_t val8; - uint16_t val16; - uint32_t val32; - uint64_t val64; - uint8_t *data; - } value; -}; - -struct fi_trigger_xpu { - int count; - enum fi_hmem_iface iface; - union { - uint64_t reserved; - int cuda; - int ze; - } device; - struct fi_trigger_var *var; -}; -\f[R] -.fi -.PP -On input to a triggered operation, the iface field indicates the -software interface that will be used to write the variables. -The device union specifies the device identifier. -For valid iface and device values, see \f[C]fi_mr\f[R](3). -The iface and device must match with the iface and device of any local -HMEM memory regions. -Count should be set to the number of fi_trigger_var structures -available, with the var field pointing to an array of struct -fi_trigger_var. -The user is responsible for ensuring that there are sufficient -fi_trigger_var structures available and of an appropriate size. -The count and size of fi_trigger_var structures can be obtained by -calling fi_getopt() on the endpoint with the FI_OPT_XPU_TRIGGER option. -See \f[C]fi_endpoint\f[R](3) for details. -.PP -Each fi_trigger_var structure referenced should have the datatype and -count fields initialized to the number of values referenced by the -struct fi_trigger_val. -If the count is 1, one of the val fields will be used to return the -necessary data (val8, val16, etc.). -If count > 1, the data field will return all necessary data used to -signal the trigger. -The data field must reference a buffer large enough to hold the returned -bytes. -.PP -On output, the provider will set the fi_trigger_xpu count to the number -of fi_trigger_var variables that must be signaled. -Count will be less than or equal to the input value. -The provider will initialize each valid fi_trigger_var entry with -information needed to signal the trigger. -The datatype indicates the size of the data that must be written. -Valid datatype values are FI_UINT8, FI_UINT16, FI_UINT32, and FI_UINT64. -For signal variables <= 64 bits, the count field will be 1. -If a trigger requires writing more than 64-bits, the datatype field will -be set to FI_UINT8, with count set to the number of bytes that must be -written. -The data that must be written to signal the start of an operation is -returned through either the value union val fields or data array. -.PP -Users signal the start of a transfer by writing the returned data to the -given memory address. -The write must occur from the specified input XPU location (based on the -iface and device fields). -If a transfer cannot be initiated for some reason, such as an error -occurring before the transfer can start, the triggered operation should -be canceled to release any allocated resources. -If multiple variables are specified, they must be updated in order. -.PP -Note that the provider will not modify the fi_trigger_xpu or -fi_trigger_var structures after returning from the data transfer call. -.PP -In order to support multiple provider implementations, users should -trigger data transfer operations in the same order that they are queued -and should serialize the writing of triggers that reference the same -endpoint. -Providers may return the same trigger variable for multiple data -transfer requests. .SH DEFERRED WORK QUEUES .PP The following feature and description are enhancements to triggered diff --git a/man/man7/fabric.7 b/man/man7/fabric.7 index 1d867999cf5..0d8f5686769 100644 --- a/man/man7/fabric.7 +++ b/man/man7/fabric.7 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 2.9.2.1 .\" -.TH "fabric" "7" "2023\-10\-27" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fabric" "7" "2024\-03\-15" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -271,6 +271,8 @@ option `\[en]enable-'. For example, `\[en]enable-udp' (or `\[en]enable-udp=yes') will add the udp provider to the build. To disable the provider, `\[en]enable-udp=no' can be used. +To build the provider as a stand-alone dynamically loadable library +(i.e.\ DL provider), `\[en]enable-udp=dl' can be used. .PP Providers can also be enable or disabled at run time using the FI_PROVIDER environment variable. @@ -280,7 +282,81 @@ If the list begins with the `\[ha]' symbol, then the list will be negated. .PP Example: To enable the udp and tcp providers only, set: -FI_PROVIDER=\[lq]udp,tcp\[rq] +\f[C]FI_PROVIDER=\[dq]udp,tcp\[dq]\f[R] +.PP +When libfabric is installed, DL providers are put under the \f[I]default +provider path\f[R], which is determined by how libfabric is built and +installed. +Usually the default provider path is +\f[C]/lib/libfabric\f[R] or +\f[C]/lib64/libfabric\f[R]. +By default, libfabric tries to find DL providers in the following order: +.IP "1." 3 +Use `dlopen' to load provider libraries named +\f[C]lib-fi.so\f[R] for all providers enabled at build time. +The search path of `ld.so' is used to locate the files. +This step is skipped if libfabric is configured with the option +`\[en]enable-restricted-dl'. +.IP "2." 3 +Try to load every file under the default provider path as a DL provider. +.PP +The FI_PROVIDER_PATH variable can be used to change the location to +search for DL providers and how to resolve conflicts if multiple +providers with the same name are found. +Setting FI_PROVIDER_PATH to any value, even if empty, would cause step 1 +be skipped, and may change the search directory used in step 2. +.PP +In the simplest form, the FI_PROVIDER_PATH variable is set to a colon +separated list of directories. +These directories replace the default provider path used in step 2. +For example: +.IP +.nf +\f[C] +FI_PROVIDER_PATH=/opt/libfabric:/opt/libfabric2 +\f[R] +.fi +.PP +By default, if multiple providers (including the built-in providers) +with the same name are found, the first one with the highest version is +active and all the others are hidden. +This can be changed by setting the FI_PROVIDER_PATH variable to start +with `\[at]', which force the first one to be active regardless of the +version. +For example: +.IP +.nf +\f[C] +FI_PROVIDER_PATH=\[at]/opt/libfabric:/opt/libfabric2 +\f[R] +.fi +.PP +The FI_PROVIDER_PATH variable can also specify preferred providers by +supplying full paths to libraries instead of directories to search +under. +A preferred provider takes precedence over other providers with the same +name. +The specification of a preferred provider must be prefixed with `+'. +For example: +.IP +.nf +\f[C] +FI_PROVIDER_PATH=+/opt/libfabric2/libtcp-fi.so:/opt/libfabric:+/opt/libfabric2/libudp-fi.so +\f[R] +.fi +.PP +If FI_PROVIDER_PATH is set, but no directory is supplied, the default +provider path is used. +Some examples: +.IP +.nf +\f[C] +FI_PROVIDER_PATH= +FI_PROVIDER_PATH=\[at] +FI_PROVIDER_PATH=+/opt/libfabric/libtcp-fi.so +FI_PROVIDER_PATH=\[at]+/opt/libfabric/libtcp-fi.so +\f[R] +.fi .PP The fi_info utility, which is included as part of the libfabric package, can be used to retrieve information about which providers are available diff --git a/man/man7/fi_cxi.7 b/man/man7/fi_cxi.7 index 14b838824cd..c0ad9d32a0d 100644 --- a/man/man7/fi_cxi.7 +++ b/man/man7/fi_cxi.7 @@ -1,7 +1,7 @@ .\"t .\" Automatically generated by Pandoc 2.9.2.1 .\" -.TH "fi_cxi" "7" "2024\-02\-01" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_cxi" "7" "2024\-03\-21" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -269,6 +269,19 @@ application processes. .IP "7." 3 Application processes select from the list of available service IDs and VNIs to form an authorization key to use for Endpoint allocation. +.SS Endpoint Protocols +.PP +The provider supports multiple endpoint protocols. +The default protocol is FI_PROTO_CXI and fully supports the messaging +requirements of parallel applicaitons. +.PP +The FI_PROTO_CXI_RNR endpoint protocol is an optional protocol that +targets client/server environments where send-after-send ordering is not +required and messaging is generally to pre-posted buffers; FI_MULTI_RECV +is recommended. +It utilizes a receiver-not-ready implementation where +\f[I]FI_CXI_RNR_MAX_TIMEOUT_US\f[R] can be tuned to control the maximum +retry duration. .SS Address Vectors .PP The CXI provider supports both \f[I]FI_AV_TABLE\f[R] and @@ -514,6 +527,16 @@ Using Pinned mode avoids any overhead due to network page faults but requires all buffers to be backed by physical memory. Copy-on-write semantics are broken when using pinned memory. See the Fork section for more information. +.PP +The CXI provider supports DMABUF for device memory registration. +If the ROCR and CUDA libraries support it, the CXI provider will default +to use DMA-buf. +There may be situations with CUDA that may double the BAR consumption. +Until this is fixed in the CUDA stack, the environment variable +\f[I]FI_CXI_DISABLE_DMABUF_CUDA\f[R] can be used to fall back to the +nvidia peer-memory interface. +Also, \f[I]FI_CXI_DISABLE_DMABUF_ROCR\f[R] can be used to fall back to +the amdgpu peer-memory interface. .SS Translation Cache .PP Mapping a buffer for use by the NIC is an expensive operation. @@ -1300,6 +1323,13 @@ queue becomes empty. Default VNI value used only for service IDs where the VNI is not restricted. .TP +\f[I]FI_CXI_RNR_MAX_TIMEOUT_US\f[R] +When using the endpoint FI_PROTO_CXI_RNR protocol, this setting is used +to control the maximum time from the original posting of the message +that the message should be retried. +A value of 0 will return an error completion on the first RNR ack +status. +.TP \f[I]FI_CXI_EQ_ACK_BATCH_SIZE\f[R] Number of EQ events to process before writing an acknowledgement to HW. Batching ACKs amortizes the cost of event acknowledgement over multiple diff --git a/man/man7/fi_mlx.7 b/man/man7/fi_mlx.7 deleted file mode 100644 index b87ae42baec..00000000000 --- a/man/man7/fi_mlx.7 +++ /dev/null @@ -1,16 +0,0 @@ -.\" Automatically generated by Pandoc 2.9.2.1 -.\" -.TH "fi_mlx" "7" "2022\-12\-09" "Libfabric Programmer\[cq]s Manual" "#VERSION#" -.hy -.SH NAME -.PP -fi_mlx - The MLX Fabric Provider -.SH OVERVIEW -.PP -The mlx provider was deprecated and removed in libfabric 1.9 due to a -lack of a maintainer. -.SH SEE ALSO -.PP -\f[C]fabric\f[R](7), \f[C]fi_provider\f[R](7), -.SH AUTHORS -OpenFabrics. diff --git a/man/man7/fi_netdir.7 b/man/man7/fi_netdir.7 deleted file mode 100644 index d442436c2af..00000000000 --- a/man/man7/fi_netdir.7 +++ /dev/null @@ -1,112 +0,0 @@ -.\" Automatically generated by Pandoc 2.9.2.1 -.\" -.TH "fi_netdir" "7" "2022\-12\-09" "Libfabric Programmer\[cq]s Manual" "#VERSION#" -.hy -.SH NAME -.PP -fi_netdir - The Network Direct Fabric Provider -.SH OVERVIEW -.PP -The Network Direct provider enables applications using OFI to be run -over any verbs hardware (Infiniband, iWarp and etc). -It uses the Microsoft Network Direct SPI for network transport and -provides a translation of OFI calls to appropriate Network Direct API -calls. -The Network Direct providers allows to OFI-based applications utilize -zero-copy data transfers between applications, kernel-bypass I/O -generation and one-sided data transfer operations on Microsoft Windows -OS. -An application is able to use OFI with Network Direct provider enabled -on Windows OS to expose the capabilities of the networking devices if -the hardware vendors of the devices implemented the Network Direct -service provider interface (SPI) for their hardware. -.SH SUPPORTED FEATURES -.PP -The Network Direct provider support the following features defined for -the libfabric API: -.TP -\f[I]Endpoint types\f[R] -The provider support the FI_EP_MSG endpoint types. -.TP -\f[I]Memory registration modes\f[R] -The provider implements the \f[I]FI_MR_BASIC\f[R] memory registration -mode. -.TP -\f[I]Data transfer operations\f[R] -The following data transfer interfaces are supported for the following -endpoint types: \f[I]FI_MSG\f[R], \f[I]FI_RMA\f[R]. -See DATA TRANSFER OPERATIONS below for more details. -.TP -\f[I]Modes\f[R] -The Network Direct provider requires applications to support the -following modes: * FI_LOCAL_MR for all applications. -.TP -\f[I]Addressing Formats\f[R] -Supported addressing formats include FI_SOCKADDR, FI_SOCKADDR_IN, -FI_SOCKADDR_IN6 -.TP -\f[I]Progress\f[R] -The Network Direct provider supports FI_PROGRESS_AUTO: Asynchronous -operations make forward progress automatically. -.TP -\f[I]Operation flags\f[R] -The provider supports FI_INJECT, FI_COMPLETION, FI_TRANSMIT_COMPLETE, -FI_INJECT_COMPLETE, FI_DELIVERY_COMPLETE, FI_SELECTIVE_COMPLETION -.TP -\f[I]Completion ordering\f[R] -RX/TX contexts: FI_ORDER_STRICT -.TP -\f[I]Other supported features\f[R] -Multiple input/output vector (IOV) is supported for FI_RMA read/write -and FI_MSG receive/transmit operations. -.SH LIMITATIONS -.TP -\f[I]Memory Regions\f[R] -Only FI_MR_BASIC mode is supported. -Adding regions via s/g list is supported only up to a s/g list size of -1. -No support for binding memory regions to a counter. -.TP -\f[I]Wait objects\f[R] -Wait object and wait sets are not supported. -.TP -\f[I]Resource Management\f[R] -Application has to make sure CQs are not overrun as this cannot be -detected by the provider. -.TP -\f[I]Unsupported Endpoint types\f[R] -FI_EP_DGRAM, FI_EP_RDM -.TP -\f[I]Other unsupported features\f[R] -Scalable endpoints, FABRIC_DIRECT -.TP -\f[I]Unsupported features specific to MSG endpoints\f[R] -FI_SOURCE, FI_TAGGED, FI_CLAIM, fi_ep_alias, shared TX context, -operations. -.SH RUNTIME PARAMETERS -.PP -The Network Direct provider checks for the following environment -variables. -.SS Variables specific to RDM endpoints -.TP -\f[I]FI_NETDIR_INLINETHR\f[R] -The size of the (default: 8 Kbyte): * Transmitted data that can be -inlined * Preposted data for the unexpected receive queue -.TP -\f[I]FI_NETDIR_PREPOSTCNT\f[R] -The number of pre-registered buffers between the endpoints that are not -require internal ACK messages, must be a power of 2 (default: 8). -.TP -\f[I]FI_NETDIR_PREPOSTBUFCNT\f[R] -The number of preposted arrays of buffers, must be a power of 2 -(default: 1). -.SS Environment variables notes -.PP -The fi_info utility would give the up-to-date information on environment -variables: fi_info -p netdir -e -.SH SEE ALSO -.PP -\f[C]fabric\f[R](7), \f[C]fi_open_ops\f[R](3), \f[C]fi_provider\f[R](7), -\f[C]fi_getinfo\f[R](3) \f[C]fi_atomic\f[R](3) -.SH AUTHORS -OpenFabrics. diff --git a/man/man7/fi_opx.7 b/man/man7/fi_opx.7 index f71561ba23b..5344fd65730 100644 --- a/man/man7/fi_opx.7 +++ b/man/man7/fi_opx.7 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 2.9.2.1 .\" -.TH "fi_opx" "7" "2023\-12\-16" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_opx" "7" "2024\-03\-14" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .PP {%include JB/setup %} @@ -230,7 +230,15 @@ default HFI selection logic. .TP \f[I]FI_OPX_DELIVERY_COMPLETION_THRESHOLD\f[R] Integer. -The minimum message length in bytes to force delivery completion. +Will be deprecated. +Please use FI_OPX_SDMA_BOUNCE_BUF_THRESHOLD. +.TP +\f[I]FI_OPX_SDMA_BOUNCE_BUF_THRESHOLD\f[R] +Integer. +The maximum message length in bytes that will be copied to the SDMA +bounce buffer. +For messages larger than this threshold, the send will not be completed +until receiver has ACKed. Value must be between 16385 and 2147483646. Defaults to 16385. .TP diff --git a/man/man7/fi_provider.7 b/man/man7/fi_provider.7 index 1295a557518..fb5f2541d83 100644 --- a/man/man7/fi_provider.7 +++ b/man/man7/fi_provider.7 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 2.9.2.1 .\" -.TH "fi_provider" "7" "2023\-12\-17" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_provider" "7" "2024\-03\-18" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -72,6 +72,10 @@ That is, necessary libraries are installed, operating system support is available, etc. This list is not exhaustive. .TP +\f[I]CXI\f[R] +Provider for Cray\[cq]s Slingshot network. +See \f[C]fi_cxi\f[R](7) for more information. +.TP \f[I]EFA\f[R] A provider for the Amazon EC2 Elastic Fabric Adapter (EFA) (https://aws.amazon.com/hpc/efa/), a custom-built OS bypass diff --git a/man/man7/fi_rxm.7 b/man/man7/fi_rxm.7 index 0f97754b937..037fc74d926 100644 --- a/man/man7/fi_rxm.7 +++ b/man/man7/fi_rxm.7 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 2.9.2.1 .\" -.TH "fi_rxm" "7" "2024\-01\-08" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_rxm" "7" "2024\-03\-13" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -184,6 +184,14 @@ but may increase connection setup time (default: 10000) Defines the maximum number of message provider CQ entries that can be consecutively read across progress calls without checking to see if the CM progress interval has been reached (default: 128) +.TP +\f[I]FI_OFI_RXM_DETECT_HMEM_IFACE\f[R] +Set this to 1 to allow automatic detection of HMEM iface of user buffers +when such information is not supplied. +This feature allows such buffers be copied or registered (e.g.\ in +Rendezvous) internally by RxM. +Note that no extra memory registration is performed with this option. +(default: false) .SH Tuning .SS Bandwidth .PP diff --git a/man/man7/fi_setup.7 b/man/man7/fi_setup.7 index faa80c4916e..47fab7e874b 100644 --- a/man/man7/fi_setup.7 +++ b/man/man7/fi_setup.7 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 2.9.2.1 .\" -.TH "fi_setup" "7" "2023\-01\-02" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.TH "fi_setup" "7" "2024\-03\-20" "Libfabric Programmer\[cq]s Manual" "#VERSION#" .hy .SH NAME .PP @@ -1118,8 +1118,12 @@ struct fi_cq_err_entry { /* Sample error handling */ struct fi_cq_msg_entry entry; struct fi_cq_err_entry err_entry; +char err_data[256]; int ret; +err_entry.err_data = err_data; +err_entry.err_data_size = 256; + ret = fi_cq_read(cq, &entry, 1); if (ret == -FI_EAVAIL) ret = fi_cq_readerr(cq, &err_entry, 0); diff --git a/man/man7/fi_usnic.7 b/man/man7/fi_usnic.7 new file mode 100644 index 00000000000..cf03f28a0f7 --- /dev/null +++ b/man/man7/fi_usnic.7 @@ -0,0 +1,382 @@ +.\" Automatically generated by Pandoc 2.9.2.1 +.\" +.TH "fi_usnic" "7" "2022\-12\-09" "Libfabric Programmer\[cq]s Manual" "#VERSION#" +.hy +.SH NAME +.PP +fi_usnic - The usNIC Fabric Provider +.SH OVERVIEW +.PP +The \f[I]usnic\f[R] provider is designed to run over the Cisco VIC +(virtualized NIC) hardware on Cisco UCS servers. +It utilizes the Cisco usNIC (userspace NIC) capabilities of the VIC to +enable ultra low latency and other offload capabilities on Ethernet +networks. +.SH RELEASE NOTES +.IP \[bu] 2 +The \f[I]usnic\f[R] libfabric provider requires the use of the +\[lq]libnl\[rq] library. +.RS 2 +.IP \[bu] 2 +There are two versions of libnl generally available: v1 and v3; the +usnic provider can use either version. +.IP \[bu] 2 +If you are building libfabric/the usnic provider from source, you will +need to have the libnl header files available (e.g., if you are +installing libnl from RPM or other packaging system, install the +\[lq]-devel\[rq] versions of the package). +.IP \[bu] 2 +If you have libnl (either v1 or v3) installed in a non-standard location +(e.g., not in /usr/lib or /usr/lib64), you may need to tell +libfabric\[cq]s configure where to find libnl via the +\f[C]--with-libnl=DIR\f[R] command line option (where DIR is the +installation prefix of the libnl package). +.RE +.IP \[bu] 2 +The most common way to use the libfabric usnic provider is via an MPI +implementation that uses libfabric (and the usnic provider) as a lower +layer transport. +MPI applications do not need to know anything about libfabric or usnic +in this use case \[en] the MPI implementation hides all these details +from the application. +.IP \[bu] 2 +If you are writing applications directly to the libfabric API: +.RS 2 +.IP \[bu] 2 +\f[I]FI_EP_DGRAM\f[R] endpoints are the best supported method of +utilizing the usNIC interface. +Specifically, the \f[I]FI_EP_DGRAM\f[R] endpoint type has been +extensively tested as the underlying layer for Open MPI\[cq]s +\f[I]usnic\f[R] BTL. +.IP \[bu] 2 +\f[I]FI_EP_MSG\f[R] and \f[I]FI_EP_RDM\f[R] endpoints are implemented, +but are only lightly tested. +It is likely that there are still some bugs in these endpoint types. +In particular, there are known bugs in RDM support in the presence of +congestion or packet loss (issue 1621). +RMA is not yet supported. +.IP \[bu] 2 +\f[C]fi_provider\f[R](7) lists requirements for all providers. +The following limitations exist in the \f[I]usnic\f[R] provider: +.RS 2 +.IP \[bu] 2 +multicast operations are not supported on \f[I]FI_EP_DGRAM\f[R] and +\f[I]FI_EP_RDM\f[R] endpoints. +.IP \[bu] 2 +\f[I]FI_EP_MSG\f[R] endpoints only support connect, accept, and getname +CM operations. +.IP \[bu] 2 +Passive endpoints only support listen, setname, and getname CM +operations. +.IP \[bu] 2 +\f[I]FI_EP_DGRAM\f[R] endpoints support \f[C]fi_sendmsg()\f[R] and +\f[C]fi_recvmsg()\f[R], but some flags are ignored. +\f[C]fi_sendmsg()\f[R] supports \f[C]FI_INJECT\f[R] and +\f[C]FI_COMPLETION\f[R]. +\f[C]fi_recvmsg()\f[R] supports \f[C]FI_MORE\f[R]. +.IP \[bu] 2 +Address vectors only support \f[C]FI_AV_MAP\f[R]. +.IP \[bu] 2 +No counters are supported. +.IP \[bu] 2 +The tag matching interface is not supported. +.IP \[bu] 2 +\f[I]FI_MSG_PREFIX\f[R] is only supported on \f[I]FI_EP_DGRAM\f[R] and +usage is limited to releases 1.1 and beyond. +.IP \[bu] 2 +fi_control with FI_GETWAIT may only be used on CQs that have been bound +to an endpoint. +If fi_control is used on an unbound CQ, it will return -FI_EOPBADSTATE. +.IP \[bu] 2 +There is limited support for data returned as part of an erroneous +asynchronous operation. +EQs will return error data for CM operations, CQs do not support +returning error data. +.IP \[bu] 2 +As of 1.5, usNIC supports fi_mr_regv, and fi_mr_regattr. +Support is limited to a single iov. +.IP \[bu] 2 +Atomic operations are not supported. +.RE +.IP \[bu] 2 +Resource management is not supported. +The application is responsible for resource protection. +.IP \[bu] 2 +The usnic libfabric provider supports extensions that provide +information and functionality beyond the standard libfabric interface. +See the \[lq]USNIC EXTENSIONS\[rq] section, below. +.RE +.SH USNIC EXTENSIONS +.PP +The usnic libfabric provider exports extensions for additional VIC, +usNIC, and Ethernet capabilities not provided by the standard libfabric +interface. +.PP +These extensions are available via the \[lq]fi_ext_usnic.h\[rq] header +file. +.SS Fabric Extension: getinfo +.PP +Version 2 of the \[lq]fabric getinfo\[rq] extension was introduced in +Libfabric release v1.3.0 and can be used to retrieve IP and SR-IOV +information about a usNIC device obtained from the +\f[C]fi_getinfo\f[R](3) function. +.PP +The \[lq]fabric getinfo\[rq] extension is obtained by calling +\f[C]fi_open_ops\f[R] and requesting \f[C]FI_USNIC_FABRIC_OPS_1\f[R] to +get the usNIC fabric extension operations. +The \f[C]getinfo\f[R] function accepts a version parameter that can be +used to select different versions of the extension. +The information returned by the \[lq]fabric getinfo\[rq] extension is +accessible through a \f[C]fi_usnic_info\f[R] struct that uses a version +tagged union. +The accessed union member must correspond with the requested version. +It is recommended that applications explicitly request a version rather +than using the header provided \f[C]FI_EXT_USNIC_INFO_VERSION\f[R]. +Although there is a version 1 of the extension, its use is discouraged, +and it may not be available in future releases. +.SS Compatibility issues +.PP +The addition of version 2 of the extension caused an alignment issue +that could lead to invalid data in the v1 portion of the structure. +This means that the alignment difference manifests when an application +using v1 of the extension is compiled with Libfabric v1.1.x or v1.2.x, +but then runs with Libfabric.so that is v1.3.x or higher (and vice +versa). +.PP +The v1.4.0 release of Libfabric introduced a padding field to explicitly +maintain compatibility with the v1.3.0 release. +If the issue is encountered, then it is recommended that you upgrade to +a release containing version 2 of the extension, or recompile with a +patched version of an older release. +.IP +.nf +\f[C] +#include + +struct fi_usnic_info { + uint32_t ui_version; + uint8_t ui_pad0[4]; + union { + struct fi_usnic_info_v1 v1; + struct fi_usnic_info_v2 v2; + } ui; +} __attribute__((packed)); + +int getinfo(uint32_t version, struct fid_fabric *fabric, + struct fi_usnic_info *info); +\f[R] +.fi +.TP +\f[I]version\f[R] +Version of getinfo to be used +.TP +\f[I]fabric\f[R] +Fabric descriptor +.TP +\f[I]info\f[R] +Upon successful return, this parameter will contain information about +the fabric. +.IP \[bu] 2 +Version 2 +.IP +.nf +\f[C] +struct fi_usnic_cap { + const char *uc_capability; + int uc_present; +} __attribute__((packed)); + +struct fi_usnic_info_v2 { + uint32_t ui_link_speed; + uint32_t ui_netmask_be; + char ui_ifname[IFNAMSIZ]; + unsigned ui_num_vf; + unsigned ui_qp_per_vf; + unsigned ui_cq_per_vf; + + char ui_devname[FI_EXT_USNIC_MAX_DEVNAME]; + uint8_t ui_mac_addr[6]; + + uint8_t ui_pad0[2]; + + uint32_t ui_ipaddr_be; + uint32_t ui_prefixlen; + uint32_t ui_mtu; + uint8_t ui_link_up; + + uint8_t ui_pad1[3]; + + uint32_t ui_vendor_id; + uint32_t ui_vendor_part_id; + uint32_t ui_device_id; + char ui_firmware[64]; + + unsigned ui_intr_per_vf; + unsigned ui_max_cq; + unsigned ui_max_qp; + + unsigned ui_max_cqe; + unsigned ui_max_send_credits; + unsigned ui_max_recv_credits; + + const char *ui_nicname; + const char *ui_pid; + + struct fi_usnic_cap **ui_caps; +} __attribute__((packed)); +\f[R] +.fi +.IP \[bu] 2 +Version 1 +.IP +.nf +\f[C] +struct fi_usnic_info_v1 { + uint32_t ui_link_speed; + uint32_t ui_netmask_be; + char ui_ifname[IFNAMSIZ]; + + uint32_t ui_num_vf; + uint32_t ui_qp_per_vf; + uint32_t ui_cq_per_vf; +} __attribute__((packed)); +\f[R] +.fi +.PP +Version 1 of the \[lq]fabric getinfo\[rq] extension can be used by +explicitly requesting it in the call to \f[C]getinfo\f[R] and accessing +the \f[C]v1\f[R] portion of the \f[C]fi_usnic_info.ui\f[R] union. +Use of version 1 is not recommended and it may be removed from future +releases. +.PP +The following is an example of how to utilize version 2 of the usnic +\[lq]fabric getinfo\[rq] extension. +.IP +.nf +\f[C] +#include +#include + +/* The usNIC extensions are all in the + rdma/fi_ext_usnic.h header */ +#include + +int main(int argc, char *argv[]) { + struct fi_info *info; + struct fi_info *info_list; + struct fi_info hints = {0}; + struct fi_ep_attr ep_attr = {0}; + struct fi_fabric_attr fabric_attr = {0}; + + fabric_attr.prov_name = \[dq]usnic\[dq]; + ep_attr.type = FI_EP_DGRAM; + + hints.caps = FI_MSG; + hints.mode = FI_LOCAL_MR | FI_MSG_PREFIX; + hints.addr_format = FI_SOCKADDR; + hints.ep_attr = &ep_attr; + hints.fabric_attr = &fabric_attr; + + /* Find all usnic providers */ + fi_getinfo(FI_VERSION(1, 0), NULL, 0, 0, &hints, &info_list); + + for (info = info_list; NULL != info; info = info->next) { + /* Open the fabric on the interface */ + struct fid_fabric *fabric; + fi_fabric(info->fabric_attr, &fabric, NULL); + + /* Pass FI_USNIC_FABRIC_OPS_1 to get usnic ops + on the fabric */ + struct fi_usnic_ops_fabric *usnic_fabric_ops; + fi_open_ops(&fabric->fid, FI_USNIC_FABRIC_OPS_1, 0, + (void **) &usnic_fabric_ops, NULL); + + /* Now use the returned usnic ops structure to call + usnic extensions. The following extension queries + some IP and SR-IOV characteristics about the + usNIC device. */ + struct fi_usnic_info usnic_info; + + /* Explicitly request version 2. */ + usnic_fabric_ops->getinfo(2, fabric, &usnic_info); + + printf(\[dq]Fabric interface %s is %s:\[rs]n\[dq] + \[dq]\[rs]tNetmask: 0x%08x\[rs]n\[rs]tLink speed: %d\[rs]n\[dq] + \[dq]\[rs]tSR-IOV VFs: %d\[rs]n\[rs]tQPs per SR-IOV VF: %d\[rs]n\[dq] + \[dq]\[rs]tCQs per SR-IOV VF: %d\[rs]n\[dq], + info->fabric_attr->name, + usnic_info.ui.v2.ui_ifname, + usnic_info.ui.v2.ui_netmask_be, + usnic_info.ui.v2.ui_link_speed, + usnic_info.ui.v2.ui_num_vf, + usnic_info.ui.v2.ui_qp_per_vf, + usnic_info.ui.v2.ui_cq_per_vf); + + fi_close(&fabric->fid); + } + + fi_freeinfo(info_list); + return 0; +} +\f[R] +.fi +.SS Adress Vector Extension: get_distance +.PP +The \[lq]address vector get_distance\[rq] extension was introduced in +Libfabric release v1.0.0 and can be used to retrieve the network +distance of an address. +.PP +The \[lq]get_distance\[rq] extension is obtained by calling +\f[C]fi_open_ops\f[R] and requesting \f[C]FI_USNIC_AV_OPS_1\f[R] to get +the usNIC address vector extension operations. +.IP +.nf +\f[C] +int get_distance(struct fid_av *av, void *addr, int *metric); +\f[R] +.fi +.TP +\f[I]av\f[R] +Address vector +.TP +\f[I]addr\f[R] +Destination address +.TP +\f[I]metric\f[R] +On output this will contain \f[C]-1\f[R] if the destination host is +unreachable, \f[C]0\f[R] is the destination host is locally connected, +and \f[C]1\f[R] otherwise. +.PP +See fi_ext_usnic.h for more details. +.SH VERSION DIFFERENCES +.SS New naming convention for fabric/domain starting with libfabric v1.4 +.PP +The release of libfabric v1.4 introduced a new naming convention for +fabric and domain. +However the usNIC provider remains backward compatible with applications +supporting the old scheme and decides which one to use based on the +version passed to \f[C]fi_getinfo\f[R]: +.IP \[bu] 2 +When \f[C]FI_VERSION(1,4)\f[R] or higher is used: +.RS 2 +.IP \[bu] 2 +fabric name is the network address with the CIDR notation (i.e., +\f[C]a.b.c.d/e\f[R]) +.IP \[bu] 2 +domain name is the usNIC Linux interface name (i.e., \f[C]usnic_X\f[R]) +.RE +.IP \[bu] 2 +When a lower version number is used, like \f[C]FI_VERSION(1, 3)\f[R], it +follows the same behavior the usNIC provider exhibited in libfabric <= +v1.3: +.RS 2 +.IP \[bu] 2 +fabric name is the usNIC Linux interface name (i.e., \f[C]usnic_X\f[R]) +.IP \[bu] 2 +domain name is \f[C]NULL\f[R] +.RE +.SH SEE ALSO +.PP +\f[C]fabric\f[R](7), \f[C]fi_open_ops\f[R](3), \f[C]fi_provider\f[R](7), +.SH AUTHORS +OpenFabrics. diff --git a/prov/coll/src/coll_coll.c b/prov/coll/src/coll_coll.c index 158f7f47eaf..63fa5b5d7b7 100644 --- a/prov/coll/src/coll_coll.c +++ b/prov/coll/src/coll_coll.c @@ -931,8 +931,10 @@ int coll_join_collective(struct fid_ep *ep, const void *addr, av_set = container_of(set, struct util_av_set, av_set_fid); if (coll_addr == FI_ADDR_NOTAVAIL) { + ofi_mutex_lock(&av_set->av->lock); assert(av_set->av->av_set); coll_mc = &av_set->av->av_set->coll_mc; + ofi_mutex_unlock(&av_set->av->lock); } else { coll_mc = (struct util_coll_mc*) ((uintptr_t) coll_addr); } diff --git a/prov/cxi/Makefile.include b/prov/cxi/Makefile.include index b2619dd2ec6..b529f942ce7 100644 --- a/prov/cxi/Makefile.include +++ b/prov/cxi/Makefile.include @@ -26,6 +26,8 @@ _cxi_files = \ prov/cxi/src/cxip_rma.c \ prov/cxi/src/cxip_mr.c \ prov/cxi/src/cxip_msg.c \ + prov/cxi/src/cxip_msg_rnr.c \ + prov/cxi/src/cxip_msg_hpc.c \ prov/cxi/src/cxip_atomic.c \ prov/cxi/src/cxip_iomm.c \ prov/cxi/src/cxip_faults.c \ @@ -33,7 +35,7 @@ _cxi_files = \ prov/cxi/src/cxip_ctrl.c \ prov/cxi/src/cxip_req_buf.c \ prov/cxi/src/cxip_rdzv_pte.c \ - prov/cxi/src/cxip_trace.c \ + prov/cxi/src/cxip_coll_trace.c \ prov/cxi/src/cxip_telemetry.c \ prov/cxi/src/cxip_ptelist_buf.c \ prov/cxi/src/cxip_evtq.c \ diff --git a/prov/cxi/configure.m4 b/prov/cxi/configure.m4 index 1a3b0b835c2..50f0eca5236 100644 --- a/prov/cxi/configure.m4 +++ b/prov/cxi/configure.m4 @@ -123,7 +123,6 @@ AC_DEFUN([FI_CXI_CONFIGURE],[ cxitest_LDFLAGS="-L$with_criterion/lib64 -Wl,-rpath=$(realpath $with_criterion/lib64)" cxitest_LIBS="-lcriterion" have_criterion=true]) - AM_CONDITIONAL([HAVE_CRITERION], [test "x$have_criterion" = "xtrue"]) AS_IF([test "$have_ze" = "1" && test "$with_ze" != "" && test x"$with_ze" != x"yes"], [cxitest_CPPFLAGS="$cxitest_CPPFLAGS -I$with_ze/include" @@ -141,5 +140,6 @@ AC_DEFUN([FI_CXI_CONFIGURE],[ ], [cxi_happy=0]) + AM_CONDITIONAL([HAVE_CRITERION], [test "x$have_criterion" = "xtrue"]) AS_IF([test $cxi_happy -eq 1], [$1], [$2]) ]) diff --git a/prov/cxi/include/cxip.h b/prov/cxi/include/cxip.h index 0a441e3bc2c..a7f8d4486dd 100644 --- a/prov/cxi/include/cxip.h +++ b/prov/cxi/include/cxip.h @@ -172,7 +172,7 @@ #define CXIP_MINOR_VERSION 1 #define CXIP_PROV_VERSION FI_VERSION(CXIP_MAJOR_VERSION, \ CXIP_MINOR_VERSION) -#define CXIP_FI_VERSION FI_VERSION(1, 20) +#define CXIP_FI_VERSION FI_VERSION(1, 21) #define CXIP_WIRE_PROTO_VERSION 1 #define CXIP_COLL_MAX_CONCUR 8 @@ -241,18 +241,22 @@ struct cxip_environment { int force_odp; int ats; int iotlb; + int disable_dmabuf_cuda; + int disable_dmabuf_rocr; enum cxip_ats_mlock_mode ats_mlock_mode; /* Messaging */ int fork_safe_requested; enum cxip_ep_ptle_mode rx_match_mode; int msg_offload; + int trunc_ok; int hybrid_preemptive; int hybrid_recv_preemptive; size_t rdzv_threshold; size_t rdzv_get_min; size_t rdzv_eager_size; int rdzv_aligned_sw_rget; + int rnr_max_timeout_us; int disable_non_inject_msg_idc; int disable_host_register; size_t oflow_buf_size; @@ -336,6 +340,7 @@ struct cxip_addr { }; #define CXIP_ADDR_EQUAL(a, b) ((a).nic == (b).nic && (a).pid == (b).pid) +#define CXIP_ADDR_VNI_EQUAL(a, b) (CXIP_ADDR_EQUAL(a, b) && (a).vni == (b).vni) /* * A PID contains "pid_granule" logical endpoints. The PID granule is set per @@ -369,6 +374,7 @@ struct cxip_addr { * be within the logical endpoint range 128 - 255. */ #define CXIP_PTL_IDX_RXQ 0 +#define CXIP_PTL_IDX_RNR_RXQ 1 #define CXIP_PTL_IDX_WRITE_MR_OPT_BASE 17 #define CXIP_PTL_IDX_READ_MR_OPT_BASE 128 #define CXIP_PTL_IDX_MR_OPT_CNT 100 @@ -539,6 +545,10 @@ bool cxip_generic_is_valid_mr_key(uint64_t key); CXIP_RDZV_ID_HIGH_WIDTH) #define CXIP_TAG_MASK ((1UL << CXIP_TAG_WIDTH) - 1) +#define CXIP_CS_TAG_WIDTH 40 +#define CXIP_VNI_WIDTH 16 +#define CXIP_CS_TAG_MASK ((1UL << CXIP_CS_TAG_WIDTH) - 1) + /* Define several types of LEs */ enum cxip_le_type { CXIP_LE_TYPE_RX = 0, /* RX data LE */ @@ -587,6 +597,14 @@ union cxip_match_bits { struct { uint64_t rdzv_id_lo : CXIP_RDZV_ID_CMD_WIDTH; }; + /* Client/Server messaging match bits */ + struct { + uint64_t rnr_tag : CXIP_CS_TAG_WIDTH; /* User tag value */ + uint64_t rnr_rsvd : 6; /* Unused, set to 0 */ + uint64_t rnr_cq_data : 1; /* Header data valid */ + uint64_t rnr_tagged : 1; /* Tagged API */ + uint64_t rnr_vni : CXIP_VNI_WIDTH; /* Source VNI */ + }; /* Control LE match bit format for notify/resume */ struct { uint64_t txc_id : 8; @@ -754,6 +772,10 @@ int cxip_cmdq_emic_idc_amo(struct cxip_cmdq *cmdq, bool fetching, bool flush); int cxip_cmdq_emit_dma_amo(struct cxip_cmdq *cmdq, struct c_dma_amo_cmd *amo, uint64_t flags, bool fetching, bool flush); +int cxip_cmdq_emit_idc_msg(struct cxip_cmdq *cmdq, + const struct c_cstate_cmd *c_state, + const struct c_idc_msg_hdr *msg, const void *buf, + size_t len, uint64_t flags); /* OFI Provider Structures */ @@ -812,6 +834,11 @@ struct cxip_telemetry_entry { unsigned long value; }; +struct cxip_domain_cmdq { + struct dlist_entry entry; + struct cxip_cmdq *cmdq; +}; + /* * CXI Provider Domain object */ @@ -921,8 +948,39 @@ struct cxip_domain { /* Domain has been configured with FI_AV_USER_ID. */ bool av_user_id; + + /* Domain level TX command queues used when number of authorization + * keys exceeds LCID limit. + */ + struct dlist_entry cmdq_list; + unsigned int cmdq_cnt; + struct ofi_genlock cmdq_lock; + size_t tx_size; }; +int cxip_domain_emit_idc_put(struct cxip_domain *dom, uint16_t vni, + enum cxi_traffic_class tc, + const struct c_cstate_cmd *c_state, + const struct c_idc_put_cmd *put, const void *buf, + size_t len, uint64_t flags); +int cxip_domain_emit_dma(struct cxip_domain *dom, uint16_t vni, + enum cxi_traffic_class tc, struct c_full_dma_cmd *dma, + uint64_t flags); +int cxip_domain_emit_idc_amo(struct cxip_domain *dom, uint16_t vni, + enum cxi_traffic_class tc, + const struct c_cstate_cmd *c_state, + const struct c_idc_amo_cmd *amo, uint64_t flags, + bool fetching, bool flush); +int cxip_domain_emit_dma_amo(struct cxip_domain *dom, uint16_t vni, + enum cxi_traffic_class tc, + struct c_dma_amo_cmd *amo, uint64_t flags, + bool fetching, bool flush); +int cxip_domain_emit_idc_msg(struct cxip_domain *dom, uint16_t vni, + enum cxi_traffic_class tc, + const struct c_cstate_cmd *c_state, + const struct c_idc_msg_hdr *msg, const void *buf, + size_t len, uint64_t flags); + static inline bool cxip_domain_mr_cache_enabled(struct cxip_domain *dom) { return dom->iomm.domain == &dom->util_domain; @@ -1034,10 +1092,17 @@ struct cxip_ux_dump_state { struct cxip_req_recv { /* Receive parameters */ struct dlist_entry rxc_entry; - struct cxip_rxc *rxc; // receive context + union { + struct cxip_rxc *rxc; + struct cxip_rxc_hpc *rxc_hpc; + struct cxip_rxc_rnr *rxc_rnr; + }; + struct cxip_cntr *cntr; void *recv_buf; // local receive buffer struct cxip_md *recv_md; // local receive MD + bool hybrid_md; // True if MD was provided + bool success_disable; uint32_t ulen; // User buffer length bool tagged; uint64_t tag; @@ -1092,7 +1157,11 @@ struct cxip_req_recv { struct cxip_req_send { /* Send parameters */ - struct cxip_txc *txc; + union { + struct cxip_txc *txc; + struct cxip_txc_hpc *txc_hpc; + struct cxip_txc_rnr *txc_rnr; + }; struct cxip_cntr *cntr; const void *buf; // local send buffer size_t len; // request length @@ -1100,6 +1169,8 @@ struct cxip_req_send { struct cxip_addr caddr; fi_addr_t dest_addr; bool tagged; + bool hybrid_md; + bool success_disable; uint32_t tclass; uint64_t tag; uint64_t data; @@ -1115,6 +1186,11 @@ struct cxip_req_send { }; int rc; // DMA return code int rdzv_send_events; // Processed event count + uint64_t max_rnr_time; + uint64_t retry_rnr_time; + struct dlist_entry rnr_entry; + int retries; + bool canceled; }; struct cxip_req_rdzv_src { @@ -1125,7 +1201,7 @@ struct cxip_req_rdzv_src { }; struct cxip_req_search { - struct cxip_rxc *rxc; + struct cxip_rxc_hpc *rxc; bool complete; int puts_pending; }; @@ -1219,6 +1295,7 @@ struct cxip_ctrl_req_mr { struct cxip_ctrl_send { uint32_t nic_addr; uint32_t pid; + uint16_t vni; union cxip_match_bits mb; }; @@ -1245,7 +1322,7 @@ struct cxip_mr_lac_cache { struct cxip_fc_peer { struct dlist_entry txc_entry; - struct cxip_txc *txc; + struct cxip_txc_hpc *txc; struct cxip_ctrl_req req; struct cxip_addr caddr; struct dlist_entry msg_queue; @@ -1258,10 +1335,11 @@ struct cxip_fc_peer { struct cxip_fc_drops { struct dlist_entry rxc_entry; - struct cxip_rxc *rxc; + struct cxip_rxc_hpc *rxc; struct cxip_ctrl_req req; uint32_t nic_addr; uint32_t pid; + uint16_t vni; uint16_t drops; unsigned int retry_count; }; @@ -1711,64 +1789,121 @@ cxip_msg_counters_msg_record(struct cxip_msg_counters *cntrs, * cannot be delivered due to EQ full, delay before retrying. */ #define CXIP_DONE_NOTIFY_RETRY_DELAY_US 100 + +#define RXC_RESERVED_FC_SLOTS 1 + +/* RXC specialization API support */ +struct cxip_rxc_ops { + ssize_t (*recv_common)(struct cxip_rxc *rxc, void *buf, size_t len, + void *desc, fi_addr_t src_add, uint64_t tag, + uint64_t ignore, void *context, uint64_t flags, + bool tagged, struct cxip_cntr *comp_cntr); + void (*progress)(struct cxip_rxc *rxc); + void (*recv_req_tgt_event)(struct cxip_req *req, + const union c_event *event); + int (*cancel_msg_recv)(struct cxip_req *req); + int (*ctrl_msg_cb)(struct cxip_ctrl_req *req, + const union c_event *event); + void (*init_struct)(struct cxip_rxc *rxc, struct cxip_ep_obj *ep_obj); + void (*fini_struct)(struct cxip_rxc *rxc); + void (*cleanup)(struct cxip_rxc *rxc); + int (*msg_init)(struct cxip_rxc *rxc); + int (*msg_fini)(struct cxip_rxc *rxc); +}; + /* - * Endpoint object receive context + * Receive context base object */ struct cxip_rxc { void *context; - struct cxip_cq *recv_cq; - struct cxip_cntr *recv_cntr; - - struct cxip_ep_obj *ep_obj; // parent EP object - struct cxip_domain *domain; // parent domain - uint8_t pid_bits; - - struct dlist_entry ep_list; // contains EPs using shared context + uint32_t protocol; struct fi_rx_attr attr; bool selective_completion; + bool hmem; + bool trunc_ok; bool sw_ep_only; + bool msg_offload; + uint8_t pid_bits; // Zero without SEP + uint8_t recv_ptl_idx; + + enum cxip_rxc_state state; + + /* Reverse link to EP object that owns this context */ + struct cxip_ep_obj *ep_obj; + + struct cxip_cq *recv_cq; + struct cxip_cntr *recv_cntr; + struct cxip_rxc_ops ops; + + struct cxip_domain *domain; + + /* RXC receive portal table, event queue and hardware + * command queue. + */ struct cxip_evtq rx_evtq; - struct cxip_pte *rx_pte; // HW RX Queue - struct cxip_cmdq *rx_cmdq; // RX CMDQ for posting receive buffers - struct cxip_cmdq *tx_cmdq; // TX CMDQ for Message Gets + struct cxip_pte *rx_pte; + struct cxip_cmdq *rx_cmdq; + ofi_atomic32_t orx_reqs; - /* Number of unexpected list entries in HW. */ - ofi_atomic32_t orx_hw_ule_cnt; - ofi_atomic32_t orx_reqs; // outstanding receive requests - ofi_atomic32_t orx_tx_reqs; // outstanding RX initiated TX requests + /* If FI_MULTI_RECV is supported, minimum receive size required + * for buffers posted. + */ + size_t min_multi_recv; + + /* If TX events are required by specialization, the maximum + * credits that can be used. + */ int32_t max_tx; unsigned int recv_appends; + struct cxip_msg_counters cntrs; +}; + +/* Receive context specialization for supporting HPC messaging + * that requires SAS implemented in a Portals environment. + */ +struct cxip_rxc_hpc { + /* Must be first */ + struct cxip_rxc base; + + int max_eager_size; + uint64_t rget_align_mask; + /* Window when FI_CLAIM mutual exclusive access is required */ bool hw_claim_in_progress; - size_t min_multi_recv; - int max_eager_size; + int sw_ux_list_len; + int sw_pending_ux_list_len; - /* Flow control/software state change metrics */ - int num_fc_eq_full; - int num_fc_no_match; - int num_fc_unexp; - int num_fc_append_fail; - int num_fc_req_full; - int num_sc_nic_hw2sw_append_fail; - int num_sc_nic_hw2sw_unexp; + /* Number of unexpected list entries in HW. */ + ofi_atomic32_t orx_hw_ule_cnt; + + /* RX context transmit queue, required for rendezvous + * gets. + */ + struct cxip_cmdq *tx_cmdq; + ofi_atomic32_t orx_tx_reqs; + + /* Software receive queue. User posted requests are queued here instead + * of on hardware if the RXC is in software endpoint mode. + */ + struct dlist_entry sw_recv_queue; + + /* Defer events to wait for both put and put overflow */ + struct def_event_ht deferred_events; /* Unexpected message handling */ struct cxip_ptelist_bufpool *req_list_bufpool; struct cxip_ptelist_bufpool *oflow_list_bufpool; - /* Defer events to wait for both put and put overflow */ - struct def_event_ht deferred_events; + enum cxip_rxc_state prev_state; + enum cxip_rxc_state new_state; + enum c_sc_reason fc_reason; - struct dlist_entry fc_drops; - struct dlist_entry replay_queue; - struct dlist_entry sw_ux_list; - struct dlist_entry sw_pending_ux_list; - int sw_ux_list_len; - int sw_pending_ux_list_len; + /* RXC drop count used for FC accounting. */ + int drop_count; /* Array of 8-byte of unexpected headers remote offsets. */ uint64_t *ule_offsets; @@ -1779,24 +1914,32 @@ struct cxip_rxc { */ unsigned int cur_ule_offsets; - /* Software receive queue. User posted requests are queued here instead - * of on hardware if the RXC is in software endpoint mode. - */ - struct dlist_entry sw_recv_queue; - - enum cxip_rxc_state state; - enum cxip_rxc_state prev_state; - enum cxip_rxc_state new_state; - enum c_sc_reason fc_reason; + struct dlist_entry fc_drops; + struct dlist_entry replay_queue; + struct dlist_entry sw_ux_list; + struct dlist_entry sw_pending_ux_list; - bool msg_offload; - uint64_t rget_align_mask; + /* Flow control/software state change metrics */ + int num_fc_eq_full; + int num_fc_no_match; + int num_fc_unexp; + int num_fc_append_fail; + int num_fc_req_full; + int num_sc_nic_hw2sw_append_fail; + int num_sc_nic_hw2sw_unexp; +}; - /* RXC drop count used for FC accounting. */ - int drop_count; - bool hmem; +/* Receive context specialization for supporting client/server + * messaging. + */ +struct cxip_rxc_rnr { + /* Must be first */ + struct cxip_rxc base; - struct cxip_msg_counters cntrs; + bool hybrid_mr_desc; + /* Used when success events are not required */ + struct cxip_req *req_selective_comp_msg; + struct cxip_req *req_selective_comp_tag; }; static inline void cxip_copy_to_md(struct cxip_md *md, void *dest, @@ -1865,7 +2008,7 @@ struct cxip_ptelist_bufpool_attr { struct cxip_ptelist_bufpool { struct cxip_ptelist_bufpool_attr attr; - struct cxip_rxc *rxc; + struct cxip_rxc_hpc *rxc; size_t buf_alignment; /* Ordered list of buffers emitted to hardware */ @@ -1900,7 +2043,7 @@ struct cxip_ptelist_buf { struct cxip_ptelist_bufpool *pool; /* RX context the request buffer is posted on. */ - struct cxip_rxc *rxc; + struct cxip_rxc_hpc *rxc; enum cxip_le_type le_type; struct dlist_entry buf_entry; struct cxip_req *req; @@ -1931,7 +2074,7 @@ struct cxip_ptelist_buf { char *data; }; -int cxip_ptelist_bufpool_init(struct cxip_rxc *rxc, +int cxip_ptelist_bufpool_init(struct cxip_rxc_hpc *rxc, struct cxip_ptelist_bufpool **pool, struct cxip_ptelist_bufpool_attr *attr); void cxip_ptelist_bufpool_fini(struct cxip_ptelist_bufpool *pool); @@ -1948,15 +2091,15 @@ void cxip_ptelist_buf_consumed(struct cxip_ptelist_buf *buf); * cxip_req_bufpool_init() - Initialize PtlTE request list buffer management * object. */ -int cxip_req_bufpool_init(struct cxip_rxc *rxc); -void cxip_req_bufpool_fini(struct cxip_rxc *rxc); +int cxip_req_bufpool_init(struct cxip_rxc_hpc *rxc); +void cxip_req_bufpool_fini(struct cxip_rxc_hpc *rxc); /* * cxip_oflow_bufpool_init() - Initialize PtlTE overflow list buffer management * object. */ -int cxip_oflow_bufpool_init(struct cxip_rxc *rxc); -void cxip_oflow_bufpool_fini(struct cxip_rxc *rxc); +int cxip_oflow_bufpool_init(struct cxip_rxc_hpc *rxc); +void cxip_oflow_bufpool_fini(struct cxip_rxc_hpc *rxc); void _cxip_req_buf_ux_free(struct cxip_ux_send *ux, bool repost); void cxip_req_buf_ux_free(struct cxip_ux_send *ux); @@ -1971,7 +2114,7 @@ void cxip_req_buf_ux_free(struct cxip_ux_send *ux); /* Base rendezvous PtlTE object */ struct cxip_rdzv_pte { - struct cxip_txc *txc; + struct cxip_txc_hpc *txc; struct cxip_pte *pte; /* Count of the number of buffers successfully linked on this PtlTE. */ @@ -2011,26 +2154,47 @@ struct cxip_rdzv_nomatch_pte { #define CXIP_TXC_FORCE_ERR_ALT_READ_PROTO_ALLOC (1 << 0) #endif +/* TXC specialization API support */ +struct cxip_txc_ops { + ssize_t (*send_common)(struct cxip_txc *txc, uint32_t tclass, + const void *buf, size_t len, void *desc, + uint64_t data, fi_addr_t dest_addr, uint64_t tag, + void *context, uint64_t flags, bool tagged, + bool triggered, uint64_t trig_thresh, + struct cxip_cntr *trig_cntr, + struct cxip_cntr *comp_cntr); + void (*progress)(struct cxip_txc *txc); + int (*cancel_msg_send)(struct cxip_req *req); + void (*init_struct)(struct cxip_txc *txc, struct cxip_ep_obj *ep_obj); + void (*fini_struct)(struct cxip_txc *txc); + void (*cleanup)(struct cxip_txc *txc); + int (*msg_init)(struct cxip_txc *txc); + int (*msg_fini)(struct cxip_txc *txc); +}; + /* * Endpoint object transmit context */ struct cxip_txc { void *context; + + uint32_t protocol; bool enabled; bool hrp_war_req; // Non-fetching 32-bit HRP - bool hmem; + bool trunc_ok; struct cxip_cq *send_cq; struct cxip_cntr *send_cntr; struct cxip_cntr *read_cntr; struct cxip_cntr *write_cntr; + struct cxip_txc_ops ops; + struct cxip_ep_obj *ep_obj; // parent EP object struct cxip_domain *domain; // parent domain uint8_t pid_bits; - - struct dlist_entry ep_list; // contains EPs using shared context + uint8_t recv_ptl_idx; struct fi_tx_attr attr; // attributes bool selective_completion; @@ -2045,33 +2209,73 @@ struct cxip_txc { struct cxip_cmdq *tx_cmdq; // added during cxip_txc_enable() ofi_atomic32_t otx_reqs; // outstanding transmit requests + /* Queue of TX messages in flight for the context */ + struct dlist_entry msg_queue; + struct cxip_req *rma_write_selective_completion_req; struct cxip_req *rma_read_selective_completion_req; struct cxip_req *amo_selective_completion_req; struct cxip_req *amo_fetch_selective_completion_req; - /* Rendezvous related structures */ + struct dlist_entry dom_entry; +}; + +/* Default HPC SAS TXC specialization */ +struct cxip_txc_hpc { + /* Must remain first */ + struct cxip_txc base; + + int max_eager_size; + int rdzv_eager_size; + + /* Rendezvous messaging support */ struct cxip_rdzv_match_pte *rdzv_pte; struct cxip_rdzv_nomatch_pte *rdzv_nomatch_pte[RDZV_NO_MATCH_PTES]; struct indexer rdzv_ids; struct indexer msg_rdzv_ids; enum cxip_rdzv_proto rdzv_proto; - /* Match complete IDs */ - struct indexer tx_ids; - - int max_eager_size; - int rdzv_eager_size; struct cxip_cmdq *rx_cmdq; // Target cmdq for Rendezvous buffers #if ENABLE_DEBUG uint64_t force_err; #endif /* Flow Control recovery */ - struct dlist_entry msg_queue; struct dlist_entry fc_peers; - struct dlist_entry dom_entry; + /* Match complete IDs */ + struct indexer tx_ids; + +}; + +/* Client/server derived TXC, does not support SAS ordering + * or remotely buffered unexpected messages. + */ +#define CXIP_RNR_TIMEOUT_US 500000 +#define CXIP_NUM_RNR_WAIT_QUEUE 5 + +struct cxip_txc_rnr { + /* Must remain first */ + struct cxip_txc base; + + uint64_t max_retry_wait_us; /* Maximum time to retry any request */ + ofi_atomic32_t time_wait_reqs; /* Number of RNR time wait reqs */ + uint64_t next_retry_wait_us; /* Time of next retry in all queues */ + uint64_t total_retries; + uint64_t total_rnr_nacks; + bool hybrid_mr_desc; + + /* Used when success events are not required */ + struct cxip_req *req_selective_comp_msg; + struct cxip_req *req_selective_comp_tag; + + /* There are CXIP_NUM_RNR_WAIT_QUEUE queues where each queue has + * a specified time wait value and where the last queue is has the + * maximum time wait value before retrying (and is used for all + * subsequent retries). This implementation allows each queue to + * be maintained in retry order with a simple append of the request. + */ + struct dlist_entry time_wait_queue[CXIP_NUM_RNR_WAIT_QUEUE]; }; int cxip_txc_emit_idc_put(struct cxip_txc *txc, uint16_t vni, @@ -2097,9 +2301,56 @@ int cxip_txc_emit_dma_amo(struct cxip_txc *txc, uint16_t vni, struct cxip_cntr *trig_cntr, size_t trig_thresh, struct c_dma_amo_cmd *amo, uint64_t flags, bool fetching, bool flush); +int cxip_txc_emit_idc_msg(struct cxip_txc *txc, uint16_t vni, + enum cxi_traffic_class tc, + enum cxi_traffic_class_type tc_type, + const struct c_cstate_cmd *c_state, + const struct c_idc_msg_hdr *msg, const void *buf, + size_t len, uint64_t flags); void cxip_txc_flush_msg_trig_reqs(struct cxip_txc *txc); +/* + * Endpoint Control Object + * + * Groups control MR and messaging structures that can be exclusively used + * for a standard EP or globally shared in a SEP by all RX/TX context. + */ +struct cxip_ctrl { + /* wait object is required to wake up CQ waiters + * when control progress is required. + */ + struct cxil_wait_obj *wait; + + struct cxi_eq *tgt_evtq; + struct cxi_eq *tx_evtq; + + /* TX command queue is used to initiate side-band messaging + * and is TX credit based. + */ + struct cxip_cmdq *txq; + unsigned int tx_credits; + + /* Target command queue is used for appending RX side-band + * messaging control LE and managing standard MR LE. + */ + struct cxip_cmdq *tgq; + struct cxip_pte *pte; + struct cxip_ctrl_req msg_req; + + /* FI_MR_PROV_KEY caching, protected with ep_obj->lock */ + struct cxip_mr_lac_cache std_mr_cache[CXIP_NUM_CACHED_KEY_LE]; + struct cxip_mr_lac_cache opt_mr_cache[CXIP_NUM_CACHED_KEY_LE]; + + struct dlist_entry mr_list; + + /* Event queue buffers */ + void *tgt_evtq_buf; + struct cxi_md *tgt_evtq_buf_md; + void *tx_evtq_buf; + struct cxi_md *tx_evtq_buf_md; +}; + /* * Base Endpoint Object * @@ -2124,20 +2375,26 @@ struct cxip_ep_obj { uint16_t *vnis; size_t vni_count; - bool enabled; - - struct cxil_wait_obj *ctrl_wait; - struct cxi_eq *ctrl_tgt_evtq; - struct cxi_eq *ctrl_tx_evtq; - struct cxip_addr src_addr; fi_addr_t fi_addr; + bool enabled; + + /* Endpoint protocol implementations. + * FI_PROTO_CXI - Portals SAS protocol + */ + uint32_t protocol; + struct cxip_txc *txc; + struct cxip_rxc *rxc; + /* ASIC version associated with EP/Domain */ enum cassini_version asic_ver; - struct cxip_txc txc; - struct cxip_rxc rxc; + /* Information that might be owned by an EP (or a SEP + * when implemented). Should ultimately be a pointer + * to a base/specialization. + */ + struct cxip_ctrl ctrl; /* Command queues. Each EP has 1 transmit and 1 target * command queue that can be shared. An optional 2nd transmit @@ -2149,15 +2406,6 @@ struct cxip_ep_obj { ofi_atomic32_t tgq_ref; struct cxip_cmdq *rx_txq; - /* Portals flow-control recovery messaging uses a credit - * scheme to avoid over-running the associated event queue. - */ - struct cxip_cmdq *ctrl_txq; - struct cxip_cmdq *ctrl_tgq; - unsigned int ctrl_tx_credits; - struct cxip_pte *ctrl_pte; - struct cxip_ctrl_req ctrl_msg_req; - /* Libfabric software EQ resource */ struct cxip_eq *eq; struct dlist_entry eq_link; @@ -2172,17 +2420,6 @@ struct cxip_ep_obj { struct cxip_ep_coll_obj coll; struct cxip_ep_zbcoll_obj zbcoll; - /* Flow control recovery event queue buffers */ - void *ctrl_tgt_evtq_buf; - struct cxi_md *ctrl_tgt_evtq_buf_md; - void *ctrl_tx_evtq_buf; - struct cxi_md *ctrl_tx_evtq_buf_md; - - /* FI_MR_PROV_KEY caching, protected with ep_obj->lock */ - struct cxip_mr_lac_cache std_mr_cache[CXIP_NUM_CACHED_KEY_LE]; - struct cxip_mr_lac_cache opt_mr_cache[CXIP_NUM_CACHED_KEY_LE]; - struct dlist_entry mr_list; - size_t txq_size; size_t tgq_size; ofi_atomic32_t ref; @@ -2573,7 +2810,9 @@ struct cxip_coll_mc { int next_red_id; // next available red_id int max_red_id; // limit total concurrency int seqno; // rolling seqno for packets + bool is_multicast; // true if multicast address bool arm_disable; // arm-disable for testing + bool retry_disable; // retry-disable for testing bool is_joined; // true if joined bool rx_discard; // true to discard RX events enum cxi_traffic_class tc; // traffic class @@ -2680,9 +2919,9 @@ struct cxip_fid_list { struct fid *fid; }; -int cxip_rdzv_match_pte_alloc(struct cxip_txc *txc, +int cxip_rdzv_match_pte_alloc(struct cxip_txc_hpc *txc, struct cxip_rdzv_match_pte **rdzv_pte); -int cxip_rdzv_nomatch_pte_alloc(struct cxip_txc *txc, int lac, +int cxip_rdzv_nomatch_pte_alloc(struct cxip_txc_hpc *txc, int lac, struct cxip_rdzv_nomatch_pte **rdzv_pte); int cxip_rdzv_pte_src_req_alloc(struct cxip_rdzv_match_pte *pte, int lac); void cxip_rdzv_match_pte_free(struct cxip_rdzv_match_pte *pte); @@ -2701,9 +2940,9 @@ int cxip_alloc_lni(struct cxip_if *iface, uint32_t svc_id, void cxip_free_lni(struct cxip_lni *lni); const char *cxi_tc_str(enum cxi_traffic_class tc); enum cxi_traffic_class cxip_ofi_to_cxi_tc(uint32_t ofi_tclass); -int cxip_txq_cp_set(struct cxip_cmdq *cmdq, uint16_t vni, - enum cxi_traffic_class tc, - enum cxi_traffic_class_type tc_type); +int cxip_cmdq_cp_set(struct cxip_cmdq *cmdq, uint16_t vni, + enum cxi_traffic_class tc, + enum cxi_traffic_class_type tc_type); void cxip_if_init(void); void cxip_if_fini(void); @@ -2745,6 +2984,19 @@ void cxip_cmdq_free(struct cxip_cmdq *cmdq); int cxip_cmdq_emit_c_state(struct cxip_cmdq *cmdq, const struct c_cstate_cmd *cmd); +static inline bool cxip_cmdq_empty(struct cxip_cmdq *cmdq) +{ + return cxi_cq_empty(cmdq->dev_cmdq); +} + +static inline bool cxip_cmdq_match(struct cxip_cmdq *cmdq, uint16_t vni, + enum cxi_traffic_class tc, + enum cxi_traffic_class_type tc_type) +{ + return (cmdq->cur_cp->vni == vni) && (cmdq->cur_cp->tc == tc) && + (cmdq->cur_cp->tc_type == tc_type); +} + int cxip_evtq_init(struct cxip_evtq *evtq, struct cxip_cq *cq, size_t num_events, size_t num_fc_events); void cxip_evtq_fini(struct cxip_evtq *eq); @@ -2758,12 +3010,12 @@ int cxip_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric, int cxip_endpoint(struct fid_domain *domain, struct fi_info *info, struct fid_ep **ep, void *context); -int cxip_tx_id_alloc(struct cxip_txc *txc, void *ctx); -int cxip_tx_id_free(struct cxip_txc *txc, int id); -void *cxip_tx_id_lookup(struct cxip_txc *txc, int id); -int cxip_rdzv_id_alloc(struct cxip_txc *txc, struct cxip_req *req); -int cxip_rdzv_id_free(struct cxip_txc *txc, int id); -void *cxip_rdzv_id_lookup(struct cxip_txc *txc, int id); +int cxip_tx_id_alloc(struct cxip_txc_hpc *txc, void *ctx); +int cxip_tx_id_free(struct cxip_txc_hpc *txc, int id); +void *cxip_tx_id_lookup(struct cxip_txc_hpc *txc, int id); +int cxip_rdzv_id_alloc(struct cxip_txc_hpc *txc, struct cxip_req *req); +int cxip_rdzv_id_free(struct cxip_txc_hpc *txc, int id); +void *cxip_rdzv_id_lookup(struct cxip_txc_hpc *txc, int id); int cxip_ep_cmdq(struct cxip_ep_obj *ep_obj, bool transmit, uint32_t tclass, struct cxi_eq *evtq, struct cxip_cmdq **cmdq); void cxip_ep_cmdq_put(struct cxip_ep_obj *ep_obj, bool transmit); @@ -2772,23 +3024,41 @@ int cxip_recv_ux_sw_matcher(struct cxip_ux_send *ux); int cxip_recv_req_sw_matcher(struct cxip_req *req); int cxip_recv_cancel(struct cxip_req *req); int cxip_fc_process_drops(struct cxip_ep_obj *ep_obj, uint32_t nic_addr, - uint32_t pid, uint16_t drops); + uint32_t pid, uint16_t vni, uint16_t drops); void cxip_recv_pte_cb(struct cxip_pte *pte, const union c_event *event); void cxip_rxc_req_fini(struct cxip_rxc *rxc); int cxip_rxc_oflow_init(struct cxip_rxc *rxc); void cxip_rxc_oflow_fini(struct cxip_rxc *rxc); -int cxip_fc_resume(struct cxip_ep_obj *ep_obj, uint32_t nic_addr, uint32_t pid); +int cxip_fc_resume(struct cxip_ep_obj *ep_obj, uint32_t nic_addr, uint32_t pid, + uint16_t vni); void cxip_txc_struct_init(struct cxip_txc *txc, const struct fi_tx_attr *attr, void *context); +struct cxip_txc *cxip_txc_calloc(struct cxip_ep_obj *ep_obj, void *context); +void cxip_txc_free(struct cxip_txc *txc); int cxip_txc_enable(struct cxip_txc *txc); void cxip_txc_disable(struct cxip_txc *txc); struct cxip_txc *cxip_stx_alloc(const struct fi_tx_attr *attr, void *context); -int cxip_rxc_msg_enable(struct cxip_rxc *rxc, uint32_t drop_count); +int cxip_rxc_msg_enable(struct cxip_rxc_hpc *rxc, uint32_t drop_count); + +struct cxip_rxc *cxip_rxc_calloc(struct cxip_ep_obj *ep_obj, void *context); +void cxip_rxc_free(struct cxip_rxc *rxc); int cxip_rxc_enable(struct cxip_rxc *rxc); void cxip_rxc_disable(struct cxip_rxc *rxc); void cxip_rxc_struct_init(struct cxip_rxc *rxc, const struct fi_rx_attr *attr, void *context); +void cxip_rxc_recv_req_cleanup(struct cxip_rxc *rxc); + +int cxip_rxc_emit_dma(struct cxip_rxc_hpc *rxc, uint16_t vni, + enum cxi_traffic_class tc, + enum cxi_traffic_class_type tc_type, + struct c_full_dma_cmd *dma, uint64_t flags); +int cxip_rxc_emit_idc_msg(struct cxip_rxc_hpc *rxc, uint16_t vni, + enum cxi_traffic_class tc, + enum cxi_traffic_class_type tc_type, + const struct c_cstate_cmd *c_state, + const struct c_idc_msg_hdr *msg, const void *buf, + size_t len, uint64_t flags); int cxip_eq_open(struct fid_fabric *fabric, struct fi_eq_attr *attr, struct fid_eq **eq, void *context); @@ -2936,19 +3206,6 @@ static inline void cxip_txq_ring(struct cxip_cmdq *cmdq, bool more, } } -ssize_t cxip_send_common(struct cxip_txc *txc, uint32_t tclass, - const void *buf, size_t len, - void *desc, uint64_t data, fi_addr_t dest_addr, - uint64_t tag, void *context, uint64_t flags, - bool tagged, bool triggered, uint64_t trig_thresh, - struct cxip_cntr *trig_cntr, - struct cxip_cntr *comp_cntr); - -ssize_t cxip_recv_common(struct cxip_rxc *rxc, void *buf, size_t len, - void *desc, fi_addr_t src_addr, uint64_t tag, - uint64_t ignore, void *context, uint64_t flags, - bool tagged, struct cxip_cntr *comp_cntr); - ssize_t cxip_rma_common(enum fi_op_type op, struct cxip_txc *txc, const void *buf, size_t len, void *desc, fi_addr_t tgt_addr, uint64_t addr, @@ -3086,64 +3343,53 @@ static inline bool is_netsim(struct cxip_ep_obj *ep_obj) } /* debugging TRACE functions */ -#define cxip_trace_attr __attribute__((format(__printf__, 1, 2))) -typedef int (*cxip_trace_t)(const char *fmt, ...); -extern cxip_trace_t cxip_trace_attr cxip_trace_fn; - -typedef void (*cxip_trace_flush_t)(void); -extern cxip_trace_flush_t cxip_trace_flush_fn; - -typedef void (*cxip_trace_close_t)(void); -extern cxip_trace_close_t cxip_trace_close_fn; - -typedef bool (*cxip_trace_enable_t)(bool enable); -extern cxip_trace_enable_t cxip_trace_enable_fn; - -extern bool cxip_trace_enabled; // true if tracing is enabled -extern bool cxip_trace_append; // append open for trace file -extern bool cxip_trace_linebuf; // set line buffering for trace -extern int cxip_trace_rank; // tracing rank -extern int cxip_trace_numranks; // tracing number of ranks -extern FILE *cxip_trace_fid; // trace output file descriptor - -int cxip_trace_attr cxip_trace(const char *fmt, ...); -void cxip_trace_flush(void); -void cxip_trace_close(void); -bool cxip_trace_enable(bool enable); +#define cxip_coll_trace_attr __attribute__((format(__printf__, 1, 2))) +extern bool cxip_coll_trace_muted; // suppress output if true +extern bool cxip_coll_trace_append; // append open for trace file +extern bool cxip_coll_trace_linebuf; // set line buffering for trace +extern int cxip_coll_trace_rank; // tracing rank +extern int cxip_coll_trace_numranks; // tracing number of ranks +extern FILE *cxip_coll_trace_fid; // trace output file descriptor + +int cxip_coll_trace_attr cxip_coll_trace(const char *fmt, ...); +void cxip_coll_trace_flush(void); +void cxip_coll_trace_close(void); +void cxip_coll_trace_init(void); /* debugging TRACE filtering control */ -enum cxip_trace_module { +enum cxip_coll_trace_module { CXIP_TRC_CTRL, CXIP_TRC_ZBCOLL, - CXIP_TRC_CURL, + CXIP_TRC_COLL_CURL, CXIP_TRC_COLL_PKT, CXIP_TRC_COLL_JOIN, CXIP_TRC_COLL_DEBUG, CXIP_TRC_TEST_CODE, CXIP_TRC_MAX }; -extern uint64_t cxip_trace_mask; +extern uint64_t cxip_coll_trace_mask; -static inline void cxip_trace_set(int mod) +static inline void cxip_coll_trace_set(int mod) { - cxip_trace_mask |= (1L << mod); + cxip_coll_trace_mask |= (1L << mod); } -static inline void cxip_trace_clr(int mod) +static inline void cxip_coll_trace_clr(int mod) { - cxip_trace_mask &= ~(1L << mod); + cxip_coll_trace_mask &= ~(1L << mod); } -static inline bool cxip_trace_true(int mod) +static inline bool cxip_coll_trace_true(int mod) { - return cxip_trace_enabled && (cxip_trace_mask & (1L << mod)); + return (!cxip_coll_trace_muted) && (cxip_coll_trace_mask & (1L << mod)); } #if ENABLE_DEBUG -#define CXIP_TRACE(mod, fmt, ...) \ - do {if (cxip_trace_true(mod)) cxip_trace_fn(fmt, ##__VA_ARGS__);} while (0) +#define CXIP_COLL_TRACE(mod, fmt, ...) \ + do {if (cxip_coll_trace_true(mod)) \ + cxip_coll_trace(fmt, ##__VA_ARGS__);} while (0) #else -#define CXIP_TRACE(mod, fmt, ...) do {} while (0) +#define CXIP_COLL_TRACE(mod, fmt, ...) do {} while (0) #endif /* fabric logging implementation functions */ @@ -3170,41 +3416,52 @@ static inline bool cxip_trace_true(int mod) abort(); \ } while (0) +#define TXC_BASE(txc) ((struct cxip_txc *)(void *)(txc)) #define TXC_DBG(txc, fmt, ...) \ _CXIP_DBG(FI_LOG_EP_DATA, "TXC (%#x:%u): " fmt "", \ - (txc)->ep_obj->src_addr.nic, (txc)->ep_obj->src_addr.pid, \ - ##__VA_ARGS__) + TXC_BASE(txc)->ep_obj->src_addr.nic, \ + TXC_BASE(txc)->ep_obj->src_addr.pid, ##__VA_ARGS__) +#define TXC_INFO(txc, fmt, ...) \ + _CXIP_INFO(FI_LOG_EP_DATA, "TXC (%#x:%u): " fmt "", \ + TXC_BASE(txc)->ep_obj->src_addr.nic, \ + TXC_BASE(txc)->ep_obj->src_addr.pid, ##__VA_ARGS__) #define TXC_WARN(txc, fmt, ...) \ _CXIP_WARN(FI_LOG_EP_DATA, "TXC (%#x:%u): " fmt "", \ - (txc)->ep_obj->src_addr.nic, (txc)->ep_obj->src_addr.pid, \ - ##__VA_ARGS__) + TXC_BASE(txc)->ep_obj->src_addr.nic, \ + TXC_BASE(txc)->ep_obj->src_addr.pid, ##__VA_ARGS__) #define TXC_WARN_RET(txc, ret, fmt, ...) \ TXC_WARN(txc, "%d:%s: " fmt "", ret, fi_strerror(-ret), ##__VA_ARGS__) #define TXC_FATAL(txc, fmt, ...) \ - CXIP_FATAL("TXC (%#x:%u):: " fmt "", (txc)->ep_obj->src_addr.nic, \ - (txc)->ep_obj->src_addr.pid, ##__VA_ARGS__) + CXIP_FATAL("TXC (%#x:%u):: " fmt "", \ + TXC_BASE(txc)->ep_obj->src_addr.nic, \ + TXC_BASE(txc)->ep_obj->src_addr.pid, ##__VA_ARGS__) +#define RXC_BASE(rxc) ((struct cxip_rxc *)(void *)(rxc)) #define RXC_DBG(rxc, fmt, ...) \ _CXIP_DBG(FI_LOG_EP_DATA, "RXC (%#x:%u) PtlTE %u: " fmt "", \ - (rxc)->ep_obj->src_addr.nic, (rxc)->ep_obj->src_addr.pid, \ - (rxc)->rx_pte->pte->ptn, ##__VA_ARGS__) + RXC_BASE(rxc)->ep_obj->src_addr.nic, \ + RXC_BASE(rxc)->ep_obj->src_addr.pid, \ + RXC_BASE(rxc)->rx_pte->pte->ptn, ##__VA_ARGS__) #define RXC_INFO(rxc, fmt, ...) \ _CXIP_INFO(FI_LOG_EP_DATA, "RXC (%#x:%u) PtlTE %u: " fmt "", \ - (rxc)->ep_obj->src_addr.nic, (rxc)->ep_obj->src_addr.pid, \ - (rxc)->rx_pte->pte->ptn, ##__VA_ARGS__) + RXC_BASE(rxc)->ep_obj->src_addr.nic, \ + RXC_BASE(rxc)->ep_obj->src_addr.pid, \ + RXC_BASE(rxc)->rx_pte->pte->ptn, ##__VA_ARGS__) #define RXC_WARN(rxc, fmt, ...) \ _CXIP_WARN(FI_LOG_EP_DATA, "RXC (%#x:%u) PtlTE %u: " fmt "", \ - (rxc)->ep_obj->src_addr.nic, (rxc)->ep_obj->src_addr.pid, \ - (rxc)->rx_pte->pte->ptn, ##__VA_ARGS__) + RXC_BASE(rxc)->ep_obj->src_addr.nic, \ + RXC_BASE(rxc)->ep_obj->src_addr.pid, \ + RXC_BASE(rxc)->rx_pte->pte->ptn, ##__VA_ARGS__) #define RXC_WARN_ONCE(rxc, fmt, ...) \ _CXIP_WARN_ONCE(FI_LOG_EP_DATA, "RXC (%#x:%u) PtlTE %u: " fmt "", \ - (rxc)->ep_obj->src_addr.nic, (rxc)->ep_obj->src_addr.pid, \ - (rxc)->rx_pte->pte->ptn, ##__VA_ARGS__) + RXC_BASE(rxc)->ep_obj->src_addr.nic, \ + RXC_BASE(rxc)->ep_obj->src_addr.pid, \ + RXC_BASE(rxc)->rx_pte->pte->ptn, ##__VA_ARGS__) #define RXC_FATAL(rxc, fmt, ...) \ CXIP_FATAL("RXC (%#x:%u) PtlTE %u:[Fatal] " fmt "", \ - (rxc)->ep_obj->src_addr.nic, \ - (rxc)->ep_obj->src_addr.pid, \ - (rxc)->rx_pte->pte->ptn, ##__VA_ARGS__) + RXC_BASE(rxc)->ep_obj->src_addr.nic, \ + RXC_BASE(rxc)->ep_obj->src_addr.pid, \ + RXC_BASE(rxc)->rx_pte->pte->ptn, ##__VA_ARGS__) #define DOM_INFO(dom, fmt, ...) \ _CXIP_INFO(FI_LOG_DOMAIN, "DOM (cxi%u:%u:%u:%u:%#x): " fmt "", \ @@ -3327,6 +3584,82 @@ cxip_txc_copy_from_hmem(struct cxip_txc *txc, struct cxip_md *hmem_md, return FI_SUCCESS; } +static inline +int cxip_set_recv_match_id(struct cxip_rxc *rxc, fi_addr_t src_addr, + bool auth_key, uint32_t *match_id, uint16_t *vni) +{ + struct cxip_addr caddr; + int ret; + + /* If FI_DIRECTED_RECV and a src_addr is specified, encode the address + * in the LE for matching. If application AVs are symmetric, use + * logical FI address for matching. Otherwise, use physical address. + */ + if (rxc->attr.caps & FI_DIRECTED_RECV && + src_addr != FI_ADDR_UNSPEC) { + if (rxc->ep_obj->av->symmetric) { + /* PID is not used for matching */ + *match_id = CXI_MATCH_ID(rxc->pid_bits, + C_PID_ANY, src_addr); + *vni = rxc->ep_obj->auth_key.vni; + } else { + ret = cxip_av_lookup_addr(rxc->ep_obj->av, src_addr, + &caddr); + if (ret != FI_SUCCESS) { + RXC_WARN(rxc, "Failed to look up FI addr: %d\n", + ret); + return -FI_EINVAL; + } + + *match_id = CXI_MATCH_ID(rxc->pid_bits, caddr.pid, + caddr.nic); + if (auth_key) + *vni = caddr.vni; + else + *vni = rxc->ep_obj->auth_key.vni; + } + } else { + *match_id = CXI_MATCH_ID_ANY; + *vni = 0; + } + + return FI_SUCCESS; +} + +fi_addr_t cxip_recv_req_src_addr(struct cxip_req *req); +int cxip_recv_req_alloc(struct cxip_rxc *rxc, void *buf, size_t len, + struct cxip_md *md, struct cxip_req **cxip_req, + int (*recv_cb)(struct cxip_req *req, + const union c_event *event)); +void cxip_recv_req_free(struct cxip_req *req); +void cxip_recv_req_report(struct cxip_req *req); +void cxip_recv_req_peek_complete(struct cxip_req *req, + struct cxip_ux_send *ux_send); +struct cxip_req *cxip_mrecv_req_dup(struct cxip_req *mrecv_req); +int cxip_complete_put(struct cxip_req *req, const union c_event *event); +/* XXXX TODO: Remove */ +/* Defines the posted receive interval for checking LE allocation if + * in hybrid RX match mode and preemptive transitions to software + * managed EP are requested. + */ +#define CXIP_HYBRID_RECV_CHECK_INTERVAL (64-1) +#define FC_SW_LE_MSG_FATAL "LE exhaustion during flow control, "\ + "FI_CXI_RX_MATCH_MODE=[hybrid|software] is required\n" +int cxip_recv_pending_ptlte_disable(struct cxip_rxc *rxc, bool check_fc); +int cxip_flush_appends(struct cxip_rxc_hpc *rxc, + int (*flush_cb)(struct cxip_req *req, + const union c_event *event)); +int cxip_recv_req_dropped(struct cxip_req *req); +void cxip_rxc_record_req_stat(struct cxip_rxc *rxc, enum c_ptl_list list, + size_t rlength, struct cxip_req *req); +bool tag_match(uint64_t init_mb, uint64_t mb, uint64_t ib); +bool init_match(struct cxip_rxc *rxc, uint32_t init, uint32_t match_id); +uint32_t cxip_msg_match_id(struct cxip_txc *txc); +void cxip_report_send_completion(struct cxip_req *req, bool sw_cntr); +bool cxip_send_eager_idc(struct cxip_req *req); +void cxip_send_buf_fini(struct cxip_req *req); +int cxip_send_buf_init(struct cxip_req *req); + size_t cxip_ep_get_unexp_msgs(struct fid_ep *fid_ep, struct fi_cq_tagged_entry *entry, size_t count, fi_addr_t *src_addr, size_t *ux_count); diff --git a/prov/cxi/include/fi_cxi_ext.h b/prov/cxi/include/fi_cxi_ext.h index bee868450e1..a2775cbc253 100644 --- a/prov/cxi/include/fi_cxi_ext.h +++ b/prov/cxi/include/fi_cxi_ext.h @@ -56,8 +56,24 @@ enum { FI_OPT_CXI_GET_OPTIMIZED_MRS, /* bool */ FI_OPT_CXI_SET_PROV_KEY_CACHE, /* bool */ FI_OPT_CXI_GET_PROV_KEY_CACHE, /* bool */ + FI_OPT_CXI_SET_RNR_MAX_RETRY_TIME, /* uint64_t */ }; +/* + * Defines for compatibility between main branch features and release + * branches back porting of features required for use. Any value + * included here should map exactly to the value established in the + * main branch (enum or define) and this CXI equivalent will exist forever. + */ +#define FI_CXI_CNTR_EVENTS_BYTES 1 /* FI_CNTR_EVENTS_BYTES */ + +/* + * TODO: Set this to the upstream value prior to releasing software. + * This flag returned in a completion and indicates that the message was + * truncated and that the length indicates the truncated message length. + */ +#define FI_CXI_TRUNC (1ULL << 56) + /* * Execute a given libfabric atomic memory operation as a PCIe operation as * compared to a NIC operation. diff --git a/prov/cxi/src/cxip_atomic.c b/prov/cxi/src/cxip_atomic.c index c71a762f02a..fd16a97084a 100644 --- a/prov/cxi/src/cxip_atomic.c +++ b/prov/cxi/src/cxip_atomic.c @@ -1371,7 +1371,7 @@ static ssize_t cxip_ep_atomic_write(struct fid_ep *fid_ep, const void *buf, .context = context }; - return cxip_amo_common(CXIP_RQ_AMO, &ep->ep_obj->txc, + return cxip_amo_common(CXIP_RQ_AMO, ep->ep_obj->txc, ep->tx_attr.tclass, &msg, NULL, NULL, 0, NULL, NULL, 0, ep->tx_attr.op_flags, false, 0, NULL, NULL); @@ -1402,7 +1402,7 @@ static ssize_t cxip_ep_atomic_writev(struct fid_ep *fid_ep, .context = context }; - return cxip_amo_common(CXIP_RQ_AMO, &ep->ep_obj->txc, + return cxip_amo_common(CXIP_RQ_AMO, ep->ep_obj->txc, ep->tx_attr.tclass, &msg, NULL, NULL, 0, NULL, NULL, 0, ep->tx_attr.op_flags, false, 0, NULL, NULL); @@ -1413,7 +1413,7 @@ static ssize_t cxip_ep_atomic_writemsg(struct fid_ep *fid_ep, uint64_t flags) { struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); - struct cxip_txc *txc = &ep->ep_obj->txc; + struct cxip_txc *txc = ep->ep_obj->txc; if (flags & ~(CXIP_WRITEMSG_ALLOWED_FLAGS | FI_CXI_UNRELIABLE | @@ -1461,7 +1461,7 @@ static ssize_t cxip_ep_atomic_inject(struct fid_ep *fid_ep, const void *buf, .context = NULL }; - return cxip_amo_common(CXIP_RQ_AMO, &ep->ep_obj->txc, + return cxip_amo_common(CXIP_RQ_AMO, ep->ep_obj->txc, ep->tx_attr.tclass, &msg, NULL, NULL, 0, NULL, NULL, 0, FI_INJECT, false, 0, NULL, NULL); } @@ -1499,7 +1499,7 @@ static ssize_t cxip_ep_atomic_readwrite(struct fid_ep *fid_ep, const void *buf, .context = context }; - return cxip_amo_common(CXIP_RQ_AMO_FETCH, &ep->ep_obj->txc, + return cxip_amo_common(CXIP_RQ_AMO_FETCH, ep->ep_obj->txc, ep->tx_attr.tclass, &msg, NULL, NULL, 0, &resultv, &result_desc, 1, ep->tx_attr.op_flags, false, 0, NULL, NULL); @@ -1534,7 +1534,7 @@ static ssize_t cxip_ep_atomic_readwritev(struct fid_ep *fid_ep, .context = context }; - return cxip_amo_common(CXIP_RQ_AMO_FETCH, &ep->ep_obj->txc, + return cxip_amo_common(CXIP_RQ_AMO_FETCH, ep->ep_obj->txc, ep->tx_attr.tclass, &msg, NULL, NULL, 0, resultv, result_desc, result_count, ep->tx_attr.op_flags, false, 0, NULL, NULL); @@ -1547,7 +1547,7 @@ static ssize_t cxip_ep_atomic_readwritemsg(struct fid_ep *fid_ep, size_t result_count, uint64_t flags) { struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); - struct cxip_txc *txc = &ep->ep_obj->txc; + struct cxip_txc *txc = ep->ep_obj->txc; enum cxip_amo_req_type req_type; if (flags & ~(CXIP_WRITEMSG_ALLOWED_FLAGS | @@ -1612,7 +1612,7 @@ static ssize_t cxip_ep_atomic_compwrite(struct fid_ep *fid_ep, const void *buf, .context = context }; - return cxip_amo_common(CXIP_RQ_AMO_SWAP, &ep->ep_obj->txc, + return cxip_amo_common(CXIP_RQ_AMO_SWAP, ep->ep_obj->txc, ep->tx_attr.tclass, &msg, &comparev, &result_desc, 1, &resultv, &result_desc, 1, ep->tx_attr.op_flags, false, 0, NULL, NULL); @@ -1650,7 +1650,7 @@ static ssize_t cxip_ep_atomic_compwritev(struct fid_ep *fid_ep, .context = context }; - return cxip_amo_common(CXIP_RQ_AMO_SWAP, &ep->ep_obj->txc, + return cxip_amo_common(CXIP_RQ_AMO_SWAP, ep->ep_obj->txc, ep->tx_attr.tclass, &msg, comparev, compare_desc, compare_count, resultv, result_desc, result_count, ep->tx_attr.op_flags, false, 0, @@ -1666,7 +1666,7 @@ cxip_ep_atomic_compwritemsg(struct fid_ep *fid_ep, uint64_t flags) { struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); - struct cxip_txc *txc = &ep->ep_obj->txc; + struct cxip_txc *txc = ep->ep_obj->txc; if (flags & ~(CXIP_WRITEMSG_ALLOWED_FLAGS | FI_CXI_UNRELIABLE | FI_CXI_WEAK_FENCE)) diff --git a/prov/cxi/src/cxip_cmdq.c b/prov/cxi/src/cxip_cmdq.c index 6d4b28efddf..d2fae71c92b 100644 --- a/prov/cxi/src/cxip_cmdq.c +++ b/prov/cxi/src/cxip_cmdq.c @@ -125,15 +125,14 @@ static int cxip_cp_get(struct cxip_lni *lni, uint16_t vni, return ret; } -int cxip_txq_cp_set(struct cxip_cmdq *cmdq, uint16_t vni, - enum cxi_traffic_class tc, - enum cxi_traffic_class_type tc_type) +int cxip_cmdq_cp_set(struct cxip_cmdq *cmdq, uint16_t vni, + enum cxi_traffic_class tc, + enum cxi_traffic_class_type tc_type) { struct cxi_cp *cp; int ret; - if (cmdq->cur_cp->vni == vni && cmdq->cur_cp->tc == tc && - cmdq->cur_cp->tc_type == tc_type) + if (cxip_cmdq_match(cmdq, vni, tc, tc_type)) return FI_SUCCESS; ret = cxip_cp_get(cmdq->lni, vni, tc, tc_type, &cp); @@ -423,3 +422,36 @@ int cxip_cmdq_emit_dma_amo(struct cxip_cmdq *cmdq, struct c_dma_amo_cmd *amo, return FI_SUCCESS; } + +int cxip_cmdq_emit_idc_msg(struct cxip_cmdq *cmdq, + const struct c_cstate_cmd *c_state, + const struct c_idc_msg_hdr *msg, const void *buf, + size_t len, uint64_t flags) +{ + int ret; + + if (flags & (FI_FENCE | FI_CXI_WEAK_FENCE)) { + ret = cxi_cq_emit_cq_cmd(cmdq->dev_cmdq, C_CMD_CQ_FENCE); + if (ret) { + CXIP_WARN("Failed to issue fence command: %d:%s\n", ret, + fi_strerror(-ret)); + return -FI_EAGAIN; + } + } + + ret = cxip_cmdq_emit_c_state(cmdq, c_state); + if (ret) { + CXIP_WARN("Failed to emit c_state command: %d:%s\n", ret, + fi_strerror(-ret)); + return ret; + } + + ret = cxi_cq_emit_idc_msg(cmdq->dev_cmdq, msg, buf, len); + if (ret) { + CXIP_WARN("Failed to emit idc_msg command: %d:%s\n", ret, + fi_strerror(-ret)); + return -FI_EAGAIN; + } + + return FI_SUCCESS; +} diff --git a/prov/cxi/src/cxip_cntr.c b/prov/cxi/src/cxip_cntr.c index 2f7354330ac..8a0989b479e 100644 --- a/prov/cxi/src/cxip_cntr.c +++ b/prov/cxi/src/cxip_cntr.c @@ -548,12 +548,22 @@ static int cxip_cntr_wait(struct fid_cntr *fid_cntr, uint64_t threshold, uint64_t success = 0; int ret; uint64_t endtime; - + uint64_t start_error = 0; + uint64_t error = 0; if (cntr->attr.wait_obj == FI_WAIT_NONE || threshold > FI_CXI_CNTR_SUCCESS_MAX) return -FI_EINVAL; + /* Determine existing value of error count, if it increments + * the function should return before threshold has been met. + */ + ret = cxip_cntr_get_ct_error(cntr, &start_error); + if (ret) { + CXIP_WARN("Failed to read counter error: %d\n", ret); + return ret; + } + endtime = ofi_timeout_time(timeout); /* Use a triggered list entry setup to fire at the user's threshold. @@ -579,6 +589,14 @@ static int cxip_cntr_wait(struct fid_cntr *fid_cntr, uint64_t threshold, if (success >= threshold) return FI_SUCCESS; + ret = cxip_cntr_get_ct_error(cntr, &error); + if (ret) { + CXIP_WARN("Failed to read counter error: %d\n", ret); + return ret; + } + if (error != start_error) + return -FI_EAVAIL; + if (ofi_adjust_timeout(endtime, &timeout)) return -FI_ETIMEDOUT; @@ -794,7 +812,8 @@ static int cxip_cntr_verify_attr(struct fi_cntr_attr *attr) if (!attr) return FI_SUCCESS; - if (attr->events != FI_CNTR_EVENTS_COMP) + if (attr->events != FI_CNTR_EVENTS_COMP && + attr->events != FI_CXI_CNTR_EVENTS_BYTES) return -FI_ENOSYS; switch (attr->wait_obj) { diff --git a/prov/cxi/src/cxip_coll.c b/prov/cxi/src/cxip_coll.c index b58983bf62c..3b4bbc0791a 100644 --- a/prov/cxi/src/cxip_coll.c +++ b/prov/cxi/src/cxip_coll.c @@ -29,11 +29,13 @@ #define _MM_GET_FLUSH_ZERO_MODE() ({0;}) #endif -#define TRACE_PKT(fmt, ...) CXIP_TRACE(CXIP_TRC_COLL_PKT, fmt, \ +#define TRACE_PKT(fmt, ...) CXIP_COLL_TRACE(CXIP_TRC_COLL_PKT, fmt, \ ##__VA_ARGS__) -#define TRACE_JOIN(fmt, ...) CXIP_TRACE(CXIP_TRC_COLL_JOIN, fmt, \ +#define TRACE_CURL(fmt, ...) CXIP_COLL_TRACE(CXIP_TRC_COLL_CURL, fmt, \ ##__VA_ARGS__) -#define TRACE_DEBUG(fmt, ...) CXIP_TRACE(CXIP_TRC_COLL_DEBUG, fmt, \ +#define TRACE_JOIN(fmt, ...) CXIP_COLL_TRACE(CXIP_TRC_COLL_JOIN, fmt, \ + ##__VA_ARGS__) +#define TRACE_DEBUG(fmt, ...) CXIP_COLL_TRACE(CXIP_TRC_COLL_DEBUG, fmt, \ ##__VA_ARGS__) // TODO regularize usage of these @@ -99,21 +101,21 @@ struct cxip_coll_cookie { uint32_t mcast_id:13; uint32_t red_id:3; - uint32_t magic: 15; - uint32_t retry: 1; + uint32_t magic: 16; } __attribute__((__packed__)); /* size 4b */ /* Packed header bits and cookie from above */ struct cxip_coll_hdr { - uint64_t seqno:10; - uint64_t arm:1; - uint64_t op:6; - uint64_t redcnt:20; - uint64_t resno:10; - uint64_t red_rc:4; - uint64_t repsum_m:8; - uint64_t repsum_ovflid:2; - uint64_t pad:3; + uint64_t seqno:10; + uint64_t arm:1; + uint64_t op:6; + uint64_t redcnt:20; + uint64_t resno:10; + uint64_t red_rc:4; + uint64_t repsum_m:8; + uint64_t repsum_ovflid:2; + uint64_t retry:1; + uint64_t pad:2; struct cxip_coll_cookie cookie; } __attribute__((__packed__)); /* size 12b */ @@ -260,6 +262,7 @@ void _dump_red_pkt(struct red_pkt *pkt, char *dir) TRACE_PKT("---------------\n"); TRACE_PKT("Reduction packet (%s):\n", dir); TRACE_PKT(" seqno = %d\n", pkt->hdr.seqno); + TRACE_PKT(" retry = %d\n", pkt->hdr.retry); TRACE_PKT(" arm = %d\n", pkt->hdr.arm); TRACE_PKT(" op = %d\n", pkt->hdr.op); TRACE_PKT(" redcnt = %d\n", pkt->hdr.redcnt); @@ -271,7 +274,6 @@ void _dump_red_pkt(struct red_pkt *pkt, char *dir) TRACE_PKT(" .mcast_id = %08x\n", pkt->hdr.cookie.mcast_id); TRACE_PKT(" .red_id = %08x\n", pkt->hdr.cookie.red_id); TRACE_PKT(" .magic = %08x\n", pkt->hdr.cookie.magic); - TRACE_PKT(" .retry = %08x\n", pkt->hdr.cookie.retry); for (i = 0; i < 4; i++) TRACE_PKT(" ival[%d] = %016lx\n", i, data[i]); TRACE_PKT("---------------\n"); @@ -535,6 +537,7 @@ static int _gen_tx_dfa(struct cxip_coll_reduction *reduction, int av_set_idx, union c_fab_addr *dfa, uint8_t *index_ext, bool *is_mcast) { + struct cxip_coll_mc *mc; struct cxip_ep_obj *ep_obj; struct cxip_av_set *av_set_obj; struct cxip_addr dest_caddr; @@ -543,15 +546,21 @@ static int _gen_tx_dfa(struct cxip_coll_reduction *reduction, int idx_ext; int ret; - ep_obj = reduction->mc_obj->ep_obj; - av_set_obj = reduction->mc_obj->av_set_obj; + /* cxi_build_mcast_dfa() found in: + cassini-headers/src/csrdef/cassini_user_defs.h + cassini-headers/include/cxi_prov_hw.h + */ + mc = reduction->mc_obj; + ep_obj = mc->ep_obj; + av_set_obj = mc->av_set_obj; /* Send address */ switch (av_set_obj->comm_key.keytype) { + case COMM_KEY_NONE: case COMM_KEY_MULTICAST: /* - destination == multicast ID * - idx_ext == 0 - * - dfa == multicast destination + * - dfa == multicast address * - index_ext == 0 */ if (is_netsim(ep_obj)) { @@ -559,16 +568,18 @@ static int _gen_tx_dfa(struct cxip_coll_reduction *reduction, return -FI_EINVAL; } idx_ext = 0; - cxi_build_mcast_dfa(av_set_obj->comm_key.mcast.mcast_addr, - reduction->red_id, idx_ext, - dfa, index_ext); + cxi_build_mcast_dfa(mc->mcast_addr, // mcast_id + reduction->red_id, // red_id + idx_ext, // idx_ext + dfa, // return dfa + index_ext); // return idx_ext *is_mcast = true; break; case COMM_KEY_UNICAST: /* - destination == remote node in av_set_obj * - idx_ext == CXIP_PTL_IDX_COLL * - dfa = remote nic - * - index_ext == CXIP_PTL_IDX_COLL + * - index_ext == idx_ext */ if (av_set_idx >= av_set_obj->fi_addr_cnt) { CXIP_WARN("av_set_idx out-of-range\n"); @@ -578,9 +589,14 @@ static int _gen_tx_dfa(struct cxip_coll_reduction *reduction, ret = cxip_av_lookup_addr(ep_obj->av, dest_addr, &dest_caddr); if (ret != FI_SUCCESS) return ret; + idx_ext = CXIP_PTL_IDX_COLL; pid_bits = ep_obj->domain->iface->dev->info.pid_bits; - cxi_build_dfa(dest_caddr.nic, dest_caddr.pid, pid_bits, - CXIP_PTL_IDX_COLL, dfa, index_ext); + cxi_build_dfa(dest_caddr.nic, // dest NIC + dest_caddr.pid, // dest PID + pid_bits, // pid width + idx_ext, // idx_ext + dfa, // return dfa + index_ext); // return idx_ext *is_mcast = false; break; case COMM_KEY_RANK: @@ -596,8 +612,12 @@ static int _gen_tx_dfa(struct cxip_coll_reduction *reduction, dest_caddr = ep_obj->src_addr; pid_bits = ep_obj->domain->iface->dev->info.pid_bits; idx_ext = CXIP_PTL_IDX_COLL + av_set_idx; - cxi_build_dfa(dest_caddr.nic, dest_caddr.pid, pid_bits, - idx_ext, dfa, index_ext); + cxi_build_dfa(dest_caddr.nic, // dest NIC + dest_caddr.pid, // dest PID + pid_bits, // pid width + idx_ext, // idx_ext + dfa, // return dfa + index_ext); // return idx_ext *is_mcast = false; break; default: @@ -668,8 +688,8 @@ int cxip_coll_send(struct cxip_coll_reduction *reduction, cmd.full_dma.request_len = buflen; /* this uses cached values, returns -FI_EAGAIN if queue full */ - ret = cxip_txq_cp_set(cmdq, ep_obj->auth_key.vni, - mc_obj->tc, mc_obj->tc_type); + ret = cxip_cmdq_cp_set(cmdq, ep_obj->auth_key.vni, + mc_obj->tc, mc_obj->tc_type); if (ret) goto err; @@ -686,8 +706,8 @@ int cxip_coll_send(struct cxip_coll_reduction *reduction, ep_obj->src_addr.pid, ep_obj->src_addr.nic); /* this uses cached values, returns -FI_EAGAIN if queue full */ - ret = cxip_txq_cp_set(cmdq, ep_obj->auth_key.vni, - mc_obj->tc, mc_obj->tc_type); + ret = cxip_cmdq_cp_set(cmdq, ep_obj->auth_key.vni, + mc_obj->tc, mc_obj->tc_type); if (ret) goto err; @@ -1637,7 +1657,7 @@ bool is_hw_root(struct cxip_coll_mc *mc_obj) /* Simulated unicast send of multiple packets as root node to leaf nodes */ static inline -ssize_t _send_pkt_as_root(struct cxip_coll_reduction *reduction, bool retry) +ssize_t _send_pkt_as_root(struct cxip_coll_reduction *reduction) { int i, ret, err; @@ -1661,7 +1681,7 @@ ssize_t _send_pkt_as_root(struct cxip_coll_reduction *reduction, bool retry) /* Simulated unicast send of single packet as leaf node to root node */ static inline -ssize_t _send_pkt_as_leaf(struct cxip_coll_reduction *reduction, bool retry) +ssize_t _send_pkt_as_leaf(struct cxip_coll_reduction *reduction) { int ret; @@ -1674,27 +1694,29 @@ ssize_t _send_pkt_as_leaf(struct cxip_coll_reduction *reduction, bool retry) /* Multicast send of single packet from root or leaf node */ static inline -ssize_t _send_pkt_mc(struct cxip_coll_reduction *reduction, bool retry) +ssize_t _send_pkt_mc(struct cxip_coll_reduction *reduction) { - return cxip_coll_send(reduction, 0, - reduction->tx_msg, - sizeof(struct red_pkt), - reduction->mc_obj->reduction_md); + int ret; + + ret = cxip_coll_send(reduction, 0, reduction->tx_msg, + sizeof(struct red_pkt), + reduction->mc_obj->reduction_md); + TRACE_DEBUG("mcast: send=%d ret=%d\n", 1, ret); + return ret; } /* Send packet from root or leaf node as appropriate */ static inline -ssize_t _send_pkt(struct cxip_coll_reduction *reduction, bool retry) +ssize_t _send_pkt(struct cxip_coll_reduction *reduction) { int ret; - if (reduction->mc_obj->av_set_obj->comm_key.keytype == - COMM_KEY_MULTICAST) { - ret = _send_pkt_mc(reduction, retry); + if (reduction->mc_obj->is_multicast) { + ret = _send_pkt_mc(reduction); } else if (is_hw_root(reduction->mc_obj)) { - ret = _send_pkt_as_root(reduction, retry); + ret = _send_pkt_as_root(reduction); } else { - ret = _send_pkt_as_leaf(reduction, retry); + ret = _send_pkt_as_leaf(reduction); } return ret; } @@ -1711,11 +1733,11 @@ int cxip_coll_send_red_pkt(struct cxip_coll_reduction *reduction, memset(&pkt->hdr, 0, sizeof(pkt->hdr)); pkt->hdr.arm = arm; + pkt->hdr.retry = retry; pkt->hdr.seqno = reduction->seqno; pkt->hdr.resno = reduction->resno; pkt->hdr.cookie.mcast_id = reduction->mc_obj->mcast_addr; pkt->hdr.cookie.red_id = reduction->red_id; - pkt->hdr.cookie.retry = retry; pkt->hdr.cookie.magic = MAGIC; if (coll_data) { @@ -1741,7 +1763,7 @@ int cxip_coll_send_red_pkt(struct cxip_coll_reduction *reduction, /* -FI_EAGAIN means HW queue is full, should self-clear */ do { - ret = _send_pkt(reduction, retry); + ret = _send_pkt(reduction); } while (ret == -FI_EAGAIN); /* any other error is a serious config/hardware issue */ if (ret) @@ -1897,6 +1919,8 @@ bool _is_red_timed_out(struct cxip_coll_reduction *reduction) { struct timespec tsnow; + if (reduction->mc_obj->retry_disable) + return false; if (_is_red_first_time(reduction)) { TRACE_DEBUG("=== root first time, retry\n"); return true; @@ -2020,7 +2044,7 @@ static void _progress_leaf(struct cxip_coll_reduction *reduction, _tsset(reduction); reduction->seqno = pkt->hdr.seqno; reduction->resno = pkt->hdr.seqno; - if (pkt->hdr.cookie.retry) + if (pkt->hdr.retry) reduction->pktsent = false; } @@ -2424,7 +2448,6 @@ struct cxip_join_state { int mynode_idx; // index within the fi_addr[] list int mynode_fiaddr; // fi_addr of this node int simrank; // simulated rank of NIC - int pid_idx; // pid_idx used by ptl_te int prov_errno; // collective provider error int sched_state; // scheduled operation int join_idx; // unique join index for diagnostics @@ -2493,7 +2516,8 @@ static void _close_pte(struct cxip_coll_pte *coll_pte) } /* pid_idx == CXIP_PTL_IDX_COLL+rank for NETSIM - * pid_idx == CXIP_PTL_IDX_COLL for all other cases + * pid_idx == CXIP_PTL_IDX_COLL for UNICAST + * pid_idx == multicast for MULTICAST */ static int _acquire_pte(struct cxip_ep_obj *ep_obj, int pid_idx, bool is_mcast, struct cxip_coll_pte **coll_pte_ret) @@ -2596,24 +2620,42 @@ static struct fi_ops mc_ops = { * Utility routine to set up the collective framework in response to calls to * fi_join_collective(). * - * If jstate->is_rank is true, this is a NETSIM model, which opens a PTE for - * each call to fi_join_collective() that is bound to the multicast object - * created by that call. This allows simulated multicast traffic through the - * NETSIM loopback port by using different pte_idx values for each PTE to - * disambiguate traffic intended for different simulated hardware endpoints. - * This model does not support multiple MC objects at an endpoint: there is - * exactly one MC address. Progressing the single endpoint will progress all - * of the simulated MC objects. Extending this model to support multiple MC - * objects is not a priority at this time. - * - * If jstate->is_rank is false, this is a multinode model. The first call to - * fi_join_collective() creates a single PTE which is bound to the EP, and - * creates the first multicast object for that endpoint. Every subsequent - * join will create an additional multicast object that shares the PTE for - * that endpoint. Multiple NICs on the node are represented by separate EP - * objects, which are functionally distinct: all endpoints must be progressed - * independently, and if any endpoint is not progressed, it will stall the - * collective. + * This currently supports three different collectives transport models. + * + * If jstate->is_rank is true, this is a NETSIM model. This is an early + * testing model, and is retained for regression testing of the code for code + * merge. The model requires a PTE for each simulated endpoint in the tree, + * since the endpoint can only send to itself: there is a single domain (and + * simulated NIC) under NETSIM. The pid_index is used to simulate multiple + * "multicast" target endpoints. Setup creates multiple PTEs, one for each + * simulated endpoint, each using a different pid_index. The NETSIM tests run + * in isolated test processes, so pid_index values should not conflict with + * other traffic. + * + * If jstate->is_rank is false, and jstate->is_mcast is also false, this is the + * UNICAST model. This is a test model developed to parallelize development + * during a period of time when fabric multicast was unavailable, to allow a + * full multi-node simulation of collectives, and may be deprecated as multicast + * capability matures. This model requires only a single PTE per domain (NIC). + * Sends are serialized through each endpoint, but receives can race and become + * disordered as they pass through the fabric, as will occur in production. The + * pid_index is set to the reserved value of CXIP_PTL_IDX_COLL, which should not + * be used by any other traffic on a given NIC, allowing this model to be used + * concurrently with other traffic. + * + * If jstate->is_rank is false and jstate->is_mcast is true, this is the + * production MULTICAST model. This supports multiple multicast trees, and + * requires a PTE for each tree, since the pid_index is used to encode the + * multicast address. + * + * Normal PTE setup populates the address portion of the PTE from the domains + * that have been defined, each domain representing a NIC, and the pid_index + * sets only the lower pid_width bits of the PTE address to differentiate + * different traffic streams. However, when a PTE is created with is_mcast=true, + * the driver code sets the entire PTE address. This calling code must encode + * the multicast address by bit-shifting it out of the pid_width range. The + * lower bits are arbitrary, since this PTE cannot receive any other traffic, + * and are set to zero. * * Caller must hold ep_obj->lock. */ @@ -2625,6 +2667,8 @@ static int _initialize_mc(void *ptr) struct cxip_coll_mc *mc_obj; struct cxip_coll_pte *coll_pte; struct cxip_cmdq *cmdq; + union cxi_pte_map_offset pid_mcast; + int pid_idx; int red_id; int ret; @@ -2634,24 +2678,32 @@ static int _initialize_mc(void *ptr) if (!mc_obj) return -FI_ENOMEM; - /* COMM_KEY_RANK model needs a distinct PTE for every MC object. - * All other models share a single PTE for all MCs using an EP. - */ - coll_pte = ep_obj->coll.coll_pte; - if (!coll_pte) { - TRACE_DEBUG("acqiring PTE\n"); - ret = _acquire_pte(ep_obj, jstate->pid_idx, jstate->is_mcast, - &coll_pte); - if (ret) { - TRACE_DEBUG("acquiring PTE failed %d\n", ret); - free(mc_obj); - return ret; - } - if (!jstate->is_rank) { - TRACE_DEBUG("assigned PTE to ep_obj\n"); - ep_obj->coll.coll_pte = coll_pte; - } - /* else leave ep_obj->coll.coll_pte == NULL */ + TRACE_DEBUG("acquiring PTE\n"); + if (jstate->is_rank) { + // NETSIM + // pid_idx = simulated collective rank + pid_idx = CXIP_PTL_IDX_COLL + jstate->simrank; + ret = _acquire_pte(ep_obj, pid_idx, false, &coll_pte); + // suppress attempt to set multiple times in idm + coll_pte->mc_obj = mc_obj; + } else if (!jstate->is_mcast) { + // UNICAST + // pid_idx = simulated collective tree + pid_idx = CXIP_PTL_IDX_COLL; + ret = _acquire_pte(ep_obj, pid_idx, false, &coll_pte); + } else { + // MULTICAST + // pid_idx = bit-shifted multicast address + memset(&pid_mcast, 0, sizeof(pid_mcast)); + pid_mcast.mcast_id = jstate->bcast_data.mcast_addr; + pid_mcast.mcast_pte_index = 0; + pid_idx = *((int *)&pid_mcast); + ret = _acquire_pte(ep_obj, pid_idx, true, &coll_pte); + } + if (ret) { + TRACE_DEBUG("acquiring PTE failed %d\n", ret); + free(mc_obj); + return ret; } /* copy coll_pte to mc_obj */ mc_obj->coll_pte = coll_pte; @@ -2667,6 +2719,17 @@ static int _initialize_mc(void *ptr) av_set_obj->mc_obj = mc_obj; mc_obj->av_set_obj = av_set_obj; + /* define whether this is multicast */ + switch (av_set_obj->comm_key.keytype) { + case COMM_KEY_NONE: + case COMM_KEY_MULTICAST: + mc_obj->is_multicast = true; + break; + default: + mc_obj->is_multicast = false; + break; + } + /* initialize remainder of mc_obj */ mc_obj->mc_fid.fid.fclass = FI_CLASS_MC; mc_obj->mc_fid.fid.context = mc_obj; @@ -2714,7 +2777,6 @@ static int _initialize_mc(void *ptr) } /* define the traffic class */ - // TODO revisit for LOW_LATENCY if (is_netsim(ep_obj)) { /* NETSIM RANK model */ mc_obj->tc = CXI_TC_BEST_EFFORT; @@ -2725,7 +2787,7 @@ static int _initialize_mc(void *ptr) mc_obj->tc_type = CXI_TC_TYPE_DEFAULT; } else if (is_hw_root(mc_obj)) { /* MULTICAST model, hw_root */ - mc_obj->tc = CXI_TC_BEST_EFFORT; + mc_obj->tc = CXI_TC_LOW_LATENCY; mc_obj->tc_type = CXI_TC_TYPE_DEFAULT; } else { /* MULTICAST model, leaves */ @@ -2734,8 +2796,8 @@ static int _initialize_mc(void *ptr) } /* Set this now to instantiate cmdq CP */ cmdq = ep_obj->coll.tx_cmdq; - ret = cxip_txq_cp_set(cmdq, ep_obj->auth_key.vni, - mc_obj->tc, mc_obj->tc_type); + ret = cxip_cmdq_cp_set(cmdq, ep_obj->auth_key.vni, + mc_obj->tc, mc_obj->tc_type); if (ret) { TRACE_JOIN("%s: cxip_txq_cp_set() = %d\n", __func__, ret); goto fail; @@ -2794,18 +2856,18 @@ static void _cxip_create_mcast_cb(struct cxip_curl_handle *handle) struct cxip_addr caddr; const char *hwrootstr; int mcaddr, hwroot; - uint32_t b2, b1, b0, n; + uint32_t octet[6], n; int i, ret; /* Creation process is done */ - TRACE_JOIN("CURL COMPLETED!\n"); + TRACE_CURL("CURL COMPLETED!\n"); jstate->finished_mcast = true; switch (handle->status) { case 200: case 201: /* CURL succeeded, parse response */ - TRACE_JOIN("CURL PARSE RESPONSE:\n%s\n", handle->response); + TRACE_CURL("CURL PARSE RESPONSE:\n%s\n", handle->response); if (!(json_obj = json_tokener_parse(handle->response))) break; if (cxip_json_int("mcastID", json_obj, &mcaddr)) @@ -2813,12 +2875,19 @@ static void _cxip_create_mcast_cb(struct cxip_curl_handle *handle) if (cxip_json_string("hwRoot", json_obj, &hwrootstr)) break; - n = sscanf(hwrootstr, "%x:%x:%x", &b2, &b1, &b0); - if (n < 3 || b2 > 0xf || b1 > 0xff || b2 > 0xff) + memset(octet, 0, sizeof(octet)); + hwroot = 0; + n = sscanf(hwrootstr, "%x:%x:%x:%x:%x:%x", + &octet[5], &octet[4], &octet[3], + &octet[2], &octet[1], &octet[0]); + if (n < 3) { + TRACE_CURL("bad hwroot address = %s\n", hwrootstr); break; - hwroot = (b2 << 16) + (b1 << 8) + b0; + } + for (i = 0; i < n; i++) + hwroot |= octet[i] << (8*i); - TRACE_JOIN("mcastID=%d hwRoot='%s'=%x\n", mcaddr, hwrootstr, + TRACE_CURL("mcastID=%d hwRoot='%s'=%x\n", mcaddr, hwrootstr, hwroot); for (i = 0; i < jstate->av_set_obj->fi_addr_cnt; i++) { ret = cxip_av_lookup_addr( @@ -2831,9 +2900,9 @@ static void _cxip_create_mcast_cb(struct cxip_curl_handle *handle) if (hwroot == caddr.nic) break; } - TRACE_JOIN("final index=%d\n", i); + TRACE_CURL("final index=%d\n", i); if (i >= jstate->av_set_obj->fi_addr_cnt) { - TRACE_JOIN("multicast HWroot not found in av_set\n"); + TRACE_CURL("multicast HWroot not found in av_set\n"); jstate->prov_errno = CXIP_PROV_ERRNO_HWROOT_INVALID; break; } @@ -2843,21 +2912,21 @@ static void _cxip_create_mcast_cb(struct cxip_curl_handle *handle) jstate->bcast_data.mcast_addr = (uint32_t)mcaddr; jstate->is_mcast = true; /* This succeeded */ - TRACE_JOIN("curl: mcaddr =%08x\n", + TRACE_CURL("curl: mcaddr =%08x\n", jstate->bcast_data.mcast_addr); - TRACE_JOIN("curl: hwrootidx=%d\n", + TRACE_CURL("curl: hwrootidx=%d\n", jstate->bcast_data.hwroot_idx); break; default: - TRACE_JOIN("ERRMSK SET CURL error %ld!\n", handle->status); + TRACE_CURL("ERRMSK SET CURL error %ld!\n", handle->status); if (handle->response) - TRACE_JOIN("ERROR RESPONSE:\n%s\n", handle->response); + TRACE_CURL("ERROR RESPONSE:\n%s\n", handle->response); // TODO finer error differentiation from CURL errors jstate->prov_errno = CXIP_PROV_ERRNO_CURL; break; } free(curl_usrptr); - TRACE_JOIN("CURL COMPLETED!\n"); + TRACE_CURL("CURL COMPLETED!\n"); jstate->finished_mcast = true; } @@ -2868,8 +2937,6 @@ static void _start_curl(void *ptr) { struct cxip_curl_mcast_usrptr *curl_usrptr; struct cxip_join_state *jstate = ptr; - static const char *json_fmt = - "{'macs':[%s],'jobID':'%s','jobStepID':'%s','timeout':%ld}"; struct cxip_addr caddr; char *jsonreq, *mac, *url, *p; int i, ret; @@ -2881,14 +2948,14 @@ static void _start_curl(void *ptr) url = NULL; /* acquire the environment variables needed */ - TRACE_JOIN("jobid = %s\n", cxip_env.coll_job_id); - TRACE_JOIN("stepid = %s\n", cxip_env.coll_job_step_id); - TRACE_JOIN("fmurl = %s\n", cxip_env.coll_fabric_mgr_url); - TRACE_JOIN("token = %s\n", cxip_env.coll_mcast_token); - TRACE_JOIN("maxadrs = %ld\n", cxip_env.hwcoll_addrs_per_job); - TRACE_JOIN("minnodes= %ld\n", cxip_env.hwcoll_min_nodes); - TRACE_JOIN("retry = %ld\n", cxip_env.coll_retry_usec); - TRACE_JOIN("tmout = %ld\n", cxip_env.coll_timeout_usec); + TRACE_CURL("jobid = %s\n", cxip_env.coll_job_id); + TRACE_CURL("stepid = %s\n", cxip_env.coll_job_step_id); + TRACE_CURL("fmurl = %s\n", cxip_env.coll_fabric_mgr_url); + TRACE_CURL("token = %s\n", cxip_env.coll_mcast_token); + TRACE_CURL("maxadrs = %ld\n", cxip_env.hwcoll_addrs_per_job); + TRACE_CURL("minnodes= %ld\n", cxip_env.hwcoll_min_nodes); + TRACE_CURL("retry = %ld\n", cxip_env.coll_retry_usec); + TRACE_CURL("tmout = %ld\n", cxip_env.coll_timeout_usec); /* Generic error for any preliminary failures */ jstate->prov_errno = CXIP_PROV_ERRNO_CURL; @@ -2900,8 +2967,7 @@ static void _start_curl(void *ptr) goto quit; } - ret = asprintf(&url, "%s/fabric/collectives/multicast", - cxip_env.coll_fabric_mgr_url); + ret = asprintf(&url, "%s", cxip_env.coll_fabric_mgr_url); if (ret < 0) { TRACE_JOIN("Failed to construct CURL address\n"); ret = -FI_ENOMEM; @@ -2933,10 +2999,14 @@ static void _start_curl(void *ptr) *(--p) = 0; /* generate the CURL JSON request */ - ret = asprintf(&jsonreq, json_fmt, mac, + ret = asprintf(&jsonreq, + "{'macs':[%s],'vni': %d,'timeout':%ld,'jobID':'%s'," + "'jobStepID':'%s'}", + mac, + jstate->ep_obj->auth_key.vni, + cxip_env.coll_timeout_usec, cxip_env.coll_job_id, - cxip_env.coll_job_step_id, - cxip_env.coll_timeout_usec); + cxip_env.coll_job_step_id); if (ret < 0) { TRACE_JOIN("Creating JSON request = %d\n", ret); ret = -FI_ENOMEM; @@ -3020,6 +3090,9 @@ static void _start_curl(void *ptr) * If the return code is anything else, the join operation fails. */ +/* suppress repeated BUSY log messages */ +static long suppress_busy_log; + /* append a jstate to the zbcoll scheduler */ static void _append_sched(struct cxip_zbcoll_obj *zb, void *usrptr) { @@ -3070,23 +3143,29 @@ static void _start_bcast(void *ptr) struct cxip_join_state *jstate = ptr; struct cxip_zbcoll_obj *zb = jstate->zb; - TRACE_JOIN("%s: entry\n", __func__); + if (!suppress_busy_log) + TRACE_JOIN("%s: entry\n", __func__); /* error will indicate that the multicast request fails */ jstate->prov_errno = C_RC_INVALID_DFA_FORMAT; /* rank 0 always does the work here */ if (jstate->mynode_idx == 0) { + if (!suppress_busy_log) + TRACE_JOIN("%s: rank 0\n", __func__); if (jstate->create_mcast) { /* first call (only) initiates CURL request */ if (!jstate->creating_mcast) { + TRACE_JOIN("%s create mcast\n", __func__); jstate->creating_mcast = true; _start_curl(jstate); } /* every retry call checks to see if CURL is complete */ if (!jstate->finished_mcast) { zb->error = -FI_EAGAIN; + suppress_busy_log++; goto quit; } + suppress_busy_log = 0; /* bcast_data.valid is set by curl callback */ } else { /* static bcast data is presumed correct */ @@ -3294,10 +3373,6 @@ static void _progress_sched(struct cxip_join_state *jstate) struct cxip_zbcoll_obj *zb = jstate->zb; enum state_code *codes; - TRACE_JOIN("entry jstate[%d,%d]=%s, error=%d\n", - jstate->join_idx, jstate->mynode_idx, - state_name[jstate->sched_state], zb->error); - /* acquire the success/again/fail state codes for current state */ codes = progress_state[jstate->sched_state]; switch (zb->error) { @@ -3305,12 +3380,15 @@ static void _progress_sched(struct cxip_join_state *jstate) /* last operation succeeded */ TRACE_JOIN("%s: success\n", __func__); jstate->sched_state = codes[0]; + suppress_busy_log = 0; break; case -FI_EBUSY: case -FI_EAGAIN: /* last operation needs a retry */ - TRACE_JOIN("%s: busy retry\n", __func__); + if (!suppress_busy_log) + TRACE_JOIN("%s: busy retry\n", __func__); jstate->sched_state = codes[1]; + suppress_busy_log++; break; default: /* last operation failed */ @@ -3318,9 +3396,10 @@ static void _progress_sched(struct cxip_join_state *jstate) jstate->sched_state = codes[2]; break; } - TRACE_JOIN("----> jstate[%d,%d]=%s\n", - jstate->join_idx, jstate->mynode_idx, - state_name[jstate->sched_state]); + if (!suppress_busy_log) + TRACE_JOIN("----> jstate[%d,%d]=%s\n", + jstate->join_idx, jstate->mynode_idx, + state_name[jstate->sched_state]); /* execute the new state function */ state_func[jstate->sched_state](jstate); @@ -3504,7 +3583,6 @@ int cxip_join_collective(struct fid_ep *ep, fi_addr_t coll_addr, jstate->mynode_fiaddr = av_set_obj->fi_addr_ary[jstate->mynode_idx]; jstate->simrank = ZB_NOSIM; - jstate->pid_idx = CXIP_PTL_IDX_COLL; jstate->bcast_data.hwroot_idx = 0; jstate->bcast_data.mcast_addr = 0; jstate->bcast_data.valid = false; @@ -3526,7 +3604,6 @@ int cxip_join_collective(struct fid_ep *ep, fi_addr_t coll_addr, jstate->mynode_fiaddr = av_set_obj->fi_addr_ary[jstate->mynode_idx]; jstate->simrank = ZB_NOSIM; - jstate->pid_idx = CXIP_PTL_IDX_COLL; jstate->bcast_data.hwroot_idx = av_set_obj->comm_key.mcast.hwroot_idx; jstate->bcast_data.mcast_addr = @@ -3550,7 +3627,6 @@ int cxip_join_collective(struct fid_ep *ep, fi_addr_t coll_addr, jstate->mynode_fiaddr = av_set_obj->fi_addr_ary[jstate->mynode_idx]; jstate->simrank = ZB_NOSIM; - jstate->pid_idx = CXIP_PTL_IDX_COLL; jstate->bcast_data.hwroot_idx = av_set_obj->comm_key.ucast.hwroot_idx; jstate->bcast_data.mcast_addr = @@ -3568,7 +3644,6 @@ int cxip_join_collective(struct fid_ep *ep, fi_addr_t coll_addr, jstate->mynode_idx = av_set_obj->comm_key.rank.rank; jstate->mynode_fiaddr = (fi_addr_t)jstate->mynode_idx; jstate->simrank = jstate->mynode_idx; - jstate->pid_idx = CXIP_PTL_IDX_COLL + jstate->simrank; jstate->bcast_data.hwroot_idx = 0; jstate->bcast_data.mcast_addr = ep_obj->src_addr.nic; jstate->bcast_data.valid = true; @@ -3764,8 +3839,8 @@ int cxip_coll_enable(struct cxip_ep *ep) } /* A read-only or write-only endpoint is legal */ - if (!(ofi_recv_allowed(ep_obj->rxc.attr.caps) && - ofi_send_allowed(ep_obj->txc.attr.caps))) { + if (!(ofi_recv_allowed(ep_obj->rxc->attr.caps) && + ofi_send_allowed(ep_obj->txc->attr.caps))) { CXIP_INFO("EP not recv/send, collectives not enabled\n"); return FI_SUCCESS; } @@ -3781,17 +3856,18 @@ int cxip_coll_enable(struct cxip_ep *ep) return -FI_EINVAL; /* Bind all STD EP objects to the coll object */ - ep_obj->coll.rx_cmdq = ep_obj->rxc.rx_cmdq; - ep_obj->coll.tx_cmdq = ep_obj->txc.tx_cmdq; - ep_obj->coll.rx_cntr = ep_obj->rxc.recv_cntr; - ep_obj->coll.tx_cntr = ep_obj->txc.send_cntr; - ep_obj->coll.rx_evtq = &ep_obj->rxc.rx_evtq; - ep_obj->coll.tx_evtq = &ep_obj->txc.tx_evtq; + ep_obj->coll.rx_cmdq = ep_obj->rxc->rx_cmdq; + ep_obj->coll.tx_cmdq = ep_obj->txc->tx_cmdq; + ep_obj->coll.rx_cntr = ep_obj->rxc->recv_cntr; + ep_obj->coll.tx_cntr = ep_obj->txc->send_cntr; + ep_obj->coll.rx_evtq = &ep_obj->rxc->rx_evtq; + ep_obj->coll.tx_evtq = &ep_obj->txc->tx_evtq; ep_obj->coll.eq = ep_obj->eq; ep->ep.collective = &cxip_collective_ops; ep_obj->coll.enabled = true; + cxip_coll_trace_init(); return FI_SUCCESS; } @@ -3801,6 +3877,7 @@ int cxip_coll_disable(struct cxip_ep_obj *ep_obj) if (!ep_obj->coll.enabled) return FI_SUCCESS; + cxip_coll_trace_close(); ep_obj->coll.enabled = false; ep_obj->coll.rx_cmdq = NULL; ep_obj->coll.tx_cmdq = NULL; diff --git a/prov/cxi/src/cxip_coll_trace.c b/prov/cxi/src/cxip_coll_trace.c new file mode 100644 index 00000000000..276fa83498e --- /dev/null +++ b/prov/cxi/src/cxip_coll_trace.c @@ -0,0 +1,220 @@ +/* + * Copyright (c) 2021-2024 Hewlett Packard Enterprise Development LP + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + */ + +/** + * @brief TRACE function for producing runtime debugging logs + * + * This creates log files on each running node of a multinode job, which + * contain trace information produced on that node. SLURM nodes inherit + * stdout/stderr from the launch node. + * + * ENABLE_DEBUG is the libfabric logging flag that enables debug message + * logging, and is required to use TRACE calls, which produce more output than + * DEBUG, and are specific to collectives. If ENABLE_DEBUG is unset, or + * evaluates to false at compile time, CXIP_COLL_TRACE is a syntactically robust + * NOOP which results in no code being generated, ensuring that these trace + * calls do not affect performance in production. ENABLE_DEBUG is normally true + * for development builds, and false for production builds. + * + * This module is initialized on the first call to cxip_ep_enable(), which + * initializes its ep.coll structure using cxip_coll_enable(). Multiple + * endpoints can be enabled, and each associated ep.coll object will be + * initialized, but this module will only be initialized once, and is not + * dependent on the ep or the ep.coll object. This module preserves a count + * of initializations, and is closed when the last ep is closed. + * + * Unlike normal libfabric output, the CXIP_COLL_TRACE() output opens a trace + * file on the node it is running on, with a unique name that identifies the + * node so that a shared file system can be used among the nodes. Since the + * collectives code is running in parallel on multiple nodes, mixing the output + * through a single file descriptor (like stdout) scrambles the traces, and + * makes it difficult to follow the flow. Using separate output files keeps the + * sequences intact without needing to post-process the data. + * + * In addition, this allows character buffered output, which generally runs + * faster, but can interleave lines in ways that makes post-processing + * impossible. The normal solution is to line-buffer the output, but this + * slows operations considerably and can mask timing-related problems. + * + * The output-selection flags are of the form CXIP_TRC_COLL_*, and are specified + * in the CXIP_COLL_TRACE() macros to differentiate the kind of output desired. + * These are activated by defining environment variables of the same name. + * + * To use this feature, create a .bashrc file in the runtime directory of each + * node in the job (or use a shared FS for all nodes in the job), containing one + * or more of the CXIP_TRC_* environment variables. + * + * Note that the environment settings for each node in a multi-node job can be + * set differently to capture different traces. + * + * Example: + * export CXIP_TRC_PATHNAME = "/mypath/myfile" + * export CXIP_TRC_LINEBUF=1 + * export CXIP_TRC_COLL_JOIN=1 + * export CXIP_TRC_COLL_PKT=1 + * + * All instances of the following in the code will result in output: + * CXIP_COLL_TRACE(CXIP_TRC_COLL_JOIN, fmt, ...); + * CXIP_COLL_TRACE(CXIP_TRC_COLL_PKT, fmt, ...); + * All instances of other CXIP_TRC_* values will be silent. + * + * Environment variables used in setup: + * CXIP_TRC_PATHNAME defines the output path name, which defaults to "trace". + * CXIP_TRC_LINEBUF sets or clears line buffering out output, and defaults to 0. + * CXIP_TRC_APPEND sets or clears open append mode, and defaults to 0. + * + * CXIP_TRC_APPEND is needed for NETSIM tests under Criterion, since each + * test is run in a separate process and closes all files at completion of + * each test. If CXIP_TRC_APPEND is not set, you will see only the tracie of + * the last test run. + * + * Specifying PMI_RANK as a rank value will apply a prefix to the trace lines + * that identifies the rank of the trace. Note that this is normally exported + * by the SLURM environment, or the multinode test framework. + * + * Specifying PMI_SIZE will expand the prefix to show the number of ranks. + * Note that this is normally exported by the SLURM environment, or the + * multinode test framework. + * + * cxip_coll_trace_flush() forces all output be flushed AND written to disk, but + * leaves the file open for more writing. + * + * cxip_coll_trace_close() flushes all output and closes the file. + * + * cxip_coll_trace() is used to generate trace messages, and is normally called + * through the CXIP_COLL_TRACE() macro. + */ +#include "config.h" + +#include +#include +#include +#include +#include + +#include "cxip.h" + +bool cxip_coll_trace_initialized; +bool cxip_coll_trace_muted; +bool cxip_coll_trace_append; +bool cxip_coll_trace_linebuf; +int cxip_coll_trace_rank; +int cxip_coll_trace_numranks; +char *cxip_coll_trace_pathname; +FILE *cxip_coll_trace_fid; +uint64_t cxip_coll_trace_mask; + +/* Get environment variable as string representation of int */ +static int getenv_int(const char *name) +{ + char *env; + int value; + + value = -1; + env = getenv(name); + if (env) + sscanf(env, "%d", &value); + return value; +} + +void cxip_coll_trace_init(void) +{ + const char *fpath; + int ret; + + /* can only initialize once */ + if (cxip_coll_trace_initialized) + return; + + cxip_coll_trace_mask = 0L; + cxip_coll_trace_fid = NULL; + cxip_coll_trace_pathname = NULL; + + cxip_coll_trace_append = !!getenv("CXIP_TRC_APPEND"); + cxip_coll_trace_linebuf = !!getenv("CXIP_TRC_LINEBUF"); + cxip_coll_trace_rank = getenv_int("PMI_RANK"); + cxip_coll_trace_numranks = getenv_int("PMI_SIZE"); + fpath = getenv("CXIP_TRC_PATHNAME"); + + /* set bits in cxip_coll_trace_mask */ + if (getenv("CXIP_TRC_CTRL")) + cxip_coll_trace_set(CXIP_TRC_CTRL); + if (getenv("CXIP_TRC_ZBCOLL")) + cxip_coll_trace_set(CXIP_TRC_ZBCOLL); + if (getenv("CXIP_TRC_COLL_CURL")) + cxip_coll_trace_set(CXIP_TRC_COLL_CURL); + if (getenv("CXIP_TRC_COLL_PKT")) + cxip_coll_trace_set(CXIP_TRC_COLL_PKT); + if (getenv("CXIP_TRC_COLL_JOIN")) + cxip_coll_trace_set(CXIP_TRC_COLL_JOIN); + if (getenv("CXIP_TRC_COLL_DEBUG")) + cxip_coll_trace_set(CXIP_TRC_COLL_DEBUG); + if (getenv("CXIP_TRC_TEST_CODE")) + cxip_coll_trace_set(CXIP_TRC_TEST_CODE); + + /* if no trace masks set, do nothing */ + if (!cxip_coll_trace_mask) + return; + + if (!fpath || !*fpath) + fpath = "trace"; + ret = asprintf(&cxip_coll_trace_pathname, "%s%d", fpath, cxip_coll_trace_rank); + if (ret <= 0) { + fprintf(stderr, "asprintf() failed = %s\n", strerror(ret)); + return; + } + cxip_coll_trace_fid = + fopen(cxip_coll_trace_pathname, cxip_coll_trace_append ? "a" : "w"); + if (!cxip_coll_trace_fid) { + fprintf(stderr, "open('%s') failed: %s\n", cxip_coll_trace_pathname, + strerror(errno)); + free(cxip_coll_trace_pathname); + cxip_coll_trace_pathname = NULL; + return; + } + if (cxip_coll_trace_linebuf && cxip_coll_trace_fid) + setlinebuf(cxip_coll_trace_fid); + + cxip_coll_trace_initialized = true; +} + +void cxip_coll_trace_flush(void) +{ + if (cxip_coll_trace_fid) { + fflush(cxip_coll_trace_fid); + fsync(fileno(cxip_coll_trace_fid)); + } +} + +void cxip_coll_trace_close(void) +{ + if (cxip_coll_trace_fid) { + cxip_coll_trace_flush(); + fclose(cxip_coll_trace_fid); + cxip_coll_trace_fid = NULL; + if (cxip_coll_trace_pathname) { + free(cxip_coll_trace_pathname); + cxip_coll_trace_pathname = NULL; + } + } + cxip_coll_trace_initialized = false; +} + +int cxip_coll_trace_attr cxip_coll_trace(const char *fmt, ...) +{ + va_list args; + char *str; + int len; + + va_start(args, fmt); + len = vasprintf(&str, fmt, args); + va_end(args); + if (len >= 0) { + len = fprintf(cxip_coll_trace_fid, "[%2d|%2d] %s", cxip_coll_trace_rank, + cxip_coll_trace_numranks, str); + free(str); + } + return len; +} diff --git a/prov/cxi/src/cxip_ctrl.c b/prov/cxi/src/cxip_ctrl.c index 3d484dcdccd..b60858742b7 100644 --- a/prov/cxi/src/cxip_ctrl.c +++ b/prov/cxi/src/cxip_ctrl.c @@ -29,7 +29,15 @@ int cxip_ctrl_msg_cb(struct cxip_ctrl_req *req, const union c_event *event) .raw = event->tgt_long.match_bits, }; uint32_t init = event->tgt_long.initiator.initiator.process; - int ret __attribute__((unused)); + int ret; + + /* Check to see if event processing is implemented or overridden + * int the protocol. A return of -FI_ENOSYS indicated the event + * was not consumed. + */ + ret = req->ep_obj->rxc->ops.ctrl_msg_cb(req, event); + if (ret != -FI_ENOSYS) + goto done; switch (event->hdr.event_type) { case C_EVENT_MATCH: @@ -40,28 +48,15 @@ int cxip_ctrl_msg_cb(struct cxip_ctrl_req *req, const union c_event *event) nic_addr = CXI_MATCH_ID_EP(pid_bits, init); pid = CXI_MATCH_ID_PID(pid_bits, init); - switch (mb.ctrl_msg_type) { - case CXIP_CTRL_MSG_FC_NOTIFY: - ret = cxip_fc_process_drops(req->ep_obj, nic_addr, pid, - mb.drops); - assert(ret == FI_SUCCESS); - - break; - case CXIP_CTRL_MSG_FC_RESUME: - ret = cxip_fc_resume(req->ep_obj, nic_addr, pid); - assert(ret == FI_SUCCESS); - - break; - case CXIP_CTRL_MSG_ZB_DATA: + /* Messages not handled by the protocol */ + if (mb.ctrl_msg_type == CXIP_CTRL_MSG_ZB_DATA) { ret = cxip_zbcoll_recv_cb(req->ep_obj, nic_addr, pid, mb.raw); assert(ret == FI_SUCCESS); - break; - default: + } else { CXIP_FATAL("Unexpected msg type: %d\n", mb.ctrl_msg_type); } - break; default: CXIP_FATAL(CXIP_UNEXPECTED_EVENT, @@ -69,6 +64,7 @@ int cxip_ctrl_msg_cb(struct cxip_ctrl_req *req, const union c_event *event) cxi_rc_to_str(cxi_event_rc(event))); } +done: CXIP_DBG("got event: %s rc: %s (req: %p)\n", cxi_event_to_str(event), cxi_rc_to_str(cxi_event_rc(event)), @@ -88,58 +84,67 @@ int cxip_ctrl_msg_send(struct cxip_ctrl_req *req) union c_fab_addr dfa; uint8_t idx_ext; uint32_t pid_bits; - union c_cmdu cmd = {}; + struct c_cstate_cmd c_state = {}; + struct c_idc_msg_hdr idc_msg = {}; uint32_t match_id; int ret; - txq = req->ep_obj->ctrl_txq; + if (!req->ep_obj->ctrl.tx_credits) { + CXIP_WARN("Control TX credits exhausted\n"); + return -FI_EAGAIN; + } + + req->ep_obj->ctrl.tx_credits--; + + txq = req->ep_obj->ctrl.txq; pid_bits = req->ep_obj->domain->iface->dev->info.pid_bits; cxi_build_dfa(req->send.nic_addr, req->send.pid, pid_bits, CXIP_PTL_IDX_CTRL, &dfa, &idx_ext); match_id = CXI_MATCH_ID(pid_bits, req->ep_obj->src_addr.pid, req->ep_obj->src_addr.nic); - cmd.c_state.event_send_disable = 1; - cmd.c_state.index_ext = idx_ext; - cmd.c_state.eq = req->ep_obj->ctrl_tx_evtq->eqn; - cmd.c_state.initiator = match_id; + c_state.event_send_disable = 1; + c_state.index_ext = idx_ext; + c_state.eq = req->ep_obj->ctrl.tx_evtq->eqn; + c_state.initiator = match_id; - if (!req->ep_obj->ctrl_tx_credits) { - CXIP_WARN("Control TX credits exhausted\n"); - return -FI_EAGAIN; - } - - req->ep_obj->ctrl_tx_credits--; + idc_msg.dfa = dfa; + idc_msg.match_bits = req->send.mb.raw; + idc_msg.user_ptr = (uint64_t)req; - ret = cxip_cmdq_emit_c_state(txq, &cmd.c_state); - if (ret) { - CXIP_DBG("Failed to issue C_STATE command: %d\n", ret); - goto err_return_credit; - } - - memset(&cmd.idc_msg, 0, sizeof(cmd.idc_msg)); - cmd.idc_msg.dfa = dfa; - cmd.idc_msg.match_bits = req->send.mb.raw; - cmd.idc_msg.user_ptr = (uint64_t)req; + if (req->ep_obj->av_auth_key) { + ret = cxip_domain_emit_idc_msg(req->ep_obj->domain, + req->send.vni, + CXI_TC_BEST_EFFORT, &c_state, + &idc_msg, NULL, 0, 0); + if (ret) { + CXIP_DBG("Failed to write domain IDC: %d\n", ret); + goto err_return_credit; + } + } else { + ret = cxip_cmdq_cp_set(txq, req->send.vni, CXI_TC_BEST_EFFORT, + CXI_TC_TYPE_DEFAULT); + if (ret) { + CXIP_DBG("Failed to set cp: %d\n", ret); + goto err_return_credit; + } - ret = cxi_cq_emit_idc_msg(txq->dev_cmdq, &cmd.idc_msg, NULL, 0); - if (ret) { - CXIP_DBG("Failed to write IDC: %d\n", ret); + ret = cxip_cmdq_emit_idc_msg(txq, &c_state, &idc_msg, NULL, 0, + 0); + if (ret) { + CXIP_DBG("Failed to write IDC: %d\n", ret); + goto err_return_credit; + } - /* Return error according to Domain Resource Management - */ - ret = -FI_EAGAIN; - goto err_return_credit; + cxi_cq_ring(txq->dev_cmdq); } - cxi_cq_ring(txq->dev_cmdq); - CXIP_DBG("Queued control message: %p\n", req); return FI_SUCCESS; err_return_credit: - req->ep_obj->ctrl_tx_credits++; + req->ep_obj->ctrl.tx_credits++; return ret; } @@ -161,40 +166,40 @@ int cxip_ctrl_msg_init(struct cxip_ep_obj *ep_obj) .raw = ~0, }; - ret = cxip_domain_ctrl_id_alloc(ep_obj->domain, &ep_obj->ctrl_msg_req); + ret = cxip_domain_ctrl_id_alloc(ep_obj->domain, &ep_obj->ctrl.msg_req); if (ret) { CXIP_WARN("Failed to allocate MR buffer ID: %d\n", ret); return -FI_ENOSPC; } - ep_obj->ctrl_msg_req.ep_obj = ep_obj; - ep_obj->ctrl_msg_req.cb = cxip_ctrl_msg_cb; + ep_obj->ctrl.msg_req.ep_obj = ep_obj; + ep_obj->ctrl.msg_req.cb = cxip_ctrl_msg_cb; le_flags = C_LE_UNRESTRICTED_BODY_RO | C_LE_UNRESTRICTED_END_RO | C_LE_OP_PUT; ib.ctrl_le_type = 0; - ret = cxip_pte_append(ep_obj->ctrl_pte, 0, 0, 0, - C_PTL_LIST_PRIORITY, ep_obj->ctrl_msg_req.req_id, + ret = cxip_pte_append(ep_obj->ctrl.pte, 0, 0, 0, + C_PTL_LIST_PRIORITY, ep_obj->ctrl.msg_req.req_id, mb.raw, ib.raw, CXI_MATCH_ID_ANY, 0, le_flags, - NULL, ep_obj->ctrl_tgq, true); + NULL, ep_obj->ctrl.tgq, true); if (ret) { CXIP_DBG("Failed to write Append command: %d\n", ret); goto err_free_id; } /* Wait for link EQ event */ - while (!(event = cxi_eq_get_event(ep_obj->ctrl_tgt_evtq))) + while (!(event = cxi_eq_get_event(ep_obj->ctrl.tgt_evtq))) sched_yield(); if (event->hdr.event_type != C_EVENT_LINK || - event->tgt_long.buffer_id != ep_obj->ctrl_msg_req.req_id) { + event->tgt_long.buffer_id != ep_obj->ctrl.msg_req.req_id) { /* This is a device malfunction */ CXIP_WARN("Invalid Link EQE %u %u %u %u\n", event->hdr.event_type, event->tgt_long.return_code, event->tgt_long.buffer_id, - ep_obj->ctrl_msg_req.req_id); + ep_obj->ctrl.msg_req.req_id); ret = -FI_EIO; goto err_free_id; } @@ -206,14 +211,14 @@ int cxip_ctrl_msg_init(struct cxip_ep_obj *ep_obj) goto err_free_id; } - cxi_eq_ack_events(ep_obj->ctrl_tgt_evtq); + cxi_eq_ack_events(ep_obj->ctrl.tgt_evtq); CXIP_DBG("Control messaging initialized: %p\n", ep_obj); return FI_SUCCESS; err_free_id: - cxip_domain_ctrl_id_free(ep_obj->domain, &ep_obj->ctrl_msg_req); + cxip_domain_ctrl_id_free(ep_obj->domain, &ep_obj->ctrl.msg_req); return ret; } @@ -225,7 +230,7 @@ int cxip_ctrl_msg_init(struct cxip_ep_obj *ep_obj) */ void cxip_ctrl_msg_fini(struct cxip_ep_obj *ep_obj) { - cxip_domain_ctrl_id_free(ep_obj->domain, &ep_obj->ctrl_msg_req); + cxip_domain_ctrl_id_free(ep_obj->domain, &ep_obj->ctrl.msg_req); CXIP_DBG("Control messaging finalized: %p\n", ep_obj); } @@ -313,7 +318,7 @@ static struct cxip_ctrl_req *cxip_ep_ctrl_event_req(struct cxip_ep_obj *ep_obj, static void cxip_ep_return_ctrl_tx_credits(struct cxip_ep_obj *ep_obj, unsigned int credits) { - ep_obj->ctrl_tx_credits += credits; + ep_obj->ctrl.tx_credits += credits; } void cxip_ep_ctrl_eq_progress(struct cxip_ep_obj *ep_obj, @@ -358,12 +363,12 @@ void cxip_ep_ctrl_eq_progress(struct cxip_ep_obj *ep_obj, void cxip_ep_tx_ctrl_progress(struct cxip_ep_obj *ep_obj) { - cxip_ep_ctrl_eq_progress(ep_obj, ep_obj->ctrl_tx_evtq, true, false); + cxip_ep_ctrl_eq_progress(ep_obj, ep_obj->ctrl.tx_evtq, true, false); } void cxip_ep_tx_ctrl_progress_locked(struct cxip_ep_obj *ep_obj) { - cxip_ep_ctrl_eq_progress(ep_obj, ep_obj->ctrl_tx_evtq, true, true); + cxip_ep_ctrl_eq_progress(ep_obj, ep_obj->ctrl.tx_evtq, true, true); } /* @@ -371,7 +376,7 @@ void cxip_ep_tx_ctrl_progress_locked(struct cxip_ep_obj *ep_obj) */ void cxip_ep_ctrl_progress(struct cxip_ep_obj *ep_obj) { - cxip_ep_ctrl_eq_progress(ep_obj, ep_obj->ctrl_tgt_evtq, false, false); + cxip_ep_ctrl_eq_progress(ep_obj, ep_obj->ctrl.tgt_evtq, false, false); cxip_ep_tx_ctrl_progress(ep_obj); } @@ -380,7 +385,7 @@ void cxip_ep_ctrl_progress(struct cxip_ep_obj *ep_obj) */ void cxip_ep_ctrl_progress_locked(struct cxip_ep_obj *ep_obj) { - cxip_ep_ctrl_eq_progress(ep_obj, ep_obj->ctrl_tgt_evtq, false, true); + cxip_ep_ctrl_eq_progress(ep_obj, ep_obj->ctrl.tgt_evtq, false, true); cxip_ep_tx_ctrl_progress_locked(ep_obj); } @@ -389,7 +394,7 @@ void cxip_ep_ctrl_progress_locked(struct cxip_ep_obj *ep_obj) */ void cxip_ep_tgt_ctrl_progress(struct cxip_ep_obj *ep_obj) { - cxip_ep_ctrl_eq_progress(ep_obj, ep_obj->ctrl_tgt_evtq, false, false); + cxip_ep_ctrl_eq_progress(ep_obj, ep_obj->ctrl.tgt_evtq, false, false); } /* @@ -398,7 +403,7 @@ void cxip_ep_tgt_ctrl_progress(struct cxip_ep_obj *ep_obj) */ void cxip_ep_tgt_ctrl_progress_locked(struct cxip_ep_obj *ep_obj) { - cxip_ep_ctrl_eq_progress(ep_obj, ep_obj->ctrl_tgt_evtq, false, true); + cxip_ep_ctrl_eq_progress(ep_obj, ep_obj->ctrl.tgt_evtq, false, true); } /* @@ -408,20 +413,20 @@ int cxip_ep_ctrl_trywait(void *arg) { struct cxip_ep_obj *ep_obj = (struct cxip_ep_obj *)arg; - if (!ep_obj->ctrl_wait) { + if (!ep_obj->ctrl.wait) { CXIP_WARN("No CXI ep_obj wait object\n"); return -FI_EINVAL; } - if (cxi_eq_peek_event(ep_obj->ctrl_tgt_evtq) || - cxi_eq_peek_event(ep_obj->ctrl_tx_evtq)) + if (cxi_eq_peek_event(ep_obj->ctrl.tgt_evtq) || + cxi_eq_peek_event(ep_obj->ctrl.tx_evtq)) return -FI_EAGAIN; ofi_genlock_lock(&ep_obj->lock); - cxil_clear_wait_obj(ep_obj->ctrl_wait); + cxil_clear_wait_obj(ep_obj->ctrl.wait); - if (cxi_eq_peek_event(ep_obj->ctrl_tgt_evtq) || - cxi_eq_peek_event(ep_obj->ctrl_tx_evtq)) { + if (cxi_eq_peek_event(ep_obj->ctrl.tgt_evtq) || + cxi_eq_peek_event(ep_obj->ctrl.tx_evtq)) { ofi_genlock_unlock(&ep_obj->lock); return -FI_EAGAIN; @@ -477,9 +482,9 @@ static int cxip_ep_ctrl_eq_alloc(struct cxip_ep_obj *ep_obj, size_t len, eq_attr.queue = *eq_buf; eq_attr.queue_len = len; - /* ep_obj->ctrl_wait will be NULL if not required */ + /* ep_obj->ctrl.wait will be NULL if not required */ ret = cxil_alloc_evtq(ep_obj->domain->lni->lni, *eq_md, &eq_attr, - ep_obj->ctrl_wait, NULL, eq); + ep_obj->ctrl.wait, NULL, eq); if (ret) goto err_free_eq_md; @@ -500,10 +505,10 @@ static int cxip_ep_ctrl_eq_alloc(struct cxip_ep_obj *ep_obj, size_t len, */ static bool cxip_ctrl_wait_required(struct cxip_ep_obj *ep_obj) { - if (ep_obj->rxc.recv_cq && ep_obj->rxc.recv_cq->priv_wait) + if (ep_obj->rxc->recv_cq && ep_obj->rxc->recv_cq->priv_wait) return true; - if (ep_obj->txc.send_cq && ep_obj->txc.send_cq->priv_wait) + if (ep_obj->txc->send_cq && ep_obj->txc->send_cq->priv_wait) return true; return false; @@ -516,18 +521,19 @@ void cxip_ep_ctrl_del_wait(struct cxip_ep_obj *ep_obj) { int wait_fd; - wait_fd = cxil_get_wait_obj_fd(ep_obj->ctrl_wait); + wait_fd = cxil_get_wait_obj_fd(ep_obj->ctrl.wait); - if (ep_obj->txc.send_cq) { - ofi_wait_del_fd(ep_obj->txc.send_cq->util_cq.wait, wait_fd); + if (ep_obj->txc->send_cq) { + ofi_wait_del_fd(ep_obj->txc->send_cq->util_cq.wait, wait_fd); CXIP_DBG("Deleted control HW EQ FD: %d from CQ: %p\n", - wait_fd, ep_obj->txc.send_cq); + wait_fd, ep_obj->txc->send_cq); } - if (ep_obj->rxc.recv_cq && ep_obj->rxc.recv_cq != ep_obj->txc.send_cq) { - ofi_wait_del_fd(ep_obj->rxc.recv_cq->util_cq.wait, wait_fd); + if (ep_obj->rxc->recv_cq && + ep_obj->rxc->recv_cq != ep_obj->txc->send_cq) { + ofi_wait_del_fd(ep_obj->rxc->recv_cq->util_cq.wait, wait_fd); CXIP_DBG("Deleted control HW EQ FD: %d from CQ %p\n", - wait_fd, ep_obj->rxc.recv_cq); + wait_fd, ep_obj->rxc->recv_cq); } } @@ -541,13 +547,13 @@ int cxip_ep_ctrl_add_wait(struct cxip_ep_obj *ep_obj) int ret; ret = cxil_alloc_wait_obj(ep_obj->domain->lni->lni, - &ep_obj->ctrl_wait); + &ep_obj->ctrl.wait); if (ret) { CXIP_WARN("Control wait object allocation failed: %d\n", ret); return -FI_ENOMEM; } - wait_fd = cxil_get_wait_obj_fd(ep_obj->ctrl_wait); + wait_fd = cxil_get_wait_obj_fd(ep_obj->ctrl.wait); ret = fi_fd_nonblock(wait_fd); if (ret) { CXIP_WARN("Unable to set control wait non-blocking: %d, %s\n", @@ -555,7 +561,7 @@ int cxip_ep_ctrl_add_wait(struct cxip_ep_obj *ep_obj) goto err; } - cq = ep_obj->txc.send_cq; + cq = ep_obj->txc->send_cq; if (cq) { ret = ofi_wait_add_fd(cq->util_cq.wait, wait_fd, POLLIN, cxip_ep_ctrl_trywait, ep_obj, @@ -567,8 +573,8 @@ int cxip_ep_ctrl_add_wait(struct cxip_ep_obj *ep_obj) } } - if (ep_obj->rxc.recv_cq && ep_obj->rxc.recv_cq != cq) { - cq = ep_obj->rxc.recv_cq; + if (ep_obj->rxc->recv_cq && ep_obj->rxc->recv_cq != cq) { + cq = ep_obj->rxc->recv_cq; ret = ofi_wait_add_fd(cq->util_cq.wait, wait_fd, POLLIN, cxip_ep_ctrl_trywait, ep_obj, @@ -586,11 +592,11 @@ int cxip_ep_ctrl_add_wait(struct cxip_ep_obj *ep_obj) return FI_SUCCESS; err_add_fd: - if (ep_obj->txc.send_cq) - ofi_wait_del_fd(ep_obj->txc.send_cq->util_cq.wait, wait_fd); + if (ep_obj->txc->send_cq) + ofi_wait_del_fd(ep_obj->txc->send_cq->util_cq.wait, wait_fd); err: - cxil_destroy_wait_obj(ep_obj->ctrl_wait); - ep_obj->ctrl_wait = NULL; + cxil_destroy_wait_obj(ep_obj->ctrl.wait); + ep_obj->ctrl.wait = NULL; return ret; } @@ -624,7 +630,7 @@ int cxip_ep_ctrl_init(struct cxip_ep_obj *ep_obj) */ if (cxip_ctrl_wait_required(ep_obj)) { ret = cxil_alloc_wait_obj(ep_obj->domain->lni->lni, - &ep_obj->ctrl_wait); + &ep_obj->ctrl.wait); if (ret) { CXIP_WARN("EP ctrl wait object alloc failed: %d\n", ret); @@ -633,18 +639,18 @@ int cxip_ep_ctrl_init(struct cxip_ep_obj *ep_obj) } ret = cxip_ep_ctrl_eq_alloc(ep_obj, 4 * s_page_size, - &ep_obj->ctrl_tx_evtq_buf, - &ep_obj->ctrl_tx_evtq_buf_md, - &ep_obj->ctrl_tx_evtq); + &ep_obj->ctrl.tx_evtq_buf, + &ep_obj->ctrl.tx_evtq_buf_md, + &ep_obj->ctrl.tx_evtq); if (ret != FI_SUCCESS) { CXIP_WARN("Failed to allocate TX EQ resources, ret: %d\n", ret); goto err; } ret = cxip_ep_ctrl_eq_alloc(ep_obj, rx_eq_size, - &ep_obj->ctrl_tgt_evtq_buf, - &ep_obj->ctrl_tgt_evtq_buf_md, - &ep_obj->ctrl_tgt_evtq); + &ep_obj->ctrl.tgt_evtq_buf, + &ep_obj->ctrl.tgt_evtq_buf_md, + &ep_obj->ctrl.tgt_evtq); if (ret != FI_SUCCESS) { CXIP_WARN("Failed to allocate TGT EQ resources, ret: %d\n", ret); @@ -652,7 +658,7 @@ int cxip_ep_ctrl_init(struct cxip_ep_obj *ep_obj) } ret = cxip_ep_cmdq(ep_obj, true, ep_obj->domain->tclass, - ep_obj->ctrl_tx_evtq, &ep_obj->ctrl_txq); + ep_obj->ctrl.tx_evtq, &ep_obj->ctrl.txq); if (ret != FI_SUCCESS) { CXIP_WARN("Failed to allocate control TXQ, ret: %d\n", ret); ret = -FI_EDOMAIN; @@ -660,34 +666,34 @@ int cxip_ep_ctrl_init(struct cxip_ep_obj *ep_obj) } ret = cxip_ep_cmdq(ep_obj, false, ep_obj->domain->tclass, - ep_obj->ctrl_tgt_evtq, &ep_obj->ctrl_tgq); + ep_obj->ctrl.tgt_evtq, &ep_obj->ctrl.tgq); if (ret != FI_SUCCESS) { CXIP_WARN("Failed to allocate control TGQ, ret: %d\n", ret); ret = -FI_EDOMAIN; goto free_txq; } - ret = cxip_pte_alloc_nomap(ep_obj->ptable, ep_obj->ctrl_tgt_evtq, - &pt_opts, NULL, NULL, &ep_obj->ctrl_pte); + ret = cxip_pte_alloc_nomap(ep_obj->ptable, ep_obj->ctrl.tgt_evtq, + &pt_opts, NULL, NULL, &ep_obj->ctrl.pte); if (ret != FI_SUCCESS) { CXIP_WARN("Failed to allocate control PTE: %d\n", ret); goto free_tgq; } /* CXIP_PTL_IDX_WRITE_MR_STD is shared with CXIP_PTL_IDX_CTRL. */ - ret = cxip_pte_map(ep_obj->ctrl_pte, CXIP_PTL_IDX_WRITE_MR_STD, false); + ret = cxip_pte_map(ep_obj->ctrl.pte, CXIP_PTL_IDX_WRITE_MR_STD, false); if (ret != FI_SUCCESS) { CXIP_WARN("Failed to map write PTE: %d\n", ret); goto free_pte; } - ret = cxip_pte_map(ep_obj->ctrl_pte, CXIP_PTL_IDX_READ_MR_STD, false); + ret = cxip_pte_map(ep_obj->ctrl.pte, CXIP_PTL_IDX_READ_MR_STD, false); if (ret != FI_SUCCESS) { CXIP_WARN("Failed to map read PTE: %d\n", ret); goto free_pte; } - ret = cxip_pte_set_state(ep_obj->ctrl_pte, ep_obj->ctrl_tgq, + ret = cxip_pte_set_state(ep_obj->ctrl.pte, ep_obj->ctrl.tgq, C_PTLTE_ENABLED, 0); if (ret) { /* This is a bug, we have exclusive access to this CMDQ. */ @@ -696,7 +702,7 @@ int cxip_ep_ctrl_init(struct cxip_ep_obj *ep_obj) } /* Wait for Enable event */ - while (!(event = cxi_eq_get_event(ep_obj->ctrl_tgt_evtq))) + while (!(event = cxi_eq_get_event(ep_obj->ctrl.tgt_evtq))) sched_yield(); switch (event->hdr.event_type) { @@ -704,7 +710,7 @@ int cxip_ep_ctrl_init(struct cxip_ep_obj *ep_obj) if (event->tgt_long.return_code != C_RC_OK || event->tgt_long.initiator.state_change.ptlte_state != C_PTLTE_ENABLED || - event->tgt_long.ptlte_index != ep_obj->ctrl_pte->pte->ptn) + event->tgt_long.ptlte_index != ep_obj->ctrl.pte->pte->ptn) CXIP_FATAL("Invalid PtlTE enable event\n"); break; case C_EVENT_COMMAND_FAILURE: @@ -718,7 +724,7 @@ int cxip_ep_ctrl_init(struct cxip_ep_obj *ep_obj) CXIP_FATAL("Invalid event type: %d\n", event->hdr.event_type); } - cxi_eq_ack_events(ep_obj->ctrl_tgt_evtq); + cxi_eq_ack_events(ep_obj->ctrl.tgt_evtq); ret = cxip_ctrl_msg_init(ep_obj); if (ret != FI_SUCCESS) @@ -730,30 +736,30 @@ int cxip_ep_ctrl_init(struct cxip_ep_obj *ep_obj) * 3. One slot for EQ overrun detection. * 4. TODO: Determine why an additional slot needs to be reserved. */ - ep_obj->ctrl_tx_credits = - ep_obj->ctrl_tx_evtq->byte_size / C_EE_CFG_ECB_SIZE - 4; + ep_obj->ctrl.tx_credits = + ep_obj->ctrl.tx_evtq->byte_size / C_EE_CFG_ECB_SIZE - 4; CXIP_DBG("EP control initialized: %p\n", ep_obj); return FI_SUCCESS; free_pte: - cxip_pte_free(ep_obj->ctrl_pte); + cxip_pte_free(ep_obj->ctrl.pte); free_tgq: cxip_ep_cmdq_put(ep_obj, false); free_txq: cxip_ep_cmdq_put(ep_obj, true); free_tgt_evtq: - cxip_eq_ctrl_eq_free(ep_obj->ctrl_tgt_evtq_buf, - ep_obj->ctrl_tgt_evtq_buf_md, - ep_obj->ctrl_tgt_evtq); + cxip_eq_ctrl_eq_free(ep_obj->ctrl.tgt_evtq_buf, + ep_obj->ctrl.tgt_evtq_buf_md, + ep_obj->ctrl.tgt_evtq); free_tx_evtq: - cxip_eq_ctrl_eq_free(ep_obj->ctrl_tx_evtq_buf, - ep_obj->ctrl_tx_evtq_buf_md, ep_obj->ctrl_tx_evtq); + cxip_eq_ctrl_eq_free(ep_obj->ctrl.tx_evtq_buf, + ep_obj->ctrl.tx_evtq_buf_md, ep_obj->ctrl.tx_evtq); err: - if (ep_obj->ctrl_wait) { - cxil_destroy_wait_obj(ep_obj->ctrl_wait); - ep_obj->ctrl_wait = NULL; + if (ep_obj->ctrl.wait) { + cxil_destroy_wait_obj(ep_obj->ctrl.wait); + ep_obj->ctrl.wait = NULL; } return ret; @@ -768,19 +774,19 @@ void cxip_ep_ctrl_fini(struct cxip_ep_obj *ep_obj) { cxip_ctrl_mr_cache_flush(ep_obj); cxip_ctrl_msg_fini(ep_obj); - cxip_pte_free(ep_obj->ctrl_pte); + cxip_pte_free(ep_obj->ctrl.pte); cxip_ep_cmdq_put(ep_obj, false); cxip_ep_cmdq_put(ep_obj, true); - cxip_eq_ctrl_eq_free(ep_obj->ctrl_tgt_evtq_buf, - ep_obj->ctrl_tgt_evtq_buf_md, - ep_obj->ctrl_tgt_evtq); - cxip_eq_ctrl_eq_free(ep_obj->ctrl_tx_evtq_buf, - ep_obj->ctrl_tx_evtq_buf_md, ep_obj->ctrl_tx_evtq); + cxip_eq_ctrl_eq_free(ep_obj->ctrl.tgt_evtq_buf, + ep_obj->ctrl.tgt_evtq_buf_md, + ep_obj->ctrl.tgt_evtq); + cxip_eq_ctrl_eq_free(ep_obj->ctrl.tx_evtq_buf, + ep_obj->ctrl.tx_evtq_buf_md, ep_obj->ctrl.tx_evtq); - if (ep_obj->ctrl_wait) { - cxil_destroy_wait_obj(ep_obj->ctrl_wait); - ep_obj->ctrl_wait = NULL; + if (ep_obj->ctrl.wait) { + cxil_destroy_wait_obj(ep_obj->ctrl.wait); + ep_obj->ctrl.wait = NULL; CXIP_DBG("Deleted control EQ wait object\n"); } diff --git a/prov/cxi/src/cxip_curl.c b/prov/cxi/src/cxip_curl.c index 225512dcaa8..97fcfcfac10 100644 --- a/prov/cxi/src/cxip_curl.c +++ b/prov/cxi/src/cxip_curl.c @@ -15,7 +15,8 @@ #include "cxip.h" -#define TRACE(fmt, ...) CXIP_TRACE(CXIP_TRC_CURL, fmt, ##__VA_ARGS__) +#define TRACE_CURL(fmt, ...) CXIP_COLL_TRACE(CXIP_TRC_COLL_CURL, fmt, \ + ##__VA_ARGS__) #define CXIP_DBG(...) _CXIP_DBG(FI_LOG_FABRIC, __VA_ARGS__) #define CXIP_WARN(...) _CXIP_WARN(FI_LOG_FABRIC, __VA_ARGS__) @@ -229,17 +230,17 @@ int cxip_curl_perform(const char *endpoint, const char *request, struct cxip_curl_handle *handle; struct curl_slist *headers; char *token; + char *verify_peer_str; + int verify_peer; CURLMcode mres; CURL *curl; int running; int ret; - TRACE("%s: usrptr=%p\n", __func__, usrptr); ret = -FI_ENOMEM; handle = calloc(1, sizeof(*handle)); if (!handle) goto fail; - TRACE("%s: handle=%p\n", __func__, handle); /* libcurl is fussy about NULL requests */ handle->endpoint = strdup(endpoint); @@ -255,8 +256,6 @@ int cxip_curl_perform(const char *endpoint, const char *request, /* add user completion function and pointer */ handle->usrfunc = usrfunc; handle->usrptr = usrptr; - TRACE("%s: handle->usrfnc=%p\n", __func__, handle->usrfunc); - TRACE("%s: handle->usrptr=%p\n", __func__, handle->usrptr); ret = -FI_EACCES; curl = curl_easy_init(); @@ -273,9 +272,10 @@ int cxip_curl_perform(const char *endpoint, const char *request, headers = curl_slist_append(headers, "charset: utf-8"); token = NULL; if (sessionToken) { - ret = asprintf(&token, "x-xenon-auth-token: %s", sessionToken); + ret = asprintf(&token, "Authorization: Bearer %s", + sessionToken); if (ret < 0) { - CXIP_WARN("x-xenon-auth-token create failed\n"); + CXIP_WARN("token string create failed\n"); goto fail; } headers = curl_slist_append(headers, token); @@ -291,6 +291,7 @@ int cxip_curl_perform(const char *endpoint, const char *request, curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE, strlen(handle->request)); } + curl_easy_setopt(curl, CURLOPT_STDERR, stderr); curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_callback); curl_easy_setopt(curl, CURLOPT_WRITEDATA, handle->recv); @@ -298,6 +299,13 @@ int cxip_curl_perform(const char *endpoint, const char *request, curl_easy_setopt(curl, CURLOPT_VERBOSE, (long)verbose); curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, cxip_curl_opname(op)); + verify_peer_str = getenv("CURLOPT_SSL_VERIFYPEER"); + if (verify_peer_str) + verify_peer = atoi(verify_peer_str); + else + verify_peer = 0; + curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, verify_peer); + curl_multi_add_handle(cxip_curlm, curl); mres = curl_multi_perform(cxip_curlm, &running); if (mres != CURLM_OK) { @@ -367,7 +375,6 @@ int cxip_curl_progress(struct cxip_curl_handle **handleptr) long status; struct curl_buffer *recv; - /* This needs to be quick if nothing is pending */ if (!cxip_curl_count) return -FI_ENODATA; @@ -388,27 +395,35 @@ int cxip_curl_progress(struct cxip_curl_handle **handleptr) return (running) ? -FI_EAGAIN : -FI_ENODATA; } + if (msg->data.result >= CURL_LAST) { + CXIP_WARN("CURL unknown result %d\n", msg->data.result); + } + else if (msg->data.result > CURLE_OK) { + CXIP_WARN("CURL error '%s'\n", + curl_easy_strerror(msg->data.result)); + } /* retrieve our handle from the private pointer */ res = curl_easy_getinfo(msg->easy_handle, CURLINFO_PRIVATE, (char **)&handle); if (res != CURLE_OK) { + TRACE_CURL("curl_easy_getinfo(%s) failed: %s\n", + "CURLINFO_PRIVATE", curl_easy_strerror(res)); CXIP_WARN("curl_easy_getinfo(%s) failed: %s\n", - "CURLINFO_PRIVATE", - curl_easy_strerror(res)); + "CURLINFO_PRIVATE", curl_easy_strerror(res)); return -FI_EOTHER; } /* handle is now valid, must eventually be freed */ - TRACE("%s: handle=%p\n", __func__, handle); - /* retrieve the status code, should not fail */ res = curl_easy_getinfo(msg->easy_handle, CURLINFO_RESPONSE_CODE, &status); if (res != CURLE_OK) { + TRACE_CURL("curl_easy_getinfo(%s) failed: %s\n", + "CURLINFO_RESPONSE_CODE", curl_easy_strerror(res)); CXIP_WARN("curl_easy_getinfo(%s) failed: %s\n", - "CURLINFO_RESPONSE_CODE", - curl_easy_strerror(res)); + "CURLINFO_RESPONSE_CODE", curl_easy_strerror(res)); /* continue, handle->status should show zero */ } + TRACE_CURL("curl_easy_getinfo() success\n"); /* we can recover resources now */ curl_slist_free_all((struct curl_slist *)handle->headers); @@ -422,11 +437,8 @@ int cxip_curl_progress(struct cxip_curl_handle **handleptr) handle->status = status; /* call the user function */ - TRACE("%s: handle->usrfnc=%p\n", __func__, handle->usrfunc); - TRACE("%s: handle->usrptr=%p\n", __func__, handle->usrptr); if (handle->usrfunc) handle->usrfunc(handle); - TRACE("%s: returned from usrfnc\n", __func__); /* return the handle, or free it */ if (handleptr) { diff --git a/prov/cxi/src/cxip_dom.c b/prov/cxi/src/cxip_dom.c index 8aa6831b0c8..3c82da3607f 100644 --- a/prov/cxi/src/cxip_dom.c +++ b/prov/cxi/src/cxip_dom.c @@ -20,6 +20,309 @@ extern struct fi_ops_mr cxip_dom_mr_ops; +static void cxip_domain_cmdq_free(struct cxip_domain *dom) +{ + struct cxip_domain_cmdq *cmdq; + + while ((cmdq = dlist_first_entry_or_null(&dom->cmdq_list, + struct cxip_domain_cmdq, + entry))) { + + cxip_cmdq_free(cmdq->cmdq); + dlist_remove(&cmdq->entry); + dom->cmdq_cnt--; + free(cmdq); + } +} + +static int cxip_domain_cmdq_alloc(struct cxip_domain *dom, + uint16_t vni, + enum cxi_traffic_class tc, + struct cxip_domain_cmdq **dom_cmdq) +{ + struct cxip_domain_cmdq *cmdq; + struct cxi_cq_alloc_opts cq_opts = { + .flags = CXI_CQ_IS_TX, + }; + int ret; + + cmdq = calloc(1, sizeof(*cmdq)); + if (!cmdq) { + CXIP_WARN("Failed to allocate cmdq memory\n"); + return -FI_ENOMEM; + } + + /* Domain managed transmit command queues require being updated on + * empty to be able to safely change communication profile VNI. + */ + cq_opts.policy = CXI_CQ_UPDATE_HIGH_FREQ_EMPTY; + + /* An IDC command can use up to 4x 64 byte slots. */ + cq_opts.count = 4 * dom->tx_size; + + ret = cxip_cmdq_alloc(dom->lni, NULL, &cq_opts, vni, tc, + CXI_TC_TYPE_DEFAULT, &cmdq->cmdq); + if (ret) { + CXIP_WARN("Failed to allocate cmdq: %d\n", ret); + goto err_free_mem; + } + + dlist_insert_head(&cmdq->entry, &dom->cmdq_list); + dom->cmdq_cnt++; + + *dom_cmdq = cmdq; + + return FI_SUCCESS; + +err_free_mem: + free(cmdq); + + return ret; +} + +/* Hardware only allows for 16 different command profiles per RGID. Since each + * domain maps to a single RGID, this means effectively limits the number of + * TX command queue per domain to be 16. Since one TX command queue is + * reserved for triggered commands, real number is 15. + */ +#define MAX_DOM_TX_CMDQ 15U + +static int cxip_domain_find_cmdq(struct cxip_domain *dom, + uint16_t vni, + enum cxi_traffic_class tc, + struct cxip_domain_cmdq **dom_cmdq) +{ + struct cxip_domain_cmdq *cmdq; + int ret; + + /* Prefer existing command queues. */ + dlist_foreach_container(&dom->cmdq_list, struct cxip_domain_cmdq, cmdq, + entry) { + if (cxip_cmdq_match(cmdq->cmdq, vni, tc, + CXI_TC_TYPE_DEFAULT)) { + *dom_cmdq = cmdq; + return FI_SUCCESS; + } + } + + /* Prefer reusing an empty command queue instead of allocating a new + * one. + */ + dlist_foreach_container(&dom->cmdq_list, struct cxip_domain_cmdq, cmdq, + entry) { + if (cxip_cmdq_empty(cmdq->cmdq)) { + + /* TODO: This needs to use new direct CP profile feature + * which disables sharing of communication profile + * across TX command queues. + */ + ret = cxip_cmdq_cp_set(cmdq->cmdq, vni, tc, + CXI_TC_TYPE_DEFAULT); + if (ret) { + CXIP_WARN("Failed to change communication profile: %d\n", + ret); + return ret; + } + + *dom_cmdq = cmdq; + return FI_SUCCESS; + } + } + + /* Last resort is allocating a new transmit command queue. If limit has + * been reached, only option is to change communication profile for + * existing TX cmdq. + */ + if (dom->cmdq_cnt == MAX_DOM_TX_CMDQ) { + CXIP_WARN("At domain command queue max\n"); + return -FI_EAGAIN; + } + + ret = cxip_domain_cmdq_alloc(dom, vni, tc, &cmdq); + if (ret) { + CXIP_WARN("Failed to allocate domain command queue: %d\n", ret); + return ret; + } + + *dom_cmdq = cmdq; + + return FI_SUCCESS; +} + +int cxip_domain_emit_idc_put(struct cxip_domain *dom, uint16_t vni, + enum cxi_traffic_class tc, + const struct c_cstate_cmd *c_state, + const struct c_idc_put_cmd *put, const void *buf, + size_t len, uint64_t flags) +{ + int ret; + struct cxip_domain_cmdq *cmdq; + + ofi_genlock_lock(&dom->cmdq_lock); + + ret = cxip_domain_find_cmdq(dom, vni, tc, &cmdq); + if (ret) { + CXIP_WARN("Failed to find command queue: %d\n", ret); + goto out_unlock; + } + + ret = cxip_cmdq_emit_idc_put(cmdq->cmdq, c_state, put, buf, len, flags); + if (ret) { + CXIP_WARN("Failed to emit idc_put: %d\n", ret); + goto out_unlock; + } + + cxi_cq_ring(cmdq->cmdq->dev_cmdq); + + ofi_genlock_unlock(&dom->cmdq_lock); + + return FI_SUCCESS; + +out_unlock: + ofi_genlock_unlock(&dom->cmdq_lock); + + return ret; +} + +int cxip_domain_emit_dma(struct cxip_domain *dom, uint16_t vni, + enum cxi_traffic_class tc, struct c_full_dma_cmd *dma, + uint64_t flags) +{ + int ret; + struct cxip_domain_cmdq *cmdq; + + ofi_genlock_lock(&dom->cmdq_lock); + + ret = cxip_domain_find_cmdq(dom, vni, tc, &cmdq); + if (ret) { + CXIP_WARN("Failed to find command queue: %d\n", ret); + goto out_unlock; + } + + ret = cxip_cmdq_emit_dma(cmdq->cmdq, dma, flags); + if (ret) { + CXIP_WARN("Failed to emit dma: %d\n", ret); + goto out_unlock; + } + + cxi_cq_ring(cmdq->cmdq->dev_cmdq); + + ofi_genlock_unlock(&dom->cmdq_lock); + + return FI_SUCCESS; + +out_unlock: + ofi_genlock_unlock(&dom->cmdq_lock); + + return ret; +} + +int cxip_domain_emit_idc_amo(struct cxip_domain *dom, uint16_t vni, + enum cxi_traffic_class tc, + const struct c_cstate_cmd *c_state, + const struct c_idc_amo_cmd *amo, uint64_t flags, + bool fetching, bool flush) +{ + int ret; + struct cxip_domain_cmdq *cmdq; + + ofi_genlock_lock(&dom->cmdq_lock); + + ret = cxip_domain_find_cmdq(dom, vni, tc, &cmdq); + if (ret) { + CXIP_WARN("Failed to find command queue: %d\n", ret); + goto out_unlock; + } + + ret = cxip_cmdq_emic_idc_amo(cmdq->cmdq, c_state, amo, flags, + fetching, flush); + if (ret) { + CXIP_WARN("Failed to emit idc_amo: %d\n", ret); + goto out_unlock; + } + + cxi_cq_ring(cmdq->cmdq->dev_cmdq); + + ofi_genlock_unlock(&dom->cmdq_lock); + + return FI_SUCCESS; + +out_unlock: + ofi_genlock_unlock(&dom->cmdq_lock); + + return ret; +} + +int cxip_domain_emit_dma_amo(struct cxip_domain *dom, uint16_t vni, + enum cxi_traffic_class tc, + struct c_dma_amo_cmd *amo, uint64_t flags, + bool fetching, bool flush) +{ + int ret; + struct cxip_domain_cmdq *cmdq; + + ofi_genlock_lock(&dom->cmdq_lock); + + ret = cxip_domain_find_cmdq(dom, vni, tc, &cmdq); + if (ret) { + CXIP_WARN("Failed to find command queue: %d\n", ret); + goto out_unlock; + } + + ret = cxip_cmdq_emit_dma_amo(cmdq->cmdq, amo, flags, fetching, flush); + if (ret) { + CXIP_WARN("Failed to emit amo: %d\n", ret); + goto out_unlock; + } + + cxi_cq_ring(cmdq->cmdq->dev_cmdq); + + ofi_genlock_unlock(&dom->cmdq_lock); + + return FI_SUCCESS; + +out_unlock: + ofi_genlock_unlock(&dom->cmdq_lock); + + return ret; +} + +int cxip_domain_emit_idc_msg(struct cxip_domain *dom, uint16_t vni, + enum cxi_traffic_class tc, + const struct c_cstate_cmd *c_state, + const struct c_idc_msg_hdr *msg, const void *buf, + size_t len, uint64_t flags) +{ + int ret; + struct cxip_domain_cmdq *cmdq; + + ofi_genlock_lock(&dom->cmdq_lock); + + ret = cxip_domain_find_cmdq(dom, vni, tc, &cmdq); + if (ret) { + CXIP_WARN("Failed to find command queue: %d\n", ret); + goto out_unlock; + } + + ret = cxip_cmdq_emit_idc_msg(cmdq->cmdq, c_state, msg, buf, len, + flags); + if (ret) { + CXIP_WARN("Failed to emit idc msg: %d\n", ret); + goto out_unlock; + } + + cxi_cq_ring(cmdq->cmdq->dev_cmdq); + + ofi_genlock_unlock(&dom->cmdq_lock); + + return FI_SUCCESS; + +out_unlock: + ofi_genlock_unlock(&dom->cmdq_lock); + + return ret; +} + /* * cxip_domain_req_alloc() - Allocate a domain control buffer ID */ @@ -261,13 +564,18 @@ static int cxip_dom_close(struct fid *fid) cxip_telemetry_free(dom->telemetry); } + cxip_domain_cmdq_free(dom); cxip_domain_disable(dom); + assert(dlist_empty(&dom->cmdq_list)); + assert(dom->cmdq_cnt == 0); + ofi_spin_destroy(&dom->lock); ofi_spin_destroy(&dom->ctrl_id_lock); ofi_idx_reset(&dom->req_ids); ofi_idx_reset(&dom->mr_ids); ofi_domain_close(&dom->util_domain); + ofi_genlock_destroy(&dom->cmdq_lock); free(dom); return 0; @@ -300,6 +608,7 @@ static int cxip_dom_dwq_op_send(struct cxip_domain *dom, struct fi_op_msg *msg, uint64_t trig_thresh) { struct cxip_ep *ep = container_of(msg->ep, struct cxip_ep, ep); + struct cxip_txc *txc = ep->ep_obj->txc; const void *buf; size_t len; int ret; @@ -321,10 +630,10 @@ static int cxip_dom_dwq_op_send(struct cxip_domain *dom, struct fi_op_msg *msg, buf = msg->msg.iov_count ? msg->msg.msg_iov[0].iov_base : NULL; len = msg->msg.iov_count ? msg->msg.msg_iov[0].iov_len : 0; - ret = cxip_send_common(&ep->ep_obj->txc, ep->tx_attr.tclass, buf, len, - NULL, msg->msg.data, msg->msg.addr, 0, - msg->msg.context, msg->flags, false, true, - trig_thresh, trig_cntr, comp_cntr); + ret = txc->ops.send_common(txc, ep->tx_attr.tclass, buf, len, NULL, + msg->msg.data, msg->msg.addr, 0, + msg->msg.context, msg->flags, false, true, + trig_thresh, trig_cntr, comp_cntr); if (ret) CXIP_DBG("Failed to emit message triggered op, ret=%d\n", ret); else @@ -341,6 +650,7 @@ static int cxip_dom_dwq_op_tsend(struct cxip_domain *dom, uint64_t trig_thresh) { struct cxip_ep *ep = container_of(tagged->ep, struct cxip_ep, ep); + struct cxip_txc *txc = ep->ep_obj->txc; const void *buf; size_t len; int ret; @@ -362,11 +672,11 @@ static int cxip_dom_dwq_op_tsend(struct cxip_domain *dom, buf = tagged->msg.iov_count ? tagged->msg.msg_iov[0].iov_base : NULL; len = tagged->msg.iov_count ? tagged->msg.msg_iov[0].iov_len : 0; - ret = cxip_send_common(&ep->ep_obj->txc, ep->tx_attr.tclass, buf, len, - NULL, tagged->msg.data, tagged->msg.addr, - tagged->msg.tag, tagged->msg.context, - tagged->flags, true, true, trig_thresh, - trig_cntr, comp_cntr); + ret = txc->ops.send_common(txc, ep->tx_attr.tclass, buf, len, NULL, + tagged->msg.data, tagged->msg.addr, + tagged->msg.tag, tagged->msg.context, + tagged->flags, true, true, trig_thresh, + trig_cntr, comp_cntr); if (ret) CXIP_DBG("Failed to emit tagged msg triggered op, ret=%d\n", ret); @@ -399,7 +709,7 @@ static int cxip_dom_dwq_op_rma(struct cxip_domain *dom, struct fi_op_rma *rma, buf = rma->msg.iov_count ? rma->msg.msg_iov[0].iov_base : NULL; len = rma->msg.iov_count ? rma->msg.msg_iov[0].iov_len : 0; - ret = cxip_rma_common(op, &ep->ep_obj->txc, buf, len, NULL, + ret = cxip_rma_common(op, ep->ep_obj->txc, buf, len, NULL, rma->msg.addr, rma->msg.rma_iov[0].addr, rma->msg.rma_iov[0].key, rma->msg.data, rma->flags, ep->tx_attr.tclass, @@ -421,7 +731,7 @@ static int cxip_dom_dwq_op_atomic(struct cxip_domain *dom, uint64_t trig_thresh) { struct cxip_ep *ep = container_of(amo->ep, struct cxip_ep, ep); - struct cxip_txc *txc = &ep->ep_obj->txc; + struct cxip_txc *txc = ep->ep_obj->txc; int ret; if (!amo) @@ -451,7 +761,7 @@ static int cxip_dom_dwq_op_fetch_atomic(struct cxip_domain *dom, uint64_t trig_thresh) { struct cxip_ep *ep = container_of(fetch_amo->ep, struct cxip_ep, ep); - struct cxip_txc *txc = &ep->ep_obj->txc; + struct cxip_txc *txc = ep->ep_obj->txc; int ret; if (!fetch_amo) @@ -484,7 +794,7 @@ static int cxip_dom_dwq_op_comp_atomic(struct cxip_domain *dom, uint64_t trig_thresh) { struct cxip_ep *ep = container_of(comp_amo->ep, struct cxip_ep, ep); - struct cxip_txc *txc = &ep->ep_obj->txc; + struct cxip_txc *txc = ep->ep_obj->txc; int ret; if (!comp_amo) @@ -562,6 +872,7 @@ static int cxip_dom_dwq_op_recv(struct cxip_domain *dom, struct fi_op_msg *msg, uint64_t trig_thresh) { struct cxip_ep *ep = container_of(msg->ep, struct cxip_ep, ep); + struct cxip_rxc *rxc = ep->ep_obj->rxc; void *buf; size_t len; @@ -572,9 +883,9 @@ static int cxip_dom_dwq_op_recv(struct cxip_domain *dom, struct fi_op_msg *msg, buf = msg->msg.iov_count ? msg->msg.msg_iov[0].iov_base : NULL; len = msg->msg.iov_count ? msg->msg.msg_iov[0].iov_len : 0; - return cxip_recv_common(&ep->ep_obj->rxc, buf, len, NULL, msg->msg.addr, - 0, 0, msg->msg.context, msg->flags, false, - comp_cntr); + return rxc->ops.recv_common(rxc, buf, len, NULL, msg->msg.addr, 0, 0, + msg->msg.context, msg->flags, false, + comp_cntr); } static int cxip_dom_dwq_op_trecv(struct cxip_domain *dom, @@ -584,6 +895,7 @@ static int cxip_dom_dwq_op_trecv(struct cxip_domain *dom, uint64_t trig_thresh) { struct cxip_ep *ep = container_of(tagged->ep, struct cxip_ep, ep); + struct cxip_rxc *rxc = ep->ep_obj->rxc; void *buf; size_t len; @@ -594,10 +906,10 @@ static int cxip_dom_dwq_op_trecv(struct cxip_domain *dom, buf = tagged->msg.iov_count ? tagged->msg.msg_iov[0].iov_base : NULL; len = tagged->msg.iov_count ? tagged->msg.msg_iov[0].iov_len : 0; - return cxip_recv_common(&ep->ep_obj->rxc, buf, len, tagged->msg.desc, - tagged->msg.addr, tagged->msg.tag, - tagged->msg.ignore, tagged->msg.context, - tagged->flags, true, comp_cntr); + return rxc->ops.recv_common(rxc, buf, len, tagged->msg.desc, + tagged->msg.addr, tagged->msg.tag, + tagged->msg.ignore, tagged->msg.context, + tagged->flags, true, comp_cntr); } /* Must hold domain lock. */ @@ -1485,6 +1797,27 @@ int cxip_domain(struct fid_fabric *fabric, struct fi_info *info, cxi_domain->util_domain.domain_fid.ops = &cxip_dom_ops; cxi_domain->util_domain.domain_fid.mr = &cxip_dom_mr_ops; + dlist_init(&cxi_domain->cmdq_list); + cxi_domain->cmdq_cnt = 0; + + /* Align domain TX command size based on EP TX size attribute. In + * addition, support ENV vars to override size. + */ + cxi_domain->tx_size = 0; + if (info->tx_attr) + cxi_domain->tx_size = info->tx_attr->size; + + if (!info->tx_attr) { + cxi_domain->tx_size = cxip_env.default_tx_size; + cxi_domain->tx_size = + MAX(cxip_env.default_cq_size, cxi_domain->tx_size); + } + + if (cxi_domain->util_domain.threading == FI_THREAD_DOMAIN) + ofi_genlock_init(&cxi_domain->cmdq_lock, OFI_LOCK_NONE); + else + ofi_genlock_init(&cxi_domain->cmdq_lock, OFI_LOCK_MUTEX); + dlist_init(&cxi_domain->txc_list); dlist_init(&cxi_domain->cntr_list); dlist_init(&cxi_domain->cq_list); diff --git a/prov/cxi/src/cxip_ep.c b/prov/cxi/src/cxip_ep.c index 4b579662002..fabdea22be3 100644 --- a/prov/cxi/src/cxip_ep.c +++ b/prov/cxi/src/cxip_ep.c @@ -180,8 +180,8 @@ void cxip_ep_progress(struct fid *fid) if (ep_obj->enabled) { ofi_genlock_lock(&ep_obj->lock); - cxip_evtq_progress(&ep_obj->rxc.rx_evtq); - cxip_evtq_progress(&ep_obj->txc.tx_evtq); + ep_obj->rxc->ops.progress(ep_obj->rxc); + ep_obj->txc->ops.progress(ep_obj->txc); cxip_ep_ctrl_progress_locked(ep_obj); ofi_genlock_unlock(&ep_obj->lock); } @@ -197,9 +197,11 @@ int cxip_ep_peek(struct fid *fid) struct cxip_ep *ep = container_of(fid, struct cxip_ep, ep.fid); struct cxip_ep_obj *ep_obj = ep->ep_obj; - if (ep_obj->txc.tx_evtq.eq && cxi_eq_peek_event(ep_obj->txc.tx_evtq.eq)) + if (ep_obj->txc->tx_evtq.eq && + cxi_eq_peek_event(ep_obj->txc->tx_evtq.eq)) return -FI_EAGAIN; - if (ep_obj->rxc.rx_evtq.eq && cxi_eq_peek_event(ep_obj->rxc.rx_evtq.eq)) + if (ep_obj->rxc->rx_evtq.eq && + cxi_eq_peek_event(ep_obj->rxc->rx_evtq.eq)) return -FI_EAGAIN; return FI_SUCCESS; @@ -222,7 +224,7 @@ size_t cxip_ep_get_unexp_msgs(struct fid_ep *fid_ep, if (!ux_count) return -FI_EINVAL; - if (ep->ep_obj->rxc.state == RXC_DISABLED) + if (ep->ep_obj->rxc->state == RXC_DISABLED) return -FI_EOPBADSTATE; if (!ofi_recv_allowed(ep->rx_attr.caps)) { @@ -233,15 +235,15 @@ size_t cxip_ep_get_unexp_msgs(struct fid_ep *fid_ep, /* If in flow control, let that complete since * on-loading could be in progress. */ - if (ep->ep_obj->rxc.state != RXC_ENABLED && - ep->ep_obj->rxc.state != RXC_ENABLED_SOFTWARE) { - cxip_cq_progress(ep->ep_obj->rxc.recv_cq); + if (ep->ep_obj->rxc->state != RXC_ENABLED && + ep->ep_obj->rxc->state != RXC_ENABLED_SOFTWARE) { + cxip_cq_progress(ep->ep_obj->rxc->recv_cq); return -FI_EAGAIN; } ofi_genlock_lock(&ep->ep_obj->lock); - if (cxip_evtq_saturated(&ep->ep_obj->rxc.rx_evtq)) { - RXC_DBG(&ep->ep_obj->rxc, "Target HW EQ saturated\n"); + if (cxip_evtq_saturated(&ep->ep_obj->rxc->rx_evtq)) { + RXC_DBG(ep->ep_obj->rxc, "Target HW EQ saturated\n"); ofi_genlock_unlock(&ep->ep_obj->lock); return -FI_EAGAIN; @@ -261,7 +263,7 @@ size_t cxip_ep_get_unexp_msgs(struct fid_ep *fid_ep, void cxip_ep_flush_trig_reqs(struct cxip_ep_obj *ep_obj) { ofi_genlock_lock(&ep_obj->lock); - cxip_evtq_flush_trig_reqs(&ep_obj->txc.tx_evtq); + cxip_evtq_flush_trig_reqs(&ep_obj->txc->tx_evtq); ofi_genlock_unlock(&ep_obj->lock); } @@ -270,7 +272,7 @@ void cxip_ep_flush_trig_reqs(struct cxip_ep_obj *ep_obj) */ void cxip_txc_close(struct cxip_ep *ep) { - struct cxip_txc *txc = &ep->ep_obj->txc; + struct cxip_txc *txc = ep->ep_obj->txc; if (txc->send_cq) { ofi_genlock_lock(&txc->send_cq->ep_list_lock); @@ -313,7 +315,7 @@ void cxip_txc_close(struct cxip_ep *ep) */ void cxip_rxc_close(struct cxip_ep *ep) { - struct cxip_rxc *rxc = &ep->ep_obj->rxc; + struct cxip_rxc *rxc = ep->ep_obj->rxc; if (rxc->recv_cq) { /* EP FID may not be found in the list if recv_cq == send_cq, @@ -422,12 +424,49 @@ ssize_t cxip_rxc_cancel(struct cxip_rxc *rxc, void *context) return cxip_evtq_req_cancel(&rxc->rx_evtq, rxc, context, true); } +/** + * Cancel TX operation + * + * Support TX/RX context cancel(). + * + * Searches the TX queue for a pending async operation with the specified + * 'context', and request it be canceled. + * + * @param rxc : TX context to search + * @param context : user context pointer to search for + * + * @return ssize_t : 0 on success, -errno on failure + */ +ssize_t cxip_txc_cancel(struct cxip_txc *txc, void *context) +{ + struct cxip_req *req; + struct dlist_entry *tmp; + + if (!txc->enabled) + return -FI_EOPBADSTATE; + + if (!context) + return -FI_ENOENT; + + /* Only messaging may be canceled at this time */ + dlist_foreach_container_safe(&txc->msg_queue, struct cxip_req, req, + send.txc_entry, tmp) { + if (req->type != CXIP_REQ_SEND || + req->context != (uint64_t)context) + continue; + return txc->ops.cancel_msg_send(req); + } + + return -FI_ENOENT; +} + /* * cxip_ep_cancel() - Cancel TX/RX operation for EP. */ ssize_t cxip_ep_cancel(fid_t fid, void *context) { struct cxip_ep *ep = container_of(fid, struct cxip_ep, ep.fid); + ssize_t ret; /* TODO: Remove this since it requires malicious programming to * create this condition. @@ -438,7 +477,11 @@ ssize_t cxip_ep_cancel(fid_t fid, void *context) if (!ofi_recv_allowed(ep->ep_obj->caps)) return -FI_ENOENT; - return cxip_rxc_cancel(&ep->ep_obj->rxc, context); + ret = cxip_rxc_cancel(ep->ep_obj->rxc, context); + if (ret != -FI_ENOENT) + return ret; + + return cxip_txc_cancel(ep->ep_obj->txc, context); } /* @@ -514,13 +557,13 @@ static int cxip_ep_enable(struct fid_ep *fid_ep) ep_obj->auth_key.vni, ep_obj->src_addr.pid); - ret = cxip_txc_enable(&ep_obj->txc); + ret = cxip_txc_enable(ep_obj->txc); if (ret != FI_SUCCESS) { CXIP_WARN("cxip_txc_enable returned: %d\n", ret); goto unlock; } - ret = cxip_rxc_enable(&ep_obj->rxc); + ret = cxip_rxc_enable(ep_obj->rxc); if (ret != FI_SUCCESS) { CXIP_WARN("cxip_rxc_enable returned: %d\n", ret); goto unlock; @@ -642,6 +685,9 @@ int cxip_free_endpoint(struct cxip_ep *ep) ofi_atomic_dec32(&ep_obj->domain->ref); ofi_genlock_destroy(&ep_obj->lock); + + cxip_txc_free(ep_obj->txc); + cxip_rxc_free(ep_obj->rxc); free(ep_obj); ep->ep_obj = NULL; @@ -703,7 +749,7 @@ static int cxip_ep_bind_cq(struct cxip_ep *ep, struct cxip_cq *cq, } if (flags & FI_TRANSMIT) { - txc = &ep->ep_obj->txc; + txc = ep->ep_obj->txc; if (txc->send_cq) { CXIP_WARN("SEND CQ previously bound\n"); return -FI_EINVAL; @@ -734,7 +780,7 @@ static int cxip_ep_bind_cq(struct cxip_ep *ep, struct cxip_cq *cq, } if (flags & FI_RECV) { - rxc = &ep->ep_obj->rxc; + rxc = ep->ep_obj->rxc; if (rxc->recv_cq) { CXIP_WARN("RECV CQ previously bound\n"); return -FI_EINVAL; @@ -782,10 +828,10 @@ static int cxip_ep_bind_cntr(struct cxip_ep *ep, struct cxip_cntr *cntr, if (!(flags & CXIP_EP_CNTR_FLAGS)) return FI_SUCCESS; - if ((flags & FI_SEND && ep->ep_obj->txc.send_cntr) || - (flags & FI_READ && ep->ep_obj->txc.read_cntr) || - (flags & FI_WRITE && ep->ep_obj->txc.write_cntr) || - (flags & FI_RECV && ep->ep_obj->rxc.recv_cntr)) { + if ((flags & FI_SEND && ep->ep_obj->txc->send_cntr) || + (flags & FI_READ && ep->ep_obj->txc->read_cntr) || + (flags & FI_WRITE && ep->ep_obj->txc->write_cntr) || + (flags & FI_RECV && ep->ep_obj->rxc->recv_cntr)) { CXIP_WARN("EP previously bound to counter\n"); return -FI_EINVAL; } @@ -798,19 +844,19 @@ static int cxip_ep_bind_cntr(struct cxip_ep *ep, struct cxip_cntr *cntr, } if (flags & FI_SEND) { - ep->ep_obj->txc.send_cntr = cntr; + ep->ep_obj->txc->send_cntr = cntr; ofi_atomic_inc32(&cntr->ref); } if (flags & FI_READ) { - ep->ep_obj->txc.read_cntr = cntr; + ep->ep_obj->txc->read_cntr = cntr; ofi_atomic_inc32(&cntr->ref); } if (flags & FI_WRITE) { - ep->ep_obj->txc.write_cntr = cntr; + ep->ep_obj->txc->write_cntr = cntr; ofi_atomic_inc32(&cntr->ref); } if (flags & FI_RECV) { - ep->ep_obj->rxc.recv_cntr = cntr; + ep->ep_obj->rxc->recv_cntr = cntr; ofi_atomic_inc32(&cntr->ref); } @@ -913,8 +959,10 @@ int cxip_set_tclass(uint32_t desired_tc, uint32_t default_tc, uint32_t *new_tc) static inline int cxip_ep_set_val(struct cxip_ep *cxi_ep, struct fi_fid_var *val) { - uint32_t *req_tclass; + struct cxip_txc_rnr *txc_rnr; uint64_t *req_order; + uint64_t *req_rnr_max_time; + uint32_t *req_tclass; uint32_t new_tclass; if (!val->val) @@ -941,6 +989,20 @@ static inline int cxip_ep_set_val(struct cxip_ep *cxi_ep, cxi_ep->tx_attr.msg_order = *req_order; break; + case FI_OPT_CXI_SET_RNR_MAX_RETRY_TIME: + req_rnr_max_time = (uint64_t *) val->val; + + if (cxi_ep->ep_obj->protocol != FI_PROTO_CXI_RNR) { + CXIP_WARN("Not FI_PROTO_CXI_RNR EP\n"); + return -FI_EINVAL; + } + + txc_rnr = container_of(cxi_ep->ep_obj->txc, struct cxip_txc_rnr, + base); + txc_rnr->max_retry_wait_us = *req_rnr_max_time; + CXIP_DBG("RNR maximum timeout set to %ld usec\n", + txc_rnr->max_retry_wait_us); + break; default: return -FI_EINVAL; } @@ -1041,7 +1103,7 @@ int cxip_ep_getopt_priv(struct cxip_ep *ep, int level, int optname, if (*optlen < sizeof(size_t)) return -FI_ETOOSMALL; - *(size_t *)optval = ep->ep_obj->rxc.min_multi_recv; + *(size_t *)optval = ep->ep_obj->rxc->min_multi_recv; *optlen = sizeof(size_t); break; @@ -1083,7 +1145,7 @@ int cxip_ep_setopt_priv(struct cxip_ep *ep, int level, int optname, CXIP_EP_MAX_MULTI_RECV); return -FI_EINVAL; } - ep->ep_obj->rxc.min_multi_recv = min_multi_recv; + ep->ep_obj->rxc->min_multi_recv = min_multi_recv; break; default: @@ -1123,8 +1185,7 @@ int cxip_alloc_endpoint(struct cxip_domain *cxip_dom, struct fi_info *hints, { int ret; struct cxip_ep_obj *ep_obj; - struct cxip_txc *txc; - struct cxip_rxc *rxc; + uint32_t txc_tclass; uint32_t nic; uint32_t pid; int i; @@ -1163,13 +1224,8 @@ int cxip_alloc_endpoint(struct cxip_domain *cxip_dom, struct fi_info *hints, if (!ep_obj) return -FI_ENOMEM; - txc = &ep_obj->txc; - rxc = &ep_obj->rxc; - - /* For faster access */ - ep_obj->asic_ver = cxip_dom->iface->info->cassini_version; - /* Save EP attributes from hints */ + ep_obj->protocol = hints->ep_attr->protocol; ep_obj->caps = hints->caps; ep_obj->ep_attr = *hints->ep_attr; ep_obj->txq_size = hints->tx_attr->size; @@ -1177,6 +1233,34 @@ int cxip_alloc_endpoint(struct cxip_domain *cxip_dom, struct fi_info *hints, ep_obj->tx_attr = *hints->tx_attr; ep_obj->rx_attr = *hints->rx_attr; + ep_obj->asic_ver = cxip_dom->iface->info->cassini_version; + + ofi_atomic_initialize32(&ep_obj->ref, 0); + + /* Allow FI_THREAD_DOMAIN optimizaiton */ + if (cxip_dom->util_domain.threading == FI_THREAD_DOMAIN || + cxip_dom->util_domain.threading == FI_THREAD_COMPLETION) + ofi_genlock_init(&ep_obj->lock, OFI_LOCK_NONE); + else + ofi_genlock_init(&ep_obj->lock, OFI_LOCK_SPINLOCK); + + ep_obj->domain = cxip_dom; + ep_obj->src_addr.nic = nic; + ep_obj->src_addr.pid = pid; + ep_obj->fi_addr = FI_ADDR_NOTAVAIL; + + ofi_atomic_initialize32(&ep_obj->txq_ref, 0); + ofi_atomic_initialize32(&ep_obj->tgq_ref, 0); + + for (i = 0; i < CXIP_NUM_CACHED_KEY_LE; i++) { + ofi_atomic_initialize32(&ep_obj->ctrl.std_mr_cache[i].ref, 0); + ofi_atomic_initialize32(&ep_obj->ctrl.opt_mr_cache[i].ref, 0); + } + + dlist_init(&ep_obj->ctrl.mr_list); + ep_obj->ep_attr.tx_ctx_cnt = 1; + ep_obj->ep_attr.rx_ctx_cnt = 1; + if (hints->ep_attr->auth_key) { /* Auth key size is verified in ofi_prov_check_info(). */ assert(hints->ep_attr->auth_key_size == @@ -1203,57 +1287,35 @@ int cxip_alloc_endpoint(struct cxip_domain *cxip_dom, struct fi_info *hints, } if (cxip_set_tclass(ep_obj->tx_attr.tclass, - cxip_dom->tclass, &ep_obj->txc.tclass)) { + cxip_dom->tclass, &txc_tclass)) { CXIP_WARN("Invalid tclass\n"); ret = -FI_EINVAL; goto err; } - ep_obj->tx_attr.tclass = ep_obj->txc.tclass; - - /* Initialize object */ - ofi_atomic_initialize32(&ep_obj->ref, 0); - - /* Allow FI_THREAD_DOMAIN optimizaiton */ - if (cxip_dom->util_domain.threading == FI_THREAD_DOMAIN || - cxip_dom->util_domain.threading == FI_THREAD_COMPLETION) - ofi_genlock_init(&ep_obj->lock, OFI_LOCK_NONE); - else - ofi_genlock_init(&ep_obj->lock, OFI_LOCK_SPINLOCK); - ep_obj->domain = cxip_dom; - ep_obj->src_addr.nic = nic; - ep_obj->src_addr.pid = pid; - ep_obj->fi_addr = FI_ADDR_NOTAVAIL; + ep_obj->tx_attr.tclass = txc_tclass; - ofi_atomic_initialize32(&ep_obj->txq_ref, 0); - ofi_atomic_initialize32(&ep_obj->tgq_ref, 0); - - for (i = 0; i < CXIP_NUM_CACHED_KEY_LE; i++) { - ofi_atomic_initialize32(&ep_obj->std_mr_cache[i].ref, 0); - ofi_atomic_initialize32(&ep_obj->opt_mr_cache[i].ref, 0); + ep_obj->txc = cxip_txc_calloc(ep_obj, context); + if (!ep_obj->txc) { + ret = -FI_ENOMEM; + goto err; } - dlist_init(&ep_obj->mr_list); - ep_obj->ep_attr.tx_ctx_cnt = 1; - ep_obj->ep_attr.rx_ctx_cnt = 1; - txc->ep_obj = ep_obj; - rxc->ep_obj = ep_obj; - - cxip_txc_struct_init(txc, &ep_obj->tx_attr, context); - cxip_rxc_struct_init(rxc, &ep_obj->rx_attr, context); - - txc->domain = cxip_dom; - txc->hrp_war_req = txc->ep_obj->asic_ver < CASSINI_2_0; + ep_obj->rxc = cxip_rxc_calloc(ep_obj, context); + if (!ep_obj->rxc) { + ret = -FI_ENOMEM; + goto err; + } - rxc->domain = cxip_dom; - rxc->min_multi_recv = CXIP_EP_MIN_MULTI_RECV; ofi_atomic_inc32(&cxip_dom->ref); - *ep_base_obj = ep_obj; return FI_SUCCESS; err: + /* handles null check */ + cxip_txc_free(ep_obj->txc); + cxip_rxc_free(ep_obj->rxc); free(ep_obj); return ret; @@ -1305,7 +1367,7 @@ int cxip_endpoint(struct fid_domain *domain, struct fi_info *info, *fid_ep = &ep->ep; cxip_coll_init(ep->ep_obj); - cxip_domain_add_txc(ep->ep_obj->domain, &ep->ep_obj->txc); + cxip_domain_add_txc(ep->ep_obj->domain, ep->ep_obj->txc); return FI_SUCCESS; } diff --git a/prov/cxi/src/cxip_evtq.c b/prov/cxi/src/cxip_evtq.c index 68b3a99d165..c40dd7e7c2f 100644 --- a/prov/cxi/src/cxip_evtq.c +++ b/prov/cxi/src/cxip_evtq.c @@ -76,7 +76,7 @@ int cxip_evtq_req_cancel(struct cxip_evtq *evtq, void *req_ctx, !req->recv.canceled && !req->recv.parent && (!match || (void *)req->context == op_ctx)) { - ret = cxip_recv_cancel(req); + ret = req->recv.rxc->ops.cancel_msg_recv(req); break; } } @@ -289,6 +289,15 @@ static struct cxip_req *cxip_evtq_event_req(struct cxip_evtq *evtq, case C_EVENT_PUT_OVERFLOW: case C_EVENT_RENDEZVOUS: case C_EVENT_SEARCH: + /* RNR C_RC_ENTRY_NOT_FOUND generates an event on the + * target, this can be safely ignored as the source + * handles the retry logic. + */ + if (event->tgt_long.buffer_id == 0) { + req = NULL; + break; + } + req = cxip_evtq_req_find(evtq, event->tgt_long.buffer_id); if (req) break; diff --git a/prov/cxi/src/cxip_info.c b/prov/cxi/src/cxip_info.c index 25c392fa6b2..a48fbcbb305 100644 --- a/prov/cxi/src/cxip_info.c +++ b/prov/cxi/src/cxip_info.c @@ -478,7 +478,7 @@ static int cxip_info_init(void) for (ndx = 0; ndx < ARRAY_SIZE(cxip_infos); ndx++) { ret = cxip_info_alloc(nic_if, ndx, &fi); if (ret == -FI_ENODATA) - continue;; + continue; if (ret != FI_SUCCESS) { cxip_put_if(nic_if); goto free_info; @@ -489,6 +489,40 @@ static int cxip_info_init(void) *fi_list = fi; fi_list = &(fi->next); } + + /* Initialize the RNR protocol equivalents here, just + * modifying the default entries to be suitable for + * RNR. NOTE: FI_PROTO_CXI_RNR protocol does not exist + * when only old compatibility constants are used. + */ + for (ndx = 0; ndx < ARRAY_SIZE(cxip_infos); ndx++) { + ret = cxip_info_alloc(nic_if, ndx, &fi); + if (ret == -FI_ENODATA) + continue; + if (ret != FI_SUCCESS) { + cxip_put_if(nic_if); + goto free_info; + } + + fi->caps |= FI_DIRECTED_RECV; + fi->ep_attr->protocol = FI_PROTO_CXI_RNR; + fi->ep_attr->mem_tag_format = FI_TAG_GENERIC >> + (64 - CXIP_CS_TAG_WIDTH); + fi->tx_attr->msg_order = CXIP_MSG_ORDER & ~FI_ORDER_SAS; + fi->tx_attr->caps |= FI_DIRECTED_RECV; + /* Support IDC but not FI_INJECT */ + fi->tx_attr->inject_size = 0; + fi->rx_attr->msg_order = CXIP_MSG_ORDER & ~FI_ORDER_SAS; + fi->rx_attr->caps |= FI_DIRECTED_RECV; + fi->rx_attr->total_buffered_recv = 0; + + CXIP_DBG("%s RNR info created\n", + nic_if->info->device_name); + *fi_list = fi; + fi_list = &(fi->next); + } + + cxip_put_if(nic_if); } return FI_SUCCESS; @@ -578,6 +612,8 @@ struct cxip_environment cxip_env = { .force_odp = false, .ats = false, .iotlb = true, + .disable_dmabuf_cuda = false, + .disable_dmabuf_rocr = false, .ats_mlock_mode = CXIP_ATS_MLOCK_ALL, .fork_safe_requested = false, .rx_match_mode = CXIP_PTLTE_DEFAULT_MODE, @@ -585,6 +621,7 @@ struct cxip_environment cxip_env = { .rdzv_get_min = 2049, /* Avoid single packet Gets */ .rdzv_eager_size = CXIP_RDZV_THRESHOLD, .rdzv_aligned_sw_rget = 1, + .rnr_max_timeout_us = CXIP_RNR_TIMEOUT_US, .disable_non_inject_msg_idc = 0, .disable_host_register = 0, .oflow_buf_size = CXIP_OFLOW_BUF_SIZE, @@ -602,6 +639,7 @@ struct cxip_environment cxip_env = { .req_buf_min_posted = CXIP_REQ_BUF_MIN_POSTED, .req_buf_max_cached = CXIP_REQ_BUF_MAX_CACHED, .msg_offload = 1, + .trunc_ok = false, .msg_lossless = 0, .sw_rx_tx_init_max = CXIP_SW_RX_TX_INIT_MAX_DEFAULT, .hybrid_preemptive = 0, @@ -671,6 +709,17 @@ static void cxip_env_init(void) fi_param_get_bool(&cxip_prov, "rdzv_aligned_sw_rget", &cxip_env.rdzv_aligned_sw_rget); + fi_param_define(&cxip_prov, "rnr_max_timeout_us", FI_PARAM_INT, + "Maximum RNR time micro-seconds (default: %d).", + cxip_env.rnr_max_timeout_us); + fi_param_get_int(&cxip_prov, "rnr_max_timeout_us", + &cxip_env.rnr_max_timeout_us); + if (cxip_env.rnr_max_timeout_us < 0) { + cxip_env.rnr_max_timeout_us = CXIP_RNR_TIMEOUT_US; + CXIP_INFO("Invalid RNR timeout, using (%d us)\n", + cxip_env.rnr_max_timeout_us); + } + fi_param_define(&cxip_prov, "enable_trig_op_limit", FI_PARAM_BOOL, "Enable enforcement of triggered operation limit. " "Doing this can result in degrade " @@ -720,6 +769,18 @@ static void cxip_env_init(void) "Enables the NIC IOTLB (default %d).", cxip_env.iotlb); fi_param_get_bool(&cxip_prov, "iotlb", &cxip_env.iotlb); + fi_param_define(&cxip_prov, "disable_dmabuf_cuda", FI_PARAM_BOOL, + "Disables the DMABUF interface for CUDA (default %d).", + cxip_env.disable_dmabuf_cuda); + fi_param_get_bool(&cxip_prov, "disable_dmabuf_cuda", + &cxip_env.disable_dmabuf_cuda); + + fi_param_define(&cxip_prov, "disable_dmabuf_rocr", FI_PARAM_BOOL, + "Disables the DMABUF interface for ROCR (default %d).", + cxip_env.disable_dmabuf_rocr); + fi_param_get_bool(&cxip_prov, "disable_dmabuf_rocr", + &cxip_env.disable_dmabuf_rocr); + fi_param_define(&cxip_prov, "ats_mlock_mode", FI_PARAM_STRING, "Sets ATS mlock mode (off | all)."); fi_param_get_str(&cxip_prov, "ats_mlock_mode", ¶m_str); @@ -1238,6 +1299,11 @@ static void cxip_env_init(void) param_str = NULL; } + fi_param_define(&cxip_prov, "trunc_ok", FI_PARAM_BOOL, + "Enables experimental truncation as a success (%d).", + cxip_env.trunc_ok); + fi_param_get_bool(&cxip_prov, "trunc_ok", &cxip_env.trunc_ok); + fi_param_define(&cxip_prov, "rdzv_proto", FI_PARAM_STRING, "Sets preferred rendezvous protocol [default | alt_read] (default %s).", cxip_rdzv_proto_to_str(cxip_env.rdzv_proto)); @@ -1553,6 +1619,7 @@ cxip_getinfo(uint32_t version, const char *node, const char *service, struct cxip_if *iface; bool copy_dest = NULL; struct fi_info *temp_hints = NULL; + uint32_t proto; if (flags & FI_SOURCE) { if (!node && !service) { @@ -1621,9 +1688,9 @@ cxip_getinfo(uint32_t version, const char *node, const char *service, if (ret) return ret; - /* Remove any info that did match based on mr_mode requirements. - * Note that mr_mode FI_MR_ENDPOINT is only required if target - * RMA/ATOMIC access is required. + /* Remove any info that did not match based on EP protocol or mr_mode + * requirements. Note that mr_mode FI_MR_ENDPOINT is only required + * if target RMA/ATOMIC access is required. */ if (hints) { fi_ptr = *info; @@ -1631,8 +1698,20 @@ cxip_getinfo(uint32_t version, const char *node, const char *service, fi_prev_ptr = NULL; while (fi_ptr) { - if (fi_ptr->caps & (FI_ATOMIC | FI_RMA) && - !fi_ptr->domain_attr->mr_mode) { + /* If hints protocol is not specified, default to use + * protocol FI_PROTO_CXI. This + * requires that FI_PROTO_CXI_RNR be explicitly + * requested if hints are passed to be used. + */ + if (!hints->ep_attr->protocol) { + proto = FI_PROTO_CXI; + } else { + proto = hints->ep_attr->protocol; + } + + if ((fi_ptr->caps & (FI_ATOMIC | FI_RMA) && + !fi_ptr->domain_attr->mr_mode) || + proto != fi_ptr->ep_attr->protocol) { /* discard entry */ if (fi_prev_ptr) fi_prev_ptr->next = fi_ptr->next; diff --git a/prov/cxi/src/cxip_iomm.c b/prov/cxi/src/cxip_iomm.c index f6116f39b68..14f4d955978 100644 --- a/prov/cxi/src/cxip_iomm.c +++ b/prov/cxi/src/cxip_iomm.c @@ -13,6 +13,51 @@ #define MAP_FAIL_MSG "cxil_map lni: %d base: 0x%p len: %ld " \ "map_flags: 0x%0X failure: %d, %s\n" +static int cxip_dmabuf_hints(enum fi_hmem_iface iface, void *iov_base, + struct cxip_md *md, struct cxi_md_hints *hints, + size_t len) +{ + int ret; + int dmabuf_fd; + size_t size; + uint64_t offset; + uintptr_t base; + + if (iface == FI_HMEM_ZE && !cxip_env.ze_hmem_supported) { + CXIP_WARN("ZE device memory not supported. Try disabling implicit scaling (EnableImplicitScaling=0 NEOReadDebugKeys=1).\n"); + return -FI_ENOSYS; + } + + if (iface == FI_HMEM_CUDA && cxip_env.disable_dmabuf_cuda) + return FI_SUCCESS; + + if (iface == FI_HMEM_ROCR && cxip_env.disable_dmabuf_rocr) + return FI_SUCCESS; + + ret = ofi_hmem_get_base_addr(iface, iov_base, len, (void*)&base, &size); + if (ret) + return ret; + + ret = ofi_hmem_get_dmabuf_fd(iface, (void*)base, size, &dmabuf_fd, + &offset); + if (!ret) { + hints->dmabuf_fd = dmabuf_fd; + hints->dmabuf_offset = offset; + hints->dmabuf_valid = true; + + return FI_SUCCESS; + } + + /* If ROCm or cuda version do not support dmabuf, fall back + * to p2p interface. hints will not be filled in. + */ + if (iface != FI_HMEM_ZE && + (ret == -FI_EOPNOTSUPP || ret == -FI_ENOSYS)) + return FI_SUCCESS; + + return ret; +} + /** * cxip_do_map() - IO map a buffer. */ @@ -23,9 +68,6 @@ static int cxip_do_map(struct ofi_mr_cache *cache, struct ofi_mr_entry *entry) struct cxip_domain *dom; uint32_t map_flags = CXI_MAP_READ | CXI_MAP_WRITE; struct cxi_md_hints hints; - void *ze_handle; - void *ze_base_addr; - size_t ze_base_size; uint64_t hmem_flags = entry->info.flags; dom = container_of(cache, struct cxip_domain, iomm); @@ -52,43 +94,13 @@ static int cxip_do_map(struct ofi_mr_cache *cache, struct ofi_mr_entry *entry) if (!dom->odp) map_flags |= CXI_MAP_PIN; } else { - /* TODO: Remove PIN when DMA buf move_notify is supported. */ map_flags |= CXI_MAP_DEVICE | CXI_MAP_PIN; - /* ZE support requires the use of the DMA buf FD and offset - * hints fields. - */ - if (entry->info.iface == FI_HMEM_ZE) { - if (!cxip_env.ze_hmem_supported) { - CXIP_WARN("ZE device memory not supported. Try disabling implicit scaling (EnableImplicitScaling=0 NEOReadDebugKeys=1).\n"); - return -FI_ENOSYS; - } - - ret = ze_hmem_get_handle(entry->info.iov.iov_base, - entry->info.iov.iov_len, - &ze_handle); - if (ret) { - CXIP_WARN("ze_hmem_get_handle failed: %d:%s\n", - ret, fi_strerror(-ret)); - goto err; - } - - ret = ze_hmem_get_base_addr(entry->info.iov.iov_base, - entry->info.iov.iov_len, - &ze_base_addr, - &ze_base_size); - if (ret) { - CXIP_WARN("ze_hmem_get_base_addr failed: %d:%s\n", - ret, fi_strerror(-ret)); - goto err; - } - - hints.dmabuf_fd = (int)(uintptr_t)ze_handle; - hints.dmabuf_offset = - (uintptr_t)entry->info.iov.iov_base - - (uintptr_t)ze_base_addr; - hints.dmabuf_valid = true; - } + ret = cxip_dmabuf_hints(entry->info.iface, + entry->info.iov.iov_base, + md, &hints, entry->info.iov.iov_len); + if (ret) + goto err; } if (!cxip_env.iotlb) @@ -351,7 +363,7 @@ static int cxip_map_cache(struct cxip_domain *dom, struct ofi_mr_info *info, ret = ofi_mr_cache_search(&dom->iomm, info, &entry); if (ret) { CXIP_WARN("Failed to acquire mapping (%p, %lu): %d\n", - info->iov.iov_base, info->iov.iov_len, ret); + info->iov.iov_base, info->iov.iov_len, ret); return ret; } @@ -364,12 +376,9 @@ static int cxip_map_nocache(struct cxip_domain *dom, struct fi_mr_attr *attr, uint64_t hmem_flags, struct cxip_md **md) { struct cxip_md *uncached_md; - uint32_t map_flags; + uint32_t map_flags = CXI_MAP_READ | CXI_MAP_WRITE; int ret; struct cxi_md_hints hints; - void *ze_handle; - void *ze_base_addr; - size_t ze_base_size; /* Prefer the ATS (scalable MD) whenever possible * @@ -387,7 +396,6 @@ static int cxip_map_nocache(struct cxip_domain *dom, struct fi_mr_attr *attr, if (!uncached_md) return -FI_ENOMEM; - map_flags = CXI_MAP_READ | CXI_MAP_WRITE; if (attr->iface == FI_HMEM_SYSTEM) { if (dom->ats) map_flags |= CXI_MAP_ATS; @@ -395,44 +403,14 @@ static int cxip_map_nocache(struct cxip_domain *dom, struct fi_mr_attr *attr, if (!dom->odp) map_flags |= CXI_MAP_PIN; } else { - /* TODO: Remove PIN when DMA buf move_notify is supported. */ map_flags |= CXI_MAP_DEVICE | CXI_MAP_PIN; - /* ZE support requires the use of the DMA buf FD and offset - * hints fields. - */ - if (attr->iface == FI_HMEM_ZE) { - if (!cxip_env.ze_hmem_supported) { - CXIP_WARN("ZE device memory not supported. Try disabling implicit scaling (EnableImplicitScaling=0 NEOReadDebugKeys=1).\n"); - ret = -FI_ENOSYS; - goto err_free_uncached_md; - } - - ret = ze_hmem_get_handle(attr->mr_iov->iov_base, - attr->mr_iov->iov_len, - &ze_handle); - if (ret) { - CXIP_WARN("ze_hmem_get_handle failed: %d:%s\n", - ret, fi_strerror(-ret)); - goto err_free_uncached_md; - } - - ret = ze_hmem_get_base_addr(attr->mr_iov->iov_base, - attr->mr_iov->iov_len, - &ze_base_addr, - &ze_base_size); - if (ret) { - CXIP_WARN("ze_hmem_get_base_addr failed: %d:%s\n", - ret, fi_strerror(-ret)); - goto err_free_uncached_md; - } - - hints.dmabuf_fd = (int)(uintptr_t)ze_handle; - hints.dmabuf_offset = - (uintptr_t)attr->mr_iov->iov_base - - (uintptr_t)ze_base_addr; - hints.dmabuf_valid = true; - } + ret = cxip_dmabuf_hints(attr->iface, + attr->mr_iov->iov_base, + uncached_md, &hints, + attr->mr_iov->iov_len); + if (ret) + goto err_free_uncached_md; } if (!cxip_env.iotlb) @@ -503,7 +481,7 @@ static void cxip_map_get_mem_region_size(const void *buf, unsigned long len, *out_len = len; } - CXIP_DBG("%s: User addr=%p User len=%lu Region addr=%p Region len=%lu\n", + CXIP_DBG("%s: User addr=%p User len=%lu Region addr=%p Region len=0x%lx\n", fi_tostr(&iface, FI_TYPE_HMEM_IFACE), buf, len, *out_buf, *out_len); } diff --git a/prov/cxi/src/cxip_mr.c b/prov/cxi/src/cxip_mr.c index 4ff81d5a448..6d088e21262 100644 --- a/prov/cxi/src/cxip_mr.c +++ b/prov/cxi/src/cxip_mr.c @@ -54,7 +54,7 @@ void cxip_mr_domain_init(struct cxip_mr_domain *mr_domain) */ static void cxip_ep_mr_insert(struct cxip_ep_obj *ep_obj, struct cxip_mr *mr) { - dlist_insert_tail(&mr->ep_entry, &ep_obj->mr_list); + dlist_insert_tail(&mr->ep_entry, &ep_obj->ctrl.mr_list); } /* @@ -176,12 +176,12 @@ static int cxip_mr_enable_std(struct cxip_mr *mr) if (!mr->count_events) le_flags |= C_LE_EVENT_SUCCESS_DISABLE; - ret = cxip_pte_append(ep_obj->ctrl_pte, + ret = cxip_pte_append(ep_obj->ctrl.pte, mr->len ? CXI_VA_TO_IOVA(mr->md->md, mr->buf) : 0, mr->len, mr->len ? mr->md->md->lac : 0, C_PTL_LIST_PRIORITY, mr->req.req_id, key.key, 0, CXI_MATCH_ID_ANY, - 0, le_flags, mr->cntr, ep_obj->ctrl_tgq, true); + 0, le_flags, mr->cntr, ep_obj->ctrl.tgq, true); if (ret != FI_SUCCESS) { CXIP_WARN("Failed to write Append command: %d\n", ret); return ret; @@ -209,8 +209,8 @@ static int cxip_mr_disable_std(struct cxip_mr *mr) struct cxip_ep_obj *ep_obj = mr->ep->ep_obj; /* TODO: Handle -FI_EAGAIN. */ - ret = cxip_pte_unlink(ep_obj->ctrl_pte, C_PTL_LIST_PRIORITY, - mr->req.req_id, ep_obj->ctrl_tgq); + ret = cxip_pte_unlink(ep_obj->ctrl.pte, C_PTL_LIST_PRIORITY, + mr->req.req_id, ep_obj->ctrl.tgq); assert(ret == FI_SUCCESS); do { @@ -229,7 +229,7 @@ static int cxip_mr_disable_std(struct cxip_mr *mr) if (mr->count_events) CXIP_WARN("Match events required pte LE invalidate\n"); - ret = cxil_invalidate_pte_le(ep_obj->ctrl_pte->pte, mr->key, + ret = cxil_invalidate_pte_le(ep_obj->ctrl.pte->pte, mr->key, C_PTL_LIST_PRIORITY); if (ret) CXIP_WARN("MR %p key 0x%016lX invalidate failed %d\n", @@ -284,7 +284,7 @@ static int cxip_mr_enable_opt(struct cxip_mr *mr) mr->req.cb = cxip_mr_cb; - ret = cxip_pte_alloc_nomap(ep_obj->ptable, ep_obj->ctrl_tgt_evtq, + ret = cxip_pte_alloc_nomap(ep_obj->ptable, ep_obj->ctrl.tgt_evtq, &opts, cxip_mr_opt_pte_cb, mr, &mr->pte); if (ret != FI_SUCCESS) { CXIP_WARN("Failed to allocate PTE: %d\n", ret); @@ -307,7 +307,7 @@ static int cxip_mr_enable_opt(struct cxip_mr *mr) goto err_pte_free; } - ret = cxip_pte_set_state(mr->pte, ep_obj->ctrl_tgq, C_PTLTE_ENABLED, 0); + ret = cxip_pte_set_state(mr->pte, ep_obj->ctrl.tgq, C_PTLTE_ENABLED, 0); if (ret != FI_SUCCESS) { /* This is a bug, we have exclusive access to this CMDQ. */ CXIP_WARN("Failed to enqueue command: %d\n", ret); @@ -339,7 +339,7 @@ static int cxip_mr_enable_opt(struct cxip_mr *mr) mr->len, mr->len ? mr->md->md->lac : 0, C_PTL_LIST_PRIORITY, mr->req.req_id, 0, ib, CXI_MATCH_ID_ANY, - 0, le_flags, mr->cntr, ep_obj->ctrl_tgq, true); + 0, le_flags, mr->cntr, ep_obj->ctrl.tgq, true); if (ret != FI_SUCCESS) { CXIP_WARN("Failed to write Append command: %d\n", ret); goto err_pte_free; @@ -373,7 +373,7 @@ static int cxip_mr_disable_opt(struct cxip_mr *mr) struct cxip_ep_obj *ep_obj = mr->ep->ep_obj; ret = cxip_pte_unlink(mr->pte, C_PTL_LIST_PRIORITY, - mr->req.req_id, ep_obj->ctrl_tgq); + mr->req.req_id, ep_obj->ctrl.tgq); if (ret) { CXIP_WARN("Failed to enqueue Unlink: %d\n", ret); goto cleanup; @@ -443,7 +443,7 @@ static int cxip_mr_prov_cache_enable_opt(struct cxip_mr *mr) uint32_t le_flags; uint64_t ib = 0; - mr_cache = &ep_obj->opt_mr_cache[lac]; + mr_cache = &ep_obj->ctrl.opt_mr_cache[lac]; ofi_atomic_inc32(&mr_cache->ref); if (mr_cache->ctrl_req) @@ -478,7 +478,7 @@ static int cxip_mr_prov_cache_enable_opt(struct cxip_mr *mr) mr_cache->ctrl_req->mr.mr->optimized = true; mr_cache->ctrl_req->mr.mr->mr_state = CXIP_MR_DISABLED; - ret = cxip_pte_alloc_nomap(ep_obj->ptable, ep_obj->ctrl_tgt_evtq, + ret = cxip_pte_alloc_nomap(ep_obj->ptable, ep_obj->ctrl.tgt_evtq, &opts, cxip_mr_opt_pte_cb, _mr, &_mr->pte); if (ret != FI_SUCCESS) { @@ -500,7 +500,7 @@ static int cxip_mr_prov_cache_enable_opt(struct cxip_mr *mr) goto err_pte_free; } - ret = cxip_pte_set_state(_mr->pte, ep_obj->ctrl_tgq, + ret = cxip_pte_set_state(_mr->pte, ep_obj->ctrl.tgq, C_PTLTE_ENABLED, 0); if (ret != FI_SUCCESS) { /* This is a bug, we have exclusive access to this CMDQ. */ @@ -526,7 +526,7 @@ static int cxip_mr_prov_cache_enable_opt(struct cxip_mr *mr) C_PTL_LIST_PRIORITY, mr_cache->ctrl_req->req_id, 0, ib, CXI_MATCH_ID_ANY, - 0, le_flags, NULL, ep_obj->ctrl_tgq, true); + 0, le_flags, NULL, ep_obj->ctrl.tgq, true); if (ret != FI_SUCCESS) { CXIP_WARN("Failed to write Append command: %d\n", ret); goto err_pte_free; @@ -576,11 +576,11 @@ static int cxip_mr_prov_cache_disable_opt(struct cxip_mr *mr) CXIP_DBG("Disable optimized cached MR: %p (key: 0x%016lX)\n", mr, mr->key); - if (ofi_atomic_get32(&ep_obj->opt_mr_cache[lac].ref) <= 0) { + if (ofi_atomic_get32(&ep_obj->ctrl.opt_mr_cache[lac].ref) <= 0) { CXIP_WARN("Cached optimized MR reference underflow\n"); return -FI_EINVAL; } - ofi_atomic_dec32(&ep_obj->opt_mr_cache[lac].ref); + ofi_atomic_dec32(&ep_obj->ctrl.opt_mr_cache[lac].ref); mr->enabled = false; return FI_SUCCESS; @@ -603,7 +603,7 @@ static int cxip_mr_prov_cache_enable_std(struct cxip_mr *mr) uint32_t le_flags; /* TODO: Handle enabling for each bound endpoint */ - mr_cache = &ep_obj->std_mr_cache[lac]; + mr_cache = &ep_obj->ctrl.std_mr_cache[lac]; ofi_atomic_inc32(&mr_cache->ref); if (mr_cache->ctrl_req) @@ -646,11 +646,11 @@ static int cxip_mr_prov_cache_enable_std(struct cxip_mr *mr) le_flags = C_LE_EVENT_SUCCESS_DISABLE | C_LE_UNRESTRICTED_BODY_RO | C_LE_OP_PUT | C_LE_OP_GET; - ret = cxip_pte_append(ep_obj->ctrl_pte, 0, -1ULL, + ret = cxip_pte_append(ep_obj->ctrl.pte, 0, -1ULL, mb.mr_lac, C_PTL_LIST_PRIORITY, mr_cache->ctrl_req->req_id, mb.raw, ib.raw, CXI_MATCH_ID_ANY, - 0, le_flags, NULL, ep_obj->ctrl_tgq, true); + 0, le_flags, NULL, ep_obj->ctrl.tgq, true); if (ret != FI_SUCCESS) { CXIP_WARN("Failed to write Append command: %d\n", ret); @@ -698,11 +698,11 @@ static int cxip_mr_prov_cache_disable_std(struct cxip_mr *mr) CXIP_DBG("Disable standard cached MR: %p (key: 0x%016lX)\n", mr, mr->key); - if (ofi_atomic_get32(&ep_obj->std_mr_cache[lac].ref) <= 0) { + if (ofi_atomic_get32(&ep_obj->ctrl.std_mr_cache[lac].ref) <= 0) { CXIP_WARN("Cached standard MR reference underflow\n"); return -FI_EINVAL; } - ofi_atomic_dec32(&ep_obj->std_mr_cache[lac].ref); + ofi_atomic_dec32(&ep_obj->ctrl.std_mr_cache[lac].ref); mr->enabled = false; return FI_SUCCESS; @@ -994,15 +994,15 @@ void cxip_ctrl_mr_cache_flush(struct cxip_ep_obj *ep_obj) /* Flush standard MR resources hardware resources not in use */ for (lac = 0; lac < CXIP_NUM_CACHED_KEY_LE; lac++) { - mr_cache = &ep_obj->std_mr_cache[lac]; + mr_cache = &ep_obj->ctrl.std_mr_cache[lac]; if (!mr_cache->ctrl_req || ofi_atomic_get32(&mr_cache->ref)) continue; - ret = cxip_pte_unlink(ep_obj->ctrl_pte, C_PTL_LIST_PRIORITY, + ret = cxip_pte_unlink(ep_obj->ctrl.pte, C_PTL_LIST_PRIORITY, mr_cache->ctrl_req->req_id, - ep_obj->ctrl_tgq); + ep_obj->ctrl.tgq); assert(ret == FI_SUCCESS); do { @@ -1011,7 +1011,7 @@ void cxip_ctrl_mr_cache_flush(struct cxip_ep_obj *ep_obj) } while (mr_cache->ctrl_req->mr.mr->mr_state != CXIP_MR_UNLINKED); - ret = cxil_invalidate_pte_le(ep_obj->ctrl_pte->pte, + ret = cxil_invalidate_pte_le(ep_obj->ctrl.pte->pte, mr_cache->ctrl_req->req_id, C_PTL_LIST_PRIORITY); if (ret) @@ -1026,7 +1026,7 @@ void cxip_ctrl_mr_cache_flush(struct cxip_ep_obj *ep_obj) /* Flush optimized MR resources hardware resources not in use */ for (lac = 0; lac < CXIP_NUM_CACHED_KEY_LE; lac++) { - mr_cache = &ep_obj->opt_mr_cache[lac]; + mr_cache = &ep_obj->ctrl.opt_mr_cache[lac]; if (!mr_cache->ctrl_req || ofi_atomic_get32(&mr_cache->ref)) @@ -1035,7 +1035,7 @@ void cxip_ctrl_mr_cache_flush(struct cxip_ep_obj *ep_obj) ret = cxip_pte_unlink(mr_cache->ctrl_req->mr.mr->pte, C_PTL_LIST_PRIORITY, mr_cache->ctrl_req->req_id, - ep_obj->ctrl_tgq); + ep_obj->ctrl.tgq); if (ret) { CXIP_WARN("Failed to enqueue Unlink: %d\n", ret); goto cleanup; diff --git a/prov/cxi/src/cxip_msg.c b/prov/cxi/src/cxip_msg.c index a9f3c0f6ec9..4d3830dc18f 100644 --- a/prov/cxi/src/cxip_msg.c +++ b/prov/cxi/src/cxip_msg.c @@ -1,6 +1,7 @@ /* - * Copyright (c) 2018,2021-2023 Hewlett Packard Enterprise Development LP - * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only + * SPDX-License-Identifier: BSD-2 Clause or GPL-2.0-only + * + * Copyright (c) 2018-2024 Hewlett Packard Enterprise Development LP */ #include "config.h" @@ -19,127 +20,10 @@ #include "cxip.h" -#define FC_SW_LE_MSG_FATAL "LE exhaustion during flow control, "\ - "FI_CXI_RX_MATCH_MODE=[hybrid|software] is required\n" -#define FC_SW_ONLOAD_MSG_FATAL "LE resources not recovered during "\ - "flow control. FI_CXI_RX_MATCH_MODE=[hybrid|software] is required\n" -#define FC_OFLOW_NO_MATCH_MSG "Flow control overflow no match, increasing "\ - "FI_CXI_OFLOW_BUF_SIZE (current is %ldB) may reduce occurrence\n" -#define FC_REQ_FULL_MSG "Flow control request list full, increasing"\ - " FI_CXI_REQ_BUF_SIZE value (current is %ldB) may reduce occurrence\n" -#define FC_DROP_COUNT_MSG "Re-enable Drop count mismatch, re-enable will "\ - "be retried on notify\n" - -#define WARN_RESTRICTED_DISABLED "Insufficient resources for %s "\ - "protocol, switching to %s protocol\n" - -/* Defines the posted receive interval for checking LE allocation if - * in hybrid RX match mode and preemptive transitions to software - * managed EP are requested. - */ -#define CXIP_HYBRID_RECV_CHECK_INTERVAL (64-1) - -static int cxip_recv_cb(struct cxip_req *req, const union c_event *event); -static void cxip_ux_onload_complete(struct cxip_req *req); -static int cxip_ux_onload(struct cxip_rxc *rxc); -static int cxip_recv_req_queue(struct cxip_req *req, bool restart_seq); -static int cxip_recv_req_dropped(struct cxip_req *req); -static ssize_t _cxip_recv_req(struct cxip_req *req, bool restart_seq); - -static int cxip_send_req_dropped(struct cxip_txc *txc, struct cxip_req *req); -static int cxip_send_req_dequeue(struct cxip_txc *txc, struct cxip_req *req); - -static void cxip_fc_progress_ctrl(struct cxip_rxc *rxc); -static void cxip_send_buf_fini(struct cxip_req *req); - -/* - * match_put_event() - Find/add a matching event. - * - * For every Put Overflow event there is a matching Put event. These events can - * be generated in any order. Both events must be received before progress can - * be made. - * - * If the matching event exists in the mapping, matched is set to true and - * the deferred event is returned. If a match was not found, matched is set to - * false and the event is added to the deferred event mapping. - * - * The deferred match event is returned; unless it must be added to the - * deferred mapping and memory is insufficient. - * - * Caller must hold ep_obj->lock. - */ -static struct cxip_deferred_event * -match_put_event(struct cxip_rxc *rxc, struct cxip_req *req, - const union c_event *event, bool *matched) -{ - union cxip_def_event_key key = {}; - struct cxip_deferred_event *def_ev; - union cxip_match_bits mb; - int bucket; - enum c_event_type match_type = - event->tgt_long.event_type == C_EVENT_PUT ? C_EVENT_PUT_OVERFLOW : C_EVENT_PUT; - - if (event->tgt_long.rendezvous) { - key.initiator = event->tgt_long.initiator.initiator.process; - mb.raw = event->tgt_long.match_bits; - key.rdzv_id = (mb.rdzv_id_hi << CXIP_RDZV_ID_CMD_WIDTH) | - event->tgt_long.rendezvous_id; - key.rdzv = 1; - } else { - key.start_addr = event->tgt_long.start; - } - - bucket = fasthash64(&key.raw, sizeof(key.raw), 0) % - CXIP_DEF_EVENT_HT_BUCKETS; - dlist_foreach_container(&rxc->deferred_events.bh[bucket], - struct cxip_deferred_event, def_ev, - rxc_entry) { - if (def_ev->key.raw == key.raw && - def_ev->ev.tgt_long.event_type == match_type && - def_ev->ev.tgt_long.return_code == event->tgt_long.return_code && - def_ev->ev.tgt_long.initiator.initiator.process == event->tgt_long.initiator.initiator.process && - def_ev->ev.tgt_long.match_bits == event->tgt_long.match_bits) { - *matched = true; - return def_ev; - } - } - - /* Not found, add mapping to hash bucket */ - *matched = false; - - def_ev = calloc(1, sizeof(*def_ev)); - if (!def_ev) { - RXC_WARN(rxc, "Failed allocate to memory\n"); - return NULL; - } - - def_ev->key.raw = key.raw; - def_ev->req = req; - def_ev->ev = *event; - - dlist_insert_tail(&def_ev->rxc_entry, &rxc->deferred_events.bh[bucket]); - - return def_ev; -} - -/* - * free_put_event() - Free a deferred put event. - * - * Free an event previously allocated added with match_put_event(). - * - * Caller must hold ep_obj->lock. - */ -static void free_put_event(struct cxip_rxc *rxc, - struct cxip_deferred_event *def_ev) -{ - dlist_remove(&def_ev->rxc_entry); - free(def_ev); -} - /* - * recv_req_src_addr() - Translate request source address to FI address. + * cxip_recv_req_src_addr() - Translate request source address to FI address. */ -static fi_addr_t recv_req_src_addr(struct cxip_req *req) +fi_addr_t cxip_recv_req_src_addr(struct cxip_req *req) { struct cxip_rxc *rxc = req->recv.rxc; @@ -172,8 +56,10 @@ static fi_addr_t recv_req_src_addr(struct cxip_req *req) * * Caller must hold ep->ep_obj->lock. */ -static int cxip_recv_req_alloc(struct cxip_rxc *rxc, void *buf, size_t len, - struct cxip_req **cxip_req) +int cxip_recv_req_alloc(struct cxip_rxc *rxc, void *buf, size_t len, + struct cxip_md *md, struct cxip_req **cxip_req, + int (*recv_cb)(struct cxip_req *req, + const union c_event *event)) { struct cxip_domain *dom = rxc->domain; struct cxip_req *req; @@ -192,17 +78,25 @@ static int cxip_recv_req_alloc(struct cxip_rxc *rxc, void *buf, size_t len, } if (len) { - ret = cxip_map(dom, (void *)buf, len, 0, &recv_md); - if (ret) { - RXC_WARN(rxc, "Map of recv buffer failed: %d, %s\n", - ret, fi_strerror(-ret)); - goto err_free_request; + /* If hybrid descriptor not passed, map for dma */ + if (!md) { + ret = cxip_map(dom, (void *)buf, len, 0, &recv_md); + if (ret) { + RXC_WARN(rxc, + "Map of recv buffer failed: %d, %s\n", + ret, fi_strerror(-ret)); + goto err_free_request; + } + req->recv.hybrid_md = false; + } else { + req->recv.hybrid_md = true; + recv_md = md; } } /* Initialize common receive request attributes. */ req->type = CXIP_REQ_RECV; - req->cb = cxip_recv_cb; + req->cb = recv_cb; req->recv.rxc = rxc; req->recv.recv_buf = buf; req->recv.recv_md = recv_md; @@ -221,7 +115,7 @@ static int cxip_recv_req_alloc(struct cxip_rxc *rxc, void *buf, size_t len, return ret; } -static void cxip_recv_req_free(struct cxip_req *req) +void cxip_recv_req_free(struct cxip_req *req) { struct cxip_rxc *rxc = req->recv.rxc; @@ -231,7 +125,7 @@ static void cxip_recv_req_free(struct cxip_req *req) ofi_atomic_dec32(&rxc->orx_reqs); - if (req->recv.recv_md) + if (req->recv.recv_md && !req->recv.hybrid_md) cxip_unmap(req->recv.recv_md); cxip_evtq_req_free(req); @@ -247,8 +141,16 @@ static inline int recv_req_event_success(struct cxip_rxc *rxc, fi_addr_t src_addr; struct cxip_addr *addr; + /* If this is a FI_MULTI_RECV mandatory completion not associated + * with a receive completion then source information is not required. + */ + if (req->recv.multi_recv && !(req->flags & FI_RECV)) { + req->flags |= FI_MULTI_RECV; + return cxip_cq_req_complete(req); + } + if (req->recv.rxc->attr.caps & FI_SOURCE) { - src_addr = recv_req_src_addr(req); + src_addr = cxip_recv_req_src_addr(req); if (src_addr != FI_ADDR_NOTAVAIL || !(rxc->attr.caps & FI_SOURCE_ERR)) return cxip_cq_req_complete_addr(req, src_addr); @@ -276,13 +178,15 @@ static inline int recv_req_event_success(struct cxip_rxc *rxc, } /* - * recv_req_report() - Report the completion of a receive operation. + * recv_recv_req_report() - Report the completion of a receive operation. */ -static void recv_req_report(struct cxip_req *req) +void cxip_recv_req_report(struct cxip_req *req) { int ret; int err; - int success_event = (req->flags & FI_COMPLETION); + int success_event = (req->flags & FI_COMPLETION) || + (req->flags & FI_MULTI_RECV && + !(req->flags & FI_COMPLETION)); struct cxip_rxc *rxc = req->recv.rxc; ssize_t truncated = req->recv.rlen - req->data_len; @@ -325,10 +229,16 @@ static void recv_req_report(struct cxip_req *req) } } - if (req->recv.rc == C_RC_OK && !truncated) { + if (req->recv.rc == C_RC_OK && (!truncated || rxc->trunc_ok)) { RXC_DBG(rxc, "Request success: %p\n", req); + /* Completion requested or mandatory FI_MULTI_RECV + * buffer un-link completion + */ if (success_event) { + if (truncated) + req->flags |= FI_CXI_TRUNC; + ret = recv_req_event_success(rxc, req); if (ret != FI_SUCCESS) RXC_WARN(rxc, @@ -379,196 +289,13 @@ static void recv_req_report(struct cxip_req *req) } } -/* - * recv_req_tgt_event() - Update common receive request fields - * - * Populate a receive request with information found in all receive event - * types. - */ -static void -recv_req_tgt_event(struct cxip_req *req, const union c_event *event) -{ - struct cxip_rxc *rxc = req->recv.rxc; - union cxip_match_bits mb = { - .raw = event->tgt_long.match_bits - }; - uint32_t init = event->tgt_long.initiator.initiator.process; - - assert(event->hdr.event_type == C_EVENT_PUT || - event->hdr.event_type == C_EVENT_PUT_OVERFLOW || - event->hdr.event_type == C_EVENT_RENDEZVOUS || - event->hdr.event_type == C_EVENT_SEARCH); - - /* Rendezvous events contain the wrong match bits and do not provide - * initiator context for symmetric AVs. - */ - if (event->hdr.event_type != C_EVENT_RENDEZVOUS) { - req->tag = mb.tag; - req->recv.initiator = init; - - if (mb.cq_data) - req->flags |= FI_REMOTE_CQ_DATA; - } - - /* remote_offset is not provided in Overflow events. */ - if (event->hdr.event_type != C_EVENT_PUT_OVERFLOW) - req->recv.src_offset = event->tgt_long.remote_offset; - - /* For rendezvous, initiator is the RGet DFA. */ - if (event->hdr.event_type == C_EVENT_RENDEZVOUS) { - init = cxi_dfa_to_init(init, rxc->pid_bits); - req->recv.rget_nic = CXI_MATCH_ID_EP(rxc->pid_bits, init); - req->recv.rget_pid = CXI_MATCH_ID_PID(rxc->pid_bits, init); - } - - /* Only need one event to set remaining fields. */ - if (req->recv.tgt_event) - return; - req->recv.tgt_event = true; - - /* VNI is needed to support FI_AV_AUTH_KEY. */ - req->recv.vni = event->tgt_long.vni; - - /* rlen is used to detect truncation. */ - req->recv.rlen = event->tgt_long.rlength; - - /* RC is used when generating completion events. */ - req->recv.rc = cxi_tgt_event_rc(event); - - /* Header data is provided in all completion events. */ - req->data = event->tgt_long.header_data; - - /* rdzv_id is used to correlate Put and Put Overflow events when using - * offloaded RPut. Otherwise, Overflow buffer start address is used to - * correlate events. - */ - if (event->tgt_long.rendezvous) - req->recv.rdzv_id = (mb.rdzv_id_hi << CXIP_RDZV_ID_CMD_WIDTH) | - event->tgt_long.rendezvous_id; - else - req->recv.oflow_start = event->tgt_long.start; - - req->recv.rdzv_lac = mb.rdzv_lac; - req->recv.rdzv_proto = mb.rdzv_proto; - req->recv.rdzv_mlen = event->tgt_long.mlength; - - /* data_len must be set uniquely for each protocol! */ -} - -/* - * rdzv_mrecv_req_lookup() - Search for a matching rendezvous, multi-receive - * child request. - */ -static int rdzv_mrecv_req_lookup(struct cxip_req *req, - const union c_event *event, - uint32_t *initiator, uint32_t *rdzv_id, - bool perform_event_checks, - struct cxip_req **req_out) -{ - struct cxip_rxc *rxc = req->recv.rxc; - struct cxip_req *child_req; - union cxip_match_bits mb; - uint32_t ev_init; - uint32_t ev_rdzv_id; - struct cxip_addr caddr; - int ret; - int i; - - if (event->hdr.event_type == C_EVENT_REPLY) { - struct cxi_rdzv_user_ptr *user_ptr; - - /* Events for software-issued operations will return a - * reference to the correct request. - */ - if (!event->init_short.rendezvous) { - *req_out = req; - return FI_SUCCESS; - } - - user_ptr = (struct cxi_rdzv_user_ptr *) - &event->init_short.user_ptr; - - ev_init = CXI_MATCH_ID(rxc->pid_bits, user_ptr->src_pid, - user_ptr->src_nid); - ev_rdzv_id = user_ptr->rendezvous_id; - } else if (event->hdr.event_type == C_EVENT_RENDEZVOUS) { - struct cxip_rxc *rxc = req->recv.rxc; - uint32_t dfa = event->tgt_long.initiator.initiator.process; - - ev_init = cxi_dfa_to_init(dfa, rxc->pid_bits); - mb.raw = event->tgt_long.match_bits; - - ev_rdzv_id = (mb.rdzv_id_hi << CXIP_RDZV_ID_CMD_WIDTH) | - event->tgt_long.rendezvous_id; - } else { - ev_init = event->tgt_long.initiator.initiator.process; - mb.raw = event->tgt_long.match_bits; - - ev_rdzv_id = (mb.rdzv_id_hi << CXIP_RDZV_ID_CMD_WIDTH) | - event->tgt_long.rendezvous_id; - } - - if ((event->hdr.event_type == C_EVENT_PUT_OVERFLOW || - event->hdr.event_type == C_EVENT_PUT) && - rxc->ep_obj->av->symmetric) { - ret = cxip_av_lookup_addr(rxc->ep_obj->av, - CXI_MATCH_ID_EP(rxc->pid_bits, ev_init), - &caddr); - if (ret != FI_SUCCESS) - RXC_FATAL(rxc, "Lookup of FI addr 0x%x: failed %d\n", - ev_init, ret); - - ev_init = CXI_MATCH_ID(rxc->pid_bits, - CXI_MATCH_ID_PID(rxc->pid_bits, ev_init), - caddr.nic); - } - - *initiator = ev_init; - *rdzv_id = ev_rdzv_id; - - /* Events for hardware-issued operations will return a rendezvous_id - * and initiator data. Use these fields to find a matching child - * request. - */ - dlist_foreach_container(&req->recv.children, - struct cxip_req, child_req, - recv.children) { - if (child_req->recv.rdzv_id == ev_rdzv_id && - child_req->recv.rdzv_initiator == ev_init) { - - if (perform_event_checks) { - /* There is an edge case where source may reuse the - * same rendezvous ID before the target has had time to - * process the C_EVENT_REPLY. If this is the case, an - * incorrect child_req match would occur. To prevent - * this, the events seen are stored with the child_req. - * If a redundant event is seen, this is a sign - * C_EVENT_REPLY needs to be process. Thus, return - * -FI_EAGAIN to process TX EQ. - */ - for (i = 0; i < child_req->recv.rdzv_events; i++) { - if (child_req->recv.rdzv_event_types[i] == event->hdr.event_type) { - assert(event->hdr.event_type != C_EVENT_REPLY); - return -FI_EAGAIN; - } - } - } - - *req_out = child_req; - return FI_SUCCESS; - } - } - - return -FI_ENOMSG; -} - /* * mrecv_req_dup() - Create a new request using an event targeting a * multi-recv buffer. * * @mrecv_req: A previously posted multi-recv buffer request. */ -static struct cxip_req *mrecv_req_dup(struct cxip_req *mrecv_req) +struct cxip_req *cxip_mrecv_req_dup(struct cxip_req *mrecv_req) { struct cxip_rxc *rxc = mrecv_req->recv.rxc; struct cxip_req *req; @@ -593,4848 +320,381 @@ static struct cxip_req *mrecv_req_dup(struct cxip_req *mrecv_req) } /* - * rdzv_mrecv_req_event() - Look up a multi-recieve child request using an - * event and multi-recv request. - * - * Each rendezvous Put transaction targeting a multi-receive buffer is tracked - * using a separate child request. A child request is uniquely identified by - * rendezvous ID and source address. Return a reference to a child request - * which matches the event. Allocate a new child request, if necessary. + * recv_req_peek_complete - FI_PEEK operation completed */ -static struct cxip_req * -rdzv_mrecv_req_event(struct cxip_req *mrecv_req, const union c_event *event) +void cxip_recv_req_peek_complete(struct cxip_req *req, + struct cxip_ux_send *ux_send) { - uint32_t ev_init; - uint32_t ev_rdzv_id; - struct cxip_req *req; - struct cxip_rxc *rxc __attribute__((unused)) = mrecv_req->recv.rxc; - int ret; - - assert(event->hdr.event_type == C_EVENT_REPLY || - event->hdr.event_type == C_EVENT_PUT || - event->hdr.event_type == C_EVENT_PUT_OVERFLOW || - event->hdr.event_type == C_EVENT_RENDEZVOUS); - - ret = rdzv_mrecv_req_lookup(mrecv_req, event, &ev_init, &ev_rdzv_id, - true, &req); - switch (ret) { - case -FI_EAGAIN: - return NULL; - - case -FI_ENOMSG: - req = mrecv_req_dup(mrecv_req); - if (!req) - return NULL; - - /* Store event initiator and rdzv_id for matching. */ - req->recv.rdzv_id = ev_rdzv_id; - req->recv.rdzv_initiator = ev_init; - - dlist_insert_tail(&req->recv.children, - &mrecv_req->recv.children); - - RXC_DBG(rxc, "New child: %p parent: %p event: %s\n", req, - mrecv_req, cxi_event_to_str(event)); - return req; - - case FI_SUCCESS: - RXC_DBG(rxc, "Found child: %p parent: %p event: %s\n", req, - mrecv_req, cxi_event_to_str(event)); - return req; - - default: - RXC_FATAL(rxc, "Unhandled rdzv_mrecv_req_lookup %d\n", ret); - } -} + /* If no unexpected message match we need to return original + * tag in the completion. + */ + if (req->recv.rc != C_RC_OK) + req->tag = req->recv.tag; + else if (req->recv.flags & FI_CLAIM) + ((struct fi_context *)req->context)->internal[0] = ux_send; -/* - * rdzv_recv_req_event() - Count a rendezvous event. - * - * Call for each target rendezvous event generated on a user receive buffer. - * After three events, a rendezvous receive is complete. The three events could - * be either: - * -Put, Rendezvous, Reply -- or - * -Put Overflow, Rendezvous, Reply - * - * For a restricted Get there is a fourth event, the ACK of the notify. - * - * In either case, the events could be generated in any order. As soon as the - * events expected are processed, the request is complete. - */ -static void rdzv_recv_req_event(struct cxip_req *req, enum c_event_type type) -{ - int total_events = req->recv.done_notify ? 4 : 3; + /* Avoid truncation processing, peek does not receive data */ + req->data_len = req->recv.rlen; - req->recv.rdzv_event_types[req->recv.rdzv_events] = type; + cxip_recv_req_report(req); - if (++req->recv.rdzv_events == total_events) { - if (req->recv.multi_recv) { - dlist_remove(&req->recv.children); - recv_req_report(req); - cxip_evtq_req_free(req); - } else { - recv_req_report(req); - cxip_recv_req_free(req); - } - } + cxip_recv_req_free(req); } /* - * oflow_req_put_bytes() - Consume bytes in the Overflow buffer. + * cxip_complete_put() - Common C_EVENT_PUT success event processing * - * An Overflow buffer is freed when all bytes are consumed by the NIC. - * - * Caller must hold ep_obj->lock. + * Data is delivered directly to user buffer. */ -static void oflow_req_put_bytes(struct cxip_req *req, size_t bytes) +int cxip_complete_put(struct cxip_req *req, const union c_event *event) { - struct cxip_ptelist_buf *oflow_buf = req->req_ctx; - - /* Non-zero length UX messages with 0 eager portion do not - * have a dependency on the oflow buffer. - */ - if (bytes == 0) - return; + if (req->recv.multi_recv) { + if (event->tgt_long.auto_unlinked) { + uintptr_t mrecv_head; - oflow_buf->cur_offset += bytes; + /* Special C_EVENT_PUT case when FI_MULTI_RECV was + * requested, but FI_COMPLETION was not specified. + * Must generate an FI_MULTI_RECV completion associated + * with only the un-link of the buffer. + */ + if (!(req->flags & FI_COMPLETION)) { + req->recv.auto_unlinked = true; + req->recv.rxc->ops.recv_req_tgt_event(req, + event); + req->flags = FI_MULTI_RECV; + req->recv.rlen = 0; + req->data_len = 0; + req->tag = 0; + req->buf = (uint64_t)NULL; - RXC_DBG(oflow_buf->rxc, "Putting %lu bytes (%lu/%lu): %p\n", bytes, - oflow_buf->cur_offset, oflow_buf->unlink_length, req); + cxip_recv_req_report(req); + cxip_recv_req_free(req); - if (oflow_buf->cur_offset == oflow_buf->unlink_length) - cxip_ptelist_buf_consumed(oflow_buf); -} + return FI_SUCCESS; + } -/* - * issue_rdzv_get() - Perform a Get to pull source data from the Initiator of a - * Send operation. - */ -static int issue_rdzv_get(struct cxip_req *req) -{ - struct c_full_dma_cmd cmd = {}; - uint64_t local_addr; - uint64_t rem_offset; - uint32_t align_bytes; - uint32_t mlen; - struct cxip_rxc *rxc = req->recv.rxc; - uint32_t pid_idx = rxc->domain->iface->dev->info.rdzv_get_idx; - uint8_t idx_ext; - union cxip_match_bits mb = {}; - int ret; - union c_fab_addr dfa; + /* For C_EVENT_PUT, need to calculate how much of the + * multi-recv buffer was consumed while factoring in + * any truncation. + */ + mrecv_head = CXI_IOVA_TO_VA(req->recv.recv_md->md, + event->tgt_long.start); - if (req->recv.rdzv_proto == CXIP_RDZV_PROTO_ALT_WRITE) - RXC_WARN_ONCE(rxc, "Rendezvous protocol: %s not implemented\n", - cxip_rdzv_proto_to_str(req->recv.rdzv_proto)); + req->recv.auto_unlinked = true; + req->recv.mrecv_unlink_bytes = mrecv_head - + (uintptr_t)req->recv.recv_buf + + event->tgt_long.mlength; + } - cmd.command.cmd_type = C_CMD_TYPE_DMA; - cmd.command.opcode = C_CMD_GET; - cmd.lac = req->recv.recv_md->md->lac; - cmd.event_send_disable = 1; + req = cxip_mrecv_req_dup(req); + if (!req) + return -FI_EAGAIN; - /* Must deliver to TX event queue */ - cmd.eq = cxip_evtq_eqn(&rxc->ep_obj->txc.tx_evtq); + req->recv.rxc->ops.recv_req_tgt_event(req, event); + req->buf = (uint64_t)(CXI_IOVA_TO_VA(req->recv.recv_md->md, + event->tgt_long.start)); + req->data_len = event->tgt_long.mlength; - if (req->recv.rdzv_proto == CXIP_RDZV_PROTO_ALT_READ) { - pid_idx = CXIP_PTL_IDX_RDZV_RESTRICTED(req->recv.rdzv_lac); - cmd.restricted = 1; - req->recv.done_notify = true; + cxip_recv_req_report(req); + cxip_evtq_req_free(req); } else { - pid_idx = rxc->domain->iface->dev->info.rdzv_get_idx; - mb.rdzv_lac = req->recv.rdzv_lac; - mb.rdzv_id_lo = req->recv.rdzv_id; - mb.rdzv_id_hi = req->recv.rdzv_id >> CXIP_RDZV_ID_CMD_WIDTH; + req->data_len = event->tgt_long.mlength; + req->recv.rxc->ops.recv_req_tgt_event(req, event); + cxip_recv_req_report(req); + cxip_recv_req_free(req); } - cmd.match_bits = mb.raw; - - cmd.user_ptr = (uint64_t)req; - cxi_build_dfa(req->recv.rget_nic, req->recv.rget_pid, rxc->pid_bits, - pid_idx, &dfa, &idx_ext); - cmd.dfa = dfa; - cmd.index_ext = idx_ext; - local_addr = CXI_VA_TO_IOVA(req->recv.recv_md->md, - req->recv.recv_buf); - local_addr += req->recv.rdzv_mlen; + return FI_SUCCESS; +} - rem_offset = req->recv.src_offset; - mlen = req->recv.rdzv_mlen; +/* Caller must hold ep_obj->lock */ +int cxip_recv_pending_ptlte_disable(struct cxip_rxc *rxc, + bool check_fc) +{ + int ret; - RXC_DBG(rxc, "SW RGet addr: 0x%" PRIx64 " len %" PRId64 - " rem_off: %" PRId64 " restricted: %d\n", local_addr, - req->data_len - req->recv.rdzv_mlen, rem_offset, - cmd.restricted); + assert(rxc->state == RXC_ENABLED || + rxc->state == RXC_ONLOAD_FLOW_CONTROL || + rxc->state == RXC_ONLOAD_FLOW_CONTROL_REENABLE || + rxc->state == RXC_FLOW_CONTROL || + rxc->state == RXC_PENDING_PTLTE_SOFTWARE_MANAGED || + rxc->state == RXC_PENDING_PTLTE_DISABLE); - /* Align mask will be non-zero if local DMA address cache-line - * alignment is desired. + /* Having flow control triggered while in flow control is a sign of LE + * exhaustion. Software endpoint mode is required to scale past hardware + * LE limit. */ - if (mlen >= rxc->rget_align_mask) { - align_bytes = local_addr & rxc->rget_align_mask; - local_addr -= align_bytes; - rem_offset -= align_bytes; - mlen -= align_bytes; - } - - if (req->data_len < mlen) - cmd.request_len = 0; - else - cmd.request_len = req->data_len - mlen; - - cmd.local_addr = local_addr; - cmd.remote_offset = rem_offset; + if (check_fc && rxc->state == RXC_FLOW_CONTROL) + RXC_FATAL(rxc, FC_SW_LE_MSG_FATAL); - RXC_DBG(rxc, "Aligned addr: 0x%" PRIx64 " len %d rem_off %" PRId64 "\n", - (uint64_t)cmd.local_addr, cmd.request_len, - (uint64_t)cmd.remote_offset); + if (rxc->state != RXC_ENABLED) + return FI_SUCCESS; - /* Issue Rendezvous Get command */ - ret = cxi_cq_emit_dma(rxc->tx_cmdq->dev_cmdq, &cmd); - if (ret) { - RXC_DBG(rxc, "Failed to queue GET command: %d\n", ret); - return -FI_EAGAIN; - } + RXC_DBG(rxc, "Manual request PTLTE_DISABLED\n"); - cxi_cq_ring(rxc->tx_cmdq->dev_cmdq); + ret = cxip_pte_set_state(rxc->rx_pte, rxc->rx_cmdq, + C_PTLTE_DISABLED, 0); + if (ret == FI_SUCCESS) + rxc->state = RXC_PENDING_PTLTE_DISABLE; - return FI_SUCCESS; + return ret; } -/* - * cxip_notify_match_cb() - Callback function for match complete notifiction - * Ack events. - */ -static int -cxip_notify_match_cb(struct cxip_req *req, const union c_event *event) +void cxip_rxc_record_req_stat(struct cxip_rxc *rxc, enum c_ptl_list list, + size_t rlength, struct cxip_req *req) { - RXC_DBG(req->recv.rxc, "Match complete: %p\n", req); - - recv_req_report(req); - - if (req->recv.multi_recv) - cxip_evtq_req_free(req); - else - cxip_recv_req_free(req); + enum fi_hmem_iface iface = rlength ? req->recv.recv_md->info.iface : + FI_HMEM_SYSTEM; - return FI_SUCCESS; + cxip_msg_counters_msg_record(&rxc->cntrs, list, iface, rlength); } /* - * cxip_notify_match() - Notify the initiator of a Send that the match is - * complete at the target. - * - * A transaction ID corresponding to the matched Send request is sent back to - * the initiator in the match_bits field of a zero-byte Put. + * cxip_recv_cancel() - Cancel outstanding receive request. */ -static int cxip_notify_match(struct cxip_req *req, const union c_event *event) +int cxip_recv_cancel(struct cxip_req *req) { + int ret = FI_SUCCESS; struct cxip_rxc *rxc = req->recv.rxc; - uint32_t pid_idx = rxc->domain->iface->dev->info.rdzv_get_idx; - uint32_t init = event->tgt_long.initiator.initiator.process; - uint32_t nic = CXI_MATCH_ID_EP(rxc->pid_bits, init); - uint32_t pid = CXI_MATCH_ID_PID(rxc->pid_bits, init); - union c_fab_addr dfa; - uint8_t idx_ext; - union cxip_match_bits mb = { - .le_type = CXIP_LE_TYPE_ZBP, - }; - union cxip_match_bits event_mb; - union c_cmdu cmd = {}; - int ret; - - event_mb.raw = event->tgt_long.match_bits; - mb.tx_id = event_mb.tx_id; - - cxi_build_dfa(nic, pid, rxc->pid_bits, pid_idx, &dfa, &idx_ext); - - cmd.c_state.event_send_disable = 1; - cmd.c_state.index_ext = idx_ext; - cmd.c_state.eq = cxip_evtq_eqn(&rxc->ep_obj->txc.tx_evtq); - - ret = cxip_cmdq_emit_c_state(rxc->tx_cmdq, &cmd.c_state); - if (ret) { - RXC_DBG(rxc, "Failed to issue C_STATE command: %d\n", ret); - return ret; - } - - memset(&cmd.idc_msg, 0, sizeof(cmd.idc_msg)); - cmd.idc_msg.dfa = dfa; - cmd.idc_msg.match_bits = mb.raw; - cmd.idc_msg.user_ptr = (uint64_t)req; - - ret = cxi_cq_emit_idc_msg(rxc->tx_cmdq->dev_cmdq, &cmd.idc_msg, - NULL, 0); - if (ret) { - RXC_DBG(rxc, "Failed to write IDC: %d\n", ret); - - /* Return error according to Domain Resource Management - */ - return -FI_EAGAIN; + /* In hybrid mode requests could be on priority list + * or software receive list. + */ + if (req->recv.software_list) { + dlist_remove_init(&req->recv.rxc_entry); + req->recv.canceled = true; + req->recv.unlinked = true; + cxip_recv_req_report(req); + cxip_recv_req_free(req); + } else { + ret = cxip_pte_unlink(rxc->rx_pte, C_PTL_LIST_PRIORITY, + req->req_id, rxc->rx_cmdq); + if (ret == FI_SUCCESS) + req->recv.canceled = true; } - - req->cb = cxip_notify_match_cb; - - cxi_cq_ring(rxc->tx_cmdq->dev_cmdq); - - RXC_DBG(rxc, "Queued match completion message: %p\n", req); - - return FI_SUCCESS; + return ret; } /* - * mrecv_req_oflow_event() - Set start and length uniquely for an unexpected - * mrecv request. - * - * Overflow buffer events contain a start address representing the offset into - * the Overflow buffer where data was written. When a unexpected header is - * later matched to a multi-receive buffer in the priority list, The Put - * Overflow event does not contain the offset into the Priority list buffer - * where data should be copied. Software must track the the Priority list - * buffer offset using ordered Put Overflow events. + * tag_match() - Compare match bits */ -static int mrecv_req_put_bytes(struct cxip_req *req, uint32_t rlen) +bool tag_match(uint64_t init_mb, uint64_t mb, uint64_t ib) { - uintptr_t mrecv_head; - uintptr_t mrecv_tail; - size_t mrecv_bytes_remaining; - - mrecv_head = (uintptr_t)req->recv.recv_buf + req->recv.start_offset; - mrecv_tail = (uintptr_t)req->recv.recv_buf + req->recv.ulen; - mrecv_bytes_remaining = mrecv_tail - mrecv_head; - - rlen = MIN(mrecv_bytes_remaining, rlen); - req->recv.start_offset += rlen; - - return rlen; + return !((init_mb ^ mb) & ~ib); } -/* cxip_recv_req_set_rget_info() - Set RGet NIC and PID fields. Used for - * messages where a rendezvous event will not be generated. Current usages are - * for the eager long protocol and rendezvous operations which have unexpected - * headers onloaded due to flow control. +/* + * init_match() - Compare UX Send initiator and Receive initiator in SW. */ -static void cxip_recv_req_set_rget_info(struct cxip_req *req) +bool init_match(struct cxip_rxc *rxc, uint32_t init, uint32_t match_id) { - struct cxip_rxc *rxc = req->recv.rxc; - int ret; + if (match_id == CXI_MATCH_ID_ANY) + return true; if (rxc->ep_obj->av->symmetric) { - struct cxip_addr caddr; - - RXC_DBG(rxc, "Translating initiator: %x, req: %p\n", - req->recv.initiator, req); - - ret = cxip_av_lookup_addr(rxc->ep_obj->av, - CXI_MATCH_ID_EP(rxc->pid_bits, req->recv.initiator), - &caddr); - if (ret != FI_SUCCESS) - RXC_FATAL(rxc, "Failed to look up FI addr: %d\n", ret); - - req->recv.rget_nic = caddr.nic; - } else { - req->recv.rget_nic = CXI_MATCH_ID_EP(rxc->pid_bits, - req->recv.initiator); + init = CXI_MATCH_ID_EP(rxc->pid_bits, init); + match_id = CXI_MATCH_ID_EP(rxc->pid_bits, match_id); } - req->recv.rget_pid = CXI_MATCH_ID_PID(rxc->pid_bits, - req->recv.initiator); + return init == match_id; } /* - * cxip_ux_send() - Progress an unexpected Send after receiving matching Put - * and Put and Put Overflow events. + * cxip_flush_appends() - Flush all user appends for a RXC. * - * Caller must hold ep_obj->lock. - */ -static int cxip_ux_send(struct cxip_req *match_req, struct cxip_req *oflow_req, - const union c_event *put_event, uint64_t mrecv_start, - uint32_t mrecv_len, bool remove_recv_entry) -{ - struct cxip_ptelist_buf *buf; - void *oflow_va; - size_t oflow_bytes; - union cxip_match_bits mb; - ssize_t ret; - struct cxip_req *parent_req = match_req; - - assert(match_req->type == CXIP_REQ_RECV); - - if (match_req->recv.multi_recv) { - if (put_event->tgt_long.rendezvous) - match_req = rdzv_mrecv_req_event(match_req, put_event); - else - match_req = mrecv_req_dup(match_req); - if (!match_req) - return -FI_EAGAIN; - - /* Set start and length uniquely for an unexpected - * mrecv request. - */ - match_req->recv.recv_buf = (uint8_t *) - match_req->recv.parent->recv.recv_buf + - mrecv_start; - match_req->buf = (uint64_t)match_req->recv.recv_buf; - match_req->data_len = mrecv_len; - } else { - match_req->data_len = put_event->tgt_long.rlength; - if (match_req->data_len > match_req->recv.ulen) - match_req->data_len = match_req->recv.ulen; - } - - recv_req_tgt_event(match_req, put_event); - buf = oflow_req->req_ctx; - oflow_va = (void *)CXI_IOVA_TO_VA(buf->md->md, - put_event->tgt_long.start); - - /* Copy data out of overflow buffer. */ - oflow_bytes = MIN(put_event->tgt_long.mlength, match_req->data_len); - cxip_copy_to_md(match_req->recv.recv_md, match_req->recv.recv_buf, - oflow_va, oflow_bytes); - - if (oflow_req->type == CXIP_REQ_OFLOW) - oflow_req_put_bytes(oflow_req, put_event->tgt_long.mlength); - - /* Remaining unexpected rendezvous processing is deferred until RGet - * completes. - */ - if (put_event->tgt_long.rendezvous) { - if (remove_recv_entry) - dlist_remove_init(&parent_req->recv.rxc_entry); - - rdzv_recv_req_event(match_req, put_event->hdr.event_type); - return FI_SUCCESS; - } - - mb.raw = put_event->tgt_long.match_bits; - - /* Check if the initiator requires match completion guarantees. - * If so, notify the initiator that the match is now complete. - * Delay the Receive event until the notification is complete. - */ - if (mb.match_comp) { - ret = cxip_notify_match(match_req, put_event); - if (ret != FI_SUCCESS) { - if (match_req->recv.multi_recv) - cxip_evtq_req_free(match_req); - - return -FI_EAGAIN; - } - - if (remove_recv_entry) - dlist_remove_init(&parent_req->recv.rxc_entry); - - return FI_SUCCESS; - } - - if (remove_recv_entry) - dlist_remove_init(&parent_req->recv.rxc_entry); - - recv_req_report(match_req); - - if (match_req->recv.multi_recv) - cxip_evtq_req_free(match_req); - else - cxip_recv_req_free(match_req); - - return FI_SUCCESS; -} - -/* - * cxip_ux_send_zb() - Progress an unexpected zero-byte Send after receiving - * a Put Overflow event. - * - * Zero-byte Put events for unexpected Sends are discarded. Progress the Send - * using only the Overflow event. There is no Send data to be copied out. - */ -static int cxip_ux_send_zb(struct cxip_req *match_req, - const union c_event *oflow_event, - uint64_t mrecv_start, bool remove_recv_entry) -{ - union cxip_match_bits mb; - int ret; - struct cxip_req *parent_req = match_req; - - assert(!oflow_event->tgt_long.rlength); - - if (match_req->recv.multi_recv) { - match_req = mrecv_req_dup(match_req); - if (!match_req) - return -FI_EAGAIN; - - match_req->buf = (uint64_t) - match_req->recv.parent->recv.recv_buf + - mrecv_start; - } - - recv_req_tgt_event(match_req, oflow_event); - - match_req->data_len = 0; - - mb.raw = oflow_event->tgt_long.match_bits; - - /* Check if the initiator requires match completion guarantees. - * If so, notify the initiator that the match is now complete. - * Delay the Receive event until the notification is complete. - */ - if (mb.match_comp) { - ret = cxip_notify_match(match_req, oflow_event); - if (ret != FI_SUCCESS) { - if (match_req->recv.multi_recv) - cxip_evtq_req_free(match_req); - - return -FI_EAGAIN; - } - - if (remove_recv_entry) - dlist_remove_init(&parent_req->recv.rxc_entry); - - return FI_SUCCESS; - } - - if (remove_recv_entry) - dlist_remove_init(&parent_req->recv.rxc_entry); - - recv_req_report(match_req); - - if (match_req->recv.multi_recv) - cxip_evtq_req_free(match_req); - else - cxip_recv_req_free(match_req); - - return FI_SUCCESS; -} - -static bool cxip_ux_is_onload_complete(struct cxip_req *req) -{ - return !req->search.puts_pending && req->search.complete; -} - -/* - * recv_req_peek_complete - FI_PEEK operation completed - */ -static void recv_req_peek_complete(struct cxip_req *req, - struct cxip_ux_send *ux_send) -{ - /* If no unexpected message match we need to return original - * tag in the completion. - */ - if (req->recv.rc != C_RC_OK) - req->tag = req->recv.tag; - else if (req->recv.flags & FI_CLAIM) - ((struct fi_context *)req->context)->internal[0] = ux_send; - - /* Avoid truncation processing, peek does not receive data */ - req->data_len = req->recv.rlen; - - recv_req_report(req); - - cxip_recv_req_free(req); -} - -/* Caller must hold ep_obj->lock. */ -static int cxip_oflow_process_put_event(struct cxip_rxc *rxc, - struct cxip_req *req, - const union c_event *event) -{ - int ret; - struct cxip_deferred_event *def_ev; - struct cxip_req *save_req; - bool matched; - - def_ev = match_put_event(rxc, req, event, &matched); - if (!matched) - return !def_ev ? -FI_EAGAIN : FI_SUCCESS; - - RXC_DBG(rxc, "Overflow beat Put event: %p\n", def_ev->req); - - if (def_ev->ux_send) { - /* UX Send was onloaded for one of these reasons: - * 1) Flow control - * 2) ULE was claimed by a FI_CLAIM - */ - save_req = def_ev->req; - def_ev->ux_send->req = req; - def_ev->ux_send->put_ev = *event; - - if (def_ev->ux_send->claimed) { - recv_req_tgt_event(save_req, &def_ev->ux_send->put_ev); - recv_req_peek_complete(save_req, def_ev->ux_send); - RXC_DBG(rxc, "FI_CLAIM put complete: %p, ux_send %p\n", - save_req, def_ev->ux_send); - goto done; - } else { - def_ev->req->search.puts_pending--; - RXC_DBG(rxc, "put complete: %p\n", def_ev->req); - } - - if (cxip_ux_is_onload_complete(def_ev->req)) - cxip_ux_onload_complete(def_ev->req); - - } else { - ret = cxip_ux_send(def_ev->req, req, event, def_ev->mrecv_start, - def_ev->mrecv_len, false); - if (ret != FI_SUCCESS) - return -FI_EAGAIN; - } - -done: - free_put_event(rxc, def_ev); - - return FI_SUCCESS; -} - -/* Caller must hold ep_obj->lock */ -static int cxip_recv_pending_ptlte_disable(struct cxip_rxc *rxc, - bool check_fc) -{ - int ret; - - assert(rxc->state == RXC_ENABLED || - rxc->state == RXC_ONLOAD_FLOW_CONTROL || - rxc->state == RXC_ONLOAD_FLOW_CONTROL_REENABLE || - rxc->state == RXC_FLOW_CONTROL || - rxc->state == RXC_PENDING_PTLTE_SOFTWARE_MANAGED || - rxc->state == RXC_PENDING_PTLTE_DISABLE); - - /* Having flow control triggered while in flow control is a sign of LE - * exhaustion. Software endpoint mode is required to scale past hardware - * LE limit. - */ - if (check_fc && rxc->state == RXC_FLOW_CONTROL) - RXC_FATAL(rxc, FC_SW_LE_MSG_FATAL); - - if (rxc->state != RXC_ENABLED) - return FI_SUCCESS; - - RXC_DBG(rxc, "Manual request PTLTE_DISABLED\n"); - - ret = cxip_pte_set_state(rxc->rx_pte, rxc->rx_cmdq, C_PTLTE_DISABLED, - 0); - if (ret == FI_SUCCESS) - rxc->state = RXC_PENDING_PTLTE_DISABLE; - - return ret; -} - -/* cxip_rxp_check_le_usage_hybrid_preempt() - Examines LE Pool usage and forces - * a preemptive hardware to software transition if needed. - * - * In cases where the LE pool entry reservation is insufficient to meet request - * list buffers (due to multiple EP sharing an LE Pool or insufficient LE Pool - * reservation value), then enabling the periodic checking of LE allocations - * can be used to force preemptive transitions to software match mode before - * resources are exhausted or so depleted they are starve software managed - * endpoint. The lpe_stat_2 is set to the number of LE pool entries allocated - * to the LE pool and lpe_stat_1 is the current allocation. Skid is required - * as stats are relative to hardware processing, not software processing of - * the event. - * - * Caller should hold ep_obj->lock. - */ -static inline bool -cxip_rxp_check_le_usage_hybrid_preempt(struct cxip_rxc *rxc, - const union c_event *event) -{ - if (event->tgt_long.lpe_stat_1 > (event->tgt_long.lpe_stat_2 >> 1) && - rxc->state == RXC_ENABLED) { - if (cxip_recv_pending_ptlte_disable(rxc, false)) - RXC_WARN(rxc, "Force FC failed\n"); - return true; - } - return false; -} - -static int cxip_rxc_check_ule_hybrid_preempt(struct cxip_rxc *rxc) -{ - int ret; - int count; - - if (cxip_env.rx_match_mode == CXIP_PTLTE_HYBRID_MODE && - cxip_env.hybrid_unexpected_msg_preemptive == 1) { - count = ofi_atomic_get32(&rxc->orx_hw_ule_cnt); - - if (rxc->state == RXC_ENABLED && count > rxc->attr.size) { - ret = cxip_recv_pending_ptlte_disable(rxc, false); - if (ret == FI_SUCCESS) { - RXC_WARN(rxc, - "Transitioning to SW EP due to too many unexpected messages: posted_count=%u request_size=%lu\n", - ret, rxc->attr.size); - } else { - assert(ret == -FI_EAGAIN); - RXC_WARN(rxc, - "Failed to transition to SW EP: %d\n", - ret); - } - - return ret; - } - } - - return FI_SUCCESS; -} - -/* - * cxip_oflow_cb() - Process an Overflow buffer event. - * - * Overflow buffers are used to land unexpected Send data. Link, Unlink - * and Put events are expected from Overflow buffers. However, Link - * events will only be requested when running in hybrid RX match mode - * with FI_CXI_HYBRID_PREEMPTIVE=1. - * - * An Unlink event indicates that buffer space was exhausted. Overflow buffers - * are configured to use locally managed LEs. When enough Puts match in an - * Overflow buffer, consuming its space, the NIC automatically unlinks the LE. - * An automatic Unlink event is generated before the final Put which caused - * buffer space to become exhausted. - * - * An Unlink event is generated by an Unlink command. Overflow buffers are - * manually unlinked in this way during teardown. When an LE is manually - * unlinked the auto_unlinked field in the corresponding event is zero. In this - * case, the request is freed immediately. - * - * A Put event is generated for each Put that matches the Overflow buffer LE. - * This event indicates that data is available in the Overflow buffer. This - * event must be correlated to a Put Overflow event from a user receive buffer - * LE. The Put Overflow event may arrive before or after the Put event. - * - * When each Put event arrives, check for the existence of a previously posted - * receive buffer which generated a matching Put Overflow event. If such a - * buffer exists, copy data from the Overflow buffer to the user receive - * buffer. Otherwise, store a record of the Put event for matching once a user - * posts a new buffer that matches the unexpected Put. - * - * If data will remain in the Overflow buffer, take a reference to it to - * prevent it from being freed. If an Unlink-Put event is detected, drop a - * reference to the Overflow buffer so it is automatically freed once all user - * data is copied out. - */ -static int cxip_oflow_cb(struct cxip_req *req, const union c_event *event) -{ - struct cxip_ptelist_buf *oflow_buf = req->req_ctx; - struct cxip_rxc *rxc = oflow_buf->rxc; - int ret = FI_SUCCESS; - - switch (event->hdr.event_type) { - case C_EVENT_LINK: - /* Success events only used with hybrid preemptive */ - if (cxi_event_rc(event) == C_RC_OK) { - - if (!cxip_env.hybrid_preemptive) - return FI_SUCCESS; - - /* Check for possible hybrid mode preemptive - * transitions to software managed mode. - */ - if (cxip_rxp_check_le_usage_hybrid_preempt(rxc, event)) - RXC_WARN(rxc, - "Force preemptive switch to SW EP\n"); - return FI_SUCCESS; - } - - assert(cxi_event_rc(event) == C_RC_NO_SPACE); - - RXC_DBG(rxc, "Oflow LE append failed\n"); - - ret = cxip_recv_pending_ptlte_disable(rxc, true); - if (ret != FI_SUCCESS) - RXC_WARN(rxc, "Force disable failed %d %s\n", - ret, fi_strerror(-ret)); - cxip_ptelist_buf_link_err(oflow_buf, cxi_event_rc(event)); - return ret; - case C_EVENT_UNLINK: - assert(!event->tgt_long.auto_unlinked); - - cxip_ptelist_buf_unlink(oflow_buf); - return FI_SUCCESS; - case C_EVENT_PUT: - /* Put event handling is complicated. Handle below. */ - break; - default: - RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT, - cxi_event_to_str(event), - cxi_rc_to_str(cxi_event_rc(event))); - } - - ofi_atomic_inc32(&rxc->orx_hw_ule_cnt); - - if (event->tgt_long.auto_unlinked) { - - oflow_buf->unlink_length = event->tgt_long.start - - CXI_VA_TO_IOVA(oflow_buf->md->md, oflow_buf->data) - + event->tgt_long.mlength; - - ofi_atomic_dec32(&oflow_buf->pool->bufs_linked); - - RXC_DBG(rxc, "Oflow auto unlink buf %p, linked %u\n", oflow_buf, - ofi_atomic_get32(&oflow_buf->pool->bufs_linked)); - - /* Replace the eager overflow buffer. */ - cxip_ptelist_buf_replenish(rxc->oflow_list_bufpool, false); - } - - ret = cxip_rxc_check_ule_hybrid_preempt(rxc); - if (ret) - goto err_dec_ule; - - /* Drop all unexpected 0-byte Put events. */ - if (!event->tgt_long.rlength) - return FI_SUCCESS; - - /* Handle Put events */ - ret = cxip_oflow_process_put_event(rxc, req, event); - if (ret) - goto err_dec_ule; - - return FI_SUCCESS; - -err_dec_ule: - ofi_atomic_dec32(&rxc->orx_hw_ule_cnt); - - return ret; -} - -static void report_send_completion(struct cxip_req *req, bool sw_cntr); -static void rdzv_send_req_event(struct cxip_req *req); - -/* - * cxip_rdzv_pte_zbp_cb() - Process zero-byte Put events. - * - * Zero-byte Puts (ZBP) are used to transfer small messages without consuming - * buffers outside of the EQ. ZBPs are currently only used for match complete - * messages. - */ -int cxip_rdzv_pte_zbp_cb(struct cxip_req *req, const union c_event *event) -{ - struct cxip_rdzv_pte *rdzv_pte = req->req_ctx; - struct cxip_txc *txc = rdzv_pte->txc; - struct cxip_req *put_req; - union cxip_match_bits mb; - int event_rc = cxi_event_rc(event); - int rdzv_id; - int ret; - - switch (event->hdr.event_type) { - case C_EVENT_LINK: - if (event_rc == C_RC_OK) - ofi_atomic_inc32(&rdzv_pte->le_linked_success_count); - else - ofi_atomic_inc32(&rdzv_pte->le_linked_failure_count); - return FI_SUCCESS; - - case C_EVENT_PUT: - mb.raw = event->tgt_long.match_bits; - - if (mb.rdzv_done) { - rdzv_id = (mb.rdzv_id_hi << CXIP_RDZV_ID_CMD_WIDTH) | - mb.rdzv_id_lo; - put_req = cxip_rdzv_id_lookup(txc, rdzv_id); - if (!put_req) { - TXC_WARN(txc, "Failed to find RDZV ID: %d\n", - rdzv_id); - return FI_SUCCESS; - } - - if (event_rc != C_RC_OK) - TXC_WARN(txc, "RDZV Done error: %p rc: %s\n", - put_req, cxi_rc_to_str(event_rc)); - else - TXC_DBG(txc, "RDZV Done ACK: %p rc: %s\n", - put_req, cxi_rc_to_str(event_rc)); - - put_req->send.rc = event_rc; - rdzv_send_req_event(put_req); - - return FI_SUCCESS; - } - - /* Match complete */ - put_req = cxip_tx_id_lookup(txc, mb.tx_id); - if (!put_req) { - TXC_WARN(txc, "Failed to find TX ID: %d\n", mb.tx_id); - return FI_SUCCESS; - } - - event_rc = cxi_tgt_event_rc(event); - if (event_rc != C_RC_OK) - TXC_WARN(txc, "ZBP error: %p rc: %s\n", put_req, - cxi_rc_to_str(event_rc)); - else - TXC_DBG(txc, "ZBP received: %p rc: %s\n", put_req, - cxi_rc_to_str(event_rc)); - - ret = cxip_send_req_dequeue(put_req->send.txc, put_req); - if (ret != FI_SUCCESS) - return ret; - - cxip_tx_id_free(txc, mb.tx_id); - - /* The unexpected message has been matched. Generate a - * completion event. The ZBP event is guaranteed to arrive - * after the eager Send Ack, so the transfer is always done at - * this point. - * - * If MATCH_COMPLETE was requested, software must manage - * counters. - */ - report_send_completion(put_req, true); - - ofi_atomic_dec32(&put_req->send.txc->otx_reqs); - cxip_evtq_req_free(put_req); - - return FI_SUCCESS; - - default: - TXC_FATAL(txc, CXIP_UNEXPECTED_EVENT, - cxi_event_to_str(event), - cxi_rc_to_str(cxi_event_rc(event))); - } -} - -/* - * cxip_oflow_bufpool_fini() - Finalize overflow buffers used for messaging. - * - * Must be called with the RX PtlTE disabled. - */ -void cxip_oflow_bufpool_fini(struct cxip_rxc *rxc) -{ - struct cxip_deferred_event *def_ev = NULL; - struct cxip_ptelist_buf *oflow_buf; - struct dlist_entry *tmp; - int i; - int def_events = 0; - - /* Clean up unexpected Put records. The PtlTE is disabled, so no more - * events can be expected. - */ - for (i = 0; i < CXIP_DEF_EVENT_HT_BUCKETS; i++) { - dlist_foreach_container_safe(&rxc->deferred_events.bh[i], - struct cxip_deferred_event, - def_ev, rxc_entry, tmp) { - /* Dropping the last reference will cause the - * oflow_buf to be removed from the RXC list and - * freed. - */ - oflow_buf = def_ev->req->req_ctx; - - if (oflow_buf->le_type == CXIP_LE_TYPE_RX) - oflow_req_put_bytes(def_ev->req, - def_ev->ev.tgt_long.mlength); - - free_put_event(rxc, def_ev); - def_events++; - } - } - - if (def_events) - RXC_DBG(rxc, "Freed %d deferred event(s)\n", def_events); - - cxip_ptelist_bufpool_fini(rxc->oflow_list_bufpool); -} - -int cxip_oflow_bufpool_init(struct cxip_rxc *rxc) -{ - struct cxip_ptelist_bufpool_attr attr = { - .list_type = C_PTL_LIST_OVERFLOW, - .ptelist_cb = cxip_oflow_cb, - .buf_size = cxip_env.oflow_buf_size, - .min_posted = cxip_env.oflow_buf_min_posted, - .max_posted = cxip_env.oflow_buf_min_posted, /* min == max */ - .max_cached = cxip_env.oflow_buf_max_cached, - .min_space_avail = rxc->max_eager_size, - }; - - return cxip_ptelist_bufpool_init(rxc, &rxc->oflow_list_bufpool, &attr); -} - -/* - * cxip_rdzv_done_notify() - Sends a rendezvous complete from target to source - * - * Sends a zero byte matching notification to the source of rendezvous - * indicating completion of a rendezvous. This is used when restricted get - * DMA (CXIP_RDZV_PROTO_ALT_READ) is used to transfer non-eager data. - */ -static int cxip_rdzv_done_notify(struct cxip_req *req) -{ - struct cxip_rxc *rxc = req->recv.rxc; - union c_fab_addr dfa; - uint32_t pid_idx = CXIP_PTL_IDX_RDZV_DEST; - uint32_t match_id; - struct c_full_dma_cmd cmd = {}; - union cxip_match_bits mb = {}; - int ret; - uint8_t idx_ext; - - mb.rdzv_id_lo = req->recv.rdzv_id; - mb.rdzv_id_hi = req->recv.rdzv_id >> CXIP_RDZV_ID_CMD_WIDTH; - mb.rdzv_done = 1; - mb.le_type = CXIP_LE_TYPE_ZBP; - - cxi_build_dfa(req->recv.rget_nic, req->recv.rget_pid, rxc->pid_bits, - pid_idx, &dfa, &idx_ext); - match_id = CXI_MATCH_ID(rxc->pid_bits, rxc->ep_obj->src_addr.pid, - rxc->ep_obj->src_addr.nic); - - cmd.command.cmd_type = C_CMD_TYPE_DMA; - cmd.command.opcode = C_CMD_PUT; - cmd.index_ext = idx_ext; - cmd.event_send_disable = 1; - cmd.dfa = dfa; - cmd.eq = cxip_evtq_eqn(&rxc->ep_obj->txc.tx_evtq); - cmd.user_ptr = (uint64_t)req; - cmd.initiator = match_id; - cmd.match_bits = mb.raw; - - ret = cxi_cq_emit_dma(rxc->tx_cmdq->dev_cmdq, &cmd); - if (ret != FI_SUCCESS) { - RXC_DBG(rxc, "Faile to write notify IDC: %d %s\n", - ret, fi_strerror(-ret)); - return -FI_EAGAIN; - } - - cxi_cq_ring(rxc->tx_cmdq->dev_cmdq); - - RXC_DBG(rxc, "RDZV done notify send RDZV ID: %d\n", - req->recv.rdzv_id); - - return FI_SUCCESS; -} - -static int cxip_recv_rdzv_cb(struct cxip_req *req, const union c_event *event) -{ - struct cxip_rxc *rxc = req->recv.rxc; - struct cxip_deferred_event *def_ev; - int event_rc; - int ret; - bool matched; - - switch (event->hdr.event_type) { - /* When errors happen, send events can occur before the put/get event. - * These events should just be dropped. - */ - case C_EVENT_SEND: - RXC_WARN(rxc, CXIP_UNEXPECTED_EVENT, - cxi_event_to_str(event), - cxi_rc_to_str(cxi_event_rc(event))); - return FI_SUCCESS; - - case C_EVENT_PUT_OVERFLOW: - /* We matched an unexpected header */ - /* Check for a previously received unexpected Put event, - * if not found defer until it arrives. - */ - def_ev = match_put_event(rxc, req, event, &matched); - if (!def_ev) - return -FI_EAGAIN; - - /* For multi-recv, management of start_offset requires events - * manage_local related events to arrive in order. - * Only C_EVENT_PUT_OVERFLOW events meet this criteria. - */ - def_ev->mrecv_start = req->recv.start_offset; - def_ev->mrecv_len = - mrecv_req_put_bytes(req, event->tgt_long.rlength); - - if (req->recv.multi_recv && event->tgt_long.auto_unlinked) { - /* If a C_EVENT_PUT_OVERFLOW unlinks a multi-recv - * buffer, mrecv_start contains the number of bytes - * consumed before this C_EVENT_PUT_OVERFLOW. Adding in - * mrecv_len gets the total bytes consumed. - */ - req->recv.auto_unlinked = true; - req->recv.mrecv_unlink_bytes = - def_ev->mrecv_start + def_ev->mrecv_len; - } - - ofi_atomic_dec32(&rxc->orx_hw_ule_cnt); - - if (!matched) - return FI_SUCCESS; - - RXC_DBG(rxc, "Matched deferred event: %p\n", def_ev); - - ret = cxip_ux_send(req, def_ev->req, &def_ev->ev, - def_ev->mrecv_start, def_ev->mrecv_len, - false); - if (ret == FI_SUCCESS) { - free_put_event(rxc, def_ev); - } else { - /* undo mrecv_req_put_bytes() and orx_hw_ule_cnt dec */ - req->recv.start_offset -= def_ev->mrecv_len; - ofi_atomic_inc32(&rxc->orx_hw_ule_cnt); - } - - return ret; - case C_EVENT_PUT: - /* Eager data was delivered directly to the user buffer. */ - if (req->recv.multi_recv) { - if (event->tgt_long.auto_unlinked) { - uintptr_t mrecv_head; - uintptr_t mrecv_tail; - size_t mrecv_bytes_remaining; - size_t rlen; - - /* For C_EVENT_PUT, need to calculate how much - * of the multi-recv buffer was consumed while - * factoring in any truncation. - */ - mrecv_head = - CXI_IOVA_TO_VA(req->recv.recv_md->md, - event->tgt_long.start); - mrecv_tail = (uintptr_t)req->recv.recv_buf + - req->recv.ulen; - mrecv_bytes_remaining = mrecv_tail - mrecv_head; - rlen = MIN(mrecv_bytes_remaining, - event->tgt_long.rlength); - - req->recv.auto_unlinked = true; - req->recv.mrecv_unlink_bytes = - mrecv_head - - (uintptr_t)req->recv.recv_buf + rlen; - } - - req = rdzv_mrecv_req_event(req, event); - if (!req) - return -FI_EAGAIN; - - /* Set start pointer and data_len using Rendezvous or - * Put Overflow event (depending on if message was - * unexpected). - */ - } - - recv_req_tgt_event(req, event); - - /* Count the rendezvous event. */ - rdzv_recv_req_event(req, event->hdr.event_type); - return FI_SUCCESS; - case C_EVENT_RENDEZVOUS: - if (req->recv.multi_recv) { - req = rdzv_mrecv_req_event(req, event); - if (!req) - return -FI_EAGAIN; - - /* Use Rendezvous event to set start pointer and - * data_len for expected Sends. - */ - struct cxip_req *parent = req->recv.parent; - size_t mrecv_bytes_remaining; - - req->buf = CXI_IOVA_TO_VA( - parent->recv.recv_md->md, - event->tgt_long.start) - - event->tgt_long.mlength; - req->recv.recv_buf = (void *)req->buf; - - mrecv_bytes_remaining = - (uint64_t)parent->recv.recv_buf + - parent->recv.ulen - - (uint64_t)req->recv.recv_buf; - req->data_len = MIN(mrecv_bytes_remaining, - event->tgt_long.rlength); - } else { - req->data_len = MIN(req->recv.ulen, event->tgt_long.rlength); - } - - recv_req_tgt_event(req, event); - - if (!event->tgt_long.get_issued) { - if (ofi_atomic_inc32(&rxc->orx_tx_reqs) > - rxc->max_tx || issue_rdzv_get(req)) { - - /* Could not issue get */ - ofi_atomic_dec32(&rxc->orx_tx_reqs); - - /* Undo multi-recv event processing. */ - if (req->recv.multi_recv && - !req->recv.rdzv_events) { - dlist_remove(&req->recv.children); - cxip_evtq_req_free(req); - } - return -FI_EAGAIN; - } - - RXC_DBG(rxc, "Software issued Get, req: %p\n", req); - } - - /* Count the rendezvous event. */ - rdzv_recv_req_event(req, event->hdr.event_type); - return FI_SUCCESS; - case C_EVENT_REPLY: - /* If mrecv, look up the correct child request. */ - if (req->recv.multi_recv) { - req = rdzv_mrecv_req_event(req, event); - if (!req) - return -FI_EAGAIN; - } - - /* If a rendezvous operation requires a done notification - * send it. Must wait for the ACK from the notify to be returned - * before completing the target operation. - */ - if (req->recv.done_notify) { - if (ofi_atomic_inc32(&rxc->orx_tx_reqs) > rxc->max_tx || - cxip_rdzv_done_notify(req)) { - - /* Could not issue notify, will be retried */ - ofi_atomic_dec32(&rxc->orx_tx_reqs); - return -FI_EAGAIN; - } - } - - /* Rendezvous Get completed, update event counts and - * complete if using unrestricted get protocol. - */ - req->recv.rc = cxi_init_event_rc(event); - rdzv_recv_req_event(req, event->hdr.event_type); - - /* If RGet initiated by software return the TX credit */ - if (!event->init_short.rendezvous) { - ofi_atomic_dec32(&rxc->orx_tx_reqs); - assert(ofi_atomic_get32(&rxc->orx_tx_reqs) >= 0); - } - - return FI_SUCCESS; - - case C_EVENT_ACK: - event_rc = cxi_init_event_rc(event); - if (event_rc != C_RC_OK) - RXC_WARN(rxc, "%#x:%u Bad RDZV notify ACK status %s\n", - req->recv.rget_nic, req->recv.rget_pid, - cxi_rc_to_str(event_rc)); - - /* Special case of the ZBP destination EQ being full and ZBP - * could not complete. This must be retried, we use the TX - * credit already allocated. - */ - if (event_rc == C_RC_ENTRY_NOT_FOUND) { - usleep(CXIP_DONE_NOTIFY_RETRY_DELAY_US); - - if (cxip_rdzv_done_notify(req)) - return -FI_EAGAIN; - - return FI_SUCCESS; - } - - /* Reflect the completion status of the ACK in the target - * side completion so that a failure will not go undetected. - */ - req->recv.rc = event_rc; - ofi_atomic_dec32(&req->recv.rxc->orx_tx_reqs); - rdzv_recv_req_event(req, event->hdr.event_type); - - return FI_SUCCESS; - - default: - RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT, - cxi_event_to_str(event), - cxi_rc_to_str(cxi_event_rc(event))); - } -} - -static void cxip_rxc_record_req_stat(struct cxip_rxc *rxc, enum c_ptl_list list, - size_t rlength, struct cxip_req *req) -{ - enum fi_hmem_iface iface = rlength ? req->recv.recv_md->info.iface : FI_HMEM_SYSTEM; - - cxip_msg_counters_msg_record(&rxc->cntrs, list, iface, rlength); -} - -/* - * cxip_recv_cb() - Process a user receive buffer event. - * - * A user receive buffer is described by an LE linked to the Priority list. - * Link, Unlink, Put, Put Overflow, and Reply events are expected from a user - * receive buffer. - * - * A Link event indicates that a new user buffer has been linked to the - * priority list. Successful Link events may be suppressed. - * - * An Unlink event indicates that a user buffer has been unlinked. Normally, a - * receive is used once and unlinked when it is matched with a Send. In this - * case, a successful Unlink event may be suppressed. - * - * For expected, eager Sends, a Put will be matched to a user receive buffer by - * the NIC. Send data is copied directly to the user buffer. A Put event is - * generated describing the match. - * - * For unexpected, eager Sends, a Put will first match a buffer in the Overflow - * list. See cxip_oflow_cb() for details on Overflow event handling. Once a - * matching user receive buffer is appended to the Priority list, a Put - * Overflow event is generated. Put and Put Overflow events for an unexpected, - * eager Send must be correlated. These events may arrive in any order. Once - * both events are accounted, data is copied from the Overflow buffer to the - * user receive buffer. - * - * Unexpected, eager Sends that are longer than the eager threshold have their - * data truncated to zero. This is to avoid long messages consuming too much - * Overflow buffer space at the target. Once a match is made with a user - * receive buffer, data is re-read from the initiator using a Get. - * - * Rendezvous receive events are handled by cxip_recv_rdzv_cb(). - */ -static int cxip_recv_cb(struct cxip_req *req, const union c_event *event) -{ - int ret; - struct cxip_rxc *rxc = req->recv.rxc; - struct cxip_deferred_event *def_ev; - bool rdzv = false; - bool matched; - - /* Common processing for rendezvous and non-rendezvous events. - * TODO: Avoid having two switch statements for event_type. - */ - switch (event->hdr.event_type) { - case C_EVENT_LINK: - /* In cases where the LE pool entry reservation is insufficient - * to meet priority list buffers (due to multiple EP sharing an - * LE Pool or insufficient LE Pool reservation value), then - * enabling the periodic checking of LE allocations can be - * used to force preemptive transitions to software match mode. - */ - if (cxi_tgt_event_rc(event) == C_RC_OK) { - - if (!cxip_env.hybrid_recv_preemptive) - return FI_SUCCESS; - - /* Check for possible hybrid mode preemptive - * transitions to software managed mode. - */ - if (cxip_rxp_check_le_usage_hybrid_preempt(rxc, event)) - RXC_WARN(rxc, - "Force preemptive switch to SW EP\n"); - - return FI_SUCCESS; - } - - /* If endpoint has been disabled and an append fails, free the - * user request without reporting any event. - */ - if (rxc->state == RXC_DISABLED) { - cxip_recv_req_free(req); - return FI_SUCCESS; - } - - /* Save append to repost, NIC will initiate transition to - * software managed EP. - */ - if (cxi_tgt_event_rc(event) == C_RC_PTLTE_SW_MANAGED) { - RXC_WARN(rxc, "Append err, transitioning to SW\n"); - cxip_recv_req_dropped(req); - - return FI_SUCCESS; - } - - /* Transition into onload and flow control if an append - * fails. - */ - if (cxi_tgt_event_rc(event) != C_RC_NO_SPACE) - RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT_STS, - cxi_event_to_str(event), - cxi_rc_to_str(cxi_tgt_event_rc(event))); - - RXC_WARN(rxc, "Append err, priority LE exhaustion\n"); - - /* Manually transition to DISABLED to initiate flow control - * and onload instead of waiting for eventual NIC no match - * transition. - */ - ret = cxip_recv_pending_ptlte_disable(rxc, true); - if (ret != FI_SUCCESS) - RXC_WARN(rxc, "Force disable failed %d %s\n", - ret, fi_strerror(-ret)); - - ret = FI_SUCCESS; - cxip_recv_req_dropped(req); - - return ret; - - case C_EVENT_UNLINK: - assert(!event->tgt_long.auto_unlinked); - - /* TODO: This is broken with multi-recv. The multi-recv request - * may be freed with pending child requests. - */ - req->recv.unlinked = true; - recv_req_report(req); - cxip_recv_req_free(req); - - return FI_SUCCESS; - - case C_EVENT_PUT_OVERFLOW: - cxip_rxc_record_req_stat(rxc, C_PTL_LIST_OVERFLOW, - event->tgt_long.rlength, req); - - /* ULE freed. Update RXC state to signal that the RXC should - * be reenabled. - */ - /* TODO: this is not atomic, there must be a better way */ - if (rxc->state == RXC_ONLOAD_FLOW_CONTROL) - rxc->state = RXC_ONLOAD_FLOW_CONTROL_REENABLE; - break; - - case C_EVENT_PUT: - cxip_rxc_record_req_stat(rxc, C_PTL_LIST_PRIORITY, - event->tgt_long.rlength, req); - break; - default: - break; - } - - /* All events related to an offloaded rendezvous receive will be - * handled by cxip_recv_rdzv_cb(). Those events are identified by the - * event rendezvous field. Two exceptions are a Reply event generated - * from a SW-issued Get, and a Ack for a software done notification - * when using restricted eager get. When such an event is generated, - * the request will have already processed a Rendezvous event. If the - * rendezvous field is not set, but the rdzv_events count is elevated, - * this must be a SW-issued Reply or Ack event. - */ - if (event->hdr.event_type == C_EVENT_REPLY || - event->hdr.event_type == C_EVENT_ACK) - rdzv = (event->init_short.rendezvous || req->recv.rdzv_events); - else - rdzv = event->tgt_long.rendezvous; - - if (rdzv) - return cxip_recv_rdzv_cb(req, event); - - switch (event->hdr.event_type) { - case C_EVENT_SEND: - /* TODO Handle Send event errors. */ - assert(cxi_event_rc(event) == C_RC_OK); - return FI_SUCCESS; - case C_EVENT_PUT_OVERFLOW: - /* We matched an unexpected header */ - /* Unexpected 0-byte Put events are dropped. Skip matching. */ - if (!event->tgt_long.rlength) { - ret = cxip_ux_send_zb(req, event, - req->recv.start_offset, false); - if (ret == FI_SUCCESS) - ofi_atomic_dec32(&rxc->orx_hw_ule_cnt); - - return ret; - } - - /* Check for a previously received unexpected Put event, - * if not found defer until it arrives. - */ - def_ev = match_put_event(rxc, req, event, &matched); - if (!def_ev) - return -FI_EAGAIN; - - /* For multi-recv, management of start_offset requires events - * manage_local related events to arrive in order. - * Only C_EVENT_PUT_OVERFLOW events meet this criteria. - */ - def_ev->mrecv_start = req->recv.start_offset; - def_ev->mrecv_len = - mrecv_req_put_bytes(req, event->tgt_long.rlength); - - if (req->recv.multi_recv && event->tgt_long.auto_unlinked) { - /* If a C_EVENT_PUT_OVERFLOW unlinks a multi-recv - * buffer, mrecv_start contains the number of bytes - * consumed before this C_EVENT_PUT_OVERFLOW. Adding in - * mrecv_len gets the total bytes consumed. - */ - req->recv.auto_unlinked = true; - req->recv.mrecv_unlink_bytes = - def_ev->mrecv_start + def_ev->mrecv_len; - } - - ofi_atomic_dec32(&rxc->orx_hw_ule_cnt); - - if (!matched) - return FI_SUCCESS; - - ret = cxip_ux_send(req, def_ev->req, &def_ev->ev, - def_ev->mrecv_start, def_ev->mrecv_len, - false); - if (ret == FI_SUCCESS) { - free_put_event(rxc, def_ev); - } else { - /* undo mrecv_req_put_bytes() and orx_hw_ule_cnt dec */ - req->recv.start_offset -= def_ev->mrecv_len; - ofi_atomic_inc32(&rxc->orx_hw_ule_cnt); - } - - return ret; - case C_EVENT_PUT: - /* Data was delivered directly to the user buffer. Complete the - * request. - */ - if (req->recv.multi_recv) { - if (event->tgt_long.auto_unlinked) { - uintptr_t mrecv_head; - - /* For C_EVENT_PUT, need to calculate how much - * of the multi-recv buffer was consumed while - * factoring in any truncation. - */ - mrecv_head = - CXI_IOVA_TO_VA(req->recv.recv_md->md, - event->tgt_long.start); - - req->recv.auto_unlinked = true; - req->recv.mrecv_unlink_bytes = - mrecv_head - - (uintptr_t)req->recv.recv_buf + - event->tgt_long.mlength; - } - - req = mrecv_req_dup(req); - if (!req) - return -FI_EAGAIN; - recv_req_tgt_event(req, event); - - req->buf = (uint64_t)(CXI_IOVA_TO_VA( - req->recv.recv_md->md, - event->tgt_long.start)); - req->data_len = event->tgt_long.mlength; - - recv_req_report(req); - cxip_evtq_req_free(req); - } else { - req->data_len = event->tgt_long.mlength; - recv_req_tgt_event(req, event); - recv_req_report(req); - cxip_recv_req_free(req); - } - return FI_SUCCESS; - - case C_EVENT_REPLY: - /* Long-send Get completed. Complete the request. */ - req->recv.rc = cxi_init_event_rc(event); - - recv_req_report(req); - if (req->recv.multi_recv) - cxip_evtq_req_free(req); - else - cxip_recv_req_free(req); - - return FI_SUCCESS; - default: - RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT, - cxi_event_to_str(event), - cxi_rc_to_str(cxi_event_rc(event))); - } -} - -/* - * cxip_recv_cancel() - Cancel outstanding receive request. - */ -int cxip_recv_cancel(struct cxip_req *req) -{ - int ret = FI_SUCCESS; - struct cxip_rxc *rxc = req->recv.rxc; - - /* In hybrid mode requests could be on priority list - * or software receive list. - */ - if (req->recv.software_list) { - dlist_remove_init(&req->recv.rxc_entry); - req->recv.canceled = true; - req->recv.unlinked = true; - recv_req_report(req); - cxip_recv_req_free(req); - } else { - ret = cxip_pte_unlink(rxc->rx_pte, C_PTL_LIST_PRIORITY, - req->req_id, rxc->rx_cmdq); - if (ret == FI_SUCCESS) - req->recv.canceled = true; - } - return ret; -} - -/* - * cxip_recv_reenable() - Attempt to re-enable the RX queue. - * - * Called by disabled EP ready to re-enable. - * - * Determine if the RX queue can be re-enabled and perform a state change - * command if necessary. The Endpoint must receive dropped Send notifications - * from all peers who experienced drops before re-enabling the RX queue. - * - * Caller must hold ep_obj->lock. - */ -int cxip_recv_reenable(struct cxip_rxc *rxc) -{ - struct cxi_pte_status pte_status = {}; - int ret __attribute__((unused)); - - if (rxc->drop_count == -1) { - RXC_WARN(rxc, "Waiting for pending FC_NOTIFY messages\n"); - return -FI_EAGAIN; - } - - ret = cxil_pte_status(rxc->rx_pte->pte, &pte_status); - assert(!ret); - - if (rxc->drop_count != pte_status.drop_count) { - RXC_DBG(rxc, "Processed %d/%d drops\n", - rxc->drop_count, pte_status.drop_count); - return -FI_EAGAIN; - } - - RXC_WARN(rxc, "Re-enabling PTE, drop_count %d\n", - rxc->drop_count); - - do { - ret = cxip_rxc_msg_enable(rxc, rxc->drop_count); - if (ret == -FI_EAGAIN && - rxc->new_state == RXC_ENABLED_SOFTWARE) { - RXC_WARN(rxc, - "PTE disable->sm drop mismatch, will retry\n"); - break; - } - } while (ret == -FI_EAGAIN); - - if (ret != FI_SUCCESS && ret != -FI_EAGAIN) - RXC_FATAL(rxc, "cxip_rxc_msg_enable failed: %d\n", ret); - - return ret; -} - -/* - * cxip_fc_resume_cb() - Process FC resume completion events. - */ -int cxip_fc_resume_cb(struct cxip_ctrl_req *req, const union c_event *event) -{ - struct cxip_fc_drops *fc_drops = container_of(req, - struct cxip_fc_drops, req); - struct cxip_rxc *rxc = fc_drops->rxc; - int ret = FI_SUCCESS; - - switch (event->hdr.event_type) { - case C_EVENT_ACK: - switch (cxi_event_rc(event)) { - case C_RC_OK: - RXC_DBG(rxc, - "FC_RESUME to %#x:%u successfully sent: retry_count=%u\n", - fc_drops->nic_addr, fc_drops->pid, - fc_drops->retry_count); - free(fc_drops); - break; - - /* This error occurs when the target's control event queue has - * run out of space. Since the target should be processing the - * event queue, it is safe to replay messages until C_RC_OK is - * returned. - */ - case C_RC_ENTRY_NOT_FOUND: - fc_drops->retry_count++; - RXC_WARN(rxc, - "%#x:%u dropped FC message: retry_delay_usecs=%d retry_count=%u\n", - fc_drops->nic_addr, fc_drops->pid, - cxip_env.fc_retry_usec_delay, - fc_drops->retry_count); - usleep(cxip_env.fc_retry_usec_delay); - ret = cxip_ctrl_msg_send(req); - break; - default: - RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT_STS, - cxi_event_to_str(event), - cxi_rc_to_str(cxi_event_rc(event))); - } - break; - default: - RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT, - cxi_event_to_str(event), - cxi_rc_to_str(cxi_event_rc(event))); - } - - return ret; -} - -/* - * cxip_fc_process_drops() - Process a dropped Send notification from a peer. - * - * Called by disabled EP waiting to re-enable. - * - * When a peer detects dropped Sends it follows up by sending a message to the - * disabled Endpoint indicating the number of drops experienced. The disabled - * Endpoint peer must count all drops before re-enabling its RX queue. - */ -int cxip_fc_process_drops(struct cxip_ep_obj *ep_obj, uint32_t nic_addr, - uint32_t pid, uint16_t drops) -{ - struct cxip_rxc *rxc = &ep_obj->rxc; - struct cxip_fc_drops *fc_drops; - int ret __attribute__((unused)); - - fc_drops = calloc(1, sizeof(*fc_drops)); - if (!fc_drops) { - RXC_WARN(rxc, "Failed to allocate drops\n"); - return -FI_ENOMEM; - } - - /* TODO: Cleanup cxip_fc_drops fields. Many of the fields are redundant - * with the req structure. - */ - fc_drops->rxc = rxc; - fc_drops->nic_addr = nic_addr; - fc_drops->pid = pid; - fc_drops->drops = drops; - - fc_drops->req.send.nic_addr = nic_addr; - fc_drops->req.send.pid = pid; - fc_drops->req.send.mb.drops = drops; - - fc_drops->req.send.mb.ctrl_le_type = CXIP_CTRL_LE_TYPE_CTRL_MSG; - fc_drops->req.send.mb.ctrl_msg_type = CXIP_CTRL_MSG_FC_RESUME; - fc_drops->req.cb = cxip_fc_resume_cb; - fc_drops->req.ep_obj = rxc->ep_obj; - - dlist_insert_tail(&fc_drops->rxc_entry, &rxc->fc_drops); - - RXC_DBG(rxc, "Processed drops: %d NIC: %#x PID: %d\n", - drops, nic_addr, pid); - - rxc->drop_count += drops; - - /* Wait until search and delete completes before attempting to - * re-enable. - */ - if (rxc->state == RXC_FLOW_CONTROL) { - ret = cxip_recv_reenable(rxc); - assert(ret == FI_SUCCESS || ret == -FI_EAGAIN); - - /* Disable to software managed transition is synchronous - * in order to handle drop count mismatches correctly. If - * successful the H/W transition completed, otherwise it - * will be retried when notified and count matches. - */ - if (rxc->new_state == RXC_ENABLED_SOFTWARE && - ret == FI_SUCCESS) { - cxip_fc_progress_ctrl(rxc); - rxc->state = RXC_ENABLED_SOFTWARE; - RXC_WARN(rxc, "Now in RXC_ENABLED_SOFTWARE\n"); - } - } - - return FI_SUCCESS; -} - -/* - * cxip_recv_replay() - Replay dropped Receive requests. - * - * When no LE is available while processing an Append command, the command is - * dropped and future appends are disabled. After all outstanding commands are - * dropped and resources are recovered, replayed all Receive requests in order. - * - * Caller must hold ep_obj->lock. - */ -static int cxip_recv_replay(struct cxip_rxc *rxc) -{ - struct cxip_req *req; - struct dlist_entry *tmp; - bool restart_seq = true; - int ret; - - dlist_foreach_container_safe(&rxc->replay_queue, - struct cxip_req, req, - recv.rxc_entry, tmp) { - dlist_remove_init(&req->recv.rxc_entry); - - /* Since the RXC and PtlTE are in a controlled state and no new - * user receives are being posted, it is safe to ignore the RXC - * state when replaying failed user posted receives. - */ - ret = cxip_recv_req_queue(req, restart_seq); - - /* Match made in software? */ - if (ret == -FI_EALREADY) - continue; - - /* TODO: Low memory or full CQ during SW matching would cause - * -FI_EAGAIN to be seen here. - */ - assert(ret == FI_SUCCESS); - - restart_seq = false; - } - - return FI_SUCCESS; -} - -/* - * cxip_recv_resume() - Send a resume message to all peers who reported dropped - * Sends. - * - * Called by disabled EP after re-enable. - * - * After counting all dropped sends targeting a disabled RX queue and - * re-enabling the queue, notify all peers who experienced dropped Sends so - * they can be replayed. - * - * Caller must hold ep_obj->lock. - */ -int cxip_recv_resume(struct cxip_rxc *rxc) -{ - struct cxip_fc_drops *fc_drops; - struct dlist_entry *tmp; - int ret; - - dlist_foreach_container_safe(&rxc->fc_drops, - struct cxip_fc_drops, fc_drops, - rxc_entry, tmp) { - ret = cxip_ctrl_msg_send(&fc_drops->req); - if (ret) - return ret; - - dlist_remove(&fc_drops->rxc_entry); - } - - return FI_SUCCESS; -} - -/* - * cxip_fc_progress_ctrl() - Progress the control EP until all resume - * control messages can be queued. - * - * Caller must hold ep_obj->lock. - */ -static void cxip_fc_progress_ctrl(struct cxip_rxc *rxc) -{ - int ret __attribute__((unused)); - - assert(rxc->state == RXC_FLOW_CONTROL); - - /* Successful transition from disabled occurred, reset - * drop count. - */ - rxc->drop_count = rxc->ep_obj->asic_ver < CASSINI_2_0 ? -1 : 0; - - while ((ret = cxip_recv_resume(rxc)) == -FI_EAGAIN) - cxip_ep_tx_ctrl_progress_locked(rxc->ep_obj); - - assert(ret == FI_SUCCESS); -} - -/* - * cxip_post_ux_onload_sw() - Nic HW-to-SW EP post UX onload processing. - * - * PTE transitioned from enabled to software managed. Onloading - * was done and appends that failed need to be replayed. - */ -static void cxip_post_ux_onload_sw(struct cxip_rxc *rxc) -{ - int ret; - - assert(cxip_env.rx_match_mode == CXIP_PTLTE_HYBRID_MODE); - assert(rxc->prev_state == RXC_ENABLED); - assert(rxc->new_state == RXC_ENABLED_SOFTWARE); - - ret = cxip_ptelist_buf_replenish(rxc->req_list_bufpool, - true); - if (ret != FI_SUCCESS) - RXC_WARN(rxc, "Request list replenish failed %d %s\n", - ret, fi_strerror(-ret)); - - /* Priority list appends that failed during the transition can - * now be replayed. - */ - ret = cxip_recv_replay(rxc); - assert(ret == FI_SUCCESS || ret == -FI_EAGAIN); - - if (rxc->state == RXC_PENDING_PTLTE_SOFTWARE_MANAGED) { - /* Transition from enabled to software managed is complete. - * Allow posting of receive operations. - */ - RXC_WARN(rxc, "Now in RXC_ENABLED_SOFTWARE\n"); - rxc->state = RXC_ENABLED_SOFTWARE; - } -} - -/* - * cxip_post_ux_onload_fc() - Flow control onload complete processing. - * - * PTE transitioned to disabled and UX onload has completed. - */ -static void cxip_post_ux_onload_fc(struct cxip_rxc *rxc) -{ - int ret; - - /* Disable RX matching offload if transitioning to - * software enabled EP. - */ - if (rxc->new_state == RXC_ENABLED_SOFTWARE) { - RXC_DBG(rxc, "Transitioning to SW EP\n"); - rxc->msg_offload = 0; - } - - if (rxc->fc_reason == C_SC_FC_EQ_FULL) - goto replay; - - if (rxc->new_state == RXC_ENABLED_SOFTWARE) - ret = cxip_ptelist_buf_replenish(rxc->req_list_bufpool, - true); - else - ret = cxip_ptelist_buf_replenish(rxc->oflow_list_bufpool, - true); - if (ret != FI_SUCCESS) - RXC_WARN(rxc, "%s buffer replenish failed %d %s\n", - rxc->new_state == RXC_ENABLED_SOFTWARE ? - "Request" : "Overflow", ret, fi_strerror(-ret)); - -replay: - /* Any priority list appends that failed during the transition - * can now be replayed. - */ - if (rxc->new_state == RXC_ENABLED) - rxc->msg_offload = 1; - - ret = cxip_recv_replay(rxc); - RXC_DBG(rxc, "Replay of failed receives ret: %d %s\n", - ret, fi_strerror(-ret)); - assert(ret == FI_SUCCESS || ret == -FI_EAGAIN); - - if (rxc->state != RXC_ONLOAD_FLOW_CONTROL_REENABLE && - rxc->new_state != RXC_ENABLED_SOFTWARE) - RXC_FATAL(rxc, FC_SW_ONLOAD_MSG_FATAL); - - rxc->state = RXC_FLOW_CONTROL; - ret = cxip_recv_reenable(rxc); - assert(ret == FI_SUCCESS || ret == -FI_EAGAIN); - RXC_WARN(rxc, "Now in RXC_FLOW_CONTROL\n"); - - /* Disable to software managed transition is synchronous in order to - * handle drop count mismatches correctly. If successful the H/W - * transition completed, otherwise the transition will occur when - * additional drop notifies are received. - */ - if (rxc->new_state == RXC_ENABLED_SOFTWARE && ret == FI_SUCCESS) { - cxip_fc_progress_ctrl(rxc); - rxc->state = RXC_ENABLED_SOFTWARE; - RXC_WARN(rxc, "Now in RXC_ENABLED_SOFTWARE\n"); - } -} - -/* - * cxip_ux_onload_complete() - Unexpected list entry onload complete. - * - * All unexpected message headers have been onloaded from hardware. - */ -static void cxip_ux_onload_complete(struct cxip_req *req) -{ - struct cxip_rxc *rxc = req->search.rxc; - - assert(rxc->state == RXC_ONLOAD_FLOW_CONTROL_REENABLE || - rxc->state == RXC_PENDING_PTLTE_SOFTWARE_MANAGED); - - free(rxc->ule_offsets); - rxc->ule_offsets = 0; - - /* During a transition to software managed PtlTE, received - * request list entries resulting from hardware not matching - * the priority list on an incoming packet were added to a - * pending unexpected message list. We merge the two - * expected list here. - */ - RXC_DBG(rxc, "Req pending %d UX entries, SW list %d UX entries\n", - rxc->sw_pending_ux_list_len, rxc->sw_ux_list_len); - - dlist_splice_tail(&rxc->sw_ux_list, &rxc->sw_pending_ux_list); - rxc->sw_ux_list_len += rxc->sw_pending_ux_list_len; - rxc->sw_pending_ux_list_len = 0; - - RXC_WARN(rxc, "Software UX list updated, %d SW UX entries\n", - rxc->sw_ux_list_len); - - if (rxc->state == RXC_PENDING_PTLTE_SOFTWARE_MANAGED) - cxip_post_ux_onload_sw(rxc); - else - cxip_post_ux_onload_fc(rxc); - - ofi_atomic_dec32(&rxc->orx_reqs); - cxip_evtq_req_free(req); -} - -/* - * cxip_get_ule_offsets() - Initialize an in-order array of ULE offsets - * - * If snapshot is requested, no more than two passes at getting offsets - * will be made. This is intended to be used with FI_CLAIM processing, - * where the PtlTE is enabled. - */ -static int cxip_get_ule_offsets(struct cxip_rxc *rxc, uint64_t **ule_offsets, - unsigned int *num_ule_offsets, bool snapshot) -{ - struct cxi_pte_status pte_status = { - .ule_count = 512 - }; - size_t cur_ule_count = 0; - int ret; - int calls = 0; - - /* Get all the unexpected header remote offsets. */ - *ule_offsets = NULL; - *num_ule_offsets = 0; - - do { - cur_ule_count = pte_status.ule_count; - *ule_offsets = reallocarray(*ule_offsets, cur_ule_count, - sizeof(*ule_offsets)); - if (*ule_offsets == NULL) { - RXC_WARN(rxc, "Failed allocate ule offset memory\n"); - ret = -FI_ENOMEM; - goto err; - } - - pte_status.ule_offsets = (void *)*ule_offsets; - ret = cxil_pte_status(rxc->rx_pte->pte, &pte_status); - assert(!ret); - } while (cur_ule_count < pte_status.ule_count && - !(snapshot && ++calls > 1)); - - *num_ule_offsets = pte_status.ule_count; - - return FI_SUCCESS; -err: - free(*ule_offsets); - - return ret; -} - -/* - * cxip_ux_onload_cb() - Process SEARCH_AND_DELETE command events. - */ -static int cxip_ux_onload_cb(struct cxip_req *req, const union c_event *event) -{ - struct cxip_rxc *rxc = req->search.rxc; - struct cxip_deferred_event *def_ev; - struct cxip_ux_send *ux_send; - bool matched; - - assert(rxc->state == RXC_ONLOAD_FLOW_CONTROL || - rxc->state == RXC_ONLOAD_FLOW_CONTROL_REENABLE || - rxc->state == RXC_PENDING_PTLTE_SOFTWARE_MANAGED); - - switch (event->hdr.event_type) { - case C_EVENT_PUT_OVERFLOW: - assert(cxi_event_rc(event) == C_RC_OK); - - ux_send = calloc(1, sizeof(*ux_send)); - if (!ux_send) { - RXC_WARN(rxc, "Failed allocate to memory\n"); - return -FI_EAGAIN; - } - - /* Zero-byte unexpected onloads require special handling since - * no deferred structure would be allocated. - */ - if (event->tgt_long.rlength) { - - def_ev = match_put_event(rxc, req, event, &matched); - if (!matched) { - if (!def_ev) { - free(ux_send); - return -FI_EAGAIN; - } - - /* Gather Put events later */ - def_ev->ux_send = ux_send; - req->search.puts_pending++; - } else { - ux_send->req = def_ev->req; - ux_send->put_ev = def_ev->ev; - - free_put_event(rxc, def_ev); - } - } else { - ux_send->put_ev = *event; - } - - /* For flow control transition if a ULE is freed, then - * set state so that re-enable will be attempted. - */ - if (rxc->state == RXC_ONLOAD_FLOW_CONTROL) - rxc->state = RXC_ONLOAD_FLOW_CONTROL_REENABLE; - - /* Fixup event with the expected remote offset for an RGet. */ - if (event->tgt_long.rlength) { - ux_send->put_ev.tgt_long.remote_offset = - rxc->ule_offsets[rxc->cur_ule_offsets] + - event->tgt_long.mlength; - } - rxc->cur_ule_offsets++; - - dlist_insert_tail(&ux_send->rxc_entry, &rxc->sw_ux_list); - rxc->sw_ux_list_len++; - - RXC_DBG(rxc, "Onloaded Send: %p\n", ux_send); - - ofi_atomic_dec32(&rxc->orx_hw_ule_cnt); - - break; - case C_EVENT_SEARCH: - if (rxc->new_state == RXC_ENABLED_SOFTWARE && - rxc->state == RXC_ONLOAD_FLOW_CONTROL) - rxc->state = RXC_ONLOAD_FLOW_CONTROL_REENABLE; - - if (rxc->state == RXC_ONLOAD_FLOW_CONTROL) - RXC_FATAL(rxc, FC_SW_ONLOAD_MSG_FATAL); - - req->search.complete = true; - rxc->rx_evtq.ack_batch_size = rxc->rx_evtq.cq->ack_batch_size; - - RXC_DBG(rxc, "UX Onload Search done\n"); - - if (cxip_ux_is_onload_complete(req)) - cxip_ux_onload_complete(req); - - break; - default: - RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT, - cxi_event_to_str(event), - cxi_rc_to_str(cxi_event_rc(event))); - } - - return FI_SUCCESS; -} - -/* - * cxip_ux_onload() - Issue SEARCH_AND_DELETE command to on-load unexpected - * Send headers queued on the RXC message queue. - * - * Caller must hold ep_obj->lock. - */ -static int cxip_ux_onload(struct cxip_rxc *rxc) -{ - struct cxip_req *req; - union c_cmdu cmd = {}; - int ret; - - assert(rxc->state == RXC_ONLOAD_FLOW_CONTROL || - rxc->state == RXC_ONLOAD_FLOW_CONTROL_REENABLE || - rxc->state == RXC_PENDING_PTLTE_SOFTWARE_MANAGED); - - RXC_DBG(rxc, "Initiate hardware UX list onload\n"); - - /* Get all the unexpected header remote offsets. */ - rxc->ule_offsets = NULL; - rxc->num_ule_offsets = 0; - rxc->cur_ule_offsets = 0; - - ret = cxip_get_ule_offsets(rxc, &rxc->ule_offsets, - &rxc->num_ule_offsets, false); - if (ret) { - RXC_WARN(rxc, "Failed to read UX remote offsets: %d %s\n", - ret, fi_strerror(-ret)); - goto err; - } - - /* Populate request */ - req = cxip_evtq_req_alloc(&rxc->rx_evtq, 1, NULL); - if (!req) { - RXC_DBG(rxc, "Failed to allocate request\n"); - ret = -FI_EAGAIN; - goto err_free_onload_offset; - } - ofi_atomic_inc32(&rxc->orx_reqs); - - req->cb = cxip_ux_onload_cb; - req->type = CXIP_REQ_SEARCH; - req->search.rxc = rxc; - - cmd.command.opcode = C_CMD_TGT_SEARCH_AND_DELETE; - cmd.target.ptl_list = C_PTL_LIST_UNEXPECTED; - cmd.target.ptlte_index = rxc->rx_pte->pte->ptn; - cmd.target.buffer_id = req->req_id; - cmd.target.length = -1U; - cmd.target.ignore_bits = -1UL; - cmd.target.match_id = CXI_MATCH_ID_ANY; - - ret = cxi_cq_emit_target(rxc->rx_cmdq->dev_cmdq, &cmd); - if (ret) { - RXC_WARN(rxc, "Failed to write Search command: %d\n", ret); - ret = -FI_EAGAIN; - goto err_dec_free_cq_req; - } - - cxi_cq_ring(rxc->rx_cmdq->dev_cmdq); - - return FI_SUCCESS; - -err_dec_free_cq_req: - ofi_atomic_dec32(&rxc->orx_reqs); - cxip_evtq_req_free(req); -err_free_onload_offset: - free(rxc->ule_offsets); -err: - RXC_WARN(rxc, "Hardware UX list onload initiation error, ret: %d\n", - ret); - return ret; -} - -static int cxip_flush_appends_cb(struct cxip_req *req, - const union c_event *event) -{ - struct cxip_rxc *rxc = req->req_ctx; - int ret; - - assert(rxc->state == RXC_ONLOAD_FLOW_CONTROL || - rxc->state == RXC_ONLOAD_FLOW_CONTROL_REENABLE || - rxc->state == RXC_PENDING_PTLTE_SOFTWARE_MANAGED); - - assert(event->hdr.event_type == C_EVENT_SEARCH); - assert(cxi_event_rc(event) == C_RC_NO_MATCH); - - ret = cxip_ux_onload(rxc); - if (ret == FI_SUCCESS) { - ofi_atomic_dec32(&rxc->orx_reqs); - cxip_evtq_req_free(req); - } - - return ret; -} - -/* - * cxip_flush_appends() - Flush all user appends for a RXC. - * - * Before cxip_ux_onload() can be called, all user appends in the command queue - * must be flushed. If not, this can cause cxip_ux_onload() to read incorrect - * remote offsets from cxil_pte_status(). The flush is implemented by issuing - * a search command which will match zero ULEs. When the search event is - * processed, all pending user appends will have been processed. Since the RXC - * is not enabled, new appends cannot occur during this time. - * - * Caller must hold ep_obj->lock. - */ -static int cxip_flush_appends(struct cxip_rxc *rxc) -{ - struct cxip_req *req; - union c_cmdu cmd = {}; - int ret; - - assert(rxc->state == RXC_ONLOAD_FLOW_CONTROL || - rxc->state == RXC_ONLOAD_FLOW_CONTROL_REENABLE || - rxc->state == RXC_PENDING_PTLTE_SOFTWARE_MANAGED); - - /* Populate request */ - req = cxip_evtq_req_alloc(&rxc->rx_evtq, 1, rxc); - if (!req) { - RXC_DBG(rxc, "Failed to allocate request\n"); - ret = -FI_EAGAIN; - goto err; - } - ofi_atomic_inc32(&rxc->orx_reqs); - - rxc->rx_evtq.ack_batch_size = 1; - - req->cb = cxip_flush_appends_cb; - req->type = CXIP_REQ_SEARCH; - - /* Search command which should match nothing. */ - cmd.command.opcode = C_CMD_TGT_SEARCH; - cmd.target.ptl_list = C_PTL_LIST_UNEXPECTED; - cmd.target.ptlte_index = rxc->rx_pte->pte->ptn; - cmd.target.buffer_id = req->req_id; - cmd.target.match_bits = -1UL; - cmd.target.length = 0; - - ret = cxi_cq_emit_target(rxc->rx_cmdq->dev_cmdq, &cmd); - if (ret) { - RXC_WARN(rxc, "Failed to write Search command: %d\n", ret); - ret = -FI_EAGAIN; - goto err_dec_free_cq_req; - } - - cxi_cq_ring(rxc->rx_cmdq->dev_cmdq); - - return FI_SUCCESS; - -err_dec_free_cq_req: - ofi_atomic_dec32(&rxc->orx_reqs); - cxip_evtq_req_free(req); -err: - return ret; -} - -/* - * cxip_recv_pte_cb() - Process receive PTE state change events. - */ -void cxip_recv_pte_cb(struct cxip_pte *pte, const union c_event *event) -{ - struct cxip_rxc *rxc = (struct cxip_rxc *)pte->ctx; - int fc_reason = cxip_fc_reason(event); - int ret __attribute__((unused)); - - switch (pte->state) { - case C_PTLTE_ENABLED: - assert(rxc->state == RXC_FLOW_CONTROL || - rxc->state == RXC_DISABLED || - rxc->state == RXC_PENDING_PTLTE_HARDWARE); - - /* Queue any flow control resume messages */ - if (rxc->state == RXC_FLOW_CONTROL) { - cxip_fc_progress_ctrl(rxc); - RXC_WARN(rxc, "Now in RXC_ENABLED\n"); - } - - rxc->state = RXC_ENABLED; - break; - - case C_PTLTE_DISABLED: - if (rxc->state == RXC_DISABLED) - break; - - if (fc_reason == C_SC_DIS_UNCOR) - RXC_FATAL(rxc, "Disabled, LE uncorrectable err\n"); - - /* An incorrect drop count was used during PTE enable. - * Another attempt will be made when a peer sends a side-band - * drop message. - */ - if (cxi_event_rc(event) == C_RC_NO_MATCH) { - assert(rxc->state == RXC_FLOW_CONTROL || - rxc->state == RXC_ONLOAD_FLOW_CONTROL || - rxc->state == RXC_ONLOAD_FLOW_CONTROL_REENABLE || - rxc->state == - RXC_PENDING_PTLTE_SOFTWARE_MANAGED); - RXC_WARN(rxc, FC_DROP_COUNT_MSG); - break; - } - - /* Flow control occurred while transitioning from HW to SW - * managed PTE. Since onloading of all UX entries will have - * been initiated (i.e. no new ones will be added) and the - * PTE state change from RXC_PENDING_PTLTE_SOFTWARE_MANAGED - * to RXC_ENABLED_SOFTWARE following onload complete is - * protected by the ep_obj->lock, it is safe to indicate that - * SW managed EP must be re-enabled on onload complete. - * The request list will have been replenished. - */ - if (rxc->state == RXC_PENDING_PTLTE_SOFTWARE_MANAGED) { - RXC_WARN(rxc, - "Flow control during HW to SW transition\n"); - rxc->state = RXC_ONLOAD_FLOW_CONTROL_REENABLE; - break; - } - - /* Check for flow control during flow control */ - if (rxc->state != RXC_ENABLED && - rxc->state != RXC_ENABLED_SOFTWARE && - rxc->state != RXC_PENDING_PTLTE_DISABLE) { - - /* There is race between SW disable on priority list - * and HW initiated LE flow control which can be - * ignored; otherwise it is a fatal error. - */ - if (fc_reason == CXIP_FC_SOFTWARE_INITIATED) - break; - RXC_FATAL(rxc, FC_SW_LE_MSG_FATAL); - } - - /* Starting flow control processing. The default is for - * flow control should re-enable in the previous - * hardware/software managed state. - */ - rxc->prev_state = rxc->state; - rxc->new_state = rxc->state; - rxc->state = RXC_ONLOAD_FLOW_CONTROL; - - RXC_DBG(rxc, "Flow control detected, H/W: %d reason: %d\n", - event->tgt_long.initiator.state_change.sc_nic_auto, - fc_reason); - - switch (fc_reason) { - case CXIP_FC_SOFTWARE_INITIATED: - /* Software initiated state change, drop count - * needs to start at zero instead of -1. Add 1 to - * account for this. Note this is only initiated - * from an hardware enabled PTE state. - */ - RXC_WARN(rxc, "SW initiated flow control\n"); - if (rxc->ep_obj->asic_ver < CASSINI_2_0) - rxc->drop_count++; - - /* If running in hybrid mode, resume operation as a - * software managed EP to reduce LE resource load. - */ - if (cxip_env.rx_match_mode == CXIP_PTLTE_HYBRID_MODE) - rxc->new_state = RXC_ENABLED_SOFTWARE; - - rxc->num_fc_append_fail++; - break; - - case C_SC_FC_EQ_FULL: - /* EQ full does not require LE resources be recovered - * to re-enable. - */ - RXC_WARN(rxc, "Flow control EQ full\n"); - rxc->state = RXC_ONLOAD_FLOW_CONTROL_REENABLE; - rxc->num_fc_eq_full++; - break; - - case C_SC_FC_NO_MATCH: - /* Overflow list buffers were full/could not be matched - * against. Must replenish buffers, but does not in - * itself require resources be recovered. - */ - RXC_WARN(rxc, FC_OFLOW_NO_MATCH_MSG, - cxip_env.oflow_buf_size); - - rxc->state = RXC_ONLOAD_FLOW_CONTROL_REENABLE; - rxc->num_fc_no_match++; - break; - - case C_SC_FC_UNEXPECTED_FAIL: - /* Hybrid mode is not enabled and overflow matches, but - * LE resources prevent unexpected message allocation. - */ - RXC_WARN(rxc, "Flow control UX LE resources\n"); - rxc->num_fc_unexp++; - break; - - case C_SC_FC_REQUEST_FULL: - /* Running as software managed EP and request list - * buffers were full/could not be matched against. - * Must replenish buffers, but does not require that - * LE resources are recovered. - */ - RXC_WARN(rxc, FC_REQ_FULL_MSG, cxip_env.req_buf_size); - rxc->state = RXC_ONLOAD_FLOW_CONTROL_REENABLE; - rxc->num_fc_req_full++; - break; - - case C_SC_SM_APPEND_FAIL: - case C_SC_SM_UNEXPECTED_FAIL: - default: - RXC_FATAL(rxc, "Invalid disable PTE c_sc_reason: %d\n", - fc_reason); - } - rxc->fc_reason = fc_reason; - - do { - ret = cxip_flush_appends(rxc); - } while (ret == -FI_EAGAIN); - - if (ret != FI_SUCCESS) - RXC_FATAL(rxc, "cxip_flush_appends failed: %d\n", ret); - - break; - - case C_PTLTE_SOFTWARE_MANAGED: - /* There is an inherent race between hardware and software - * in setting the PtlTE state. If software requested to - * disable the PtlTE after hardware started a HW to SW - * transition; just wait for the disable event. - */ - if (rxc->state == RXC_PENDING_PTLTE_DISABLE) - break; - - RXC_DBG(rxc, "SW Managed: nic auto: %d, reason: %d\n", - event->tgt_long.initiator.state_change.sc_nic_auto, - event->tgt_long.initiator.state_change.sc_nic_auto ? - event->tgt_long.initiator.state_change.sc_reason : -1); - - /* We should not get a bad drop count status since the - * transition is synchronous but we will need this in - * the future. - */ - if (cxi_event_rc(event) == C_RC_NO_MATCH) { - RXC_WARN(rxc, "Bad drop count, ignored\n"); - break; - } - - /* Sanity check */ - if (rxc->state == RXC_FLOW_CONTROL) - RXC_FATAL(rxc, "FC to SW EP should be synchronous\n"); - - assert(rxc->state == RXC_DISABLED || - rxc->state == RXC_ENABLED || - rxc->state == RXC_PENDING_PTLTE_SOFTWARE_MANAGED); - - /* Hardware should only generate PTE software managed events - * in two cases: - * 1. Initial start in software mode: disabled->software. - * 2. NIC initiated software transition: enabled->software. - */ - switch (fc_reason) { - case CXIP_FC_SOFTWARE_INITIATED: - /* If messaging was initially offloaded then this - * state transition can only happen if the RXC has - * been disabled; it is safe to ignore this change. - */ - assert(rxc->state == RXC_DISABLED); - if (!cxip_env.msg_offload) { - RXC_WARN(rxc, "Software managed EP enabled\n"); - rxc->state = RXC_ENABLED_SOFTWARE; - } - break; - - case C_SC_SM_APPEND_FAIL: - case C_SC_SM_UNEXPECTED_FAIL: - /* The NIC initiated the transition; priority list - * appends that are in flight will fail and be added - * to the receive replay list. Update state so that - * no additional appends will be attempted until - * onload completes and the failed appends are - * replayed. - */ - RXC_WARN(rxc, - "NIC transition to SW EP, c_sc_reason: %d\n", - fc_reason); - rxc->fc_reason = fc_reason; - rxc->prev_state = rxc->state; - rxc->new_state = RXC_ENABLED_SOFTWARE; - - if (rxc->fc_reason == C_SC_SM_UNEXPECTED_FAIL) - rxc->num_sc_nic_hw2sw_unexp++; - else if (rxc->fc_reason == C_SC_SM_APPEND_FAIL) - rxc->num_sc_nic_hw2sw_append_fail++; - - rxc->msg_offload = 0; - rxc->state = RXC_PENDING_PTLTE_SOFTWARE_MANAGED; - do { - /* Flush and kick-off onloading of UX list */ - ret = cxip_flush_appends(rxc); - } while (ret == -FI_EAGAIN); - if (ret != FI_SUCCESS) - RXC_WARN(rxc, "Flush/UX onload err: %d\n", ret); - break; - default: - RXC_FATAL(rxc, "Invalid PTE c_sc_reason: %d\n", - fc_reason); - } - - break; - default: - RXC_FATAL(rxc, "Unexpected state received: %u\n", pte->state); - } -} - -/* - * tag_match() - Compare UX Send tag and Receive tags in SW. - */ -static bool tag_match(uint64_t init_mb, uint64_t mb, uint64_t ib) -{ - return !((init_mb ^ mb) & ~ib); -} - -/* - * tag_match() - Compare UX Send initiator and Receive initiator in SW. - */ -static bool init_match(struct cxip_rxc *rxc, uint32_t init, uint32_t match_id) -{ - if (match_id == CXI_MATCH_ID_ANY) - return true; - - if (rxc->ep_obj->av->symmetric) { - init = CXI_MATCH_ID_EP(rxc->pid_bits, init); - match_id = CXI_MATCH_ID_EP(rxc->pid_bits, match_id); - } - - return init == match_id; -} - -/* - * cxip_claim_onload_cb() - Process SEARCH and DELETE of claimed UX message. - */ -static int cxip_claim_onload_cb(struct cxip_req *req, - const union c_event *evt) -{ - struct cxip_rxc *rxc = req->req_ctx; - struct cxip_deferred_event *def_ev; - struct cxip_ux_send *ux_send; - bool matched = false; - - if (evt->hdr.event_type != C_EVENT_PUT_OVERFLOW) - RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT, - cxi_event_to_str(evt), - cxi_rc_to_str(cxi_event_rc(evt))); - - /* Failed to onload UX message, return ENOMSG */ - if (cxi_event_rc(evt) != C_RC_OK) { - RXC_WARN(rxc, "FI_CLAIM HW onload failed: %d\n", - cxi_event_rc(evt)); - recv_req_peek_complete(req, NULL); - - return FI_SUCCESS; - } - - ofi_atomic_dec32(&rxc->orx_hw_ule_cnt); - - /* FI_CLAIM UX message onloaded from hardware */ - ux_send = calloc(1, sizeof(*ux_send)); - if (!ux_send) { - RXC_WARN(rxc, "Failed allocate UX memory\n"); - return -FI_EAGAIN; - } - ux_send->claimed = true; - - /* Zero-byte unexpected onloads require special handling - * since no deferred structure would be allocated. - */ - if (evt->tgt_long.rlength) { - def_ev = match_put_event(rxc, req, evt, &matched); - if (!matched) { - /* The EVENT_PUT to the overflow list has not been - * processed. The FI_CLAIM operation will be completed - * when the matching put is received. - */ - if (!def_ev) { - free(ux_send); - return -FI_EAGAIN; - } - def_ev->ux_send = ux_send; - } else { - ux_send->req = def_ev->req; - ux_send->put_ev = def_ev->ev; - free_put_event(rxc, def_ev); - } - - /* Fixup event remote offset for an RGet. */ - if (evt->tgt_long.rlength) - ux_send->put_ev.tgt_long.remote_offset = - req->recv.ule_offset + evt->tgt_long.mlength; - - } else { - matched = true; - ux_send->put_ev = *evt; - } - - /* Add to the sw UX list as a claimed entry, it will be ignored in - * recieve matching of UX list entries. Its order no longer matters. - */ - dlist_insert_tail(&ux_send->rxc_entry, &rxc->sw_ux_list); - rxc->sw_ux_list_len++; - - RXC_DBG(rxc, "FI_CLAIM Onload req: %p ux_send %p\n", req, ux_send); - recv_req_tgt_event(req, &ux_send->put_ev); - - /* Put was already received, return FI_CLAIM completion */ - if (matched) { - recv_req_peek_complete(req, ux_send); - RXC_DBG(rxc, "FI_CLAIM onload complete, req %p, ux_send %p\n", - req, ux_send); - } - - ofi_atomic_dec32(&rxc->orx_hw_ule_cnt); - - return FI_SUCCESS; -} - -/* - * cxip_claim_ux_onload() - Initiate SEARCH and DELETE of FI_CLAIM ux entry. - */ -static int cxip_claim_ux_onload(struct cxip_req *req) -{ - struct cxip_rxc *rxc = req->req_ctx; - int ret = FI_SUCCESS; - union c_cmdu cmd = {}; - union cxip_match_bits mb = {}; - union cxip_match_bits ib = {}; - - if (rxc->state != RXC_ENABLED) { - RXC_DBG(rxc, "FC inprogress, fail claim req %p\n", req); - goto err; - } - - /* Initiate a search to get the remote offset for the - * unexpected list entry we matched. - */ - req->cb = cxip_claim_onload_cb; - mb.tag = req->recv.tag; - mb.tagged = 1; - ib.tx_id = ~0; - ib.cq_data = ~0; - ib.match_comp = ~0; - ib.rdzv_done = ~0; - ib.le_type = ~0; - ib.tag = req->recv.ignore; - - cmd.command.opcode = C_CMD_TGT_SEARCH_AND_DELETE; - - cmd.target.ptl_list = C_PTL_LIST_UNEXPECTED; - cmd.target.ptlte_index = rxc->rx_pte->pte->ptn; - cmd.target.buffer_id = req->req_id; - cmd.target.length = -1U; - cmd.target.ignore_bits = ib.raw; - cmd.target.match_bits = mb.raw; - cmd.target.match_id = req->recv.match_id; - /* Delete first match */ - cmd.target.use_once = 1; - - ret = cxi_cq_emit_target(rxc->rx_cmdq->dev_cmdq, &cmd); - if (ret) { - /* This condition should clear */ - RXC_WARN(rxc, - "Cannot emit of UX delete cmd, return -FI_EAGAIN\n"); - return -FI_EAGAIN; - } - - cxi_cq_ring(rxc->rx_cmdq->dev_cmdq); - - /* Hardware handles the race between subsequent priority list - * appends to the search and delete command. Re-enable. - */ - rxc->hw_claim_in_progress = false; - RXC_DBG(rxc, "FI_CLAIM Search and Delete of UX entry initiated\n"); - - return FI_SUCCESS; - -err: - /* Unable to initiate FI_CLAIM, report as ENOMSG */ - rxc->hw_claim_in_progress = false; - recv_req_peek_complete(req, NULL); - - return FI_SUCCESS; -} - -/* - * cxip_hw_claim_offset_cb() - Process SEARCH command events to get remote - * offset of entry to be deleted. - */ -static int cxip_hw_claim_offset_cb(struct cxip_req *req, - const union c_event *evt) -{ - struct cxip_rxc *rxc = req->recv.rxc; - union cxip_match_bits ux_mb; - uint32_t ux_init; - int ret; - - switch (evt->hdr.event_type) { - case C_EVENT_SEARCH: - if (cxi_event_rc(evt) == C_RC_OK) { - RXC_DBG(rxc, "Claim UX offset search entry, req: %p\n", - req); - - if (req->recv.offset_found) - break; - - req->recv.cur_ule_offsets++; - - /* Not found in range of the offsets we have */ - if (req->recv.cur_ule_offsets > - req->recv.num_ule_offsets) { - RXC_DBG(rxc, "Claim UX offsets exceeded\n"); - break; - } - - /* Check for a match against the FI_PEEK */ - ux_mb.raw = evt->tgt_long.match_bits; - ux_init = evt->tgt_long.initiator.initiator.process; - - if (req->recv.tagged != ux_mb.tagged) - break; - if (ux_mb.tagged - && !tag_match(ux_mb.tag, req->recv.tag, - req->recv.ignore)) - break; - if (!init_match(rxc, ux_init, req->recv.match_id)) - break; - - /* Matched, update to ignore any future events */ - req->recv.offset_found = true; - req->recv.ule_offset = - req->recv.ule_offsets[req->recv.cur_ule_offsets - 1]; - - RXC_DBG(rxc, "Found offset for claim %p, %d : 0x%lX\n", - req, req->recv.cur_ule_offsets - 1, - req->recv.ule_offset); - break; - } - - assert(cxi_event_rc(evt) == C_RC_NO_MATCH); - - RXC_DBG(rxc, "FI_CLAIM remote offset search done, status %d\n", - cxi_event_rc(evt)); - - if (!req->recv.offset_found) { - RXC_DBG(rxc, "Req %p, FI_CLAIM UX not found\n", req); - goto err_not_found; - } - - ret = cxip_claim_ux_onload(req); - if (ret) { - /* Unable to initiate SEARCH and DELETE, this - * should clear. All other errors return ENOMSG. - */ - if (ret == -FI_EAGAIN) - return ret; - - RXC_WARN(rxc, "claim_ux_onload failed %d\n", ret); - goto err_not_found; - } - - RXC_DBG(rxc, "FI_CLAIM req %p remote offset 0x%lX\n", - req, req->recv.ule_offset); - break; - default: - RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT, - cxi_event_to_str(evt), - cxi_rc_to_str(cxi_event_rc(evt))); - } - - return FI_SUCCESS; - -err_not_found: - /* Terminate FI_PEEK with FI_CLAIM with ENOMSG */ - rxc->hw_claim_in_progress = false; - free(req->recv.ule_offsets); - req->recv.ule_offsets = NULL; - recv_req_peek_complete(req, NULL); - - return FI_SUCCESS; -} - -/* - * cxip_initiate_hw_claim() - Onload the specified peek, claiming it. - */ -static int cxip_initiate_hw_claim(struct cxip_req *req) -{ - struct cxip_rxc *rxc = req->req_ctx; - union c_cmdu cmd = {}; - int ret = FI_SUCCESS; - - if (rxc->state != RXC_ENABLED) { - RXC_DBG(rxc, "FC inprogress, unable to claim req %p\n", req); - goto err; - } - - /* UX entry exists in hardware, the initial search acts as a flush of - * the event queue for priority list appends. Get remote offset for - * the associated unexpected list entry. - */ - req->recv.cur_ule_offsets = 0; - ret = cxip_get_ule_offsets(rxc, &req->recv.ule_offsets, - &req->recv.num_ule_offsets, true); - if (ret) { - RXC_WARN(rxc, "Unable to get FI_CLAIM UX offsets\n"); - goto err; - } - - RXC_DBG(rxc, "ule_offsets %p, num offsets %d\n", - req->recv.ule_offsets, req->recv.num_ule_offsets); - - /* Initiate a search to get the remote offset for the - * unexpected list entry we matched. This requires going - * through the list. - */ - req->cb = cxip_hw_claim_offset_cb; - - cmd.command.opcode = C_CMD_TGT_SEARCH; - cmd.target.ptl_list = C_PTL_LIST_UNEXPECTED; - cmd.target.ptlte_index = rxc->rx_pte->pte->ptn; - cmd.target.buffer_id = req->req_id; - cmd.target.length = -1U; - cmd.target.ignore_bits = -1UL; - cmd.target.match_id = CXI_MATCH_ID_ANY; - - ret = cxi_cq_emit_target(rxc->rx_cmdq->dev_cmdq, &cmd); - if (ret) { - RXC_WARN(rxc, "Failed to write Search command: %d\n", ret); - goto err_free_offsets; - } - - cxi_cq_ring(rxc->rx_cmdq->dev_cmdq); - - RXC_DBG(rxc, "Search for remote offsets initiated, req %p\n", req); - - return FI_SUCCESS; - -err_free_offsets: - free(req->recv.ule_offsets); - req->recv.ule_offsets = NULL; -err: - /* Unable to initiate FI_CLAIM, report as ENOMSG */ - rxc->hw_claim_in_progress = false; - recv_req_peek_complete(req, NULL); - - return FI_SUCCESS; -} - -/* - * cxip_ux_peek_cb() - Process UX list SEARCH command events. - */ -static int cxip_ux_peek_cb(struct cxip_req *req, const union c_event *event) -{ - struct cxip_rxc *rxc = req->req_ctx; - - assert(req->recv.flags & FI_PEEK); - - switch (event->hdr.event_type) { - case C_EVENT_SEARCH: - /* Will receive event for only first match or failure */ - if (cxi_event_rc(event) == C_RC_OK) { - RXC_DBG(rxc, "Peek UX search req: %p matched\n", req); - if (req->recv.flags & FI_CLAIM) { - RXC_DBG(rxc, "req: %p UX must be claimed\n", - req); - return cxip_initiate_hw_claim(req); - } - - /* FI_PEEK only was found */ - recv_req_tgt_event(req, event); - } else { - RXC_DBG(rxc, "Peek UX search req: %p no match\n", req); - } - - recv_req_peek_complete(req, NULL); - break; - - default: - RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT, - cxi_event_to_str(event), - cxi_rc_to_str(cxi_event_rc(event))); - } - - return FI_SUCCESS; -} - -/* - * cxip_ux_peek() - Issue a SEARCH command to peek for a matching send - * on the RXC offloaded unexpected message list. - * - * Caller must hold ep_obj->lock. - */ -static int cxip_ux_peek(struct cxip_req *req) -{ - struct cxip_rxc *rxc = req->req_ctx; - union c_cmdu cmd = {}; - union cxip_match_bits mb = {}; - union cxip_match_bits ib = {}; - int ret; - - assert(req->recv.flags & FI_PEEK); - - req->cb = cxip_ux_peek_cb; - - mb.tag = req->recv.tag; - mb.tagged = 1; - ib.tx_id = ~0; - ib.cq_data = ~0; - ib.match_comp = ~0; - ib.rdzv_done = ~0; - ib.le_type = ~0; - ib.tag = req->recv.ignore; - - cmd.command.opcode = C_CMD_TGT_SEARCH; - cmd.target.ptl_list = C_PTL_LIST_UNEXPECTED; - cmd.target.ptlte_index = rxc->rx_pte->pte->ptn; - cmd.target.buffer_id = req->req_id; - cmd.target.length = -1U; - cmd.target.ignore_bits = ib.raw; - cmd.target.match_bits = mb.raw; - cmd.target.match_id = req->recv.match_id; - /* First match only */ - cmd.target.use_once = 1; - - if (cxip_evtq_saturated(&rxc->rx_evtq)) { - RXC_DBG(rxc, "Target HW EQ saturated\n"); - return -FI_EAGAIN; - } - - RXC_DBG(rxc, "Peek UX search req: %p mb.raw: 0x%" PRIx64 " match_id: 0x%x ignore: 0x%" PRIx64 "\n", - req, mb.raw, req->recv.match_id, req->recv.ignore); - - ret = cxi_cq_emit_target(rxc->rx_cmdq->dev_cmdq, &cmd); - if (ret) { - RXC_WARN(rxc, "Failed to write Search command: %d\n", ret); - return -FI_EAGAIN; - } - - cxi_cq_ring(rxc->rx_cmdq->dev_cmdq); - - /* If FI_CLAIM, we disable priority list appends so the - * search acts as a flush of outstanding appends. - */ - if (req->flags & FI_CLAIM) - rxc->hw_claim_in_progress = true; - - return FI_SUCCESS; -} - -/* cxip_set_ux_dump_entry() - initialize a CQ entry structure - * and/or source address with UX message info. - */ -static void cxip_set_ux_dump_entry(struct cxip_req *req, - const union c_event *evt) -{ - struct cxip_ux_dump_state *ux_dump = req->recv.ux_dump; - union cxip_match_bits mb; - struct fi_cq_tagged_entry *cq_entry = NULL; - fi_addr_t *src_addr = NULL; - - ux_dump->ux_count++; - - /* If exceeding caller provided space updating the total - * available UX message count is all that is required. - */ - if (ux_dump->ret_count >= ux_dump->max_count) - return; - - if (ux_dump->entry) - cq_entry = &ux_dump->entry[ux_dump->ret_count]; - if (ux_dump->src_addr) - src_addr = &ux_dump->src_addr[ux_dump->ret_count]; - - if (cq_entry || src_addr) { - ux_dump->ret_count++; - - req->recv.tgt_event = false; - req->flags = 0; - recv_req_tgt_event(req, evt); - - if (cq_entry) { - /* Need to add FI_TAGGED or FI_MSG directly */ - mb.raw = evt->tgt_long.match_bits; - if (mb.tagged) - req->flags |= FI_TAGGED; - else - req->flags |= FI_MSG; - cq_entry->op_context = NULL; - cq_entry->flags = req->flags; - cq_entry->len = req->recv.rlen; - cq_entry->buf = NULL; - cq_entry->data = req->data; - cq_entry->tag = req->tag; - } - - if (src_addr && req->recv.rxc->attr.caps & FI_SOURCE) - *src_addr = recv_req_src_addr(req); - } -} - -/* - * cxip_unexp_msg_dump_cb() - Process search command dumping H/W UX entries. - */ -static int cxip_unexp_msg_dump_cb(struct cxip_req *req, - const union c_event *evt) -{ - struct cxip_rxc *rxc = req->recv.rxc; - - if (evt->hdr.event_type != C_EVENT_SEARCH) - RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT, - cxi_event_to_str(evt), - cxi_rc_to_str(cxi_event_rc(evt))); - - if (cxi_event_rc(evt) == C_RC_NO_MATCH) { - req->recv.ux_dump->done = true; - return FI_SUCCESS; - } - assert(cxi_event_rc(evt) == C_RC_OK); - - cxip_set_ux_dump_entry(req, evt); - - return FI_SUCCESS; -} - -/* - * cxip_build_debug_ux_entry_info() - Initialize UX info array from ULE. - * - * It is expected that a debugger is utilizing this interface and is - * expecting synchronous behavior. - * - * Caller should hold ep_obj->lock. - */ -int cxip_build_ux_entry_info(struct cxip_ep *ep, - struct fi_cq_tagged_entry *entry, size_t count, - fi_addr_t *src_addr, size_t *ux_count) -{ - struct cxip_rxc *rxc = &ep->ep_obj->rxc; - struct cxip_ux_dump_state *ux_dump; - struct cxip_ux_send *ux_send; - struct dlist_entry *tmp; - struct cxip_req *req = NULL; - union c_cmdu cmd = {}; - int ret_count; - int ret; - - ret = cxip_recv_req_alloc(rxc, NULL, 0, &req); - if (ret) - return ret; - - ux_dump = calloc(1, sizeof(struct cxip_ux_dump_state)); - if (!ux_dump) { - RXC_WARN(rxc, "ENOMEM on allocate of UX state buffer\n"); - ret_count = -FI_ENOMEM; - goto done; - } - - ux_dump->max_count = count; - ux_dump->entry = entry; - ux_dump->src_addr = src_addr; - req->recv.ux_dump = ux_dump; - - /* Get entries from software UX list first */ - dlist_foreach_container_safe(&rxc->sw_ux_list, struct cxip_ux_send, - ux_send, rxc_entry, tmp) - cxip_set_ux_dump_entry(req, &ux_send->put_ev); - - if (!rxc->msg_offload) - goto done; - - /* Read H/W UX list processing the request events synchronously - * until we set "Done" in the request callback. - */ - req->cb = cxip_unexp_msg_dump_cb; - cmd.command.opcode = C_CMD_TGT_SEARCH; - cmd.target.ptl_list = C_PTL_LIST_UNEXPECTED; - cmd.target.ptlte_index = rxc->rx_pte->pte->ptn; - cmd.target.buffer_id = req->req_id; - cmd.target.length = -1U; - cmd.target.ignore_bits = -1UL; - cmd.target.match_id = CXI_MATCH_ID_ANY; - - ret = cxi_cq_emit_target(rxc->rx_cmdq->dev_cmdq, &cmd); - if (ret) { - RXC_WARN(rxc, "Failed to write ULE Search command: %d\n", ret); - ret_count = ret; - goto done; - } - cxi_cq_ring(rxc->rx_cmdq->dev_cmdq); - - RXC_DBG(rxc, "Search for ULE dump initiated, req %p\n", req); - do { - cxip_evtq_progress(&rxc->rx_evtq); - sched_yield(); - } while (!ux_dump->done); - - RXC_DBG(rxc, "Search ULE dump done, req %p, count %ld\n", - req, ux_dump->ret_count); -done: - ret_count = ux_dump->ret_count; - *ux_count = ux_dump->ux_count; - - free(ux_dump); - cxip_recv_req_free(req); - - return ret_count; -} - -/* - * cxip_recv_sw_matched() - Progress the SW Receive match. - * - * Progress the operation which matched in SW. - */ -static int cxip_recv_sw_matched(struct cxip_req *req, - struct cxip_ux_send *ux_send) -{ - int ret; - uint64_t mrecv_start; - uint32_t mrecv_len; - bool req_done = true; - uint32_t ev_init; - uint32_t ev_rdzv_id; - struct cxip_req *rdzv_req; - struct cxip_rxc *rxc = req->recv.rxc; - - assert(req->type == CXIP_REQ_RECV); - - mrecv_start = req->recv.start_offset; - mrecv_len = mrecv_req_put_bytes(req, ux_send->put_ev.tgt_long.rlength); - - if (req->recv.multi_recv && - (req->recv.ulen - req->recv.start_offset) >= - req->recv.rxc->min_multi_recv) - req_done = false; - - if (ux_send->put_ev.tgt_long.rendezvous) { - - /* Make sure we can issue the RGet; if not we stall - * and TX event queue progress will free up credits. - */ - if (ofi_atomic_inc32(&rxc->orx_tx_reqs) > rxc->max_tx) { - ofi_atomic_dec32(&rxc->orx_tx_reqs); - return -FI_EAGAIN; - } - - ret = cxip_ux_send(req, ux_send->req, &ux_send->put_ev, - mrecv_start, mrecv_len, req_done); - if (ret != FI_SUCCESS) { - req->recv.start_offset -= mrecv_len; - ofi_atomic_dec32(&rxc->orx_tx_reqs); - - return ret; - } - - /* If multi-recv, a child request was created from - * cxip_ux_send(). Need to lookup this request. - * - * NOTE: Since the same event will be used, the evenet checks - * must be NOT be performed. The event checks are only needed - * when hardware is generating put and put overflow events for - * an mrecv buffer. If we have reached here, we know a put - * overflow event will never occur since the mrecv buffer has - * not been offloaded to hardware. - */ - if (req->recv.multi_recv) { - ret = rdzv_mrecv_req_lookup(req, &ux_send->put_ev, - &ev_init, &ev_rdzv_id, - false, &rdzv_req); - - /* If the previous cxip_ux_send() returns FI_SUCCESS, - * a matching rdzv mrecv req will always exist. - */ - assert(ret == FI_SUCCESS); - } else { - rdzv_req = req; - } - - /* Rendezvous event will not happen. So ack rendezvous event - * now. - */ - rdzv_recv_req_event(rdzv_req, ux_send->put_ev.hdr.event_type); - - cxip_recv_req_set_rget_info(rdzv_req); - - - /* A TX credit has been reserved and user receive request may - * have been removed from the ordered SW queue. If the command - * queue is backed up the condition will clear and the rget - * must get sent out, so wait for it. - */ - do { - ret = issue_rdzv_get(rdzv_req); - } while (ret == -FI_EAGAIN); - assert(ret == FI_SUCCESS); - } else { - if (ux_send->put_ev.tgt_long.rlength) - ret = cxip_ux_send(req, ux_send->req, &ux_send->put_ev, - mrecv_start, mrecv_len, req_done); - else - ret = cxip_ux_send_zb(req, &ux_send->put_ev, - mrecv_start, req_done); - - if (ret != FI_SUCCESS) { - /* undo mrecv_req_put_bytes() */ - req->recv.start_offset -= mrecv_len; - return ret; - } - } - - /* If this is a multi-receive request and there is still space, return - * a special code to indicate SW should keep matching messages to it. - */ - if (ret == FI_SUCCESS && !req_done) - return -FI_EINPROGRESS; - - return ret; -} - -static bool cxip_match_recv_sw(struct cxip_rxc *rxc, struct cxip_req *req, - struct cxip_ux_send *ux, bool claimed) -{ - union cxip_match_bits ux_mb; - uint32_t ux_init; - - if (claimed != ux->claimed) - return false; - - ux_mb.raw = ux->put_ev.tgt_long.match_bits; - ux_init = ux->put_ev.tgt_long.initiator.initiator.process; - - if (req->recv.tagged != ux_mb.tagged) - return false; - - if (ux_mb.tagged && - !tag_match(ux_mb.tag, req->recv.tag, req->recv.ignore)) - return false; - - if (!init_match(rxc, ux_init, req->recv.match_id)) - return false; - - return true; -} - -static int cxip_recv_sw_matcher(struct cxip_rxc *rxc, struct cxip_req *req, - struct cxip_ux_send *ux, bool claimed) -{ - int ret; - - if (!cxip_match_recv_sw(rxc, req, ux, claimed)) - return -FI_ENOMSG; - - ret = cxip_recv_sw_matched(req, ux); - if (ret == -FI_EAGAIN) - return -FI_EAGAIN; - - /* FI_EINPROGRESS is return for a multi-recv match. */ - assert(ret == FI_SUCCESS || ret == -FI_EINPROGRESS); - - /* TODO: Manage freeing of UX entries better. */ - dlist_remove(&ux->rxc_entry); - if (ux->req && ux->req->type == CXIP_REQ_RBUF) { - cxip_req_buf_ux_free(ux); - rxc->sw_ux_list_len--; - } else { - free(ux); - rxc->sw_ux_list_len--; - } - - RXC_DBG(rxc, - "Software match, req: %p ux_send: %p (sw_ux_list_len: %u)\n", - req, ux, req->recv.rxc->sw_ux_list_len); - - return ret; -} - -/* - * cxip_recv_ux_sw_matcher() - Attempt to match an unexpected message to a user - * posted receive. - * - * User must hold the ep_obj->lock. - */ -int cxip_recv_ux_sw_matcher(struct cxip_ux_send *ux) -{ - struct cxip_ptelist_buf *rbuf = ux->req->req_ctx; - struct cxip_rxc *rxc = rbuf->rxc; - struct cxip_req *req; - struct dlist_entry *tmp; - int ret; - - if (dlist_empty(&rxc->sw_recv_queue)) - return -FI_ENOMSG; - - dlist_foreach_container_safe(&rxc->sw_recv_queue, struct cxip_req, req, - recv.rxc_entry, tmp) { - /* Only matches against unclaimed UX messages */ - ret = cxip_recv_sw_matcher(rxc, req, ux, false); - - /* Unexpected message found match but unable to progress */ - if (ret == -FI_EAGAIN) - return ret; - - /* Unexpected message found a match. */ - if (ret == FI_SUCCESS || ret == -FI_EINPROGRESS) - return FI_SUCCESS; - } - - return -FI_ENOMSG; -} - -/* - * cxip_recv_req_sw_matcher() - Attempt to match the receive request in SW. - * - * Loop through all onloaded UX Sends looking for a match for the Receive - * request. If a match is found, progress the operation. - * - * Caller must hold ep_obj->lock. - */ -int cxip_recv_req_sw_matcher(struct cxip_req *req) -{ - struct cxip_rxc *rxc = req->recv.rxc; - struct cxip_ux_send *ux_send; - struct dlist_entry *tmp; - int ret; - - if (dlist_empty(&rxc->sw_ux_list)) - return -FI_ENOMSG; - - dlist_foreach_container_safe(&rxc->sw_ux_list, struct cxip_ux_send, - ux_send, rxc_entry, tmp) { - /* Only match against unclaimed UX messages */ - ret = cxip_recv_sw_matcher(rxc, req, ux_send, false); - switch (ret) { - /* On successful multi-recv or no match, keep matching. */ - case -FI_EINPROGRESS: - case -FI_ENOMSG: - break; - - /* Stop matching. */ - default: - return ret; - } - } - - return -FI_ENOMSG; -} - -/* - * cxip_recv_req_dropped() - Mark the Received request dropped. - * - * If HW does not have sufficient LEs to perform an append, the command is - * dropped. Queue the request for replay. When all outstanding append commands - * complete, replay all Receives. - * - * Caller must hold ep_obj->lock - */ -static int cxip_recv_req_dropped(struct cxip_req *req) -{ - struct cxip_rxc *rxc = req->recv.rxc; - int ret __attribute__((unused)); - - assert(dlist_empty(&req->recv.rxc_entry)); - dlist_insert_tail(&req->recv.rxc_entry, &rxc->replay_queue); - - RXC_DBG(rxc, "Receive dropped: %p\n", req); - - return FI_SUCCESS; -} - -/* - * cxip_recv_req_peek() - Peek for matching unexpected message on RXC. - * - * Examine onloaded UX sends, if not found there and HW offload is enabled, - * initiate check of HW UX list. In either case the operation will not - * consume the UX send, but only report the results of the peek to the CQ. - * - * Caller must hold the ep_obj->lock. - */ -static int cxip_recv_req_peek(struct cxip_req *req, bool check_rxc_state) -{ - struct cxip_rxc *rxc = req->recv.rxc; - struct cxip_ux_send *ux_send; - struct dlist_entry *tmp; - int ret; - - if (check_rxc_state && rxc->state != RXC_ENABLED && - rxc->state != RXC_ENABLED_SOFTWARE) - return -FI_EAGAIN; - - /* Attempt to match the onloaded UX list first */ - dlist_foreach_container_safe(&rxc->sw_ux_list, struct cxip_ux_send, - ux_send, rxc_entry, tmp) { - if (cxip_match_recv_sw(rxc, req, ux_send, false)) { - if (req->recv.flags & FI_CLAIM) - ux_send->claimed = true; - - recv_req_tgt_event(req, &ux_send->put_ev); - recv_req_peek_complete(req, ux_send); - return FI_SUCCESS; - } - } - - if (rxc->msg_offload) { - /* Must serialize H/W FI_CLAIM due to getting remote offsets */ - if (rxc->hw_claim_in_progress) - return -FI_EAGAIN; - - ret = cxip_ux_peek(req); - } else { - req->recv.rc = C_RC_NO_MATCH; - recv_req_peek_complete(req, NULL); - ret = FI_SUCCESS; - } - - return ret; -} - -/* - * cxip_recv_req_queue() - Queue Receive request on RXC. - * - * Before appending a new Receive request to a HW list, attempt to match the - * Receive to any onloaded UX Sends. - * - * Caller must hold the RXC lock and ensure correct RXC state if required. - */ -static int cxip_recv_req_queue(struct cxip_req *req, bool restart_seq) -{ - struct cxip_rxc *rxc = req->recv.rxc; - int ret; - - /* Try to match against onloaded Sends first. */ - ret = cxip_recv_req_sw_matcher(req); - if (ret == FI_SUCCESS) - return -FI_EALREADY; - else if (ret == -FI_EAGAIN) - return -FI_EAGAIN; - else if (ret != -FI_ENOMSG) - RXC_FATAL(rxc, "SW matching failed: %d\n", ret); - - if (rxc->msg_offload) { - /* Can not append to priority list if claimng UX */ - if (rxc->hw_claim_in_progress) - goto err_dequeue_req; - - ret = _cxip_recv_req(req, restart_seq); - if (ret) - goto err_dequeue_req; - } else { - - req->recv.software_list = true; - dlist_insert_tail(&req->recv.rxc_entry, &rxc->sw_recv_queue); - } - - return FI_SUCCESS; - -err_dequeue_req: - dlist_remove_init(&req->recv.rxc_entry); - - return -FI_EAGAIN; -} - -static int cxip_rxc_check_recv_count_hybrid_preempt(struct cxip_rxc *rxc) -{ - int ret; - int count; - - if (cxip_env.rx_match_mode == CXIP_PTLTE_HYBRID_MODE && - cxip_env.hybrid_posted_recv_preemptive == 1) { - count = ofi_atomic_get32(&rxc->orx_reqs); - - if (count > rxc->attr.size) { - assert(rxc->state == RXC_ENABLED); - - /* On success, need to return -FI_EAGAIN which will - * propagate back to the user. In addition, RXC state - * will have transitioned to RXC_PENDING_PTLTE_DISABLE. - */ - ret = cxip_recv_pending_ptlte_disable(rxc, false); - if (ret == FI_SUCCESS) { - RXC_WARN(rxc, - "Transitioning to SW EP due to too many posted recvs: posted_count=%u request_size=%lu\n", - ret, rxc->attr.size); - return -FI_EAGAIN; - } - - RXC_WARN(rxc, "Failed to transition to SW EP: %d\n", - ret); - return ret; - } - } - - return FI_SUCCESS; -} - -/* - * _cxip_recv_req() - Submit Receive request to hardware. - */ -static ssize_t _cxip_recv_req(struct cxip_req *req, bool restart_seq) -{ - struct cxip_rxc *rxc = req->recv.rxc; - uint32_t le_flags = 0; - union cxip_match_bits mb = {}; - union cxip_match_bits ib = { - .tx_id = ~0, - .match_comp = 1, - .cq_data = 1, - .rdzv_done = 1, - .le_type = ~0, - }; - int ret; - struct cxip_md *recv_md = req->recv.recv_md; - uint64_t recv_iova = 0; - - ret = cxip_rxc_check_recv_count_hybrid_preempt(rxc); - if (ret != FI_SUCCESS) - return ret; - - if (req->recv.tagged) { - mb.tagged = 1; - mb.tag = req->recv.tag; - ib.tag = req->recv.ignore; - } - - /* For poorly written applications a periodic check LE pool - * resources can be requested to force transitions to software mode. - * For this to occur, the code must be executing in hybrid mode, - * still matching in hardware, and FI_CXI_HYBRID_RECV_PREEMPTIVE - * explicitly set by the application. - */ - if (cxip_env.rx_match_mode != CXIP_PTLTE_HYBRID_MODE || - ++rxc->recv_appends & CXIP_HYBRID_RECV_CHECK_INTERVAL) - le_flags = C_LE_EVENT_LINK_DISABLE; - - /* Always set manage_local in Receive LEs. This makes Cassini ignore - * initiator remote_offset in all Puts. With this, remote_offset in Put - * events can be used by the initiator for protocol data. The behavior - * of use_once is not impacted by manage_local. - */ - le_flags |= C_LE_EVENT_UNLINK_DISABLE | C_LE_MANAGE_LOCAL | - C_LE_UNRESTRICTED_BODY_RO | C_LE_UNRESTRICTED_END_RO | - C_LE_OP_PUT; - - if (!req->recv.multi_recv) - le_flags |= C_LE_USE_ONCE; - if (restart_seq) - le_flags |= C_LE_RESTART_SEQ; - - if (recv_md) - recv_iova = CXI_VA_TO_IOVA(recv_md->md, - (uint64_t)req->recv.recv_buf + - req->recv.start_offset); - - req->recv.hw_offloaded = true; - - /* Issue Append command */ - ret = cxip_pte_append(rxc->rx_pte, recv_iova, - req->recv.ulen - req->recv.start_offset, - recv_md ? recv_md->md->lac : 0, - C_PTL_LIST_PRIORITY, req->req_id, - mb.raw, ib.raw, req->recv.match_id, - req->recv.multi_recv ? rxc->min_multi_recv : 0, - le_flags, NULL, rxc->rx_cmdq, - !(req->recv.flags & FI_MORE)); - if (ret != FI_SUCCESS) { - RXC_WARN(rxc, "Failed to write Append command: %d\n", ret); - return ret; - } - - return FI_SUCCESS; -} - -/* - * cxip_recv_common() - Common message receive function. Used for tagged and - * untagged sends of all sizes. - */ -ssize_t cxip_recv_common(struct cxip_rxc *rxc, void *buf, size_t len, - void *desc, fi_addr_t src_addr, uint64_t tag, - uint64_t ignore, void *context, uint64_t flags, - bool tagged, struct cxip_cntr *comp_cntr) -{ - int ret; - struct cxip_req *req; - struct cxip_addr caddr; - struct cxip_ux_send *ux_msg; - uint32_t match_id; - - if (len && !buf) - return -FI_EINVAL; - - if (rxc->state == RXC_DISABLED) - return -FI_EOPBADSTATE; - - /* HW to SW PtlTE transition, ensure progress is made */ - if (rxc->state != RXC_ENABLED && rxc->state != RXC_ENABLED_SOFTWARE) { - cxip_cq_progress(rxc->recv_cq); - return -FI_EAGAIN; - } - - if (tagged) { - if (tag & ~CXIP_TAG_MASK || ignore & ~CXIP_TAG_MASK) { - RXC_WARN(rxc, - "Invalid tag: %#018lx ignore: %#018lx (%#018lx)\n", - tag, ignore, CXIP_TAG_MASK); - return -FI_EINVAL; - } - flags &= ~FI_MULTI_RECV; - } - - /* If FI_DIRECTED_RECV and a src_addr is specified, encode the address - * in the LE for matching. If application AVs are symmetric, use - * logical FI address for matching. Otherwise, use physical address. - */ - if (rxc->attr.caps & FI_DIRECTED_RECV && - src_addr != FI_ADDR_UNSPEC) { - if (rxc->ep_obj->av->symmetric) { - /* PID is not used for matching */ - match_id = CXI_MATCH_ID(rxc->pid_bits, C_PID_ANY, - src_addr); - } else { - ret = cxip_av_lookup_addr(rxc->ep_obj->av, src_addr, - &caddr); - if (ret != FI_SUCCESS) { - RXC_WARN(rxc, "Failed to look up FI addr: %d\n", - ret); - return -FI_EINVAL; - } - - match_id = CXI_MATCH_ID(rxc->pid_bits, caddr.pid, - caddr.nic); - } - } else { - match_id = CXI_MATCH_ID_ANY; - } - - ofi_genlock_lock(&rxc->ep_obj->lock); - ret = cxip_recv_req_alloc(rxc, buf, len, &req); - if (ret) - goto err; - - /* req->data_len, req->tag, req->data must be set later. req->buf may - * be overwritten later. - */ - req->context = (uint64_t)context; - - req->flags = FI_RECV | (flags & FI_COMPLETION); - if (tagged) - req->flags |= FI_TAGGED; - else - req->flags |= FI_MSG; - - req->recv.cntr = comp_cntr ? comp_cntr : rxc->recv_cntr; - req->recv.match_id = match_id; - req->recv.tag = tag; - req->recv.ignore = ignore; - req->recv.flags = flags; - req->recv.tagged = tagged; - req->recv.multi_recv = (flags & FI_MULTI_RECV ? true : false); - - if (rxc->state != RXC_ENABLED && rxc->state != RXC_ENABLED_SOFTWARE) { - ret = -FI_EAGAIN; - goto err_free_request; - } - - if (!(req->recv.flags & (FI_PEEK | FI_CLAIM))) { - - ret = cxip_recv_req_queue(req, false); - /* Match made in software? */ - if (ret == -FI_EALREADY) { - ofi_genlock_unlock(&rxc->ep_obj->lock); - - return FI_SUCCESS; - } - - /* RXC busy (onloading Sends or full CQ)? */ - if (ret != FI_SUCCESS) - goto err_free_request; - - ofi_genlock_unlock(&rxc->ep_obj->lock); - - RXC_DBG(rxc, - "req: %p buf: %p len: %lu src_addr: %ld tag(%c):" - " 0x%lx ignore: 0x%lx context: %p\n", - req, buf, len, src_addr, tagged ? '*' : '-', tag, - ignore, context); - - return FI_SUCCESS; - } - - /* FI_PEEK with/without FI_CLAIM */ - if (req->recv.flags & FI_PEEK) { - if (req->recv.flags & FI_CLAIM && !req->context) { - RXC_WARN(rxc, "FI_CLAIM requires fi_context\n"); - ret = -FI_EINVAL; - goto err_free_request; - } - ret = cxip_recv_req_peek(req, true); - if (ret == FI_SUCCESS) { - ofi_genlock_unlock(&rxc->ep_obj->lock); - - return ret; - } - - goto err_free_request; - } - - /* FI_CLAIM without FI_PEEK */ - ux_msg = ((struct fi_context *)req->context)->internal[0]; - if (!ux_msg->claimed) { - RXC_WARN(rxc, "Bad fi_context specified with FI_CLAIM\n"); - ret = -FI_EINVAL; - goto err_free_request; - } - - RXC_DBG(rxc, "FI_CLAIM invoke sw matcher %p\n", ux_msg); - ret = cxip_recv_sw_matcher(rxc, req, ux_msg, true); - if (ret == FI_SUCCESS || ret == -FI_EINPROGRESS) { - ofi_genlock_unlock(&rxc->ep_obj->lock); - - return FI_SUCCESS; - } - -err_free_request: - cxip_recv_req_free(req); -err: - ofi_genlock_unlock(&rxc->ep_obj->lock); - - return ret; -} - -/* - * cxip_txc_fi_addr() - Return the FI address of the TXC. - */ -static fi_addr_t _txc_fi_addr(struct cxip_txc *txc) -{ - if (txc->ep_obj->fi_addr == FI_ADDR_NOTAVAIL) { - txc->ep_obj->fi_addr = - cxip_av_lookup_fi_addr(txc->ep_obj->av, - &txc->ep_obj->src_addr); - TXC_DBG(txc, "Found EP FI Addr: %lu\n", txc->ep_obj->fi_addr); - } - - return txc->ep_obj->fi_addr; -} - -/* - * cxip_msg_match_id() - Return the TXC's initiator address used to transmit a - * message. - * - * By default, the physical address of the TXC is returned. This address is - * sent along with message data and is used for source address matching at the - * target. When the target receives a message, the physical ID is translated to - * a logical FI address. Translation adds overhead to the receive path. - * - * As an optimization, if rendezvous offload is not being used and the process - * is part of a job with symmetric AVs, a logical FI address is returned. This - * way, there is no source address translation overhead involved in the - * receive. - */ -static uint32_t cxip_msg_match_id(struct cxip_txc *txc) -{ - /* PID is not used for logical matching, but is used for rendezvous. */ - if (txc->ep_obj->av->symmetric) - return CXI_MATCH_ID(txc->pid_bits, txc->ep_obj->src_addr.pid, - _txc_fi_addr(txc)); - - return CXI_MATCH_ID(txc->pid_bits, txc->ep_obj->src_addr.pid, - txc->ep_obj->src_addr.nic); -} - -/* - * report_send_completion() - Report the completion of a send operation. - */ -static void report_send_completion(struct cxip_req *req, bool sw_cntr) -{ - int ret; - int ret_err; - int success_event = (req->flags & FI_COMPLETION); - struct cxip_txc *txc = req->send.txc; - - req->flags &= (FI_MSG | FI_TAGGED | FI_SEND); - - if (req->send.rc == C_RC_OK) { - TXC_DBG(txc, "Request success: %p\n", req); - - if (success_event) { - ret = cxip_cq_req_complete(req); - if (ret != FI_SUCCESS) - TXC_WARN(txc, - "Failed to report completion: %d\n", - ret); - } - - if (sw_cntr && req->send.cntr) { - ret = cxip_cntr_mod(req->send.cntr, 1, false, false); - if (ret) - TXC_WARN(txc, "cxip_cntr_mod returned: %d\n", - ret); - } - } else { - ret_err = proverr2errno(req->send.rc); - TXC_WARN(txc, "Request dest_addr: %ld caddr.nic: %#X caddr.pid: %u error: %p (err: %d, %s)\n", - req->send.dest_addr, req->send.caddr.nic, - req->send.caddr.pid, req, ret_err, - cxi_rc_to_str(req->send.rc)); - - ret = cxip_cq_req_error(req, 0, ret_err, - req->send.rc, NULL, 0, - FI_ADDR_UNSPEC); - if (ret != FI_SUCCESS) - TXC_WARN(txc, "Failed to report error: %d\n", ret); - - if (sw_cntr && req->send.cntr) { - ret = cxip_cntr_mod(req->send.cntr, 1, false, true); - if (ret) - TXC_WARN(txc, "cxip_cntr_mod returned: %d\n", - ret); - } - } -} - -/* - * rdzv_send_req_complete() - Complete long send request. - */ -static void rdzv_send_req_complete(struct cxip_req *req) -{ - cxip_rdzv_id_free(req->send.txc, req->send.rdzv_id); - - cxip_send_buf_fini(req); - - report_send_completion(req, true); - - ofi_atomic_dec32(&req->send.txc->otx_reqs); - cxip_evtq_req_free(req); -} - -/* - * rdzv_send_req_event() - Count a rendezvous send event. - * - * Call for each initiator event. The events could be generated in any order. - * Once all expected events are received, complete the request. - * - * A successful rendezvous Send generates two events: Ack and Get. - */ -static void rdzv_send_req_event(struct cxip_req *req) -{ - if (++req->send.rdzv_send_events == 2) - rdzv_send_req_complete(req); -} - -/* - * cxip_send_rdzv_put_cb() - Long send callback. - * - * Progress a long send operation to completion. - */ -static int cxip_send_rdzv_put_cb(struct cxip_req *req, - const union c_event *event) -{ - int event_rc; - int ret; - struct cxip_txc *txc = req->send.txc; - - switch (event->hdr.event_type) { - case C_EVENT_ACK: - /* The source Put completed. */ - event_rc = cxi_init_event_rc(event); - - TXC_DBG(txc, "Acked: %p (rc: %s list: %s)\n", req, - cxi_rc_to_str(event_rc), - cxi_ptl_list_to_str(event->init_short.ptl_list)); - - /* If the message was dropped, mark the peer as disabled. Do - * not generate a completion. Free associated resources. Do not - * free the request (it will be used to replay the Send). - */ - if (event_rc == C_RC_PT_DISABLED) { - ret = cxip_send_req_dropped(req->send.txc, req); - if (ret == FI_SUCCESS) - cxip_rdzv_id_free(req->send.txc, - req->send.rdzv_id); - else - ret = -FI_EAGAIN; - - return ret; - } - - /* Message was accepted by the peer. Match order is preserved. - * The request can be dequeued from the SW message queue. This - * allows flow-control recovery to be performed before - * outstanding long Send operations have completed. - */ - ret = cxip_send_req_dequeue(req->send.txc, req); - if (ret != FI_SUCCESS) - return ret; - - /* The transaction is complete if the put failed */ - if (event_rc != C_RC_OK) { - req->send.rc = event_rc; - rdzv_send_req_complete(req); - } else { - /* Count the event, another may be expected. */ - rdzv_send_req_event(req); - } - return FI_SUCCESS; - - /* When errors happen, send events can occur before the put/get event. - * These events should just be dropped. - */ - case C_EVENT_SEND: - { - struct cxi_md *md = req->send.send_md->md; - - TXC_WARN(txc, "Unexpected %s event: rc:%s buf:%p len:0x%lx iova:0x%llx md.va:0x%llx lac:%d\n", - cxi_event_to_str(event), - cxi_rc_to_str(cxi_event_rc(event)), req->send.buf, - req->send.len, CXI_VA_TO_IOVA(md, req->send.buf), - md->iova, md->lac); - } - return FI_SUCCESS; - - default: - TXC_FATAL(txc, CXIP_UNEXPECTED_EVENT, - cxi_event_to_str(event), - cxi_rc_to_str(cxi_event_rc(event))); - } -} - -/* - * cxip_rdzv_pte_src_cb() - Process rendezvous source buffer events. - * - * A Get event is generated for each rendezvous Send indicating Send completion. - */ -int cxip_rdzv_pte_src_cb(struct cxip_req *req, const union c_event *event) -{ - struct cxip_rdzv_pte *rdzv_pte = req->req_ctx; - struct cxip_txc *txc = rdzv_pte->txc; - struct cxip_req *get_req; - union cxip_match_bits mb; - int event_rc = cxi_event_rc(event); - int rdzv_id; - - switch (event->hdr.event_type) { - case C_EVENT_LINK: - if (event_rc == C_RC_OK) - ofi_atomic_inc32(&rdzv_pte->le_linked_success_count); - else - ofi_atomic_inc32(&rdzv_pte->le_linked_failure_count); - return FI_SUCCESS; - - case C_EVENT_GET: - mb.raw = event->tgt_long.match_bits; - rdzv_id = (mb.rdzv_id_hi << CXIP_RDZV_ID_CMD_WIDTH) | - mb.rdzv_id_lo; - get_req = cxip_rdzv_id_lookup(txc, rdzv_id); - if (!get_req) { - TXC_WARN(txc, "Failed to find RDZV ID: %d\n", - mb.rdzv_id_lo); - return FI_SUCCESS; - } - - if (event_rc != C_RC_OK) - TXC_WARN(txc, "Get error: %p rc: %s\n", get_req, - cxi_rc_to_str(event_rc)); - else - TXC_DBG(txc, "Get received: %p rc: %s\n", get_req, - cxi_rc_to_str(event_rc)); - - get_req->send.rc = event_rc; - - /* Count the event, another may be expected. */ - rdzv_send_req_event(get_req); - - return FI_SUCCESS; - default: - TXC_FATAL(txc, CXIP_UNEXPECTED_EVENT, - cxi_event_to_str(event), - cxi_rc_to_str(cxi_event_rc(event))); - } -} - -static inline int cxip_send_prep_cmdq(struct cxip_cmdq *cmdq, - struct cxip_req *req, - uint32_t tclass) -{ - struct cxip_txc *txc = req->send.txc; - int ret; - uint16_t vni; - - if (!req->triggered) { - if (txc->ep_obj->av_auth_key) - vni = req->send.caddr.vni; - else - vni = txc->ep_obj->auth_key.vni; - - ret = cxip_txq_cp_set(cmdq, vni, - cxip_ofi_to_cxi_tc(txc->tclass), - CXI_TC_TYPE_DEFAULT); - if (ret != FI_SUCCESS) - return ret; - } - - if (req->send.flags & FI_FENCE) { - ret = cxi_cq_emit_cq_cmd(cmdq->dev_cmdq, C_CMD_CQ_FENCE); - if (ret) { - TXC_DBG(txc, "Failed to issue CQ_FENCE command: %d\n", - ret); - return -FI_EAGAIN; - } - } - - return FI_SUCCESS; -} - -/* - * _cxip_send_rdzv_put() - Initiate a send rendezvous put operation. - * - * The rendezvous protocol works as follows: - * - * 1. The Initiator performs a Rendezvous Put command which includes a portion - * of the source buffer data. - * 2. Once the Put is matched to a user receive buffer (in the Priority list), - * a Get of the remaining source data is performed. - */ -static ssize_t _cxip_send_rdzv_put(struct cxip_req *req) -{ - struct cxip_txc *txc = req->send.txc; - union c_fab_addr dfa; - uint8_t idx_ext; - struct c_full_dma_cmd cmd = {}; - union cxip_match_bits put_mb = {}; - int rdzv_id; - int lac = req->send.send_md->md->lac; - int ret; - struct cxip_cmdq *cmdq = - req->triggered ? txc->domain->trig_cmdq : txc->tx_cmdq; - - /* Zero length rendezvous not supported. */ - assert(req->send.send_md); - assert(req->send.len); - - /* Allocate rendezvous ID */ - rdzv_id = cxip_rdzv_id_alloc(txc, req); - if (rdzv_id < 0) - return -FI_EAGAIN; - - /* Calculate DFA */ - cxi_build_dfa(req->send.caddr.nic, req->send.caddr.pid, txc->pid_bits, - CXIP_PTL_IDX_RXQ, &dfa, &idx_ext); - - /* Allocate a source request for the given LAC. This makes the source - * memory accessible for rendezvous. - */ - ret = cxip_rdzv_pte_src_req_alloc(txc->rdzv_pte, lac); - if (ret) { - TXC_WARN(txc, "Failed to prepare source window: %d\n", ret); - goto err_free_rdzv_id; - } - - - /* Allocate restricted source window. If resources can not be allocated - * discontinue use of the restricted protocol, falling back - * to unrestricted. TODO: keep track and only switch for LAC that - * failed. - */ - if (txc->rdzv_proto == CXIP_RDZV_PROTO_ALT_READ && - !txc->rdzv_nomatch_pte[lac]) { - TXC_DBG(txc, "allocate restricted PTE lac %d\n", lac); - - ret = cxip_rdzv_nomatch_pte_alloc(txc, lac, - &txc->rdzv_nomatch_pte[lac]); - if (ret) { - TXC_WARN(txc, WARN_RESTRICTED_DISABLED, - cxip_rdzv_proto_to_str(txc->rdzv_proto), - cxip_rdzv_proto_to_str(CXIP_RDZV_PROTO_DEFAULT)); - txc->rdzv_proto = CXIP_RDZV_PROTO_DEFAULT; - } - } - - /* Build match bits */ - if (req->send.tagged) { - put_mb.tagged = 1; - put_mb.tag = req->send.tag; - } - - if (req->send.flags & FI_REMOTE_CQ_DATA) - put_mb.cq_data = 1; - - put_mb.rdzv_proto = txc->rdzv_proto; - - req->send.rdzv_id = rdzv_id; - req->cb = cxip_send_rdzv_put_cb; - req->send.rdzv_send_events = 0; - - /* Build Put command descriptor */ - cmd.command.cmd_type = C_CMD_TYPE_DMA; - cmd.index_ext = idx_ext; - cmd.lac = req->send.send_md->md->lac; - cmd.event_send_disable = 1; - cmd.restricted = 0; - cmd.dfa = dfa; - cmd.local_addr = CXI_VA_TO_IOVA(req->send.send_md->md, req->send.buf); - cmd.request_len = req->send.len; - cmd.eq = cxip_evtq_eqn(&txc->tx_evtq); - cmd.user_ptr = (uint64_t)req; - cmd.initiator = cxip_msg_match_id(txc); - cmd.header_data = req->send.data; - cmd.remote_offset = - CXI_VA_TO_IOVA(req->send.send_md->md, req->send.buf); - cmd.command.opcode = C_CMD_RENDEZVOUS_PUT; - cmd.eager_length = txc->rdzv_eager_size; - cmd.use_offset_for_get = 1; - - put_mb.rdzv_id_hi = rdzv_id >> CXIP_RDZV_ID_CMD_WIDTH; - put_mb.rdzv_lac = req->send.send_md->md->lac; - put_mb.le_type = CXIP_LE_TYPE_RX; - cmd.match_bits = put_mb.raw; - cmd.rendezvous_id = rdzv_id; - - if (req->triggered) { - const struct c_ct_cmd ct_cmd = { - .trig_ct = req->trig_cntr->ct->ctn, - .threshold = req->trig_thresh, - }; - - /* Triggered command queue is domain resource, lock. */ - ofi_genlock_lock(&txc->domain->trig_cmdq_lock); - - ret = cxip_send_prep_cmdq(cmdq, req, req->send.tclass); - if (ret) { - ofi_genlock_unlock(&txc->domain->trig_cmdq_lock); - goto err_free_rdzv_id; - } - - /* Clear the triggered flag to prevent retrying of operation, - * due to flow control, from using the triggered path. - */ - req->triggered = false; - - ret = cxi_cq_emit_trig_full_dma(cmdq->dev_cmdq, &ct_cmd, - &cmd); - if (ret) { - ofi_genlock_unlock(&txc->domain->trig_cmdq_lock); - goto err_enqueue; - } - - cxip_txq_ring(cmdq, !!(req->send.flags & FI_MORE), - ofi_atomic_get32(&req->send.txc->otx_reqs) - 1); - ofi_genlock_unlock(&txc->domain->trig_cmdq_lock); - } else { - - ret = cxip_send_prep_cmdq(cmdq, req, req->send.tclass); - if (ret) - goto err_free_rdzv_id; - - ret = cxi_cq_emit_dma(cmdq->dev_cmdq, &cmd); - if (ret) - goto err_enqueue; - - cxip_txq_ring(cmdq, !!(req->send.flags & FI_MORE), - ofi_atomic_get32(&req->send.txc->otx_reqs) - 1); - } - - return FI_SUCCESS; - -err_enqueue: - TXC_DBG(txc, "Failed to enqueue Put: %d, return -FI_EAGAIN\n", ret); -err_free_rdzv_id: - cxip_rdzv_id_free(txc, rdzv_id); - - return -FI_EAGAIN; -} - -/* - * cxip_send_eager_cb() - Eager send callback. Used for both tagged and - * untagged messages. - */ -static int cxip_send_eager_cb(struct cxip_req *req, - const union c_event *event) -{ - int match_complete = req->flags & FI_MATCH_COMPLETE; - int ret; - - /* When errors happen, send events can occur before the put/get event. - * These events should just be dropped. - */ - if (event->hdr.event_type == C_EVENT_SEND) { - TXC_WARN(req->send.txc, CXIP_UNEXPECTED_EVENT, - cxi_event_to_str(event), - cxi_rc_to_str(cxi_event_rc(event))); - return FI_SUCCESS; - } - - assert(event->hdr.event_type == C_EVENT_ACK); - - req->send.rc = cxi_init_event_rc(event); - - /* If the message was dropped, mark the peer as disabled. Do not - * generate a completion. Free associated resources. Do not free the - * request (it will be used to replay the Send). - */ - if (req->send.rc == C_RC_PT_DISABLED) { - - ret = cxip_send_req_dropped(req->send.txc, req); - if (ret != FI_SUCCESS) - return -FI_EAGAIN; - - if (match_complete) - cxip_tx_id_free(req->send.txc, req->send.tx_id); - - return FI_SUCCESS; - } - - ret = cxip_send_req_dequeue(req->send.txc, req); - if (ret != FI_SUCCESS) - return ret; - - cxip_send_buf_fini(req); - - /* If MATCH_COMPLETE was requested and the the Put did not match a user - * buffer, do not generate a completion event until the target notifies - * the initiator that the match is complete. - */ - if (match_complete) { - if (req->send.rc == C_RC_OK && - event->init_short.ptl_list != C_PTL_LIST_PRIORITY) { - TXC_DBG(req->send.txc, - "Waiting for match complete: %p\n", req); - return FI_SUCCESS; - } - - TXC_DBG(req->send.txc, "Match complete with Ack: %p\n", req); - cxip_tx_id_free(req->send.txc, req->send.tx_id); - } - - /* If MATCH_COMPLETE was requested, software must manage counters. */ - report_send_completion(req, match_complete); - - ofi_atomic_dec32(&req->send.txc->otx_reqs); - cxip_evtq_req_free(req); - - return FI_SUCCESS; -} - -static inline int cxip_set_eager_mb(struct cxip_req *req, - union cxip_match_bits *mb) -{ - int tx_id; - - mb->raw = 0; - mb->le_type = CXIP_LE_TYPE_RX; - mb->tagged = req->send.tagged; - mb->tag = req->send.tag; - mb->cq_data = !!(req->send.flags & FI_REMOTE_CQ_DATA); - - /* Allocate a TX ID if match completion guarantees are required */ - if (req->send.flags & FI_MATCH_COMPLETE) { - - tx_id = cxip_tx_id_alloc(req->send.txc, req); - if (tx_id < 0) { - TXC_DBG(req->send.txc, - "Failed to allocate TX ID: %d\n", tx_id); - return -FI_EAGAIN; - } - - req->send.tx_id = tx_id; - mb->match_comp = 1; - mb->tx_id = tx_id; - } - - return FI_SUCCESS; -} - -/* - * _cxip_send_eager_idc() - Enqueue eager IDC message - */ -static ssize_t _cxip_send_eager_idc(struct cxip_req *req) -{ - struct cxip_txc *txc = req->send.txc; - union c_fab_addr dfa; - uint8_t idx_ext; - union cxip_match_bits mb; - ssize_t ret; - struct cxip_cmdq *cmdq = txc->tx_cmdq; - const void *buf; - struct c_cstate_cmd cstate_cmd = {}; - struct c_idc_msg_hdr idc_cmd; - - assert(req->send.len > 0); - -#if ENABLE_DEBUG - if (req->send.flags & FI_INJECT) - assert(req->send.ibuf); - - /* ibuf and send_md are mutually exclusive. */ - if (req->send.ibuf) { - assert(req->send.send_md == NULL); - } else if (req->send.send_md) { - assert(req->send.ibuf == NULL); - - /* All non FI_HMEM_SYSTEM buffers require an ibuf. */ - assert(req->send.send_md->info.iface == FI_HMEM_SYSTEM); - } -#endif - - /* Calculate DFA */ - cxi_build_dfa(req->send.caddr.nic, req->send.caddr.pid, txc->pid_bits, - CXIP_PTL_IDX_RXQ, &dfa, &idx_ext); - - /* Favor bounce buffer if allocated. */ - if (req->send.ibuf) - buf = req->send.ibuf; - else - buf = req->send.buf; - - ret = cxip_set_eager_mb(req, &mb); - if (ret) - goto err; - - req->cb = cxip_send_eager_cb; - - /* Build commands before taking lock */ - cstate_cmd.event_send_disable = 1; - cstate_cmd.index_ext = idx_ext; - cstate_cmd.eq = cxip_evtq_eqn(&txc->tx_evtq); - cstate_cmd.initiator = cxip_msg_match_id(txc); - - /* If MATCH_COMPLETE was requested, software must manage - * counters. - */ - if (req->send.cntr && !mb.match_comp) { - cstate_cmd.event_ct_ack = 1; - cstate_cmd.ct = req->send.cntr->ct->ctn; - } - - /* Note: IDC command completely filled in */ - idc_cmd.unused_0 = 0; - idc_cmd.dfa = dfa; - idc_cmd.match_bits = mb.raw; - idc_cmd.header_data = req->send.data; - idc_cmd.user_ptr = (uint64_t)req; - - /* Submit command */ - ret = cxip_send_prep_cmdq(cmdq, req, req->send.tclass); - if (ret) - goto err_cleanup; - - ret = cxip_cmdq_emit_c_state(cmdq, &cstate_cmd); - if (ret) { - TXC_DBG(txc, "Failed to issue C_STATE command: %ld\n", ret); - goto err_cleanup; - } - - ret = cxi_cq_emit_idc_msg(cmdq->dev_cmdq, &idc_cmd, buf, req->send.len); - if (ret) { - TXC_DBG(txc, "Failed to write IDC: %ld\n", ret); - - /* Return error according to Domain Resource Management */ - ret = -FI_EAGAIN; - goto err_cleanup; - } - - cxip_txq_ring(cmdq, !!(req->send.flags & FI_MORE), - ofi_atomic_get32(&req->send.txc->otx_reqs) - 1); - - return FI_SUCCESS; - -err_cleanup: - if (mb.match_comp) - cxip_tx_id_free(txc, req->send.tx_id); -err: - return ret; -} - -/* - * _cxip_send_eager() - Enqueue eager send command. + * Before cxip_ux_onload() can be called, all user appends in the command queue + * must be flushed. If not, this can cause cxip_ux_onload() to read incorrect + * remote offsets from cxil_pte_status(). The flush is implemented by issuing + * a search command which will match zero ULEs. When the search event is + * processed, all pending user appends will have been processed. Since the RXC + * is not enabled, new appends cannot occur during this time. + * + * Caller must hold ep_obj->lock. */ -static ssize_t _cxip_send_eager(struct cxip_req *req) +int cxip_flush_appends(struct cxip_rxc_hpc *rxc, + int (*flush_cb)(struct cxip_req *req, + const union c_event *event)) { - struct cxip_txc *txc = req->send.txc; - union c_fab_addr dfa; - uint8_t idx_ext; - union cxip_match_bits mb; - ssize_t ret; - struct cxip_cmdq *cmdq = - req->triggered ? txc->domain->trig_cmdq : txc->tx_cmdq; - bool trig = req->triggered; - struct c_full_dma_cmd cmd = {}; - - /* Calculate DFA */ - cxi_build_dfa(req->send.caddr.nic, req->send.caddr.pid, txc->pid_bits, - CXIP_PTL_IDX_RXQ, &dfa, &idx_ext); - - ret = cxip_set_eager_mb(req, &mb); - if (ret) - goto err; - - req->cb = cxip_send_eager_cb; - - cmd.command.cmd_type = C_CMD_TYPE_DMA; - cmd.command.opcode = C_CMD_PUT; - cmd.index_ext = idx_ext; - cmd.event_send_disable = 1; - cmd.dfa = dfa; - cmd.eq = cxip_evtq_eqn(&txc->tx_evtq); - cmd.user_ptr = (uint64_t)req; - cmd.initiator = cxip_msg_match_id(txc); - cmd.match_bits = mb.raw; - cmd.header_data = req->send.data; - - /* Triggered ops could result in 0 length DMA */ - if (req->send.send_md) { - cmd.lac = req->send.send_md->md->lac; - cmd.local_addr = CXI_VA_TO_IOVA(req->send.send_md->md, - req->send.buf); - cmd.request_len = req->send.len; - } + struct cxip_req *req; + union c_cmdu cmd = {}; + int ret; - /* If MATCH_COMPLETE was requested, software must manage - * counters. - */ - if (req->send.cntr && !mb.match_comp) { - cmd.event_ct_ack = 1; - cmd.ct = req->send.cntr->ct->ctn; + /* Populate request */ + req = cxip_evtq_req_alloc(&rxc->base.rx_evtq, 1, rxc); + if (!req) { + RXC_DBG(rxc, "Failed to allocate request\n"); + ret = -FI_EAGAIN; + goto err; } + ofi_atomic_inc32(&rxc->base.orx_reqs); - /* Issue Eager Put command */ - if (trig) { - const struct c_ct_cmd ct_cmd = { - .trig_ct = req->trig_cntr->ct->ctn, - .threshold = req->trig_thresh, - }; - - /* Triggered command queue is domain resource, lock. */ - ofi_genlock_lock(&txc->domain->trig_cmdq_lock); - ret = cxip_send_prep_cmdq(cmdq, req, req->send.tclass); - if (ret) { - ofi_genlock_unlock(&txc->domain->trig_cmdq_lock); - goto err; - } - - /* Clear the triggered flag to prevent retrying of - * operation, due to flow control, from using the - * triggered path. - */ - req->triggered = false; - - ret = cxi_cq_emit_trig_full_dma(cmdq->dev_cmdq, &ct_cmd, - &cmd); - if (ret) { - ofi_genlock_unlock(&txc->domain->trig_cmdq_lock); - goto err_enqueue; - } - cxip_txq_ring(cmdq, !!(req->send.flags & FI_MORE), - ofi_atomic_get32(&req->send.txc->otx_reqs) - 1); - ofi_genlock_unlock(&txc->domain->trig_cmdq_lock); + rxc->base.rx_evtq.ack_batch_size = 1; - } else { - ret = cxip_send_prep_cmdq(cmdq, req, req->send.tclass); - if (ret) - goto err; + req->cb = flush_cb; + req->type = CXIP_REQ_SEARCH; - ret = cxi_cq_emit_dma(cmdq->dev_cmdq, &cmd); - if (ret) - goto err_enqueue; + /* Search command which should match nothing. */ + cmd.command.opcode = C_CMD_TGT_SEARCH; + cmd.target.ptl_list = C_PTL_LIST_UNEXPECTED; + cmd.target.ptlte_index = rxc->base.rx_pte->pte->ptn; + cmd.target.buffer_id = req->req_id; + cmd.target.match_bits = -1UL; + cmd.target.length = 0; - cxip_txq_ring(cmdq, !!(req->send.flags & FI_MORE), - ofi_atomic_get32(&req->send.txc->otx_reqs) - 1); + ret = cxi_cq_emit_target(rxc->base.rx_cmdq->dev_cmdq, &cmd); + if (ret) { + RXC_WARN(rxc, "Failed to write Search command: %d\n", ret); + ret = -FI_EAGAIN; + goto err_dec_free_cq_req; } - return FI_SUCCESS; + cxi_cq_ring(rxc->base.rx_cmdq->dev_cmdq); -err_enqueue: - TXC_DBG(txc, "Failed to write DMA command: %ld\n", ret); - ret = -FI_EAGAIN; + return FI_SUCCESS; - if (mb.match_comp) - cxip_tx_id_free(txc, req->send.tx_id); +err_dec_free_cq_req: + ofi_atomic_dec32(&rxc->base.orx_reqs); + cxip_evtq_req_free(req); err: return ret; } -static bool cxip_send_eager_idc(struct cxip_req *req) -{ - return (req->send.len <= CXIP_INJECT_SIZE) && - !cxip_env.disable_non_inject_msg_idc; -} - -static ssize_t _cxip_send_req(struct cxip_req *req) -{ - /* Force all zero-byte operations to use the eager path. This utilizes - * a smaller command format. - */ - if (req->send.len == 0) - return _cxip_send_eager(req); - - /* IDC commands are not supported with triggered operations. */ - if (!req->triggered && - ((req->send.flags & FI_INJECT) || cxip_send_eager_idc(req))) - return _cxip_send_eager_idc(req); - - if (req->send.len <= req->send.txc->max_eager_size) - return _cxip_send_eager(req); - - return _cxip_send_rdzv_put(req); -} - -/* - * cxip_fc_peer_lookup() - Check if a peer is disabled. - * - * Look up disabled peer state and return it, if available. - * - * Caller must hold ep_obj->lock. - */ -static struct cxip_fc_peer *cxip_fc_peer_lookup(struct cxip_txc *txc, - struct cxip_addr caddr) -{ - struct cxip_fc_peer *peer; - - dlist_foreach_container(&txc->fc_peers, struct cxip_fc_peer, - peer, txc_entry) { - if (CXIP_ADDR_EQUAL(peer->caddr, caddr)) - return peer; - } - - return NULL; -} - /* - * cxip_fc_peer_put() - Account for completion of an outstanding Send targeting - * a disabled peer. + * cxip_recv_req_dropped() - Mark the Received request dropped. * - * Drop a reference to a disabled peer. When the last reference is dropped, - * attempt flow-control recovery. + * If HW does not have sufficient LEs to perform an append, the command is + * dropped. Queue the request for replay. When all outstanding append commands + * complete, replay all Receives. * - * Caller must hold ep_obj->lock. + * Caller must hold ep_obj->lock */ -static int cxip_fc_peer_put(struct cxip_fc_peer *peer) +int cxip_recv_req_dropped(struct cxip_req *req) { - int ret; - - assert(peer->pending > 0); + struct cxip_rxc_hpc *rxc = req->recv.rxc_hpc; - /* Account for the completed Send */ - if (!--peer->pending) { - peer->req.send.mb.drops = peer->dropped; - - ret = cxip_ctrl_msg_send(&peer->req); - if (ret != FI_SUCCESS) { - peer->pending++; - return ret; - } + assert(rxc->base.protocol == FI_PROTO_CXI); + assert(dlist_empty(&req->recv.rxc_entry)); - peer->pending_acks++; + dlist_insert_tail(&req->recv.rxc_entry, &rxc->replay_queue); - TXC_DBG(peer->txc, - "Notified disabled peer NIC: %#x PID: %u dropped: %u\n", - peer->caddr.nic, peer->caddr.pid, peer->dropped); - } + RXC_DBG(rxc, "Receive dropped: %p\n", req); return FI_SUCCESS; } /* - * cxip_fc_peer_fini() - Remove disabled peer state. - * - * Caller must hold ep_obj->lock. - */ -static void cxip_fc_peer_fini(struct cxip_fc_peer *peer) -{ - assert(dlist_empty(&peer->msg_queue)); - dlist_remove(&peer->txc_entry); - free(peer); -} - -/* - * cxip_fc_notify_cb() - Process FC notify completion events. - */ -int cxip_fc_notify_cb(struct cxip_ctrl_req *req, const union c_event *event) -{ - struct cxip_fc_peer *peer = container_of(req, struct cxip_fc_peer, req); - struct cxip_txc *txc = peer->txc; - - switch (event->hdr.event_type) { - case C_EVENT_ACK: - switch (cxi_event_rc(event)) { - case C_RC_OK: - TXC_DBG(txc, - "FC_NOTIFY to %#x:%u successfully sent: retry_count=%u\n", - peer->caddr.nic, peer->caddr.pid, - peer->retry_count); - - /* Peer flow control structure can only be freed if - * replay is complete and all acks accounted for. - */ - peer->pending_acks--; - if (!peer->pending_acks && peer->replayed) - cxip_fc_peer_fini(peer); - - return FI_SUCCESS; - - /* This error occurs when the target's control event queue has - * run out of space. Since the target should be processing the - * event queue, it is safe to replay messages until C_RC_OK is - * returned. - */ - case C_RC_ENTRY_NOT_FOUND: - peer->retry_count++; - TXC_WARN(txc, - "%#x:%u dropped FC message: retry_delay_usecs=%d retry_count=%u\n", - peer->caddr.nic, peer->caddr.pid, - cxip_env.fc_retry_usec_delay, - peer->retry_count); - usleep(cxip_env.fc_retry_usec_delay); - return cxip_ctrl_msg_send(req); - default: - TXC_FATAL(txc, CXIP_UNEXPECTED_EVENT_STS, - cxi_event_to_str(event), - cxi_rc_to_str(cxi_event_rc(event))); - } - default: - TXC_FATAL(txc, CXIP_UNEXPECTED_EVENT, - cxi_event_to_str(event), - cxi_rc_to_str(cxi_event_rc(event))); - } -} - -/* - * cxip_fc_peer_init() - Mark a peer as disabled. - * - * Called by sending EP after experiencing first dropped Send to a peer. - * - * Allocate state to track the disabled peer. Locate all outstanding Sends - * targeting the peer. - * - * Caller must hold ep_obj->lock. + * _txc_fi_addr() - Return the FI address of the TXC. */ -static int cxip_fc_peer_init(struct cxip_txc *txc, struct cxip_addr caddr, - struct cxip_fc_peer **peer) +static fi_addr_t _txc_fi_addr(struct cxip_txc *txc) { - struct cxip_fc_peer *p; - struct cxip_req *req; - struct dlist_entry *tmp; - - p = calloc(1, sizeof(*p)); - if (!p) { - TXC_WARN(txc, "Failed to allocate FC Peer\n"); - return -FI_ENOMEM; - } - - p->caddr = caddr; - p->txc = txc; - dlist_init(&p->msg_queue); - dlist_insert_tail(&p->txc_entry, &txc->fc_peers); - - p->req.send.nic_addr = caddr.nic; - p->req.send.pid = caddr.pid; - /* TODO: remove */ - p->req.send.mb.txc_id = 0; - p->req.send.mb.rxc_id = 0; - - p->req.send.mb.ctrl_le_type = CXIP_CTRL_LE_TYPE_CTRL_MSG; - p->req.send.mb.ctrl_msg_type = CXIP_CTRL_MSG_FC_NOTIFY; - p->req.cb = cxip_fc_notify_cb; - p->req.ep_obj = txc->ep_obj; - - /* Queue all Sends to the FC'ed peer */ - dlist_foreach_container_safe(&txc->msg_queue, struct cxip_req, - req, send.txc_entry, tmp) { - if (CXIP_ADDR_EQUAL(req->send.caddr, caddr)) { - dlist_remove(&req->send.txc_entry); - dlist_insert_tail(&req->send.txc_entry, &p->msg_queue); - p->pending++; - req->send.fc_peer = p; - } + if (txc->ep_obj->fi_addr == FI_ADDR_NOTAVAIL) { + txc->ep_obj->fi_addr = + cxip_av_lookup_fi_addr(txc->ep_obj->av, + &txc->ep_obj->src_addr); + TXC_DBG(txc, "Found EP FI Addr: %lu\n", txc->ep_obj->fi_addr); } - *peer = p; - - return FI_SUCCESS; + return txc->ep_obj->fi_addr; } /* - * cxip_fc_resume() - Replay dropped Sends. + * cxip_msg_match_id() - Return the TXC's initiator address used to transmit a + * message. * - * Called by sending EP after being notified disabled peer was re-enabled. + * By default, the physical address of the TXC is returned. This address is + * sent along with message data and is used for source address matching at the + * target. When the target receives a message, the physical ID is translated to + * a logical FI address. Translation adds overhead to the receive path. * - * Replay all dropped Sends in order. + * As an optimization, if rendezvous offload is not being used and the process + * is part of a job with symmetric AVs, a logical FI address is returned. This + * way, there is no source address translation overhead involved in the + * receive. */ -int cxip_fc_resume(struct cxip_ep_obj *ep_obj, uint32_t nic_addr, uint32_t pid) +uint32_t cxip_msg_match_id(struct cxip_txc *txc) { - struct cxip_txc *txc = &ep_obj->txc; - struct cxip_fc_peer *peer; - struct cxip_addr caddr = { - .nic = nic_addr, - .pid = pid, - }; - struct cxip_req *req; - struct dlist_entry *tmp; - int ret __attribute__((unused)); - - peer = cxip_fc_peer_lookup(txc, caddr); - if (!peer) - TXC_FATAL(txc, "Fatal, FC peer not found: NIC: %#x PID: %d\n", - nic_addr, pid); - - TXC_DBG(txc, "Replaying dropped sends, NIC: %#x PID: %d\n", - nic_addr, pid); - - dlist_foreach_container_safe(&peer->msg_queue, struct cxip_req, - req, send.txc_entry, tmp) { - /* -FI_EAGAIN can be return if the command queue is full. Loop - * until this goes through. - */ - do { - ret = _cxip_send_req(req); - } while (ret == -FI_EAGAIN); - assert(ret == FI_SUCCESS); - - /* Move request back to the message queue. */ - dlist_remove(&req->send.txc_entry); - req->send.fc_peer = NULL; - dlist_insert_tail(&req->send.txc_entry, &txc->msg_queue); - - TXC_DBG(txc, "Replayed %p\n", req); - } - - /* Peer flow control structure can only be freed if replay is complete - * and all acks accounted for. - */ - if (!peer->pending_acks) - cxip_fc_peer_fini(peer); - else - peer->replayed = true; + /* PID is not used for logical matching, but is used for rendezvous. */ + if (txc->ep_obj->av->symmetric) + return CXI_MATCH_ID(txc->pid_bits, txc->ep_obj->src_addr.pid, + _txc_fi_addr(txc)); - return FI_SUCCESS; + return CXI_MATCH_ID(txc->pid_bits, txc->ep_obj->src_addr.pid, + txc->ep_obj->src_addr.nic); } /* - * cxip_send_req_dropped() - Mark the Send request dropped. - * - * Mark the Send request dropped. Mark the target peer as disabled. Track all - * outstanding Sends targeting the disabled peer. When all outstanding Sends - * are completed, recovery will be performed. + * report_send_completion() - Report the completion of a send operation. */ -static int cxip_send_req_dropped(struct cxip_txc *txc, struct cxip_req *req) +void cxip_report_send_completion(struct cxip_req *req, bool sw_cntr) { - struct cxip_fc_peer *peer; int ret; + int ret_err; + int success_event = (req->flags & FI_COMPLETION); + struct cxip_txc *txc = req->send.txc; - /* Check if peer is already disabled */ - peer = cxip_fc_peer_lookup(txc, req->send.caddr); - if (!peer) { - ret = cxip_fc_peer_init(txc, req->send.caddr, &peer); - if (ret != FI_SUCCESS) - return ret; + req->flags &= (FI_MSG | FI_TAGGED | FI_SEND | FI_CXI_TRUNC); - TXC_DBG(txc, - "Disabled peer detected, NIC: %#x PID: %u pending: %u\n", - peer->caddr.nic, peer->caddr.pid, peer->pending); - } + if (req->send.rc == C_RC_OK) { + TXC_DBG(txc, "Request success: %p\n", req); - /* Account for the dropped message. */ - peer->dropped++; - ret = cxip_fc_peer_put(peer); - if (ret) - peer->dropped--; - else - TXC_DBG(txc, - "Send dropped, req: %p NIC: %#x PID: %u pending: %u dropped: %u\n", - req, peer->caddr.nic, peer->caddr.pid, peer->pending, - peer->dropped); + if (success_event) { + ret = cxip_cq_req_complete(req); + if (ret != FI_SUCCESS) + TXC_WARN(txc, + "Failed to report completion: %d\n", + ret); + } - return ret; -} + if (sw_cntr && req->send.cntr) { + ret = cxip_cntr_mod(req->send.cntr, 1, false, false); + if (ret) + TXC_WARN(txc, "cxip_cntr_mod returned: %d\n", + ret); + } + } else { + if (req->send.canceled) { + ret_err = FI_ECANCELED; + TXC_DBG(txc, "Request canceled: %p (err: %d)\n", + req, ret_err); + } else { + ret_err = proverr2errno(req->send.rc); + } -/* - * cxip_send_req_queue() - Queue Send request on TXC. - * - * Place the Send request in an ordered SW queue. Return error if the target - * peer is disabled. - */ -static int cxip_send_req_queue(struct cxip_txc *txc, struct cxip_req *req) -{ - struct cxip_fc_peer *peer; + TXC_WARN(txc, "Request dest_addr: %ld caddr.nic: %#X caddr.pid: %u error: %p (err: %d, %s)\n", + req->send.dest_addr, req->send.caddr.nic, + req->send.caddr.pid, req, ret_err, + cxi_rc_to_str(req->send.rc)); - if (!dlist_empty(&txc->fc_peers)) { - peer = cxip_fc_peer_lookup(txc, req->send.caddr); - if (peer) { - /* Peer is disabled. Progress control EQs so future - * cxip_send_req_queue() may succeed. - */ - cxip_ep_ctrl_progress_locked(txc->ep_obj); + ret = cxip_cq_req_error(req, 0, ret_err, + req->send.rc, NULL, 0, + FI_ADDR_UNSPEC); + if (ret != FI_SUCCESS) + TXC_WARN(txc, "Failed to report error: %d\n", ret); - return -FI_EAGAIN; + if (sw_cntr && req->send.cntr) { + ret = cxip_cntr_mod(req->send.cntr, 1, false, true); + if (ret) + TXC_WARN(txc, "cxip_cntr_mod returned: %d\n", + ret); } } - - dlist_insert_tail(&req->send.txc_entry, &txc->msg_queue); - - return FI_SUCCESS; } -/* - * cxip_send_req_dequeue() - Dequeue Send request from TXC. - * - * Remove the Send requst from the ordered message queue. Update peer - * flow-control state, if necessary. - */ -static int cxip_send_req_dequeue(struct cxip_txc *txc, struct cxip_req *req) +bool cxip_send_eager_idc(struct cxip_req *req) { - int ret; - - if (req->send.fc_peer) { - /* The peer was disabled after this message arrived. */ - TXC_DBG(txc, - "Send not dropped, req: %p NIC: %#x PID: %u pending: %u dropped: %u\n", - req, req->send.fc_peer->caddr.nic, - req->send.fc_peer->caddr.pid, - req->send.fc_peer->pending, req->send.fc_peer->dropped); - - ret = cxip_fc_peer_put(req->send.fc_peer); - if (ret != FI_SUCCESS) - return ret; - - req->send.fc_peer = NULL; - } - - dlist_remove(&req->send.txc_entry); - - return FI_SUCCESS; + return (req->send.len <= CXIP_INJECT_SIZE) && + !cxip_env.disable_non_inject_msg_idc; } -static void cxip_send_buf_fini(struct cxip_req *req) +void cxip_send_buf_fini(struct cxip_req *req) { - if (req->send.send_md) + if (req->send.send_md && !req->send.hybrid_md) cxip_unmap(req->send.send_md); if (req->send.ibuf) cxip_txc_ibuf_free(req->send.txc, req->send.ibuf); } -static int cxip_send_buf_init(struct cxip_req *req) +int cxip_send_buf_init(struct cxip_req *req) { struct cxip_txc *txc = req->send.txc; int ret; @@ -5513,137 +773,6 @@ static int cxip_send_buf_init(struct cxip_req *req) return ret; } -/* - * cxip_send_common() - Common message send function. Used for tagged and - * untagged sends of all sizes. This includes triggered operations. - */ -ssize_t cxip_send_common(struct cxip_txc *txc, uint32_t tclass, const void *buf, - size_t len, void *desc, uint64_t data, - fi_addr_t dest_addr, uint64_t tag, void *context, - uint64_t flags, bool tagged, bool triggered, - uint64_t trig_thresh, struct cxip_cntr *trig_cntr, - struct cxip_cntr *comp_cntr) -{ - struct cxip_req *req; - struct cxip_addr caddr; - int ret; - - if (len && !buf) - return -FI_EINVAL; - - if (len > CXIP_EP_MAX_MSG_SZ) - return -FI_EMSGSIZE; - - if (tagged && tag & ~CXIP_TAG_MASK) { - TXC_WARN(txc, "Invalid tag: %#018lx (%#018lx)\n", - tag, CXIP_TAG_MASK); - return -FI_EINVAL; - } - - if (flags & FI_INJECT && len > CXIP_INJECT_SIZE) { - TXC_WARN(txc, "Invalid inject length: %lu\n", len); - return -FI_EMSGSIZE; - } - - ofi_genlock_lock(&txc->ep_obj->lock); - - req = cxip_evtq_req_alloc(&txc->tx_evtq, false, txc); - if (!req) { - TXC_DBG(txc, "Failed to allocate request, return -FI_EAGAIN\n"); - ret = -FI_EAGAIN; - goto unlock; - } - - /* Restrict outstanding success event requests to queue size */ - if (ofi_atomic_inc32(&txc->otx_reqs) > txc->attr.size) { - ret = -FI_EAGAIN; - goto err_req_free; - } - - req->triggered = triggered; - req->trig_thresh = trig_thresh; - req->trig_cntr = trig_cntr; - - /* Save Send parameters to replay */ - req->type = CXIP_REQ_SEND; - req->send.txc = txc; - req->send.tclass = tclass; - - req->send.cntr = triggered ? comp_cntr : txc->send_cntr; - req->send.buf = buf; - req->send.len = len; - req->send.data = data; - req->send.flags = flags; - - /* Set completion parameters */ - req->context = (uint64_t)context; - req->flags = FI_SEND | (flags & (FI_COMPLETION | FI_MATCH_COMPLETE)); - if (tagged) { - req->send.tagged = tagged; - req->send.tag = tag; - req->flags |= FI_TAGGED; - } else { - req->flags |= FI_MSG; - } - - ret = cxip_send_buf_init(req); - if (ret) { - TXC_WARN(txc, "cxip_send_buf_init failed: %d:%s\n", ret, - fi_strerror(-ret)); - goto err_req_free; - } - - /* Look up target CXI address */ - ret = cxip_av_lookup_addr(txc->ep_obj->av, dest_addr, &caddr); - if (ret != FI_SUCCESS) { - TXC_WARN(txc, "Failed to look up FI addr: %d\n", ret); - goto err_req_buf_fini; - } - - req->send.caddr = caddr; - req->send.dest_addr = dest_addr; - - if (cxip_evtq_saturated(&txc->tx_evtq)) { - TXC_DBG(txc, "TX HW EQ saturated\n"); - ret = -FI_EAGAIN; - goto err_req_buf_fini; - } - - /* Check if target peer is disabled */ - ret = cxip_send_req_queue(req->send.txc, req); - if (ret != FI_SUCCESS) { - TXC_DBG(txc, "Target peer disabled\n"); - goto err_req_buf_fini; - } - - /* Try Send */ - ret = _cxip_send_req(req); - if (ret != FI_SUCCESS) - goto err_req_dequeue; - - ofi_genlock_unlock(&txc->ep_obj->lock); - - TXC_DBG(txc, - "req: %p buf: %p len: %lu dest_addr: 0x%lX nic: %d pid: %d tag(%c): 0x%lx context %#lx\n", - req, req->send.buf, req->send.len, dest_addr, caddr.nic, - caddr.pid, req->send.tagged ? '*' : '-', req->send.tag, - req->context); - - return FI_SUCCESS; - -err_req_dequeue: - cxip_send_req_dequeue(req->send.txc, req); -err_req_buf_fini: - cxip_send_buf_fini(req); -err_req_free: - ofi_atomic_dec32(&txc->otx_reqs); - cxip_evtq_req_free(req); -unlock: - ofi_genlock_unlock(&txc->ep_obj->lock); - - return ret; -} - /* * Libfabric APIs */ @@ -5652,10 +781,11 @@ static ssize_t cxip_trecv(struct fid_ep *fid_ep, void *buf, size_t len, uint64_t ignore, void *context) { struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + struct cxip_rxc *rxc = ep->ep_obj->rxc; + uint64_t flags = ep->rx_attr.op_flags & ~FI_MULTI_RECV; - return cxip_recv_common(&ep->ep_obj->rxc, buf, len, desc, src_addr, - tag, ignore, context, ep->rx_attr.op_flags, - true, NULL); + return rxc->ops.recv_common(rxc, buf, len, desc, src_addr, tag, ignore, + context, flags, true, NULL); } static ssize_t cxip_trecvv(struct fid_ep *fid_ep, const struct iovec *iov, @@ -5663,6 +793,8 @@ static ssize_t cxip_trecvv(struct fid_ep *fid_ep, const struct iovec *iov, uint64_t tag, uint64_t ignore, void *context) { struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + uint64_t flags = ep->rx_attr.op_flags & ~FI_MULTI_RECV; + struct cxip_rxc *rxc = ep->ep_obj->rxc; size_t len; void *buf; void *mr_desc; @@ -5676,19 +808,19 @@ static ssize_t cxip_trecvv(struct fid_ep *fid_ep, const struct iovec *iov, buf = iov[0].iov_base; mr_desc = desc ? desc[0] : NULL; } else { - RXC_WARN(&ep->ep_obj->rxc, "Invalid IOV\n"); + RXC_WARN(rxc, "Invalid IOV\n"); return -FI_EINVAL; } - return cxip_recv_common(&ep->ep_obj->rxc, buf, len, mr_desc, src_addr, - tag, ignore, context, ep->rx_attr.op_flags, - true, NULL); + return rxc->ops.recv_common(rxc, buf, len, mr_desc, src_addr, tag, + ignore, context, flags, true, NULL); } static ssize_t cxip_trecvmsg(struct fid_ep *fid_ep, const struct fi_msg_tagged *msg, uint64_t flags) { struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + struct cxip_rxc *rxc = ep->ep_obj->rxc; size_t len; void *buf; void *mr_desc; @@ -5698,14 +830,16 @@ static ssize_t cxip_trecvmsg(struct fid_ep *fid_ep, return -FI_EBADFLAGS; if (!msg) { - RXC_WARN(&ep->ep_obj->rxc, "NULL msg not supported\n"); + RXC_WARN(rxc, "NULL msg not supported\n"); return -FI_EINVAL; } + flags &= ~FI_MULTI_RECV; + /* If selective completion is not requested, always generate * completions. */ - if (!ep->ep_obj->rxc.selective_completion) + if (!rxc->selective_completion) flags |= FI_COMPLETION; if (!(flags & FI_PEEK)) { @@ -5718,19 +852,19 @@ static ssize_t cxip_trecvmsg(struct fid_ep *fid_ep, buf = msg->msg_iov[0].iov_base; mr_desc = msg->desc ? msg->desc[0] : NULL; } else { - RXC_WARN(&ep->ep_obj->rxc, "Invalid IOV\n"); + RXC_WARN(rxc, "Invalid IOV\n"); return -FI_EINVAL; } - return cxip_recv_common(&ep->ep_obj->rxc, buf, len, mr_desc, - msg->addr, msg->tag, msg->ignore, - msg->context, flags, true, NULL); + return rxc->ops.recv_common(rxc, buf, len, mr_desc, msg->addr, + msg->tag, msg->ignore, msg->context, + flags, true, NULL); } /* FI_PEEK does not post a recv or return message payload */ - return cxip_recv_common(&ep->ep_obj->rxc, NULL, 0UL, NULL, msg->addr, - msg->tag, msg->ignore, msg->context, flags, - true, NULL); + return rxc->ops.recv_common(rxc, NULL, 0UL, NULL, msg->addr, msg->tag, + msg->ignore, msg->context, flags, true, + NULL); } static ssize_t cxip_tsend(struct fid_ep *fid_ep, const void *buf, size_t len, @@ -5738,11 +872,12 @@ static ssize_t cxip_tsend(struct fid_ep *fid_ep, const void *buf, size_t len, void *context) { struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + struct cxip_txc *txc = ep->ep_obj->txc; - return cxip_send_common(&ep->ep_obj->txc, ep->tx_attr.tclass, buf, len, - desc, 0, dest_addr, tag, context, - ep->tx_attr.op_flags, true, false, 0, - NULL, NULL); + return txc->ops.send_common(txc, ep->tx_attr.tclass, buf, len, desc, 0, + dest_addr, tag, context, + ep->tx_attr.op_flags, true, false, 0, + NULL, NULL); } static ssize_t cxip_tsendv(struct fid_ep *fid_ep, const struct iovec *iov, @@ -5750,6 +885,7 @@ static ssize_t cxip_tsendv(struct fid_ep *fid_ep, const struct iovec *iov, uint64_t tag, void *context) { struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + struct cxip_txc *txc = ep->ep_obj->txc; size_t len; const void *buf; void *mr_desc; @@ -5763,27 +899,27 @@ static ssize_t cxip_tsendv(struct fid_ep *fid_ep, const struct iovec *iov, buf = iov[0].iov_base; mr_desc = desc ? desc[0] : NULL; } else { - TXC_WARN(&ep->ep_obj->txc, "Invalid IOV\n"); + TXC_WARN(txc, "Invalid IOV\n"); return -FI_EINVAL; } - return cxip_send_common(&ep->ep_obj->txc, ep->tx_attr.tclass, buf, len, - mr_desc, 0, dest_addr, tag, context, - ep->tx_attr.op_flags, true, false, 0, NULL, - NULL); + return txc->ops.send_common(txc, ep->tx_attr.tclass, buf, len, mr_desc, + 0, dest_addr, tag, context, + ep->tx_attr.op_flags, true, false, 0, + NULL, NULL); } static ssize_t cxip_tsendmsg(struct fid_ep *fid_ep, const struct fi_msg_tagged *msg, uint64_t flags) { struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); - struct cxip_txc *txc = &ep->ep_obj->txc; + struct cxip_txc *txc = ep->ep_obj->txc; size_t len; const void *buf; void *mr_desc; if (!msg) { - TXC_WARN(&ep->ep_obj->txc, "NULL msg not supported\n"); + TXC_WARN(txc, "NULL msg not supported\n"); return -FI_EINVAL; } @@ -5796,7 +932,7 @@ static ssize_t cxip_tsendmsg(struct fid_ep *fid_ep, buf = msg->msg_iov[0].iov_base; mr_desc = msg->desc ? msg->desc[0] : NULL; } else { - TXC_WARN(&ep->ep_obj->txc, "Invalid IOV\n"); + TXC_WARN(txc, "Invalid IOV\n"); return -FI_EINVAL; } @@ -5812,19 +948,21 @@ static ssize_t cxip_tsendmsg(struct fid_ep *fid_ep, if (!txc->selective_completion) flags |= FI_COMPLETION; - return cxip_send_common(txc, ep->tx_attr.tclass, buf, len, mr_desc, - msg->data, msg->addr, msg->tag, msg->context, - flags, true, false, 0, NULL, NULL); + return txc->ops.send_common(txc, ep->tx_attr.tclass, buf, len, mr_desc, + msg->data, msg->addr, msg->tag, + msg->context, flags, true, false, 0, + NULL, NULL); } static ssize_t cxip_tinject(struct fid_ep *fid_ep, const void *buf, size_t len, fi_addr_t dest_addr, uint64_t tag) { struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + struct cxip_txc *txc = ep->ep_obj->txc; - return cxip_send_common(&ep->ep_obj->txc, ep->tx_attr.tclass, buf, len, - NULL, 0, dest_addr, tag, NULL, FI_INJECT, - true, false, 0, NULL, NULL); + return txc->ops.send_common(txc, ep->tx_attr.tclass, buf, len, NULL, 0, + dest_addr, tag, NULL, FI_INJECT, true, + false, 0, NULL, NULL); } static ssize_t cxip_tsenddata(struct fid_ep *fid_ep, const void *buf, @@ -5832,11 +970,12 @@ static ssize_t cxip_tsenddata(struct fid_ep *fid_ep, const void *buf, fi_addr_t dest_addr, uint64_t tag, void *context) { struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + struct cxip_txc *txc = ep->ep_obj->txc; - return cxip_send_common(&ep->ep_obj->txc, ep->tx_attr.tclass, buf, len, - desc, data, dest_addr, tag, context, - ep->tx_attr.op_flags | FI_REMOTE_CQ_DATA, - true, false, 0, NULL, NULL); + return txc->ops.send_common(txc, ep->tx_attr.tclass, buf, len, desc, + data, dest_addr, tag, context, + ep->tx_attr.op_flags | FI_REMOTE_CQ_DATA, + true, false, 0, NULL, NULL); } static ssize_t cxip_tinjectdata(struct fid_ep *fid_ep, const void *buf, @@ -5844,11 +983,12 @@ static ssize_t cxip_tinjectdata(struct fid_ep *fid_ep, const void *buf, uint64_t tag) { struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + struct cxip_txc *txc = ep->ep_obj->txc; - return cxip_send_common(&ep->ep_obj->txc, ep->tx_attr.tclass, buf, - len, NULL, data, dest_addr, tag, NULL, - FI_INJECT | FI_REMOTE_CQ_DATA, - true, false, 0, NULL, NULL); + return txc->ops.send_common(txc, ep->tx_attr.tclass, buf, len, NULL, + data, dest_addr, tag, NULL, + FI_INJECT | FI_REMOTE_CQ_DATA, true, false, + 0, NULL, NULL); } struct fi_ops_tagged cxip_ep_tagged_no_ops = { @@ -5907,9 +1047,10 @@ static ssize_t cxip_recv(struct fid_ep *fid_ep, void *buf, size_t len, void *desc, fi_addr_t src_addr, void *context) { struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + struct cxip_rxc *rxc = ep->ep_obj->rxc; - return cxip_recv_common(&ep->ep_obj->rxc, buf, len, desc, src_addr, 0, - 0, context, ep->rx_attr.op_flags, false, NULL); + return rxc->ops.recv_common(rxc, buf, len, desc, src_addr, 0, 0, + context, ep->rx_attr.op_flags, false, NULL); } static ssize_t cxip_recvv(struct fid_ep *fid_ep, const struct iovec *iov, @@ -5917,6 +1058,7 @@ static ssize_t cxip_recvv(struct fid_ep *fid_ep, const struct iovec *iov, void *context) { struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + struct cxip_rxc *rxc = ep->ep_obj->rxc; size_t len; void *buf; void *mr_desc; @@ -5930,20 +1072,19 @@ static ssize_t cxip_recvv(struct fid_ep *fid_ep, const struct iovec *iov, buf = iov[0].iov_base; mr_desc = desc ? desc[0] : NULL; } else { - RXC_WARN(&ep->ep_obj->rxc, "Invalid IOV\n"); + RXC_WARN(rxc, "Invalid IOV\n"); return -FI_EINVAL; } - return cxip_recv_common(&ep->ep_obj->rxc, buf, len, mr_desc, src_addr, - 0, 0, context, ep->rx_attr.op_flags, false, - NULL); + return rxc->ops.recv_common(rxc, buf, len, mr_desc, src_addr, 0, 0, + context, ep->rx_attr.op_flags, false, NULL); } static ssize_t cxip_recvmsg(struct fid_ep *fid_ep, const struct fi_msg *msg, uint64_t flags) { struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); - struct cxip_rxc *rxc = &ep->ep_obj->rxc; + struct cxip_rxc *rxc = ep->ep_obj->rxc; size_t len; void *buf; void *mr_desc; @@ -5965,7 +1106,7 @@ static ssize_t cxip_recvmsg(struct fid_ep *fid_ep, const struct fi_msg *msg, buf = msg->msg_iov[0].iov_base; mr_desc = msg->desc ? msg->desc[0] : NULL; } else { - RXC_WARN(&ep->ep_obj->rxc, "Invalid IOV\n"); + RXC_WARN(rxc, "Invalid IOV\n"); return -FI_EINVAL; } @@ -5975,19 +1116,19 @@ static ssize_t cxip_recvmsg(struct fid_ep *fid_ep, const struct fi_msg *msg, if (!rxc->selective_completion) flags |= FI_COMPLETION; - return cxip_recv_common(rxc, buf, len, mr_desc, msg->addr, 0, 0, - msg->context, flags, false, NULL); + return rxc->ops.recv_common(rxc, buf, len, mr_desc, msg->addr, 0, 0, + msg->context, flags, false, NULL); } static ssize_t cxip_send(struct fid_ep *fid_ep, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, void *context) { struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + struct cxip_txc *txc = ep->ep_obj->txc; - return cxip_send_common(&ep->ep_obj->txc, ep->tx_attr.tclass, - buf, len, desc, 0, dest_addr, 0, context, - ep->tx_attr.op_flags, false, false, 0, - NULL, NULL); + return txc->ops.send_common(txc, ep->tx_attr.tclass, buf, len, desc, 0, + dest_addr, 0, context, ep->tx_attr.op_flags, + false, false, 0, NULL, NULL); } static ssize_t cxip_sendv(struct fid_ep *fid_ep, const struct iovec *iov, @@ -5995,6 +1136,7 @@ static ssize_t cxip_sendv(struct fid_ep *fid_ep, const struct iovec *iov, void *context) { struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + struct cxip_txc *txc = ep->ep_obj->txc; size_t len; const void *buf; void *mr_desc; @@ -6008,27 +1150,27 @@ static ssize_t cxip_sendv(struct fid_ep *fid_ep, const struct iovec *iov, buf = iov[0].iov_base; mr_desc = desc ? desc[0] : NULL; } else { - TXC_WARN(&ep->ep_obj->txc, "Invalid IOV\n"); + TXC_WARN(txc, "Invalid IOV\n"); return -FI_EINVAL; } - return cxip_send_common(&ep->ep_obj->txc, ep->tx_attr.tclass, - buf, len, mr_desc, 0, dest_addr, 0, context, - ep->tx_attr.op_flags, false, false, 0, - NULL, NULL); + return txc->ops.send_common(txc, ep->tx_attr.tclass, buf, len, mr_desc, + 0, dest_addr, 0, context, + ep->tx_attr.op_flags, false, false, 0, + NULL, NULL); } static ssize_t cxip_sendmsg(struct fid_ep *fid_ep, const struct fi_msg *msg, uint64_t flags) { struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); - struct cxip_txc *txc = &ep->ep_obj->txc; + struct cxip_txc *txc = ep->ep_obj->txc; size_t len; const void *buf; void *mr_desc; if (!msg) { - TXC_WARN(&ep->ep_obj->txc, "NULL msg not supported\n"); + TXC_WARN(txc, "NULL msg not supported\n"); return -FI_EINVAL; } @@ -6041,7 +1183,7 @@ static ssize_t cxip_sendmsg(struct fid_ep *fid_ep, const struct fi_msg *msg, buf = msg->msg_iov[0].iov_base; mr_desc = msg->desc ? msg->desc[0] : NULL; } else { - TXC_WARN(&ep->ep_obj->txc, "Invalid IOV\n"); + TXC_WARN(txc, "Invalid IOV\n"); return -FI_EINVAL; } @@ -6057,19 +1199,20 @@ static ssize_t cxip_sendmsg(struct fid_ep *fid_ep, const struct fi_msg *msg, if (!txc->selective_completion) flags |= FI_COMPLETION; - return cxip_send_common(txc, ep->tx_attr.tclass, buf, len, mr_desc, - msg->data, msg->addr, 0, msg->context, flags, - false, false, 0, NULL, NULL); + return txc->ops.send_common(txc, ep->tx_attr.tclass, buf, len, mr_desc, + msg->data, msg->addr, 0, msg->context, + flags, false, false, 0, NULL, NULL); } static ssize_t cxip_inject(struct fid_ep *fid_ep, const void *buf, size_t len, fi_addr_t dest_addr) { struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + struct cxip_txc *txc = ep->ep_obj->txc; - return cxip_send_common(&ep->ep_obj->txc, ep->tx_attr.tclass, buf, - len, NULL, 0, dest_addr, 0, NULL, FI_INJECT, - false, false, 0, NULL, NULL); + return txc->ops.send_common(txc, ep->tx_attr.tclass, buf, len, NULL, 0, + dest_addr, 0, NULL, FI_INJECT, false, false, + 0, NULL, NULL); } static ssize_t cxip_senddata(struct fid_ep *fid_ep, const void *buf, size_t len, @@ -6077,22 +1220,24 @@ static ssize_t cxip_senddata(struct fid_ep *fid_ep, const void *buf, size_t len, void *context) { struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + struct cxip_txc *txc = ep->ep_obj->txc; - return cxip_send_common(&ep->ep_obj->txc, ep->tx_attr.tclass, buf, - len, desc, data, dest_addr, 0, context, - ep->tx_attr.op_flags | FI_REMOTE_CQ_DATA, - false, false, 0, NULL, NULL); + return txc->ops.send_common(txc, ep->tx_attr.tclass, buf, len, desc, + data, dest_addr, 0, context, + ep->tx_attr.op_flags | FI_REMOTE_CQ_DATA, + false, false, 0, NULL, NULL); } static ssize_t cxip_injectdata(struct fid_ep *fid_ep, const void *buf, size_t len, uint64_t data, fi_addr_t dest_addr) { struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); + struct cxip_txc *txc = ep->ep_obj->txc; - return cxip_send_common(&ep->ep_obj->txc, ep->tx_attr.tclass, buf, - len, NULL, data, dest_addr, 0, NULL, - FI_INJECT | FI_REMOTE_CQ_DATA, - false, false, 0, NULL, NULL); + return txc->ops.send_common(txc, ep->tx_attr.tclass, buf, len, NULL, + data, dest_addr, 0, NULL, + FI_INJECT | FI_REMOTE_CQ_DATA, false, false, + 0, NULL, NULL); } struct fi_ops_msg cxip_ep_msg_no_ops = { diff --git a/prov/cxi/src/cxip_msg_hpc.c b/prov/cxi/src/cxip_msg_hpc.c new file mode 100644 index 00000000000..5d68d40c51a --- /dev/null +++ b/prov/cxi/src/cxip_msg_hpc.c @@ -0,0 +1,5275 @@ +/* + * SPDX-License-Identifier: BSD-2 Clause or GPL-2.0-only + * + * Copyright (c) 2018-2024 Hewlett Packard Enterprise Development LP + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "cxip.h" + +#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_EP_CTRL, __VA_ARGS__) +#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_EP_CTRL, __VA_ARGS__) +#define CXIP_INFO(...) _CXIP_INFO(FI_LOG_EP_CTRL, __VA_ARGS__) + +#define CXIP_SC_STATS "FC/SC stats - EQ full: %d append fail: %d no match: %d"\ + " request full: %d unexpected: %d, NIC HW2SW unexp: %d"\ + " NIC HW2SW append fail: %d\n" +#define FC_SW_ONLOAD_MSG_FATAL "LE resources not recovered during "\ + "flow control. FI_CXI_RX_MATCH_MODE=[hybrid|software] is required\n" +#define FC_OFLOW_NO_MATCH_MSG "Flow control overflow no match, increasing "\ + "FI_CXI_OFLOW_BUF_SIZE (current is %ldB) may reduce occurrence\n" +#define FC_REQ_FULL_MSG "Flow control request list full, increasing"\ + " FI_CXI_REQ_BUF_SIZE value (current is %ldB) may reduce occurrence\n" +#define FC_DROP_COUNT_MSG "Re-enable Drop count mismatch, re-enable will "\ + "be retried on notify\n" + +#define WARN_RESTRICTED_DISABLED "Insufficient resources for %s "\ + "protocol, switching to %s protocol\n" + +static int cxip_recv_cb(struct cxip_req *req, const union c_event *event); +static void cxip_ux_onload_complete(struct cxip_req *req); +static int cxip_ux_onload(struct cxip_rxc_hpc *rxc); +static int cxip_recv_req_queue(struct cxip_req *req, bool restart_seq); +static int cxip_send_req_dropped(struct cxip_txc_hpc *txc, + struct cxip_req *req); +static ssize_t _cxip_recv_req(struct cxip_req *req, bool restart_seq); +static void cxip_rxc_hpc_recv_req_tgt_event(struct cxip_req *, + const union c_event *event); +static int cxip_send_req_dequeue(struct cxip_txc_hpc *txc, + struct cxip_req *req); + +static void cxip_fc_progress_ctrl(struct cxip_rxc_hpc *rxc); + +/* + * match_put_event() - Find/add a matching event. + * + * For every Put Overflow event there is a matching Put event. These events can + * be generated in any order. Both events must be received before progress can + * be made. + * + * If the matching event exists in the mapping, matched is set to true and + * the deferred event is returned. If a match was not found, matched is set to + * false and the event is added to the deferred event mapping. + * + * The deferred match event is returned; unless it must be added to the + * deferred mapping and memory is insufficient. + * + * Caller must hold ep_obj->lock. + */ +static struct cxip_deferred_event * +match_put_event(struct cxip_rxc_hpc *rxc, struct cxip_req *req, + const union c_event *event, bool *matched) +{ + union cxip_def_event_key key = {}; + struct cxip_deferred_event *def_ev; + union cxip_match_bits mb; + int bucket; + enum c_event_type match_type = + event->tgt_long.event_type == C_EVENT_PUT ? + C_EVENT_PUT_OVERFLOW : C_EVENT_PUT; + + if (event->tgt_long.rendezvous) { + key.initiator = event->tgt_long.initiator.initiator.process; + mb.raw = event->tgt_long.match_bits; + key.rdzv_id = (mb.rdzv_id_hi << CXIP_RDZV_ID_CMD_WIDTH) | + event->tgt_long.rendezvous_id; + key.rdzv = 1; + } else { + key.start_addr = event->tgt_long.start; + } + + bucket = fasthash64(&key.raw, sizeof(key.raw), 0) % + CXIP_DEF_EVENT_HT_BUCKETS; + dlist_foreach_container(&rxc->deferred_events.bh[bucket], + struct cxip_deferred_event, def_ev, + rxc_entry) { + if (def_ev->key.raw == key.raw && + def_ev->ev.tgt_long.event_type == match_type && + def_ev->ev.tgt_long.return_code == + event->tgt_long.return_code && + def_ev->ev.tgt_long.initiator.initiator.process == + event->tgt_long.initiator.initiator.process && + def_ev->ev.tgt_long.match_bits == + event->tgt_long.match_bits) { + *matched = true; + return def_ev; + } + } + + /* Not found, add mapping to hash bucket */ + *matched = false; + + def_ev = calloc(1, sizeof(*def_ev)); + if (!def_ev) { + RXC_WARN(rxc, "Failed allocate to memory\n"); + return NULL; + } + + def_ev->key.raw = key.raw; + def_ev->req = req; + def_ev->ev = *event; + + dlist_insert_tail(&def_ev->rxc_entry, &rxc->deferred_events.bh[bucket]); + + return def_ev; +} + +/* + * free_put_event() - Free a deferred put event. + * + * Free an event previously allocated added with match_put_event(). + * + * Caller must hold ep_obj->lock. + */ +static void free_put_event(struct cxip_rxc_hpc *rxc, + struct cxip_deferred_event *def_ev) +{ + dlist_remove(&def_ev->rxc_entry); + free(def_ev); +} + +/* + * rdzv_mrecv_req_lookup() - Search for a matching rendezvous, multi-receive + * child request. + */ +static int rdzv_mrecv_req_lookup(struct cxip_req *req, + const union c_event *event, + uint32_t *initiator, uint32_t *rdzv_id, + bool perform_event_checks, + struct cxip_req **req_out) +{ + struct cxip_rxc *rxc = req->recv.rxc; + struct cxip_req *child_req; + union cxip_match_bits mb; + uint32_t ev_init; + uint32_t ev_rdzv_id; + struct cxip_addr caddr; + int ret; + int i; + + if (event->hdr.event_type == C_EVENT_REPLY) { + struct cxi_rdzv_user_ptr *user_ptr; + + /* Events for software-issued operations will return a + * reference to the correct request. + */ + if (!event->init_short.rendezvous) { + *req_out = req; + return FI_SUCCESS; + } + + user_ptr = (struct cxi_rdzv_user_ptr *) + &event->init_short.user_ptr; + + ev_init = CXI_MATCH_ID(rxc->pid_bits, user_ptr->src_pid, + user_ptr->src_nid); + ev_rdzv_id = user_ptr->rendezvous_id; + } else if (event->hdr.event_type == C_EVENT_RENDEZVOUS) { + struct cxip_rxc *rxc = req->recv.rxc; + uint32_t dfa = event->tgt_long.initiator.initiator.process; + + ev_init = cxi_dfa_to_init(dfa, rxc->pid_bits); + mb.raw = event->tgt_long.match_bits; + + ev_rdzv_id = (mb.rdzv_id_hi << CXIP_RDZV_ID_CMD_WIDTH) | + event->tgt_long.rendezvous_id; + } else { + ev_init = event->tgt_long.initiator.initiator.process; + mb.raw = event->tgt_long.match_bits; + + ev_rdzv_id = (mb.rdzv_id_hi << CXIP_RDZV_ID_CMD_WIDTH) | + event->tgt_long.rendezvous_id; + } + + if ((event->hdr.event_type == C_EVENT_PUT_OVERFLOW || + event->hdr.event_type == C_EVENT_PUT) && + rxc->ep_obj->av->symmetric) { + ret = cxip_av_lookup_addr(rxc->ep_obj->av, + CXI_MATCH_ID_EP(rxc->pid_bits, + ev_init), + &caddr); + if (ret != FI_SUCCESS) + RXC_FATAL(rxc, "Lookup of FI addr 0x%x: failed %d\n", + ev_init, ret); + + ev_init = CXI_MATCH_ID(rxc->pid_bits, + CXI_MATCH_ID_PID(rxc->pid_bits, ev_init), + caddr.nic); + } + + *initiator = ev_init; + *rdzv_id = ev_rdzv_id; + + /* Events for hardware-issued operations will return a rendezvous_id + * and initiator data. Use these fields to find a matching child + * request. + */ + dlist_foreach_container(&req->recv.children, + struct cxip_req, child_req, + recv.children) { + if (child_req->recv.rdzv_id == ev_rdzv_id && + child_req->recv.rdzv_initiator == ev_init) { + + if (perform_event_checks) { + /* There is an edge case where source may reuse the + * same rendezvous ID before the target has had time to + * process the C_EVENT_REPLY. If this is the case, an + * incorrect child_req match would occur. To prevent + * this, the events seen are stored with the child_req. + * If a redundant event is seen, this is a sign + * C_EVENT_REPLY needs to be process. Thus, return + * -FI_EAGAIN to process TX EQ. + */ + for (i = 0; i < child_req->recv.rdzv_events; i++) { + if (child_req->recv.rdzv_event_types[i] == event->hdr.event_type) { + assert(event->hdr.event_type != C_EVENT_REPLY); + return -FI_EAGAIN; + } + } + } + + *req_out = child_req; + return FI_SUCCESS; + } + } + + return -FI_ENOMSG; +} + +/* + * rdzv_mrecv_req_event() - Look up a multi-recieve child request using an + * event and multi-recv request. + * + * Each rendezvous Put transaction targeting a multi-receive buffer is tracked + * using a separate child request. A child request is uniquely identified by + * rendezvous ID and source address. Return a reference to a child request + * which matches the event. Allocate a new child request, if necessary. + */ +static struct cxip_req * +rdzv_mrecv_req_event(struct cxip_req *mrecv_req, const union c_event *event) +{ + uint32_t ev_init; + uint32_t ev_rdzv_id; + struct cxip_req *req; + struct cxip_rxc *rxc __attribute__((unused)) = mrecv_req->recv.rxc; + int ret; + + assert(event->hdr.event_type == C_EVENT_REPLY || + event->hdr.event_type == C_EVENT_PUT || + event->hdr.event_type == C_EVENT_PUT_OVERFLOW || + event->hdr.event_type == C_EVENT_RENDEZVOUS); + + ret = rdzv_mrecv_req_lookup(mrecv_req, event, &ev_init, &ev_rdzv_id, + true, &req); + switch (ret) { + case -FI_EAGAIN: + return NULL; + + case -FI_ENOMSG: + req = cxip_mrecv_req_dup(mrecv_req); + if (!req) + return NULL; + + /* Store event initiator and rdzv_id for matching. */ + req->recv.rdzv_id = ev_rdzv_id; + req->recv.rdzv_initiator = ev_init; + + dlist_insert_tail(&req->recv.children, + &mrecv_req->recv.children); + + RXC_DBG(rxc, "New child: %p parent: %p event: %s\n", req, + mrecv_req, cxi_event_to_str(event)); + return req; + + case FI_SUCCESS: + RXC_DBG(rxc, "Found child: %p parent: %p event: %s\n", req, + mrecv_req, cxi_event_to_str(event)); + return req; + + default: + RXC_FATAL(rxc, "Unhandled rdzv_mrecv_req_lookup %d\n", ret); + } +} + +/* + * rdzv_recv_req_event() - Count a rendezvous event. + * + * Call for each target rendezvous event generated on a user receive buffer. + * After three events, a rendezvous receive is complete. The three events could + * be either: + * -Put, Rendezvous, Reply -- or + * -Put Overflow, Rendezvous, Reply + * + * For a restricted Get there is a fourth event, the ACK of the notify. + * + * In either case, the events could be generated in any order. As soon as the + * events expected are processed, the request is complete. + */ +static void rdzv_recv_req_event(struct cxip_req *req, enum c_event_type type) +{ + int total_events = req->recv.done_notify ? 4 : 3; + + req->recv.rdzv_event_types[req->recv.rdzv_events] = type; + + if (++req->recv.rdzv_events == total_events) { + if (req->recv.multi_recv) { + dlist_remove(&req->recv.children); + cxip_recv_req_report(req); + cxip_evtq_req_free(req); + } else { + cxip_recv_req_report(req); + cxip_recv_req_free(req); + } + } +} + +/* + * oflow_req_put_bytes() - Consume bytes in the Overflow buffer. + * + * An Overflow buffer is freed when all bytes are consumed by the NIC. + * + * Caller must hold ep_obj->lock. + */ +static void oflow_req_put_bytes(struct cxip_req *req, size_t bytes) +{ + struct cxip_ptelist_buf *oflow_buf = req->req_ctx; + + /* Non-zero length UX messages with 0 eager portion do not + * have a dependency on the oflow buffer. + */ + if (bytes == 0) + return; + + oflow_buf->cur_offset += bytes; + + RXC_DBG(oflow_buf->rxc, "Putting %lu bytes (%lu/%lu): %p\n", bytes, + oflow_buf->cur_offset, oflow_buf->unlink_length, req); + + if (oflow_buf->cur_offset == oflow_buf->unlink_length) + cxip_ptelist_buf_consumed(oflow_buf); +} + +/* + * issue_rdzv_get() - Perform a Get to pull source data from the Initiator of a + * Send operation. + */ +static int issue_rdzv_get(struct cxip_req *req) +{ + struct c_full_dma_cmd cmd = {}; + uint64_t local_addr; + uint64_t rem_offset; + uint32_t align_bytes; + uint32_t mlen; + struct cxip_rxc_hpc *rxc = req->recv.rxc_hpc; + uint32_t pid_idx = rxc->base.domain->iface->dev->info.rdzv_get_idx; + uint8_t idx_ext; + union cxip_match_bits mb = {}; + int ret; + union c_fab_addr dfa; + + if (req->recv.rdzv_proto == CXIP_RDZV_PROTO_ALT_WRITE) + RXC_WARN_ONCE(rxc, "Rendezvous protocol: %s not implemented\n", + cxip_rdzv_proto_to_str(req->recv.rdzv_proto)); + + cmd.command.cmd_type = C_CMD_TYPE_DMA; + cmd.command.opcode = C_CMD_GET; + cmd.lac = req->recv.recv_md->md->lac; + cmd.event_send_disable = 1; + + /* Must deliver to TX event queue */ + cmd.eq = cxip_evtq_eqn(&rxc->base.ep_obj->txc->tx_evtq); + + if (req->recv.rdzv_proto == CXIP_RDZV_PROTO_ALT_READ) { + pid_idx = CXIP_PTL_IDX_RDZV_RESTRICTED(req->recv.rdzv_lac); + cmd.restricted = 1; + req->recv.done_notify = true; + } else { + pid_idx = rxc->base.domain->iface->dev->info.rdzv_get_idx; + mb.rdzv_lac = req->recv.rdzv_lac; + mb.rdzv_id_lo = req->recv.rdzv_id; + mb.rdzv_id_hi = req->recv.rdzv_id >> CXIP_RDZV_ID_CMD_WIDTH; + } + cmd.match_bits = mb.raw; + + cmd.user_ptr = (uint64_t)req; + cxi_build_dfa(req->recv.rget_nic, req->recv.rget_pid, + rxc->base.pid_bits, pid_idx, &dfa, &idx_ext); + cmd.dfa = dfa; + cmd.index_ext = idx_ext; + + local_addr = CXI_VA_TO_IOVA(req->recv.recv_md->md, + req->recv.recv_buf); + local_addr += req->recv.rdzv_mlen; + + rem_offset = req->recv.src_offset; + mlen = req->recv.rdzv_mlen; + + RXC_DBG(rxc, "SW RGet addr: 0x%" PRIx64 " len %" PRId64 + " rem_off: %" PRId64 " restricted: %d\n", local_addr, + req->data_len - req->recv.rdzv_mlen, rem_offset, + cmd.restricted); + + /* Align mask will be non-zero if local DMA address cache-line + * alignment is desired. + */ + if (mlen >= rxc->rget_align_mask) { + align_bytes = local_addr & rxc->rget_align_mask; + local_addr -= align_bytes; + rem_offset -= align_bytes; + mlen -= align_bytes; + } + + if (req->data_len < mlen) + cmd.request_len = 0; + else + cmd.request_len = req->data_len - mlen; + + cmd.local_addr = local_addr; + cmd.remote_offset = rem_offset; + + RXC_DBG(rxc, "Aligned addr: 0x%" PRIx64 " len %d rem_off %" PRId64 "\n", + (uint64_t)cmd.local_addr, cmd.request_len, + (uint64_t)cmd.remote_offset); + + ret = cxip_rxc_emit_dma(rxc, req->recv.vni, + cxip_ofi_to_cxi_tc(cxip_env.rget_tc), + CXI_TC_TYPE_DEFAULT, &cmd, 0); + if (ret) + RXC_WARN(rxc, "Failed to issue rendezvous get: %d\n", ret); + + return ret; +} + +/* + * cxip_notify_match_cb() - Callback function for match complete notifiction + * Ack events. + */ +static int +cxip_notify_match_cb(struct cxip_req *req, const union c_event *event) +{ + RXC_DBG(req->recv.rxc, "Match complete: %p\n", req); + + cxip_recv_req_report(req); + + if (req->recv.multi_recv) + cxip_evtq_req_free(req); + else + cxip_recv_req_free(req); + + return FI_SUCCESS; +} + +/* + * cxip_notify_match() - Notify the initiator of a Send that the match is + * complete at the target. + * + * A transaction ID corresponding to the matched Send request is sent back to + * the initiator in the match_bits field of a zero-byte Put. + */ +static int cxip_notify_match(struct cxip_req *req, const union c_event *event) +{ + struct cxip_rxc_hpc *rxc = req->recv.rxc_hpc; + uint32_t pid_idx = rxc->base.domain->iface->dev->info.rdzv_get_idx; + uint32_t init = event->tgt_long.initiator.initiator.process; + uint32_t nic = CXI_MATCH_ID_EP(rxc->base.pid_bits, init); + uint32_t pid = CXI_MATCH_ID_PID(rxc->base.pid_bits, init); + union c_fab_addr dfa; + uint8_t idx_ext; + union cxip_match_bits mb = { + .le_type = CXIP_LE_TYPE_ZBP, + }; + union cxip_match_bits event_mb; + int ret; + struct c_cstate_cmd c_state = {}; + struct c_idc_msg_hdr idc_msg = {}; + + event_mb.raw = event->tgt_long.match_bits; + mb.tx_id = event_mb.tx_id; + + cxi_build_dfa(nic, pid, rxc->base.pid_bits, pid_idx, &dfa, &idx_ext); + + c_state.event_send_disable = 1; + c_state.index_ext = idx_ext; + c_state.eq = cxip_evtq_eqn(&rxc->base.ep_obj->txc->tx_evtq); + + idc_msg.dfa = dfa; + idc_msg.match_bits = mb.raw; + idc_msg.user_ptr = (uint64_t)req; + + req->cb = cxip_notify_match_cb; + + ret = cxip_rxc_emit_idc_msg(rxc, event->tgt_long.vni, + cxip_ofi_to_cxi_tc(cxip_env.rget_tc), + CXI_TC_TYPE_DEFAULT, &c_state, &idc_msg, + NULL, 0, 0); + + RXC_DBG(rxc, "Queued match completion message: %p\n", req); + + return ret; +} + +/* + * mrecv_req_oflow_event() - Set start and length uniquely for an unexpected + * mrecv request. + * + * Overflow buffer events contain a start address representing the offset into + * the Overflow buffer where data was written. When a unexpected header is + * later matched to a multi-receive buffer in the priority list, The Put + * Overflow event does not contain the offset into the Priority list buffer + * where data should be copied. Software must track the the Priority list + * buffer offset using ordered Put Overflow events. + */ +static int mrecv_req_put_bytes(struct cxip_req *req, uint32_t rlen) +{ + uintptr_t mrecv_head; + uintptr_t mrecv_tail; + size_t mrecv_bytes_remaining; + + mrecv_head = (uintptr_t)req->recv.recv_buf + req->recv.start_offset; + mrecv_tail = (uintptr_t)req->recv.recv_buf + req->recv.ulen; + mrecv_bytes_remaining = mrecv_tail - mrecv_head; + + rlen = MIN(mrecv_bytes_remaining, rlen); + req->recv.start_offset += rlen; + + return rlen; +} + +/* cxip_recv_req_set_rget_info() - Set RGet NIC and PID fields. Used for + * messages where a rendezvous event will not be generated. Current usages are + * for the eager long protocol and rendezvous operations which have unexpected + * headers onloaded due to flow control. + */ +static void cxip_recv_req_set_rget_info(struct cxip_req *req) +{ + struct cxip_rxc *rxc = req->recv.rxc; + int ret; + + if (rxc->ep_obj->av->symmetric) { + struct cxip_addr caddr; + + RXC_DBG(rxc, "Translating initiator: %x, req: %p\n", + req->recv.initiator, req); + + ret = cxip_av_lookup_addr(rxc->ep_obj->av, + CXI_MATCH_ID_EP(rxc->pid_bits, + req->recv.initiator), + &caddr); + if (ret != FI_SUCCESS) + RXC_FATAL(rxc, "Failed to look up FI addr: %d\n", ret); + + req->recv.rget_nic = caddr.nic; + } else { + req->recv.rget_nic = CXI_MATCH_ID_EP(rxc->pid_bits, + req->recv.initiator); + } + + req->recv.rget_pid = CXI_MATCH_ID_PID(rxc->pid_bits, + req->recv.initiator); +} + +/* + * cxip_ux_send() - Progress an unexpected Send after receiving matching Put + * and Put and Put Overflow events. + * + * Caller must hold ep_obj->lock. + */ +static int cxip_ux_send(struct cxip_req *match_req, struct cxip_req *oflow_req, + const union c_event *put_event, uint64_t mrecv_start, + uint32_t mrecv_len, bool remove_recv_entry) +{ + struct cxip_ptelist_buf *buf; + void *oflow_va; + size_t oflow_bytes; + union cxip_match_bits mb; + ssize_t ret; + struct cxip_req *parent_req = match_req; + + assert(match_req->type == CXIP_REQ_RECV); + + if (match_req->recv.multi_recv) { + if (put_event->tgt_long.rendezvous) + match_req = rdzv_mrecv_req_event(match_req, put_event); + else + match_req = cxip_mrecv_req_dup(match_req); + if (!match_req) + return -FI_EAGAIN; + + /* Set start and length uniquely for an unexpected + * mrecv request. + */ + match_req->recv.recv_buf = (uint8_t *) + match_req->recv.parent->recv.recv_buf + + mrecv_start; + match_req->buf = (uint64_t)match_req->recv.recv_buf; + match_req->data_len = mrecv_len; + } else { + match_req->data_len = put_event->tgt_long.rlength; + if (match_req->data_len > match_req->recv.ulen) + match_req->data_len = match_req->recv.ulen; + } + + cxip_rxc_hpc_recv_req_tgt_event(match_req, put_event); + buf = oflow_req->req_ctx; + oflow_va = (void *)CXI_IOVA_TO_VA(buf->md->md, + put_event->tgt_long.start); + + /* Copy data out of overflow buffer. */ + oflow_bytes = MIN(put_event->tgt_long.mlength, match_req->data_len); + cxip_copy_to_md(match_req->recv.recv_md, match_req->recv.recv_buf, + oflow_va, oflow_bytes); + + if (oflow_req->type == CXIP_REQ_OFLOW) + oflow_req_put_bytes(oflow_req, put_event->tgt_long.mlength); + + /* Remaining unexpected rendezvous processing is deferred until RGet + * completes. + */ + if (put_event->tgt_long.rendezvous) { + if (remove_recv_entry) + dlist_remove_init(&parent_req->recv.rxc_entry); + + rdzv_recv_req_event(match_req, put_event->hdr.event_type); + return FI_SUCCESS; + } + + mb.raw = put_event->tgt_long.match_bits; + + /* Check if the initiator requires match completion guarantees. + * If so, notify the initiator that the match is now complete. + * Delay the Receive event until the notification is complete. + */ + if (mb.match_comp) { + ret = cxip_notify_match(match_req, put_event); + if (ret != FI_SUCCESS) { + if (match_req->recv.multi_recv) + cxip_evtq_req_free(match_req); + + return -FI_EAGAIN; + } + + if (remove_recv_entry) + dlist_remove_init(&parent_req->recv.rxc_entry); + + return FI_SUCCESS; + } + + if (remove_recv_entry) + dlist_remove_init(&parent_req->recv.rxc_entry); + + cxip_recv_req_report(match_req); + + if (match_req->recv.multi_recv) + cxip_evtq_req_free(match_req); + else + cxip_recv_req_free(match_req); + + return FI_SUCCESS; +} + +/* + * cxip_ux_send_zb() - Progress an unexpected zero-byte Send after receiving + * a Put Overflow event. + * + * Zero-byte Put events for unexpected Sends are discarded. Progress the Send + * using only the Overflow event. There is no Send data to be copied out. + */ +static int cxip_ux_send_zb(struct cxip_req *match_req, + const union c_event *oflow_event, + uint64_t mrecv_start, bool remove_recv_entry) +{ + union cxip_match_bits mb; + int ret; + struct cxip_req *parent_req = match_req; + + assert(!oflow_event->tgt_long.rlength); + + if (match_req->recv.multi_recv) { + match_req = cxip_mrecv_req_dup(match_req); + if (!match_req) + return -FI_EAGAIN; + + match_req->buf = (uint64_t) + match_req->recv.parent->recv.recv_buf + + mrecv_start; + } + + cxip_rxc_hpc_recv_req_tgt_event(match_req, oflow_event); + + match_req->data_len = 0; + + mb.raw = oflow_event->tgt_long.match_bits; + + /* Check if the initiator requires match completion guarantees. + * If so, notify the initiator that the match is now complete. + * Delay the Receive event until the notification is complete. + */ + if (mb.match_comp) { + ret = cxip_notify_match(match_req, oflow_event); + if (ret != FI_SUCCESS) { + if (match_req->recv.multi_recv) + cxip_evtq_req_free(match_req); + + return -FI_EAGAIN; + } + + if (remove_recv_entry) + dlist_remove_init(&parent_req->recv.rxc_entry); + + return FI_SUCCESS; + } + + if (remove_recv_entry) + dlist_remove_init(&parent_req->recv.rxc_entry); + + cxip_recv_req_report(match_req); + + if (match_req->recv.multi_recv) + cxip_evtq_req_free(match_req); + else + cxip_recv_req_free(match_req); + + return FI_SUCCESS; +} + +static bool cxip_ux_is_onload_complete(struct cxip_req *req) +{ + return !req->search.puts_pending && req->search.complete; +} + +/* Caller must hold ep_obj->lock. */ +static int cxip_oflow_process_put_event(struct cxip_rxc_hpc *rxc, + struct cxip_req *req, + const union c_event *event) +{ + int ret; + struct cxip_deferred_event *def_ev; + struct cxip_req *save_req; + bool matched; + + def_ev = match_put_event(rxc, req, event, &matched); + if (!matched) + return !def_ev ? -FI_EAGAIN : FI_SUCCESS; + + RXC_DBG(rxc, "Overflow beat Put event: %p\n", def_ev->req); + + if (def_ev->ux_send) { + /* UX Send was onloaded for one of these reasons: + * 1) Flow control + * 2) ULE was claimed by a FI_CLAIM + */ + save_req = def_ev->req; + def_ev->ux_send->req = req; + def_ev->ux_send->put_ev = *event; + + if (def_ev->ux_send->claimed) { + cxip_rxc_hpc_recv_req_tgt_event(save_req, + &def_ev->ux_send->put_ev); + cxip_recv_req_peek_complete(save_req, def_ev->ux_send); + RXC_DBG(rxc, "FI_CLAIM put complete: %p, ux_send %p\n", + save_req, def_ev->ux_send); + goto done; + } else { + def_ev->req->search.puts_pending--; + RXC_DBG(rxc, "put complete: %p\n", def_ev->req); + } + + if (cxip_ux_is_onload_complete(def_ev->req)) + cxip_ux_onload_complete(def_ev->req); + + } else { + ret = cxip_ux_send(def_ev->req, req, event, def_ev->mrecv_start, + def_ev->mrecv_len, false); + if (ret != FI_SUCCESS) + return -FI_EAGAIN; + } + +done: + free_put_event(rxc, def_ev); + + return FI_SUCCESS; +} + +/* cxip_rxp_check_le_usage_hybrid_preempt() - Examines LE Pool usage and forces + * a preemptive hardware to software transition if needed. + * + * In cases where the LE pool entry reservation is insufficient to meet request + * list buffers (due to multiple EP sharing an LE Pool or insufficient LE Pool + * reservation value), then enabling the periodic checking of LE allocations + * can be used to force preemptive transitions to software match mode before + * resources are exhausted or so depleted they are starve software managed + * endpoint. The lpe_stat_2 is set to the number of LE pool entries allocated + * to the LE pool and lpe_stat_1 is the current allocation. Skid is required + * as stats are relative to hardware processing, not software processing of + * the event. + * + * Caller should hold ep_obj->lock. + */ +static inline bool +cxip_rxp_check_le_usage_hybrid_preempt(struct cxip_rxc_hpc *rxc, + const union c_event *event) +{ + if (event->tgt_long.lpe_stat_1 > (event->tgt_long.lpe_stat_2 >> 1) && + rxc->base.state == RXC_ENABLED) { + if (cxip_recv_pending_ptlte_disable(&rxc->base, false)) + RXC_WARN(rxc, "Force FC failed\n"); + return true; + } + return false; +} + +static int cxip_rxc_check_ule_hybrid_preempt(struct cxip_rxc_hpc *rxc) +{ + int ret; + int count; + + if (cxip_env.rx_match_mode == CXIP_PTLTE_HYBRID_MODE && + cxip_env.hybrid_unexpected_msg_preemptive == 1) { + count = ofi_atomic_get32(&rxc->orx_hw_ule_cnt); + + if (rxc->base.state == RXC_ENABLED && + count > rxc->base.attr.size) { + ret = cxip_recv_pending_ptlte_disable(&rxc->base, + false); + if (ret == FI_SUCCESS) { + RXC_WARN(rxc, + "Transitioning to SW EP due to too many unexpected messages: posted_count=%u request_size=%lu\n", + ret, rxc->base.attr.size); + } else { + assert(ret == -FI_EAGAIN); + RXC_WARN(rxc, + "Failed to transition to SW EP: %d\n", + ret); + } + + return ret; + } + } + + return FI_SUCCESS; +} + +/* + * cxip_oflow_cb() - Process an Overflow buffer event. + * + * Overflow buffers are used to land unexpected Send data. Link, Unlink + * and Put events are expected from Overflow buffers. However, Link + * events will only be requested when running in hybrid RX match mode + * with FI_CXI_HYBRID_PREEMPTIVE=1. + * + * An Unlink event indicates that buffer space was exhausted. Overflow buffers + * are configured to use locally managed LEs. When enough Puts match in an + * Overflow buffer, consuming its space, the NIC automatically unlinks the LE. + * An automatic Unlink event is generated before the final Put which caused + * buffer space to become exhausted. + * + * An Unlink event is generated by an Unlink command. Overflow buffers are + * manually unlinked in this way during teardown. When an LE is manually + * unlinked the auto_unlinked field in the corresponding event is zero. In this + * case, the request is freed immediately. + * + * A Put event is generated for each Put that matches the Overflow buffer LE. + * This event indicates that data is available in the Overflow buffer. This + * event must be correlated to a Put Overflow event from a user receive buffer + * LE. The Put Overflow event may arrive before or after the Put event. + * + * When each Put event arrives, check for the existence of a previously posted + * receive buffer which generated a matching Put Overflow event. If such a + * buffer exists, copy data from the Overflow buffer to the user receive + * buffer. Otherwise, store a record of the Put event for matching once a user + * posts a new buffer that matches the unexpected Put. + * + * If data will remain in the Overflow buffer, take a reference to it to + * prevent it from being freed. If an Unlink-Put event is detected, drop a + * reference to the Overflow buffer so it is automatically freed once all user + * data is copied out. + */ +static int cxip_oflow_cb(struct cxip_req *req, const union c_event *event) +{ + struct cxip_ptelist_buf *oflow_buf = req->req_ctx; + struct cxip_rxc_hpc *rxc = oflow_buf->rxc; + int ret = FI_SUCCESS; + + switch (event->hdr.event_type) { + case C_EVENT_LINK: + /* Success events only used with hybrid preemptive */ + if (cxi_event_rc(event) == C_RC_OK) { + + if (!cxip_env.hybrid_preemptive) + return FI_SUCCESS; + + /* Check for possible hybrid mode preemptive + * transitions to software managed mode. + */ + if (cxip_rxp_check_le_usage_hybrid_preempt(rxc, event)) + RXC_WARN(rxc, + "Force preemptive switch to SW EP\n"); + return FI_SUCCESS; + } + + assert(cxi_event_rc(event) == C_RC_NO_SPACE); + + RXC_DBG(rxc, "Oflow LE append failed\n"); + + ret = cxip_recv_pending_ptlte_disable(&rxc->base, true); + if (ret != FI_SUCCESS) + RXC_WARN(rxc, "Force disable failed %d %s\n", + ret, fi_strerror(-ret)); + cxip_ptelist_buf_link_err(oflow_buf, cxi_event_rc(event)); + return ret; + case C_EVENT_UNLINK: + assert(!event->tgt_long.auto_unlinked); + + cxip_ptelist_buf_unlink(oflow_buf); + return FI_SUCCESS; + case C_EVENT_PUT: + /* Put event handling is complicated. Handle below. */ + break; + default: + RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT, + cxi_event_to_str(event), + cxi_rc_to_str(cxi_event_rc(event))); + } + + ofi_atomic_inc32(&rxc->orx_hw_ule_cnt); + + if (event->tgt_long.auto_unlinked) { + + oflow_buf->unlink_length = event->tgt_long.start - + CXI_VA_TO_IOVA(oflow_buf->md->md, oflow_buf->data) + + event->tgt_long.mlength; + + ofi_atomic_dec32(&oflow_buf->pool->bufs_linked); + + RXC_DBG(rxc, "Oflow auto unlink buf %p, linked %u\n", oflow_buf, + ofi_atomic_get32(&oflow_buf->pool->bufs_linked)); + + /* Replace the eager overflow buffer. */ + cxip_ptelist_buf_replenish(rxc->oflow_list_bufpool, false); + } + + ret = cxip_rxc_check_ule_hybrid_preempt(rxc); + if (ret) + goto err_dec_ule; + + /* Drop all unexpected 0-byte Put events. */ + if (!event->tgt_long.rlength) + return FI_SUCCESS; + + /* Handle Put events */ + ret = cxip_oflow_process_put_event(rxc, req, event); + if (ret) + goto err_dec_ule; + + return FI_SUCCESS; + +err_dec_ule: + ofi_atomic_dec32(&rxc->orx_hw_ule_cnt); + + return ret; +} + +static void rdzv_send_req_event(struct cxip_req *req); + +/* + * cxip_rdzv_pte_zbp_cb() - Process zero-byte Put events. + * + * Zero-byte Puts (ZBP) are used to transfer small messages without consuming + * buffers outside of the EQ. ZBPs are currently only used for match complete + * messages. + */ +int cxip_rdzv_pte_zbp_cb(struct cxip_req *req, const union c_event *event) +{ + struct cxip_rdzv_pte *rdzv_pte = req->req_ctx; + struct cxip_txc_hpc *txc = rdzv_pte->txc; + struct cxip_req *put_req; + union cxip_match_bits mb; + int event_rc = cxi_event_rc(event); + int rdzv_id; + int ret; + + switch (event->hdr.event_type) { + case C_EVENT_LINK: + if (event_rc == C_RC_OK) + ofi_atomic_inc32(&rdzv_pte->le_linked_success_count); + else + ofi_atomic_inc32(&rdzv_pte->le_linked_failure_count); + return FI_SUCCESS; + + case C_EVENT_PUT: + mb.raw = event->tgt_long.match_bits; + + if (mb.rdzv_done) { + rdzv_id = (mb.rdzv_id_hi << CXIP_RDZV_ID_CMD_WIDTH) | + mb.rdzv_id_lo; + put_req = cxip_rdzv_id_lookup(txc, rdzv_id); + if (!put_req) { + TXC_WARN(txc, "Failed to find RDZV ID: %d\n", + rdzv_id); + return FI_SUCCESS; + } + + if (event_rc != C_RC_OK) + TXC_WARN(txc, "RDZV Done error: %p rc: %s\n", + put_req, cxi_rc_to_str(event_rc)); + else + TXC_DBG(txc, "RDZV Done ACK: %p rc: %s\n", + put_req, cxi_rc_to_str(event_rc)); + + put_req->send.rc = event_rc; + rdzv_send_req_event(put_req); + + return FI_SUCCESS; + } + + /* Match complete */ + put_req = cxip_tx_id_lookup(txc, mb.tx_id); + if (!put_req) { + TXC_WARN(txc, "Failed to find TX ID: %d\n", mb.tx_id); + return FI_SUCCESS; + } + + event_rc = cxi_tgt_event_rc(event); + if (event_rc != C_RC_OK) + TXC_WARN(txc, "ZBP error: %p rc: %s\n", put_req, + cxi_rc_to_str(event_rc)); + else + TXC_DBG(txc, "ZBP received: %p rc: %s\n", put_req, + cxi_rc_to_str(event_rc)); + + ret = cxip_send_req_dequeue(put_req->send.txc_hpc, put_req); + if (ret != FI_SUCCESS) + return ret; + + cxip_tx_id_free(txc, mb.tx_id); + + /* The unexpected message has been matched. Generate a + * completion event. The ZBP event is guaranteed to arrive + * after the eager Send Ack, so the transfer is always done at + * this point. + * + * If MATCH_COMPLETE was requested, software must manage + * counters. + */ + cxip_report_send_completion(put_req, true); + + ofi_atomic_dec32(&put_req->send.txc->otx_reqs); + cxip_evtq_req_free(put_req); + + return FI_SUCCESS; + + default: + TXC_FATAL(txc, CXIP_UNEXPECTED_EVENT, + cxi_event_to_str(event), + cxi_rc_to_str(cxi_event_rc(event))); + } +} + +/* + * cxip_oflow_bufpool_fini() - Finalize overflow buffers used for messaging. + * + * Must be called with the RX PtlTE disabled. + */ +void cxip_oflow_bufpool_fini(struct cxip_rxc_hpc *rxc) +{ + struct cxip_deferred_event *def_ev = NULL; + struct cxip_ptelist_buf *oflow_buf; + struct dlist_entry *tmp; + int i; + int def_events = 0; + + /* Clean up unexpected Put records. The PtlTE is disabled, so no more + * events can be expected. + */ + for (i = 0; i < CXIP_DEF_EVENT_HT_BUCKETS; i++) { + dlist_foreach_container_safe(&rxc->deferred_events.bh[i], + struct cxip_deferred_event, + def_ev, rxc_entry, tmp) { + /* Dropping the last reference will cause the + * oflow_buf to be removed from the RXC list and + * freed. + */ + oflow_buf = def_ev->req->req_ctx; + + if (oflow_buf->le_type == CXIP_LE_TYPE_RX) + oflow_req_put_bytes(def_ev->req, + def_ev->ev.tgt_long.mlength); + + free_put_event(rxc, def_ev); + def_events++; + } + } + + if (def_events) + RXC_DBG(rxc, "Freed %d deferred event(s)\n", def_events); + + cxip_ptelist_bufpool_fini(rxc->oflow_list_bufpool); +} + +int cxip_oflow_bufpool_init(struct cxip_rxc_hpc *rxc) +{ + struct cxip_ptelist_bufpool_attr attr = { + .list_type = C_PTL_LIST_OVERFLOW, + .ptelist_cb = cxip_oflow_cb, + .buf_size = cxip_env.oflow_buf_size, + .min_posted = cxip_env.oflow_buf_min_posted, + .max_posted = cxip_env.oflow_buf_min_posted, /* min == max */ + .max_cached = cxip_env.oflow_buf_max_cached, + .min_space_avail = rxc->max_eager_size, + }; + + return cxip_ptelist_bufpool_init(rxc, &rxc->oflow_list_bufpool, &attr); +} + +/* + * cxip_rdzv_done_notify() - Sends a rendezvous complete from target to source + * + * Sends a zero byte matching notification to the source of rendezvous + * indicating completion of a rendezvous. This is used when restricted get + * DMA (CXIP_RDZV_PROTO_ALT_READ) is used to transfer non-eager data. + */ +static int cxip_rdzv_done_notify(struct cxip_req *req) +{ + struct cxip_rxc_hpc *rxc = req->recv.rxc_hpc; + union c_fab_addr dfa; + uint32_t pid_idx = CXIP_PTL_IDX_RDZV_DEST; + uint32_t match_id; + struct c_full_dma_cmd cmd = {}; + union cxip_match_bits mb = {}; + int ret; + uint8_t idx_ext; + + mb.rdzv_id_lo = req->recv.rdzv_id; + mb.rdzv_id_hi = req->recv.rdzv_id >> CXIP_RDZV_ID_CMD_WIDTH; + mb.rdzv_done = 1; + mb.le_type = CXIP_LE_TYPE_ZBP; + + cxi_build_dfa(req->recv.rget_nic, req->recv.rget_pid, + rxc->base.pid_bits, pid_idx, &dfa, &idx_ext); + match_id = CXI_MATCH_ID(rxc->base.pid_bits, + rxc->base.ep_obj->src_addr.pid, + rxc->base.ep_obj->src_addr.nic); + + cmd.command.cmd_type = C_CMD_TYPE_DMA; + cmd.command.opcode = C_CMD_PUT; + cmd.index_ext = idx_ext; + cmd.event_send_disable = 1; + cmd.dfa = dfa; + cmd.eq = cxip_evtq_eqn(&rxc->base.ep_obj->txc->tx_evtq); + cmd.user_ptr = (uint64_t)req; + cmd.initiator = match_id; + cmd.match_bits = mb.raw; + + RXC_DBG(rxc, "RDZV done notify send RDZV ID: %d\n", + req->recv.rdzv_id); + + ret = cxip_rxc_emit_dma(rxc, req->recv.vni, + cxip_ofi_to_cxi_tc(cxip_env.rget_tc), + CXI_TC_TYPE_DEFAULT, &cmd, 0); + if (ret) + RXC_WARN(rxc, "Failed to issue rdvz done: %d\n", ret); + + return ret; +} + +static int cxip_recv_rdzv_cb(struct cxip_req *req, const union c_event *event) +{ + struct cxip_rxc_hpc *rxc = req->recv.rxc_hpc; + struct cxip_deferred_event *def_ev; + int event_rc; + int ret; + bool matched; + + assert(rxc->base.protocol == FI_PROTO_CXI); + + switch (event->hdr.event_type) { + /* When errors happen, send events can occur before the put/get event. + * These events should just be dropped. + */ + case C_EVENT_SEND: + RXC_WARN(rxc, CXIP_UNEXPECTED_EVENT, + cxi_event_to_str(event), + cxi_rc_to_str(cxi_event_rc(event))); + return FI_SUCCESS; + + case C_EVENT_PUT_OVERFLOW: + /* We matched an unexpected header */ + /* Check for a previously received unexpected Put event, + * if not found defer until it arrives. + */ + def_ev = match_put_event(rxc, req, event, &matched); + if (!def_ev) + return -FI_EAGAIN; + + /* For multi-recv, management of start_offset requires events + * manage_local related events to arrive in order. + * Only C_EVENT_PUT_OVERFLOW events meet this criteria. + */ + def_ev->mrecv_start = req->recv.start_offset; + def_ev->mrecv_len = + mrecv_req_put_bytes(req, event->tgt_long.rlength); + + if (req->recv.multi_recv && event->tgt_long.auto_unlinked) { + /* If a C_EVENT_PUT_OVERFLOW unlinks a multi-recv + * buffer, mrecv_start contains the number of bytes + * consumed before this C_EVENT_PUT_OVERFLOW. Adding in + * mrecv_len gets the total bytes consumed. + */ + req->recv.auto_unlinked = true; + req->recv.mrecv_unlink_bytes = + def_ev->mrecv_start + def_ev->mrecv_len; + } + + ofi_atomic_dec32(&rxc->orx_hw_ule_cnt); + + if (!matched) + return FI_SUCCESS; + + RXC_DBG(rxc, "Matched deferred event: %p\n", def_ev); + + ret = cxip_ux_send(req, def_ev->req, &def_ev->ev, + def_ev->mrecv_start, def_ev->mrecv_len, + false); + if (ret == FI_SUCCESS) { + free_put_event(rxc, def_ev); + } else { + /* undo mrecv_req_put_bytes() and orx_hw_ule_cnt dec */ + req->recv.start_offset -= def_ev->mrecv_len; + ofi_atomic_inc32(&rxc->orx_hw_ule_cnt); + } + + return ret; + case C_EVENT_PUT: + /* Eager data was delivered directly to the user buffer. */ + if (req->recv.multi_recv) { + if (event->tgt_long.auto_unlinked) { + uintptr_t mrecv_head; + uintptr_t mrecv_tail; + size_t mrecv_bytes_remaining; + size_t rlen; + + /* For C_EVENT_PUT, need to calculate how much + * of the multi-recv buffer was consumed while + * factoring in any truncation. + */ + mrecv_head = + CXI_IOVA_TO_VA(req->recv.recv_md->md, + event->tgt_long.start); + mrecv_tail = (uintptr_t)req->recv.recv_buf + + req->recv.ulen; + mrecv_bytes_remaining = mrecv_tail - mrecv_head; + rlen = MIN(mrecv_bytes_remaining, + event->tgt_long.rlength); + + req->recv.auto_unlinked = true; + req->recv.mrecv_unlink_bytes = + mrecv_head - + (uintptr_t)req->recv.recv_buf + rlen; + } + + req = rdzv_mrecv_req_event(req, event); + if (!req) + return -FI_EAGAIN; + + /* Set start pointer and data_len using Rendezvous or + * Put Overflow event (depending on if message was + * unexpected). + */ + } + + cxip_rxc_hpc_recv_req_tgt_event(req, event); + + /* Count the rendezvous event. */ + rdzv_recv_req_event(req, event->hdr.event_type); + return FI_SUCCESS; + case C_EVENT_RENDEZVOUS: + if (req->recv.multi_recv) { + req = rdzv_mrecv_req_event(req, event); + if (!req) + return -FI_EAGAIN; + + /* Use Rendezvous event to set start pointer and + * data_len for expected Sends. + */ + struct cxip_req *parent = req->recv.parent; + size_t mrecv_bytes_remaining; + + req->buf = CXI_IOVA_TO_VA( + parent->recv.recv_md->md, + event->tgt_long.start) - + event->tgt_long.mlength; + req->recv.recv_buf = (void *)req->buf; + + mrecv_bytes_remaining = + (uint64_t)parent->recv.recv_buf + + parent->recv.ulen - + (uint64_t)req->recv.recv_buf; + req->data_len = MIN(mrecv_bytes_remaining, + event->tgt_long.rlength); + } else { + req->data_len = MIN(req->recv.ulen, + event->tgt_long.rlength); + } + + cxip_rxc_hpc_recv_req_tgt_event(req, event); + + if (!event->tgt_long.get_issued) { + if (ofi_atomic_inc32(&rxc->orx_tx_reqs) > + rxc->base.max_tx || issue_rdzv_get(req)) { + + /* Could not issue get */ + ofi_atomic_dec32(&rxc->orx_tx_reqs); + + /* Undo multi-recv event processing. */ + if (req->recv.multi_recv && + !req->recv.rdzv_events) { + dlist_remove(&req->recv.children); + cxip_evtq_req_free(req); + } + return -FI_EAGAIN; + } + + RXC_DBG(rxc, "Software issued Get, req: %p\n", req); + } + + /* Count the rendezvous event. */ + rdzv_recv_req_event(req, event->hdr.event_type); + return FI_SUCCESS; + case C_EVENT_REPLY: + /* If mrecv, look up the correct child request. */ + if (req->recv.multi_recv) { + req = rdzv_mrecv_req_event(req, event); + if (!req) + return -FI_EAGAIN; + } + + /* If a rendezvous operation requires a done notification + * send it. Must wait for the ACK from the notify to be returned + * before completing the target operation. + */ + if (req->recv.done_notify) { + if (ofi_atomic_inc32(&rxc->orx_tx_reqs) > + rxc->base.max_tx || cxip_rdzv_done_notify(req)) { + + /* Could not issue notify, will be retried */ + ofi_atomic_dec32(&rxc->orx_tx_reqs); + return -FI_EAGAIN; + } + } + + /* Rendezvous Get completed, update event counts and + * complete if using unrestricted get protocol. + */ + req->recv.rc = cxi_init_event_rc(event); + rdzv_recv_req_event(req, event->hdr.event_type); + + /* If RGet initiated by software return the TX credit */ + if (!event->init_short.rendezvous) { + ofi_atomic_dec32(&req->recv.rxc_hpc->orx_tx_reqs); + assert(ofi_atomic_get32(&req->recv.rxc_hpc->orx_tx_reqs) + >= 0); + } + + return FI_SUCCESS; + + case C_EVENT_ACK: + event_rc = cxi_init_event_rc(event); + if (event_rc != C_RC_OK) + RXC_WARN(rxc, "%#x:%u Bad RDZV notify ACK status %s\n", + req->recv.rget_nic, req->recv.rget_pid, + cxi_rc_to_str(event_rc)); + + /* Special case of the ZBP destination EQ being full and ZBP + * could not complete. This must be retried, we use the TX + * credit already allocated. + */ + if (event_rc == C_RC_ENTRY_NOT_FOUND) { + usleep(CXIP_DONE_NOTIFY_RETRY_DELAY_US); + + if (cxip_rdzv_done_notify(req)) + return -FI_EAGAIN; + + return FI_SUCCESS; + } + + /* Reflect the completion status of the ACK in the target + * side completion so that a failure will not go undetected. + */ + req->recv.rc = event_rc; + ofi_atomic_dec32(&req->recv.rxc_hpc->orx_tx_reqs); + rdzv_recv_req_event(req, event->hdr.event_type); + + return FI_SUCCESS; + + default: + RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT, + cxi_event_to_str(event), + cxi_rc_to_str(cxi_event_rc(event))); + } +} + +/* + * cxip_recv_cb() - Process a user receive buffer event. + * + * A user receive buffer is described by an LE linked to the Priority list. + * Link, Unlink, Put, Put Overflow, and Reply events are expected from a user + * receive buffer. + * + * A Link event indicates that a new user buffer has been linked to the + * priority list. Successful Link events may be suppressed. + * + * An Unlink event indicates that a user buffer has been unlinked. Normally, a + * receive is used once and unlinked when it is matched with a Send. In this + * case, a successful Unlink event may be suppressed. + * + * For expected, eager Sends, a Put will be matched to a user receive buffer by + * the NIC. Send data is copied directly to the user buffer. A Put event is + * generated describing the match. + * + * For unexpected, eager Sends, a Put will first match a buffer in the Overflow + * list. See cxip_oflow_cb() for details on Overflow event handling. Once a + * matching user receive buffer is appended to the Priority list, a Put + * Overflow event is generated. Put and Put Overflow events for an unexpected, + * eager Send must be correlated. These events may arrive in any order. Once + * both events are accounted, data is copied from the Overflow buffer to the + * user receive buffer. + * + * Unexpected, eager Sends that are longer than the eager threshold have their + * data truncated to zero. This is to avoid long messages consuming too much + * Overflow buffer space at the target. Once a match is made with a user + * receive buffer, data is re-read from the initiator using a Get. + * + * Rendezvous receive events are handled by cxip_recv_rdzv_cb(). + */ +static int cxip_recv_cb(struct cxip_req *req, const union c_event *event) +{ + int ret; + struct cxip_rxc_hpc *rxc = req->recv.rxc_hpc; + struct cxip_deferred_event *def_ev; + bool rdzv = false; + bool matched; + + /* Common processing for rendezvous and non-rendezvous events. + * TODO: Avoid having two switch statements for event_type. + */ + switch (event->hdr.event_type) { + case C_EVENT_LINK: + /* In cases where the LE pool entry reservation is insufficient + * to meet priority list buffers (due to multiple EP sharing an + * LE Pool or insufficient LE Pool reservation value), then + * enabling the periodic checking of LE allocations can be + * used to force preemptive transitions to software match mode. + */ + if (cxi_tgt_event_rc(event) == C_RC_OK) { + + if (!cxip_env.hybrid_recv_preemptive) + return FI_SUCCESS; + + /* Check for possible hybrid mode preemptive + * transitions to software managed mode. + */ + if (cxip_rxp_check_le_usage_hybrid_preempt(rxc, event)) + RXC_WARN(rxc, + "Force preemptive switch to SW EP\n"); + + return FI_SUCCESS; + } + + /* If endpoint has been disabled and an append fails, free the + * user request without reporting any event. + */ + if (rxc->base.state == RXC_DISABLED) { + cxip_recv_req_free(req); + return FI_SUCCESS; + } + + /* Save append to repost, NIC will initiate transition to + * software managed EP. + */ + if (cxi_tgt_event_rc(event) == C_RC_PTLTE_SW_MANAGED) { + RXC_WARN(rxc, "Append err, transitioning to SW\n"); + cxip_recv_req_dropped(req); + + return FI_SUCCESS; + } + + /* Transition into onload and flow control if an append + * fails. + */ + if (cxi_tgt_event_rc(event) != C_RC_NO_SPACE) + RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT_STS, + cxi_event_to_str(event), + cxi_rc_to_str(cxi_tgt_event_rc(event))); + + RXC_WARN(rxc, "Append err, priority LE exhaustion\n"); + + /* Manually transition to DISABLED to initiate flow control + * and onload instead of waiting for eventual NIC no match + * transition. + */ + ret = cxip_recv_pending_ptlte_disable(&rxc->base, true); + if (ret != FI_SUCCESS) + RXC_WARN(rxc, "Force disable failed %d %s\n", + ret, fi_strerror(-ret)); + + ret = FI_SUCCESS; + cxip_recv_req_dropped(req); + + return ret; + + case C_EVENT_UNLINK: + assert(!event->tgt_long.auto_unlinked); + + /* TODO: This is broken with multi-recv. The multi-recv request + * may be freed with pending child requests. + */ + req->recv.unlinked = true; + cxip_recv_req_report(req); + cxip_recv_req_free(req); + + return FI_SUCCESS; + + case C_EVENT_PUT_OVERFLOW: + cxip_rxc_record_req_stat(&rxc->base, C_PTL_LIST_OVERFLOW, + event->tgt_long.rlength, req); + + /* ULE freed. Update RXC state to signal that the RXC should + * be reenabled. + */ + /* TODO: this is not atomic, there must be a better way */ + if (rxc->base.state == RXC_ONLOAD_FLOW_CONTROL) + rxc->base.state = RXC_ONLOAD_FLOW_CONTROL_REENABLE; + break; + + case C_EVENT_PUT: + cxip_rxc_record_req_stat(&rxc->base, C_PTL_LIST_PRIORITY, + event->tgt_long.rlength, req); + break; + default: + break; + } + + /* All events related to an offloaded rendezvous receive will be + * handled by cxip_recv_rdzv_cb(). Those events are identified by the + * event rendezvous field. Two exceptions are a Reply event generated + * from a SW-issued Get, and a Ack for a software done notification + * when using restricted eager get. When such an event is generated, + * the request will have already processed a Rendezvous event. If the + * rendezvous field is not set, but the rdzv_events count is elevated, + * this must be a SW-issued Reply or Ack event. + */ + if (event->hdr.event_type == C_EVENT_REPLY || + event->hdr.event_type == C_EVENT_ACK) + rdzv = (event->init_short.rendezvous || req->recv.rdzv_events); + else + rdzv = event->tgt_long.rendezvous; + + if (rdzv) + return cxip_recv_rdzv_cb(req, event); + + switch (event->hdr.event_type) { + case C_EVENT_SEND: + /* TODO Handle Send event errors. */ + assert(cxi_event_rc(event) == C_RC_OK); + return FI_SUCCESS; + case C_EVENT_PUT_OVERFLOW: + /* We matched an unexpected header */ + /* Unexpected 0-byte Put events are dropped. Skip matching. */ + if (!event->tgt_long.rlength) { + ret = cxip_ux_send_zb(req, event, + req->recv.start_offset, false); + if (ret == FI_SUCCESS) + ofi_atomic_dec32(&rxc->orx_hw_ule_cnt); + + return ret; + } + + /* Check for a previously received unexpected Put event, + * if not found defer until it arrives. + */ + def_ev = match_put_event(rxc, req, event, &matched); + if (!def_ev) + return -FI_EAGAIN; + + /* For multi-recv, management of start_offset requires events + * manage_local related events to arrive in order. + * Only C_EVENT_PUT_OVERFLOW events meet this criteria. + */ + def_ev->mrecv_start = req->recv.start_offset; + def_ev->mrecv_len = + mrecv_req_put_bytes(req, event->tgt_long.rlength); + + if (req->recv.multi_recv && event->tgt_long.auto_unlinked) { + /* If a C_EVENT_PUT_OVERFLOW unlinks a multi-recv + * buffer, mrecv_start contains the number of bytes + * consumed before this C_EVENT_PUT_OVERFLOW. Adding in + * mrecv_len gets the total bytes consumed. + */ + req->recv.auto_unlinked = true; + req->recv.mrecv_unlink_bytes = + def_ev->mrecv_start + def_ev->mrecv_len; + } + + ofi_atomic_dec32(&rxc->orx_hw_ule_cnt); + + if (!matched) + return FI_SUCCESS; + + ret = cxip_ux_send(req, def_ev->req, &def_ev->ev, + def_ev->mrecv_start, def_ev->mrecv_len, + false); + if (ret == FI_SUCCESS) { + free_put_event(rxc, def_ev); + } else { + /* undo mrecv_req_put_bytes() and orx_hw_ule_cnt dec */ + req->recv.start_offset -= def_ev->mrecv_len; + ofi_atomic_inc32(&rxc->orx_hw_ule_cnt); + } + + return ret; + case C_EVENT_PUT: + /* Data was delivered directly to the user buffer. Complete the + * request. + */ + return cxip_complete_put(req, event); + + case C_EVENT_REPLY: + /* Long-send Get completed. Complete the request. */ + req->recv.rc = cxi_init_event_rc(event); + + cxip_recv_req_report(req); + if (req->recv.multi_recv) + cxip_evtq_req_free(req); + else + cxip_recv_req_free(req); + + return FI_SUCCESS; + default: + RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT, + cxi_event_to_str(event), + cxi_rc_to_str(cxi_event_rc(event))); + } +} + +/* + * cxip_recv_reenable() - Attempt to re-enable the RX queue. + * + * Called by disabled EP ready to re-enable. + * + * Determine if the RX queue can be re-enabled and perform a state change + * command if necessary. The Endpoint must receive dropped Send notifications + * from all peers who experienced drops before re-enabling the RX queue. + * + * Caller must hold ep_obj->lock. + */ +int cxip_recv_reenable(struct cxip_rxc_hpc *rxc) +{ + struct cxi_pte_status pte_status = {}; + int ret; + + if (rxc->drop_count == -1) { + RXC_WARN(rxc, "Waiting for pending FC_NOTIFY messages\n"); + return -FI_EAGAIN; + } + + ret = cxil_pte_status(rxc->base.rx_pte->pte, &pte_status); + assert(!ret); + + if (rxc->drop_count != pte_status.drop_count) { + RXC_DBG(rxc, "Processed %d/%d drops\n", + rxc->drop_count, pte_status.drop_count); + return -FI_EAGAIN; + } + + RXC_WARN(rxc, "Re-enabling PTE, drop_count %d\n", + rxc->drop_count); + + do { + ret = cxip_rxc_msg_enable(rxc, rxc->drop_count); + if (ret == -FI_EAGAIN && + rxc->new_state == RXC_ENABLED_SOFTWARE) { + RXC_WARN(rxc, + "PTE disable->sm drop mismatch, will retry\n"); + break; + } + } while (ret == -FI_EAGAIN); + + if (ret != FI_SUCCESS && ret != -FI_EAGAIN) + RXC_FATAL(rxc, "cxip_rxc_msg_enable failed: %d\n", ret); + + return ret; +} + +/* + * cxip_fc_resume_cb() - Process FC resume completion events. + */ +int cxip_fc_resume_cb(struct cxip_ctrl_req *req, const union c_event *event) +{ + struct cxip_fc_drops *fc_drops = container_of(req, + struct cxip_fc_drops, req); + struct cxip_rxc_hpc *rxc = fc_drops->rxc; + int ret = FI_SUCCESS; + + switch (event->hdr.event_type) { + case C_EVENT_ACK: + switch (cxi_event_rc(event)) { + case C_RC_OK: + RXC_DBG(rxc, + "FC_RESUME to %#x:%u:%u successfully sent: retry_count=%u\n", + fc_drops->nic_addr, fc_drops->pid, + fc_drops->vni, fc_drops->retry_count); + free(fc_drops); + break; + + /* This error occurs when the target's control event queue has + * run out of space. Since the target should be processing the + * event queue, it is safe to replay messages until C_RC_OK is + * returned. + */ + case C_RC_ENTRY_NOT_FOUND: + fc_drops->retry_count++; + RXC_WARN(rxc, + "%#x:%u:%u dropped FC message: retry_delay_usecs=%d retry_count=%u\n", + fc_drops->nic_addr, fc_drops->pid, + fc_drops->vni, cxip_env.fc_retry_usec_delay, + fc_drops->retry_count); + usleep(cxip_env.fc_retry_usec_delay); + ret = cxip_ctrl_msg_send(req); + break; + default: + RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT_STS, + cxi_event_to_str(event), + cxi_rc_to_str(cxi_event_rc(event))); + } + break; + default: + RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT, + cxi_event_to_str(event), + cxi_rc_to_str(cxi_event_rc(event))); + } + + return ret; +} + +/* + * cxip_fc_process_drops() - Process a dropped Send notification from a peer. + * + * Called by disabled EP waiting to re-enable. + * + * When a peer detects dropped Sends it follows up by sending a message to the + * disabled Endpoint indicating the number of drops experienced. The disabled + * Endpoint peer must count all drops before re-enabling its RX queue. + */ +int cxip_fc_process_drops(struct cxip_ep_obj *ep_obj, uint32_t nic_addr, + uint32_t pid, uint16_t vni, uint16_t drops) +{ + struct cxip_rxc_hpc *rxc = container_of(ep_obj->rxc, + struct cxip_rxc_hpc, base); + struct cxip_fc_drops *fc_drops; + int ret; + + fc_drops = calloc(1, sizeof(*fc_drops)); + if (!fc_drops) { + RXC_WARN(rxc, "Failed to allocate drops\n"); + return -FI_ENOMEM; + } + + /* TODO: Cleanup cxip_fc_drops fields. Many of the fields are redundant + * with the req structure. + */ + fc_drops->rxc = rxc; + fc_drops->nic_addr = nic_addr; + fc_drops->pid = pid; + fc_drops->vni = vni; + fc_drops->drops = drops; + + fc_drops->req.send.nic_addr = nic_addr; + fc_drops->req.send.pid = pid; + fc_drops->req.send.vni = vni; + fc_drops->req.send.mb.drops = drops; + + fc_drops->req.send.mb.ctrl_le_type = CXIP_CTRL_LE_TYPE_CTRL_MSG; + fc_drops->req.send.mb.ctrl_msg_type = CXIP_CTRL_MSG_FC_RESUME; + fc_drops->req.cb = cxip_fc_resume_cb; + fc_drops->req.ep_obj = rxc->base.ep_obj; + + dlist_insert_tail(&fc_drops->rxc_entry, &rxc->fc_drops); + + RXC_DBG(rxc, "Processed drops: %d NIC: %#x PID: %d\n", + drops, nic_addr, pid); + + rxc->drop_count += drops; + + /* Wait until search and delete completes before attempting to + * re-enable. + */ + if (rxc->base.state == RXC_FLOW_CONTROL) { + ret = cxip_recv_reenable(rxc); + assert(ret == FI_SUCCESS || ret == -FI_EAGAIN); + + /* Disable to software managed transition is synchronous + * in order to handle drop count mismatches correctly. If + * successful the H/W transition completed, otherwise it + * will be retried when notified and count matches. + */ + if (rxc->new_state == RXC_ENABLED_SOFTWARE && + ret == FI_SUCCESS) { + cxip_fc_progress_ctrl(rxc); + rxc->base.state = RXC_ENABLED_SOFTWARE; + RXC_WARN(rxc, "Now in RXC_ENABLED_SOFTWARE\n"); + } + } + + return FI_SUCCESS; +} + +/* + * cxip_recv_replay() - Replay dropped Receive requests. + * + * When no LE is available while processing an Append command, the command is + * dropped and future appends are disabled. After all outstanding commands are + * dropped and resources are recovered, replayed all Receive requests in order. + * + * Caller must hold ep_obj->lock. + */ +static int cxip_recv_replay(struct cxip_rxc_hpc *rxc) +{ + struct cxip_req *req; + struct dlist_entry *tmp; + bool restart_seq = true; + int ret; + + dlist_foreach_container_safe(&rxc->replay_queue, struct cxip_req, req, + recv.rxc_entry, tmp) { + dlist_remove_init(&req->recv.rxc_entry); + + /* Since the RXC and PtlTE are in a controlled state and no new + * user receives are being posted, it is safe to ignore the RXC + * state when replaying failed user posted receives. + */ + ret = cxip_recv_req_queue(req, restart_seq); + + /* Match made in software? */ + if (ret == -FI_EALREADY) + continue; + + /* TODO: Low memory or full CQ during SW matching would cause + * -FI_EAGAIN to be seen here. + */ + assert(ret == FI_SUCCESS); + + restart_seq = false; + } + + return FI_SUCCESS; +} + +/* + * cxip_recv_resume() - Send a resume message to all peers who reported dropped + * Sends. + * + * Called by disabled EP after re-enable. + * + * After counting all dropped sends targeting a disabled RX queue and + * re-enabling the queue, notify all peers who experienced dropped Sends so + * they can be replayed. + * + * Caller must hold ep_obj->lock. + */ +int cxip_recv_resume(struct cxip_rxc_hpc *rxc) +{ + struct cxip_fc_drops *fc_drops; + struct dlist_entry *tmp; + int ret; + + dlist_foreach_container_safe(&rxc->fc_drops, + struct cxip_fc_drops, fc_drops, + rxc_entry, tmp) { + ret = cxip_ctrl_msg_send(&fc_drops->req); + if (ret) + return ret; + + dlist_remove(&fc_drops->rxc_entry); + } + + return FI_SUCCESS; +} + +/* + * cxip_fc_progress_ctrl() - Progress the control EP until all resume + * control messages can be queued. + * + * Caller must hold ep_obj->lock. + */ +static void cxip_fc_progress_ctrl(struct cxip_rxc_hpc *rxc) +{ + int ret __attribute__((unused)); + + assert(rxc->base.state == RXC_FLOW_CONTROL); + + /* Successful transition from disabled occurred, reset + * drop count. + */ + rxc->drop_count = rxc->base.ep_obj->asic_ver < CASSINI_2_0 ? -1 : 0; + + while ((ret = cxip_recv_resume(rxc)) == -FI_EAGAIN) + cxip_ep_tx_ctrl_progress_locked(rxc->base.ep_obj); + + assert(ret == FI_SUCCESS); +} + +/* + * cxip_post_ux_onload_sw() - Nic HW-to-SW EP post UX onload processing. + * + * PTE transitioned from enabled to software managed. Onloading + * was done and appends that failed need to be replayed. + */ +static void cxip_post_ux_onload_sw(struct cxip_rxc_hpc *rxc) +{ + int ret; + + assert(cxip_env.rx_match_mode == CXIP_PTLTE_HYBRID_MODE); + assert(rxc->prev_state == RXC_ENABLED); + assert(rxc->new_state == RXC_ENABLED_SOFTWARE); + + ret = cxip_ptelist_buf_replenish(rxc->req_list_bufpool, + true); + if (ret != FI_SUCCESS) + RXC_WARN(rxc, "Request list replenish failed %d %s\n", + ret, fi_strerror(-ret)); + + /* Priority list appends that failed during the transition can + * now be replayed. + */ + ret = cxip_recv_replay(rxc); + assert(ret == FI_SUCCESS || ret == -FI_EAGAIN); + + if (rxc->base.state == RXC_PENDING_PTLTE_SOFTWARE_MANAGED) { + /* Transition from enabled to software managed is complete. + * Allow posting of receive operations. + */ + RXC_WARN(rxc, "Now in RXC_ENABLED_SOFTWARE\n"); + rxc->base.state = RXC_ENABLED_SOFTWARE; + } +} + +/* + * cxip_post_ux_onload_fc() - Flow control onload complete processing. + * + * PTE transitioned to disabled and UX onload has completed. + */ +static void cxip_post_ux_onload_fc(struct cxip_rxc_hpc *rxc) +{ + int ret; + + /* Disable RX matching offload if transitioning to + * software enabled EP. + */ + if (rxc->new_state == RXC_ENABLED_SOFTWARE) { + RXC_DBG(rxc, "Transitioning to SW EP\n"); + rxc->base.msg_offload = 0; + } + + if (rxc->fc_reason == C_SC_FC_EQ_FULL) + goto replay; + + if (rxc->new_state == RXC_ENABLED_SOFTWARE) + ret = cxip_ptelist_buf_replenish(rxc->req_list_bufpool, + true); + else + ret = cxip_ptelist_buf_replenish(rxc->oflow_list_bufpool, + true); + if (ret != FI_SUCCESS) + RXC_WARN(rxc, "%s buffer replenish failed %d %s\n", + rxc->new_state == RXC_ENABLED_SOFTWARE ? + "Request" : "Overflow", ret, fi_strerror(-ret)); + +replay: + /* Any priority list appends that failed during the transition + * can now be replayed. + */ + if (rxc->new_state == RXC_ENABLED) + rxc->base.msg_offload = 1; + + ret = cxip_recv_replay(rxc); + RXC_DBG(rxc, "Replay of failed receives ret: %d %s\n", + ret, fi_strerror(-ret)); + assert(ret == FI_SUCCESS || ret == -FI_EAGAIN); + + if (rxc->base.state != RXC_ONLOAD_FLOW_CONTROL_REENABLE && + rxc->new_state != RXC_ENABLED_SOFTWARE) + RXC_FATAL(rxc, FC_SW_ONLOAD_MSG_FATAL); + + rxc->base.state = RXC_FLOW_CONTROL; + ret = cxip_recv_reenable(rxc); + assert(ret == FI_SUCCESS || ret == -FI_EAGAIN); + RXC_WARN(rxc, "Now in RXC_FLOW_CONTROL\n"); + + /* Disable to software managed transition is synchronous in order to + * handle drop count mismatches correctly. If successful the H/W + * transition completed, otherwise the transition will occur when + * additional drop notifies are received. + */ + if (rxc->new_state == RXC_ENABLED_SOFTWARE && ret == FI_SUCCESS) { + cxip_fc_progress_ctrl(rxc); + rxc->base.state = RXC_ENABLED_SOFTWARE; + RXC_WARN(rxc, "Now in RXC_ENABLED_SOFTWARE\n"); + } +} + +/* + * cxip_ux_onload_complete() - Unexpected list entry onload complete. + * + * All unexpected message headers have been onloaded from hardware. + */ +static void cxip_ux_onload_complete(struct cxip_req *req) +{ + struct cxip_rxc_hpc *rxc = req->search.rxc; + + assert(rxc->base.state == RXC_ONLOAD_FLOW_CONTROL_REENABLE || + rxc->base.state == RXC_PENDING_PTLTE_SOFTWARE_MANAGED); + + free(rxc->ule_offsets); + rxc->ule_offsets = 0; + + /* During a transition to software managed PtlTE, received + * request list entries resulting from hardware not matching + * the priority list on an incoming packet were added to a + * pending unexpected message list. We merge the two + * expected list here. + */ + RXC_DBG(rxc, "Req pending %d UX entries, SW list %d UX entries\n", + rxc->sw_pending_ux_list_len, rxc->sw_ux_list_len); + + dlist_splice_tail(&rxc->sw_ux_list, &rxc->sw_pending_ux_list); + rxc->sw_ux_list_len += rxc->sw_pending_ux_list_len; + rxc->sw_pending_ux_list_len = 0; + + RXC_WARN(rxc, "Software UX list updated, %d SW UX entries\n", + rxc->sw_ux_list_len); + + if (rxc->base.state == RXC_PENDING_PTLTE_SOFTWARE_MANAGED) + cxip_post_ux_onload_sw(rxc); + else + cxip_post_ux_onload_fc(rxc); + + ofi_atomic_dec32(&rxc->base.orx_reqs); + cxip_evtq_req_free(req); +} + +/* + * cxip_get_ule_offsets() - Initialize an in-order array of ULE offsets + * + * If snapshot is requested, no more than two passes at getting offsets + * will be made. This is intended to be used with FI_CLAIM processing, + * where the PtlTE is enabled. + */ +static int cxip_get_ule_offsets(struct cxip_rxc_hpc *rxc, + uint64_t **ule_offsets, + unsigned int *num_ule_offsets, bool snapshot) +{ + struct cxi_pte_status pte_status = { + .ule_count = 512 + }; + size_t cur_ule_count = 0; + int ret; + int calls = 0; + + /* Get all the unexpected header remote offsets. */ + *ule_offsets = NULL; + *num_ule_offsets = 0; + + do { + cur_ule_count = pte_status.ule_count; + *ule_offsets = reallocarray(*ule_offsets, cur_ule_count, + sizeof(*ule_offsets)); + if (*ule_offsets == NULL) { + RXC_WARN(rxc, "Failed allocate ule offset memory\n"); + ret = -FI_ENOMEM; + goto err; + } + + pte_status.ule_offsets = (void *)*ule_offsets; + ret = cxil_pte_status(rxc->base.rx_pte->pte, &pte_status); + assert(!ret); + } while (cur_ule_count < pte_status.ule_count && + !(snapshot && ++calls > 1)); + + *num_ule_offsets = pte_status.ule_count; + + return FI_SUCCESS; +err: + free(*ule_offsets); + + return ret; +} + +/* + * cxip_ux_onload_cb() - Process SEARCH_AND_DELETE command events. + */ +static int cxip_ux_onload_cb(struct cxip_req *req, const union c_event *event) +{ + struct cxip_rxc_hpc *rxc = req->search.rxc; + struct cxip_deferred_event *def_ev; + struct cxip_ux_send *ux_send; + bool matched; + + assert(rxc->base.state == RXC_ONLOAD_FLOW_CONTROL || + rxc->base.state == RXC_ONLOAD_FLOW_CONTROL_REENABLE || + rxc->base.state == RXC_PENDING_PTLTE_SOFTWARE_MANAGED); + + switch (event->hdr.event_type) { + case C_EVENT_PUT_OVERFLOW: + assert(cxi_event_rc(event) == C_RC_OK); + + ux_send = calloc(1, sizeof(*ux_send)); + if (!ux_send) { + RXC_WARN(rxc, "Failed allocate to memory\n"); + return -FI_EAGAIN; + } + + /* Zero-byte unexpected onloads require special handling since + * no deferred structure would be allocated. + */ + if (event->tgt_long.rlength) { + + def_ev = match_put_event(rxc, req, event, &matched); + if (!matched) { + if (!def_ev) { + free(ux_send); + return -FI_EAGAIN; + } + + /* Gather Put events later */ + def_ev->ux_send = ux_send; + req->search.puts_pending++; + } else { + ux_send->req = def_ev->req; + ux_send->put_ev = def_ev->ev; + + free_put_event(rxc, def_ev); + } + } else { + ux_send->put_ev = *event; + } + + /* For flow control transition if a ULE is freed, then + * set state so that re-enable will be attempted. + */ + if (rxc->base.state == RXC_ONLOAD_FLOW_CONTROL) + rxc->base.state = RXC_ONLOAD_FLOW_CONTROL_REENABLE; + + /* Fixup event with the expected remote offset for an RGet. */ + if (event->tgt_long.rlength) { + ux_send->put_ev.tgt_long.remote_offset = + rxc->ule_offsets[rxc->cur_ule_offsets] + + event->tgt_long.mlength; + } + rxc->cur_ule_offsets++; + + dlist_insert_tail(&ux_send->rxc_entry, &rxc->sw_ux_list); + rxc->sw_ux_list_len++; + + RXC_DBG(rxc, "Onloaded Send: %p\n", ux_send); + + ofi_atomic_dec32(&rxc->orx_hw_ule_cnt); + + break; + case C_EVENT_SEARCH: + if (rxc->new_state == RXC_ENABLED_SOFTWARE && + rxc->base.state == RXC_ONLOAD_FLOW_CONTROL) + rxc->base.state = RXC_ONLOAD_FLOW_CONTROL_REENABLE; + + if (rxc->base.state == RXC_ONLOAD_FLOW_CONTROL) + RXC_FATAL(rxc, FC_SW_ONLOAD_MSG_FATAL); + + req->search.complete = true; + rxc->base.rx_evtq.ack_batch_size = + rxc->base.rx_evtq.cq->ack_batch_size; + + RXC_DBG(rxc, "UX Onload Search done\n"); + + if (cxip_ux_is_onload_complete(req)) + cxip_ux_onload_complete(req); + + break; + default: + RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT, + cxi_event_to_str(event), + cxi_rc_to_str(cxi_event_rc(event))); + } + + return FI_SUCCESS; +} + +/* + * cxip_ux_onload() - Issue SEARCH_AND_DELETE command to on-load unexpected + * Send headers queued on the RXC message queue. + * + * Caller must hold ep_obj->lock. + */ +static int cxip_ux_onload(struct cxip_rxc_hpc *rxc) +{ + struct cxip_req *req; + union c_cmdu cmd = {}; + int ret; + + assert(rxc->base.state == RXC_ONLOAD_FLOW_CONTROL || + rxc->base.state == RXC_ONLOAD_FLOW_CONTROL_REENABLE || + rxc->base.state == RXC_PENDING_PTLTE_SOFTWARE_MANAGED); + + RXC_DBG(rxc, "Initiate hardware UX list onload\n"); + + /* Get all the unexpected header remote offsets. */ + rxc->ule_offsets = NULL; + rxc->num_ule_offsets = 0; + rxc->cur_ule_offsets = 0; + + ret = cxip_get_ule_offsets(rxc, &rxc->ule_offsets, + &rxc->num_ule_offsets, false); + if (ret) { + RXC_WARN(rxc, "Failed to read UX remote offsets: %d %s\n", + ret, fi_strerror(-ret)); + goto err; + } + + /* Populate request */ + req = cxip_evtq_req_alloc(&rxc->base.rx_evtq, 1, NULL); + if (!req) { + RXC_DBG(rxc, "Failed to allocate request\n"); + ret = -FI_EAGAIN; + goto err_free_onload_offset; + } + ofi_atomic_inc32(&rxc->base.orx_reqs); + + req->cb = cxip_ux_onload_cb; + req->type = CXIP_REQ_SEARCH; + req->search.rxc = rxc; + + cmd.command.opcode = C_CMD_TGT_SEARCH_AND_DELETE; + cmd.target.ptl_list = C_PTL_LIST_UNEXPECTED; + cmd.target.ptlte_index = rxc->base.rx_pte->pte->ptn; + cmd.target.buffer_id = req->req_id; + cmd.target.length = -1U; + cmd.target.ignore_bits = -1UL; + cmd.target.match_id = CXI_MATCH_ID_ANY; + + ret = cxi_cq_emit_target(rxc->base.rx_cmdq->dev_cmdq, &cmd); + if (ret) { + RXC_WARN(rxc, "Failed to write Search command: %d\n", ret); + ret = -FI_EAGAIN; + goto err_dec_free_cq_req; + } + + cxi_cq_ring(rxc->base.rx_cmdq->dev_cmdq); + + return FI_SUCCESS; + +err_dec_free_cq_req: + ofi_atomic_dec32(&rxc->base.orx_reqs); + cxip_evtq_req_free(req); +err_free_onload_offset: + free(rxc->ule_offsets); +err: + RXC_WARN(rxc, "Hardware UX list onload initiation error, ret: %d\n", + ret); + return ret; +} + +static int cxip_flush_appends_cb(struct cxip_req *req, + const union c_event *event) +{ + struct cxip_rxc_hpc *rxc = req->req_ctx; + int ret; + + assert(rxc->base.state == RXC_ONLOAD_FLOW_CONTROL || + rxc->base.state == RXC_ONLOAD_FLOW_CONTROL_REENABLE || + rxc->base.state == RXC_PENDING_PTLTE_SOFTWARE_MANAGED); + + assert(event->hdr.event_type == C_EVENT_SEARCH); + assert(cxi_event_rc(event) == C_RC_NO_MATCH); + + ret = cxip_ux_onload(rxc); + if (ret == FI_SUCCESS) { + ofi_atomic_dec32(&rxc->base.orx_reqs); + cxip_evtq_req_free(req); + } + + return ret; +} + +/* + * cxip_recv_pte_cb() - Process receive PTE state change events. + */ +void cxip_recv_pte_cb(struct cxip_pte *pte, const union c_event *event) +{ + struct cxip_rxc_hpc *rxc = (struct cxip_rxc_hpc *)pte->ctx; + int fc_reason = cxip_fc_reason(event); + int ret; + + switch (pte->state) { + case C_PTLTE_ENABLED: + assert(rxc->base.state == RXC_FLOW_CONTROL || + rxc->base.state == RXC_DISABLED || + rxc->base.state == RXC_PENDING_PTLTE_HARDWARE); + + /* Queue any flow control resume messages */ + if (rxc->base.state == RXC_FLOW_CONTROL) { + cxip_fc_progress_ctrl(rxc); + RXC_WARN(rxc, "Now in RXC_ENABLED\n"); + } + + rxc->base.state = RXC_ENABLED; + break; + + case C_PTLTE_DISABLED: + if (rxc->base.state == RXC_DISABLED) + break; + + if (fc_reason == C_SC_DIS_UNCOR) + RXC_FATAL(rxc, "Disabled, LE uncorrectable err\n"); + + /* An incorrect drop count was used during PTE enable. + * Another attempt will be made when a peer sends a side-band + * drop message. + */ + if (cxi_event_rc(event) == C_RC_NO_MATCH) { + assert(rxc->base.state == RXC_FLOW_CONTROL || + rxc->base.state == RXC_ONLOAD_FLOW_CONTROL || + rxc->base.state == + RXC_ONLOAD_FLOW_CONTROL_REENABLE || + rxc->base.state == + RXC_PENDING_PTLTE_SOFTWARE_MANAGED); + RXC_WARN(rxc, FC_DROP_COUNT_MSG); + break; + } + + /* Flow control occurred while transitioning from HW to SW + * managed PTE. Since onloading of all UX entries will have + * been initiated (i.e. no new ones will be added) and the + * PTE state change from RXC_PENDING_PTLTE_SOFTWARE_MANAGED + * to RXC_ENABLED_SOFTWARE following onload complete is + * protected by the ep_obj->lock, it is safe to indicate that + * SW managed EP must be re-enabled on onload complete. + * The request list will have been replenished. + */ + if (rxc->base.state == RXC_PENDING_PTLTE_SOFTWARE_MANAGED) { + RXC_WARN(rxc, + "Flow control during HW to SW transition\n"); + rxc->base.state = RXC_ONLOAD_FLOW_CONTROL_REENABLE; + break; + } + + /* Check for flow control during flow control */ + if (rxc->base.state != RXC_ENABLED && + rxc->base.state != RXC_ENABLED_SOFTWARE && + rxc->base.state != RXC_PENDING_PTLTE_DISABLE) { + + /* There is race between SW disable on priority list + * and HW initiated LE flow control which can be + * ignored; otherwise it is a fatal error. + */ + if (fc_reason == CXIP_FC_SOFTWARE_INITIATED) + break; + RXC_FATAL(rxc, FC_SW_LE_MSG_FATAL); + } + + /* Starting flow control processing. The default is for + * flow control should re-enable in the previous + * hardware/software managed state. + */ + rxc->prev_state = rxc->base.state; + rxc->new_state = rxc->base.state; + rxc->base.state = RXC_ONLOAD_FLOW_CONTROL; + + RXC_DBG(rxc, "Flow control detected, H/W: %d reason: %d\n", + event->tgt_long.initiator.state_change.sc_nic_auto, + fc_reason); + + switch (fc_reason) { + case CXIP_FC_SOFTWARE_INITIATED: + /* Software initiated state change, drop count + * needs to start at zero instead of -1. Add 1 to + * account for this. Note this is only initiated + * from an hardware enabled PTE state. + */ + RXC_WARN(rxc, "SW initiated flow control\n"); + if (rxc->base.ep_obj->asic_ver < CASSINI_2_0) + rxc->drop_count++; + + /* If running in hybrid mode, resume operation as a + * software managed EP to reduce LE resource load. + */ + if (cxip_env.rx_match_mode == CXIP_PTLTE_HYBRID_MODE) + rxc->new_state = RXC_ENABLED_SOFTWARE; + + rxc->num_fc_append_fail++; + break; + + case C_SC_FC_EQ_FULL: + /* EQ full does not require LE resources be recovered + * to re-enable. + */ + RXC_WARN(rxc, "Flow control EQ full\n"); + rxc->base.state = RXC_ONLOAD_FLOW_CONTROL_REENABLE; + rxc->num_fc_eq_full++; + break; + + case C_SC_FC_NO_MATCH: + /* Overflow list buffers were full/could not be matched + * against. Must replenish buffers, but does not in + * itself require resources be recovered. + */ + RXC_WARN(rxc, FC_OFLOW_NO_MATCH_MSG, + cxip_env.oflow_buf_size); + + rxc->base.state = RXC_ONLOAD_FLOW_CONTROL_REENABLE; + rxc->num_fc_no_match++; + break; + + case C_SC_FC_UNEXPECTED_FAIL: + /* Hybrid mode is not enabled and overflow matches, but + * LE resources prevent unexpected message allocation. + */ + RXC_WARN(rxc, "Flow control UX LE resources\n"); + rxc->num_fc_unexp++; + break; + + case C_SC_FC_REQUEST_FULL: + /* Running as software managed EP and request list + * buffers were full/could not be matched against. + * Must replenish buffers, but does not require that + * LE resources are recovered. + */ + RXC_WARN(rxc, FC_REQ_FULL_MSG, cxip_env.req_buf_size); + rxc->base.state = RXC_ONLOAD_FLOW_CONTROL_REENABLE; + rxc->num_fc_req_full++; + break; + + case C_SC_SM_APPEND_FAIL: + case C_SC_SM_UNEXPECTED_FAIL: + default: + RXC_FATAL(rxc, "Invalid disable PTE c_sc_reason: %d\n", + fc_reason); + } + rxc->fc_reason = fc_reason; + + do { + ret = cxip_flush_appends(rxc, cxip_flush_appends_cb); + } while (ret == -FI_EAGAIN); + + if (ret != FI_SUCCESS) + RXC_FATAL(rxc, "cxip_flush_appends failed: %d\n", ret); + + break; + + case C_PTLTE_SOFTWARE_MANAGED: + /* There is an inherent race between hardware and software + * in setting the PtlTE state. If software requested to + * disable the PtlTE after hardware started a HW to SW + * transition; just wait for the disable event. + */ + if (rxc->base.state == RXC_PENDING_PTLTE_DISABLE) + break; + + RXC_DBG(rxc, "SW Managed: nic auto: %d, reason: %d\n", + event->tgt_long.initiator.state_change.sc_nic_auto, + event->tgt_long.initiator.state_change.sc_nic_auto ? + event->tgt_long.initiator.state_change.sc_reason : -1); + + /* We should not get a bad drop count status since the + * transition is synchronous but we will need this in + * the future. + */ + if (cxi_event_rc(event) == C_RC_NO_MATCH) { + RXC_WARN(rxc, "Bad drop count, ignored\n"); + break; + } + + /* Sanity check */ + if (rxc->base.state == RXC_FLOW_CONTROL) + RXC_FATAL(rxc, "FC to SW EP should be synchronous\n"); + + assert(rxc->base.state == RXC_DISABLED || + rxc->base.state == RXC_ENABLED || + rxc->base.state == RXC_PENDING_PTLTE_SOFTWARE_MANAGED); + + /* Hardware should only generate PTE software managed events + * in two cases: + * 1. Initial start in software mode: disabled->software. + * 2. NIC initiated software transition: enabled->software. + */ + switch (fc_reason) { + case CXIP_FC_SOFTWARE_INITIATED: + /* If messaging was initially offloaded then this + * state transition can only happen if the RXC has + * been disabled; it is safe to ignore this change. + */ + assert(rxc->base.state == RXC_DISABLED); + if (!cxip_env.msg_offload) { + RXC_WARN(rxc, "Software managed EP enabled\n"); + rxc->base.state = RXC_ENABLED_SOFTWARE; + } + break; + + case C_SC_SM_APPEND_FAIL: + case C_SC_SM_UNEXPECTED_FAIL: + /* The NIC initiated the transition; priority list + * appends that are in flight will fail and be added + * to the receive replay list. Update state so that + * no additional appends will be attempted until + * onload completes and the failed appends are + * replayed. + */ + RXC_WARN(rxc, + "NIC transition to SW EP, c_sc_reason: %d\n", + fc_reason); + rxc->fc_reason = fc_reason; + rxc->prev_state = rxc->base.state; + rxc->new_state = RXC_ENABLED_SOFTWARE; + + if (rxc->fc_reason == C_SC_SM_UNEXPECTED_FAIL) + rxc->num_sc_nic_hw2sw_unexp++; + else if (rxc->fc_reason == C_SC_SM_APPEND_FAIL) + rxc->num_sc_nic_hw2sw_append_fail++; + + rxc->base.msg_offload = 0; + rxc->base.state = RXC_PENDING_PTLTE_SOFTWARE_MANAGED; + do { + /* Flush and kick-off onloading of UX list */ + ret = cxip_flush_appends(rxc, + cxip_flush_appends_cb); + } while (ret == -FI_EAGAIN); + if (ret != FI_SUCCESS) + RXC_WARN(rxc, "Flush/UX onload err: %d\n", ret); + break; + default: + RXC_FATAL(rxc, "Invalid PTE c_sc_reason: %d\n", + fc_reason); + } + + break; + default: + RXC_FATAL(rxc, "Unexpected state received: %u\n", pte->state); + } +} + +/* + * cxip_claim_onload_cb() - Process SEARCH and DELETE of claimed UX message. + */ +static int cxip_claim_onload_cb(struct cxip_req *req, + const union c_event *evt) +{ + struct cxip_rxc_hpc *rxc = req->req_ctx; + struct cxip_deferred_event *def_ev; + struct cxip_ux_send *ux_send; + bool matched = false; + + if (evt->hdr.event_type != C_EVENT_PUT_OVERFLOW) + RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT, + cxi_event_to_str(evt), + cxi_rc_to_str(cxi_event_rc(evt))); + + /* Failed to onload UX message, return ENOMSG */ + if (cxi_event_rc(evt) != C_RC_OK) { + RXC_WARN(rxc, "FI_CLAIM HW onload failed: %d\n", + cxi_event_rc(evt)); + cxip_recv_req_peek_complete(req, NULL); + + return FI_SUCCESS; + } + + ofi_atomic_dec32(&rxc->orx_hw_ule_cnt); + + /* FI_CLAIM UX message onloaded from hardware */ + ux_send = calloc(1, sizeof(*ux_send)); + if (!ux_send) { + RXC_WARN(rxc, "Failed allocate UX memory\n"); + return -FI_EAGAIN; + } + ux_send->claimed = true; + + /* Zero-byte unexpected onloads require special handling + * since no deferred structure would be allocated. + */ + if (evt->tgt_long.rlength) { + def_ev = match_put_event(rxc, req, evt, &matched); + if (!matched) { + /* The EVENT_PUT to the overflow list has not been + * processed. The FI_CLAIM operation will be completed + * when the matching put is received. + */ + if (!def_ev) { + free(ux_send); + return -FI_EAGAIN; + } + def_ev->ux_send = ux_send; + } else { + ux_send->req = def_ev->req; + ux_send->put_ev = def_ev->ev; + free_put_event(rxc, def_ev); + } + + /* Fixup event remote offset for an RGet. */ + if (evt->tgt_long.rlength) + ux_send->put_ev.tgt_long.remote_offset = + req->recv.ule_offset + evt->tgt_long.mlength; + + } else { + matched = true; + ux_send->put_ev = *evt; + } + + /* Add to the sw UX list as a claimed entry, it will be ignored in + * recieve matching of UX list entries. Its order no longer matters. + */ + dlist_insert_tail(&ux_send->rxc_entry, &rxc->sw_ux_list); + rxc->sw_ux_list_len++; + + RXC_DBG(rxc, "FI_CLAIM Onload req: %p ux_send %p\n", req, ux_send); + cxip_rxc_hpc_recv_req_tgt_event(req, &ux_send->put_ev); + + /* Put was already received, return FI_CLAIM completion */ + if (matched) { + cxip_recv_req_peek_complete(req, ux_send); + RXC_DBG(rxc, "FI_CLAIM onload complete, req %p, ux_send %p\n", + req, ux_send); + } + + ofi_atomic_dec32(&rxc->orx_hw_ule_cnt); + + return FI_SUCCESS; +} + +/* + * cxip_claim_ux_onload() - Initiate SEARCH and DELETE of FI_CLAIM ux entry. + */ +static int cxip_claim_ux_onload(struct cxip_req *req) +{ + struct cxip_rxc_hpc *rxc = req->req_ctx; + int ret = FI_SUCCESS; + union c_cmdu cmd = {}; + union cxip_match_bits mb = {}; + union cxip_match_bits ib = {}; + + if (rxc->base.state != RXC_ENABLED) { + RXC_DBG(rxc, "FC inprogress, fail claim req %p\n", req); + goto err; + } + + /* Initiate a search to get the remote offset for the + * unexpected list entry we matched. + */ + req->cb = cxip_claim_onload_cb; + mb.tag = req->recv.tag; + mb.tagged = 1; + ib.tx_id = ~0; + ib.cq_data = ~0; + ib.match_comp = ~0; + ib.rdzv_done = ~0; + ib.le_type = ~0; + ib.tag = req->recv.ignore; + + cmd.command.opcode = C_CMD_TGT_SEARCH_AND_DELETE; + + cmd.target.ptl_list = C_PTL_LIST_UNEXPECTED; + cmd.target.ptlte_index = rxc->base.rx_pte->pte->ptn; + cmd.target.buffer_id = req->req_id; + cmd.target.length = -1U; + cmd.target.ignore_bits = ib.raw; + cmd.target.match_bits = mb.raw; + cmd.target.match_id = req->recv.match_id; + /* Delete first match */ + cmd.target.use_once = 1; + + ret = cxi_cq_emit_target(rxc->base.rx_cmdq->dev_cmdq, &cmd); + if (ret) { + /* This condition should clear */ + RXC_WARN(rxc, + "Cannot emit of UX delete cmd, return -FI_EAGAIN\n"); + return -FI_EAGAIN; + } + + cxi_cq_ring(rxc->base.rx_cmdq->dev_cmdq); + + /* Hardware handles the race between subsequent priority list + * appends to the search and delete command. Re-enable. + */ + rxc->hw_claim_in_progress = false; + RXC_DBG(rxc, "FI_CLAIM Search and Delete of UX entry initiated\n"); + + return FI_SUCCESS; + +err: + /* Unable to initiate FI_CLAIM, report as ENOMSG */ + rxc->hw_claim_in_progress = false; + cxip_recv_req_peek_complete(req, NULL); + + return FI_SUCCESS; +} + +/* + * cxip_hw_claim_offset_cb() - Process SEARCH command events to get remote + * offset of entry to be deleted. + */ +static int cxip_hw_claim_offset_cb(struct cxip_req *req, + const union c_event *evt) +{ + struct cxip_rxc_hpc *rxc = req->recv.rxc_hpc; + union cxip_match_bits ux_mb; + uint32_t ux_init; + int ret; + + switch (evt->hdr.event_type) { + case C_EVENT_SEARCH: + if (cxi_event_rc(evt) == C_RC_OK) { + RXC_DBG(rxc, "Claim UX offset search entry, req: %p\n", + req); + + if (req->recv.offset_found) + break; + + req->recv.cur_ule_offsets++; + + /* Not found in range of the offsets we have */ + if (req->recv.cur_ule_offsets > + req->recv.num_ule_offsets) { + RXC_DBG(rxc, "Claim UX offsets exceeded\n"); + break; + } + + /* Check for a match against the FI_PEEK */ + ux_mb.raw = evt->tgt_long.match_bits; + ux_init = evt->tgt_long.initiator.initiator.process; + + if (req->recv.tagged != ux_mb.tagged) + break; + if (ux_mb.tagged + && !tag_match(ux_mb.tag, req->recv.tag, + req->recv.ignore)) + break; + if (!init_match(&rxc->base, ux_init, + req->recv.match_id)) + break; + + /* Matched, update to ignore any future events */ + req->recv.offset_found = true; + req->recv.ule_offset = + req->recv.ule_offsets[req->recv.cur_ule_offsets - 1]; + + RXC_DBG(rxc, "Found offset for claim %p, %d : 0x%lX\n", + req, req->recv.cur_ule_offsets - 1, + req->recv.ule_offset); + break; + } + + assert(cxi_event_rc(evt) == C_RC_NO_MATCH); + + RXC_DBG(rxc, "FI_CLAIM remote offset search done, status %d\n", + cxi_event_rc(evt)); + + if (!req->recv.offset_found) { + RXC_DBG(rxc, "Req %p, FI_CLAIM UX not found\n", req); + goto err_not_found; + } + + ret = cxip_claim_ux_onload(req); + if (ret) { + /* Unable to initiate SEARCH and DELETE, this + * should clear. All other errors return ENOMSG. + */ + if (ret == -FI_EAGAIN) + return ret; + + RXC_WARN(rxc, "claim_ux_onload failed %d\n", ret); + goto err_not_found; + } + + RXC_DBG(rxc, "FI_CLAIM req %p remote offset 0x%lX\n", + req, req->recv.ule_offset); + break; + default: + RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT, + cxi_event_to_str(evt), + cxi_rc_to_str(cxi_event_rc(evt))); + } + + return FI_SUCCESS; + +err_not_found: + /* Terminate FI_PEEK with FI_CLAIM with ENOMSG */ + rxc->hw_claim_in_progress = false; + free(req->recv.ule_offsets); + req->recv.ule_offsets = NULL; + cxip_recv_req_peek_complete(req, NULL); + + return FI_SUCCESS; +} + +/* + * cxip_initiate_hw_claim() - Onload the specified peek, claiming it. + */ +static int cxip_initiate_hw_claim(struct cxip_req *req) +{ + struct cxip_rxc_hpc *rxc = req->req_ctx; + union c_cmdu cmd = {}; + int ret = FI_SUCCESS; + + if (rxc->base.state != RXC_ENABLED) { + RXC_DBG(rxc, "FC inprogress, unable to claim req %p\n", req); + goto err; + } + + /* UX entry exists in hardware, the initial search acts as a flush of + * the event queue for priority list appends. Get remote offset for + * the associated unexpected list entry. + */ + req->recv.cur_ule_offsets = 0; + ret = cxip_get_ule_offsets(rxc, &req->recv.ule_offsets, + &req->recv.num_ule_offsets, true); + if (ret) { + RXC_WARN(rxc, "Unable to get FI_CLAIM UX offsets\n"); + goto err; + } + + RXC_DBG(rxc, "ule_offsets %p, num offsets %d\n", + req->recv.ule_offsets, req->recv.num_ule_offsets); + + /* Initiate a search to get the remote offset for the + * unexpected list entry we matched. This requires going + * through the list. + */ + req->cb = cxip_hw_claim_offset_cb; + + cmd.command.opcode = C_CMD_TGT_SEARCH; + cmd.target.ptl_list = C_PTL_LIST_UNEXPECTED; + cmd.target.ptlte_index = rxc->base.rx_pte->pte->ptn; + cmd.target.buffer_id = req->req_id; + cmd.target.length = -1U; + cmd.target.ignore_bits = -1UL; + cmd.target.match_id = CXI_MATCH_ID_ANY; + + ret = cxi_cq_emit_target(rxc->base.rx_cmdq->dev_cmdq, &cmd); + if (ret) { + RXC_WARN(rxc, "Failed to write Search command: %d\n", ret); + goto err_free_offsets; + } + + cxi_cq_ring(rxc->base.rx_cmdq->dev_cmdq); + + RXC_DBG(rxc, "Search for remote offsets initiated, req %p\n", req); + + return FI_SUCCESS; + +err_free_offsets: + free(req->recv.ule_offsets); + req->recv.ule_offsets = NULL; +err: + /* Unable to initiate FI_CLAIM, report as ENOMSG */ + rxc->hw_claim_in_progress = false; + cxip_recv_req_peek_complete(req, NULL); + + return FI_SUCCESS; +} + +/* + * cxip_ux_peek_cb() - Process UX list SEARCH command events. + */ +static int cxip_ux_peek_cb(struct cxip_req *req, const union c_event *event) +{ + struct cxip_rxc_hpc *rxc = req->req_ctx; + + assert(req->recv.flags & FI_PEEK); + + switch (event->hdr.event_type) { + case C_EVENT_SEARCH: + /* Will receive event for only first match or failure */ + if (cxi_event_rc(event) == C_RC_OK) { + RXC_DBG(rxc, "Peek UX search req: %p matched\n", req); + if (req->recv.flags & FI_CLAIM) { + RXC_DBG(rxc, "req: %p UX must be claimed\n", + req); + return cxip_initiate_hw_claim(req); + } + + /* FI_PEEK only was found */ + cxip_rxc_hpc_recv_req_tgt_event(req, event); + } else { + RXC_DBG(rxc, "Peek UX search req: %p no match\n", req); + } + + cxip_recv_req_peek_complete(req, NULL); + break; + + default: + RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT, + cxi_event_to_str(event), + cxi_rc_to_str(cxi_event_rc(event))); + } + + return FI_SUCCESS; +} + +/* + * cxip_ux_peek() - Issue a SEARCH command to peek for a matching send + * on the RXC offloaded unexpected message list. + * + * Caller must hold ep_obj->lock. + */ +static int cxip_ux_peek(struct cxip_req *req) +{ + struct cxip_rxc_hpc *rxc = req->req_ctx; + union c_cmdu cmd = {}; + union cxip_match_bits mb = {}; + union cxip_match_bits ib = {}; + int ret; + + assert(rxc->base.protocol == FI_PROTO_CXI); + assert(req->recv.flags & FI_PEEK); + + req->cb = cxip_ux_peek_cb; + + mb.tag = req->recv.tag; + mb.tagged = 1; + ib.tx_id = ~0; + ib.cq_data = ~0; + ib.match_comp = ~0; + ib.rdzv_done = ~0; + ib.le_type = ~0; + ib.tag = req->recv.ignore; + + cmd.command.opcode = C_CMD_TGT_SEARCH; + cmd.target.ptl_list = C_PTL_LIST_UNEXPECTED; + cmd.target.ptlte_index = rxc->base.rx_pte->pte->ptn; + cmd.target.buffer_id = req->req_id; + cmd.target.length = -1U; + cmd.target.ignore_bits = ib.raw; + cmd.target.match_bits = mb.raw; + cmd.target.match_id = req->recv.match_id; + /* First match only */ + cmd.target.use_once = 1; + + if (cxip_evtq_saturated(&rxc->base.rx_evtq)) { + RXC_DBG(rxc, "Target HW EQ saturated\n"); + return -FI_EAGAIN; + } + + RXC_DBG(rxc, "Peek UX search req: %p mb.raw: 0x%" PRIx64 " match_id: 0x%x ignore: 0x%" PRIx64 "\n", + req, mb.raw, req->recv.match_id, req->recv.ignore); + + ret = cxi_cq_emit_target(rxc->base.rx_cmdq->dev_cmdq, &cmd); + if (ret) { + RXC_WARN(rxc, "Failed to write Search command: %d\n", ret); + return -FI_EAGAIN; + } + + cxi_cq_ring(rxc->base.rx_cmdq->dev_cmdq); + + /* If FI_CLAIM, we disable priority list appends so the + * search acts as a flush of outstanding appends. + */ + if (req->flags & FI_CLAIM) + rxc->hw_claim_in_progress = true; + + return FI_SUCCESS; +} + +/* cxip_set_ux_dump_entry() - initialize a CQ entry structure + * and/or source address with UX message info. + */ +static void cxip_set_ux_dump_entry(struct cxip_req *req, + const union c_event *evt) +{ + struct cxip_ux_dump_state *ux_dump = req->recv.ux_dump; + union cxip_match_bits mb; + struct fi_cq_tagged_entry *cq_entry = NULL; + fi_addr_t *src_addr = NULL; + + ux_dump->ux_count++; + + /* If exceeding caller provided space updating the total + * available UX message count is all that is required. + */ + if (ux_dump->ret_count >= ux_dump->max_count) + return; + + if (ux_dump->entry) + cq_entry = &ux_dump->entry[ux_dump->ret_count]; + if (ux_dump->src_addr) + src_addr = &ux_dump->src_addr[ux_dump->ret_count]; + + if (cq_entry || src_addr) { + ux_dump->ret_count++; + + req->recv.tgt_event = false; + req->flags = 0; + cxip_rxc_hpc_recv_req_tgt_event(req, evt); + + if (cq_entry) { + /* Need to add FI_TAGGED or FI_MSG directly */ + mb.raw = evt->tgt_long.match_bits; + if (mb.tagged) + req->flags |= FI_TAGGED; + else + req->flags |= FI_MSG; + cq_entry->op_context = NULL; + cq_entry->flags = req->flags; + cq_entry->len = req->recv.rlen; + cq_entry->buf = NULL; + cq_entry->data = req->data; + cq_entry->tag = req->tag; + } + + if (src_addr && req->recv.rxc->attr.caps & FI_SOURCE) + *src_addr = cxip_recv_req_src_addr(req); + } +} + +/* + * cxip_unexp_msg_dump_cb() - Process search command dumping H/W UX entries. + */ +static int cxip_unexp_msg_dump_cb(struct cxip_req *req, + const union c_event *evt) +{ + struct cxip_rxc_hpc *rxc = req->recv.rxc_hpc; + + if (evt->hdr.event_type != C_EVENT_SEARCH) + RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT, + cxi_event_to_str(evt), + cxi_rc_to_str(cxi_event_rc(evt))); + + if (cxi_event_rc(evt) == C_RC_NO_MATCH) { + req->recv.ux_dump->done = true; + return FI_SUCCESS; + } + assert(cxi_event_rc(evt) == C_RC_OK); + + cxip_set_ux_dump_entry(req, evt); + + return FI_SUCCESS; +} + +/* + * cxip_build_debug_ux_entry_info() - Initialize UX info array from ULE. + * + * It is expected that a debugger is utilizing this interface and is + * expecting synchronous behavior. + * + * Caller should hold ep_obj->lock. + */ +int cxip_build_ux_entry_info(struct cxip_ep *ep, + struct fi_cq_tagged_entry *entry, size_t count, + fi_addr_t *src_addr, size_t *ux_count) +{ + struct cxip_rxc_hpc *rxc = container_of(ep->ep_obj->rxc, + struct cxip_rxc_hpc, base); + struct cxip_ux_dump_state *ux_dump; + struct cxip_ux_send *ux_send; + struct dlist_entry *tmp; + struct cxip_req *req = NULL; + union c_cmdu cmd = {}; + int ret_count; + int ret; + + ret = cxip_recv_req_alloc(&rxc->base, NULL, 0, NULL, + &req, cxip_recv_cb); + if (ret) + return ret; + + ux_dump = calloc(1, sizeof(struct cxip_ux_dump_state)); + if (!ux_dump) { + RXC_WARN(rxc, "ENOMEM on allocate of UX state buffer\n"); + ret_count = -FI_ENOMEM; + goto done; + } + + ux_dump->max_count = count; + ux_dump->entry = entry; + ux_dump->src_addr = src_addr; + req->recv.ux_dump = ux_dump; + + /* Get entries from software UX list first */ + dlist_foreach_container_safe(&rxc->sw_ux_list, struct cxip_ux_send, + ux_send, rxc_entry, tmp) + cxip_set_ux_dump_entry(req, &ux_send->put_ev); + + if (!rxc->base.msg_offload) + goto done; + + /* Read H/W UX list processing the request events synchronously + * until we set "Done" in the request callback. + */ + req->cb = cxip_unexp_msg_dump_cb; + cmd.command.opcode = C_CMD_TGT_SEARCH; + cmd.target.ptl_list = C_PTL_LIST_UNEXPECTED; + cmd.target.ptlte_index = rxc->base.rx_pte->pte->ptn; + cmd.target.buffer_id = req->req_id; + cmd.target.length = -1U; + cmd.target.ignore_bits = -1UL; + cmd.target.match_id = CXI_MATCH_ID_ANY; + + ret = cxi_cq_emit_target(rxc->base.rx_cmdq->dev_cmdq, &cmd); + if (ret) { + RXC_WARN(rxc, "Failed to write ULE Search command: %d\n", ret); + ret_count = ret; + goto done; + } + cxi_cq_ring(rxc->base.rx_cmdq->dev_cmdq); + + RXC_DBG(rxc, "Search for ULE dump initiated, req %p\n", req); + do { + cxip_evtq_progress(&rxc->base.rx_evtq); + sched_yield(); + } while (!ux_dump->done); + + RXC_DBG(rxc, "Search ULE dump done, req %p, count %ld\n", + req, ux_dump->ret_count); +done: + ret_count = ux_dump->ret_count; + *ux_count = ux_dump->ux_count; + + free(ux_dump); + cxip_recv_req_free(req); + + return ret_count; +} + +/* + * cxip_recv_sw_matched() - Progress the SW Receive match. + * + * Progress the operation which matched in SW. + */ +static int cxip_recv_sw_matched(struct cxip_req *req, + struct cxip_ux_send *ux_send) +{ + int ret; + uint64_t mrecv_start; + uint32_t mrecv_len; + bool req_done = true; + uint32_t ev_init; + uint32_t ev_rdzv_id; + struct cxip_req *rdzv_req; + struct cxip_rxc_hpc *rxc = req->recv.rxc_hpc; + + assert(req->type == CXIP_REQ_RECV); + + mrecv_start = req->recv.start_offset; + mrecv_len = mrecv_req_put_bytes(req, ux_send->put_ev.tgt_long.rlength); + + if (req->recv.multi_recv && + (req->recv.ulen - req->recv.start_offset) >= + req->recv.rxc->min_multi_recv) + req_done = false; + + if (ux_send->put_ev.tgt_long.rendezvous) { + + /* Make sure we can issue the RGet; if not we stall + * and TX event queue progress will free up credits. + */ + if (ofi_atomic_inc32(&rxc->orx_tx_reqs) > rxc->base.max_tx) { + ofi_atomic_dec32(&rxc->orx_tx_reqs); + return -FI_EAGAIN; + } + + ret = cxip_ux_send(req, ux_send->req, &ux_send->put_ev, + mrecv_start, mrecv_len, req_done); + if (ret != FI_SUCCESS) { + req->recv.start_offset -= mrecv_len; + ofi_atomic_dec32(&rxc->orx_tx_reqs); + + return ret; + } + + /* If multi-recv, a child request was created from + * cxip_ux_send(). Need to lookup this request. + * + * NOTE: Since the same event will be used, the evenet checks + * must be NOT be performed. The event checks are only needed + * when hardware is generating put and put overflow events for + * an mrecv buffer. If we have reached here, we know a put + * overflow event will never occur since the mrecv buffer has + * not been offloaded to hardware. + */ + if (req->recv.multi_recv) { + ret = rdzv_mrecv_req_lookup(req, &ux_send->put_ev, + &ev_init, &ev_rdzv_id, + false, &rdzv_req); + + /* If the previous cxip_ux_send() returns FI_SUCCESS, + * a matching rdzv mrecv req will always exist. + */ + assert(ret == FI_SUCCESS); + } else { + rdzv_req = req; + } + + /* Rendezvous event will not happen. So ack rendezvous event + * now. + */ + rdzv_recv_req_event(rdzv_req, ux_send->put_ev.hdr.event_type); + + cxip_recv_req_set_rget_info(rdzv_req); + + + /* A TX credit has been reserved and user receive request may + * have been removed from the ordered SW queue. If the command + * queue is backed up the condition will clear and the rget + * must get sent out, so wait for it. + */ + do { + ret = issue_rdzv_get(rdzv_req); + } while (ret == -FI_EAGAIN); + assert(ret == FI_SUCCESS); + } else { + if (ux_send->put_ev.tgt_long.rlength) + ret = cxip_ux_send(req, ux_send->req, &ux_send->put_ev, + mrecv_start, mrecv_len, req_done); + else + ret = cxip_ux_send_zb(req, &ux_send->put_ev, + mrecv_start, req_done); + + if (ret != FI_SUCCESS) { + /* undo mrecv_req_put_bytes() */ + req->recv.start_offset -= mrecv_len; + return ret; + } + } + + /* If this is a multi-receive request and there is still space, return + * a special code to indicate SW should keep matching messages to it. + */ + if (ret == FI_SUCCESS && !req_done) + return -FI_EINPROGRESS; + + return ret; +} + +static bool cxip_match_recv_sw(struct cxip_rxc_hpc *rxc, struct cxip_req *req, + struct cxip_ux_send *ux, bool claimed) +{ + union cxip_match_bits ux_mb; + uint32_t ux_init; + + if (claimed != ux->claimed) + return false; + + ux_mb.raw = ux->put_ev.tgt_long.match_bits; + ux_init = ux->put_ev.tgt_long.initiator.initiator.process; + + if (req->recv.tagged != ux_mb.tagged) + return false; + + if (ux_mb.tagged && + !tag_match(ux_mb.tag, req->recv.tag, req->recv.ignore)) + return false; + + if (!init_match(&rxc->base, ux_init, req->recv.match_id)) + return false; + + return true; +} + +static int cxip_recv_sw_matcher(struct cxip_rxc_hpc *rxc, struct cxip_req *req, + struct cxip_ux_send *ux, bool claimed) +{ + int ret; + + if (!cxip_match_recv_sw(rxc, req, ux, claimed)) + return -FI_ENOMSG; + + ret = cxip_recv_sw_matched(req, ux); + if (ret == -FI_EAGAIN) + return -FI_EAGAIN; + + /* FI_EINPROGRESS is return for a multi-recv match. */ + assert(ret == FI_SUCCESS || ret == -FI_EINPROGRESS); + + /* TODO: Manage freeing of UX entries better. */ + dlist_remove(&ux->rxc_entry); + if (ux->req && ux->req->type == CXIP_REQ_RBUF) { + cxip_req_buf_ux_free(ux); + rxc->sw_ux_list_len--; + } else { + free(ux); + rxc->sw_ux_list_len--; + } + + RXC_DBG(rxc, + "Software match, req: %p ux_send: %p (sw_ux_list_len: %u)\n", + req, ux, req->recv.rxc_hpc->sw_ux_list_len); + + return ret; +} + +/* + * cxip_recv_ux_sw_matcher() - Attempt to match an unexpected message to a user + * posted receive. + * + * User must hold the ep_obj->lock. + */ +int cxip_recv_ux_sw_matcher(struct cxip_ux_send *ux) +{ + struct cxip_ptelist_buf *rbuf = ux->req->req_ctx; + struct cxip_rxc_hpc *rxc = rbuf->rxc; + struct cxip_req *req; + struct dlist_entry *tmp; + int ret; + + if (dlist_empty(&rxc->sw_recv_queue)) + return -FI_ENOMSG; + + dlist_foreach_container_safe(&rxc->sw_recv_queue, struct cxip_req, req, + recv.rxc_entry, tmp) { + /* Only matches against unclaimed UX messages */ + ret = cxip_recv_sw_matcher(rxc, req, ux, false); + + /* Unexpected message found match but unable to progress */ + if (ret == -FI_EAGAIN) + return ret; + + /* Unexpected message found a match. */ + if (ret == FI_SUCCESS || ret == -FI_EINPROGRESS) + return FI_SUCCESS; + } + + return -FI_ENOMSG; +} + +/* + * cxip_recv_req_sw_matcher() - Attempt to match the receive request in SW. + * + * Loop through all onloaded UX Sends looking for a match for the Receive + * request. If a match is found, progress the operation. + * + * Caller must hold ep_obj->lock. + */ +int cxip_recv_req_sw_matcher(struct cxip_req *req) +{ + struct cxip_rxc_hpc *rxc = req->recv.rxc_hpc; + struct cxip_ux_send *ux_send; + struct dlist_entry *tmp; + int ret; + + if (dlist_empty(&rxc->sw_ux_list)) + return -FI_ENOMSG; + + dlist_foreach_container_safe(&rxc->sw_ux_list, struct cxip_ux_send, + ux_send, rxc_entry, tmp) { + /* Only match against unclaimed UX messages */ + ret = cxip_recv_sw_matcher(rxc, req, ux_send, false); + switch (ret) { + /* On successful multi-recv or no match, keep matching. */ + case -FI_EINPROGRESS: + case -FI_ENOMSG: + break; + + /* Stop matching. */ + default: + return ret; + } + } + + return -FI_ENOMSG; +} + +/* + * cxip_recv_req_peek() - Peek for matching unexpected message on RXC. + * + * Examine onloaded UX sends, if not found there and HW offload is enabled, + * initiate check of HW UX list. In either case the operation will not + * consume the UX send, but only report the results of the peek to the CQ. + * + * Caller must hold the ep_obj->lock. + */ +static int cxip_recv_req_peek(struct cxip_req *req, bool check_rxc_state) +{ + struct cxip_rxc_hpc *rxc = req->recv.rxc_hpc; + struct cxip_ux_send *ux_send; + struct dlist_entry *tmp; + int ret; + + if (check_rxc_state && rxc->base.state != RXC_ENABLED && + rxc->base.state != RXC_ENABLED_SOFTWARE) + return -FI_EAGAIN; + + /* Attempt to match the onloaded UX list first */ + dlist_foreach_container_safe(&rxc->sw_ux_list, struct cxip_ux_send, + ux_send, rxc_entry, tmp) { + if (cxip_match_recv_sw(rxc, req, ux_send, false)) { + if (req->recv.flags & FI_CLAIM) + ux_send->claimed = true; + + cxip_rxc_hpc_recv_req_tgt_event(req, &ux_send->put_ev); + cxip_recv_req_peek_complete(req, ux_send); + return FI_SUCCESS; + } + } + + if (rxc->base.msg_offload) { + /* Must serialize H/W FI_CLAIM due to getting remote offsets */ + if (rxc->hw_claim_in_progress) + return -FI_EAGAIN; + + ret = cxip_ux_peek(req); + } else { + req->recv.rc = C_RC_NO_MATCH; + cxip_recv_req_peek_complete(req, NULL); + ret = FI_SUCCESS; + } + + return ret; +} + +/* + * cxip_recv_req_queue() - Queue Receive request on RXC. + * + * Before appending a new Receive request to a HW list, attempt to match the + * Receive to any onloaded UX Sends. + * + * Caller must hold the RXC lock and ensure correct RXC state if required. + */ +static int cxip_recv_req_queue(struct cxip_req *req, bool restart_seq) +{ + struct cxip_rxc_hpc *rxc = req->recv.rxc_hpc; + int ret; + + /* Try to match against onloaded Sends first. */ + ret = cxip_recv_req_sw_matcher(req); + if (ret == FI_SUCCESS) + return -FI_EALREADY; + else if (ret == -FI_EAGAIN) + return -FI_EAGAIN; + else if (ret != -FI_ENOMSG) + RXC_FATAL(rxc, "SW matching failed: %d\n", ret); + + if (rxc->base.msg_offload) { + /* Can not append to priority list if claimng UX */ + if (rxc->hw_claim_in_progress) + goto err_dequeue_req; + + ret = _cxip_recv_req(req, restart_seq); + if (ret) + goto err_dequeue_req; + } else { + + req->recv.software_list = true; + dlist_insert_tail(&req->recv.rxc_entry, &rxc->sw_recv_queue); + } + + return FI_SUCCESS; + +err_dequeue_req: + dlist_remove_init(&req->recv.rxc_entry); + + return -FI_EAGAIN; +} + +static void cxip_rxc_hpc_progress(struct cxip_rxc *rxc) +{ + cxip_evtq_progress(&rxc->rx_evtq); +} + +static void cxip_rxc_hpc_recv_req_tgt_event(struct cxip_req *req, + const union c_event *event) +{ + struct cxip_rxc *rxc = req->recv.rxc; + union cxip_match_bits mb = { + .raw = event->tgt_long.match_bits + }; + uint32_t init = event->tgt_long.initiator.initiator.process; + + assert(event->hdr.event_type == C_EVENT_PUT || + event->hdr.event_type == C_EVENT_PUT_OVERFLOW || + event->hdr.event_type == C_EVENT_RENDEZVOUS || + event->hdr.event_type == C_EVENT_SEARCH); + + /* Rendezvous events contain the wrong match bits and do not provide + * initiator context for symmetric AVs. + */ + if (event->hdr.event_type != C_EVENT_RENDEZVOUS) { + req->tag = mb.tag; + req->recv.initiator = init; + + if (mb.cq_data) + req->flags |= FI_REMOTE_CQ_DATA; + } + + /* remote_offset is not provided in Overflow events. */ + if (event->hdr.event_type != C_EVENT_PUT_OVERFLOW) + req->recv.src_offset = event->tgt_long.remote_offset; + + /* For rendezvous, initiator is the RGet DFA. */ + if (event->hdr.event_type == C_EVENT_RENDEZVOUS) { + init = cxi_dfa_to_init(init, rxc->pid_bits); + req->recv.rget_nic = CXI_MATCH_ID_EP(rxc->pid_bits, init); + req->recv.rget_pid = CXI_MATCH_ID_PID(rxc->pid_bits, init); + } + + /* Only need one event to set remaining fields. */ + if (req->recv.tgt_event) + return; + req->recv.tgt_event = true; + + /* VNI is needed to support FI_AV_AUTH_KEY. */ + req->recv.vni = event->tgt_long.vni; + + /* rlen is used to detect truncation. */ + req->recv.rlen = event->tgt_long.rlength; + + /* RC is used when generating completion events. */ + req->recv.rc = cxi_tgt_event_rc(event); + + /* Header data is provided in all completion events. */ + req->data = event->tgt_long.header_data; + + /* rdzv_id is used to correlate Put and Put Overflow events when using + * offloaded RPut. Otherwise, Overflow buffer start address is used to + * correlate events. + */ + if (event->tgt_long.rendezvous) + req->recv.rdzv_id = (mb.rdzv_id_hi << CXIP_RDZV_ID_CMD_WIDTH) | + event->tgt_long.rendezvous_id; + else + req->recv.oflow_start = event->tgt_long.start; + + req->recv.rdzv_lac = mb.rdzv_lac; + req->recv.rdzv_proto = mb.rdzv_proto; + req->recv.rdzv_mlen = event->tgt_long.mlength; + + /* data_len must be set uniquely for each protocol! */ +} + +static int cxip_rxc_hpc_cancel_msg_recv(struct cxip_req *req) +{ + /* Perform default */ + return cxip_recv_cancel(req); +} + +/* Handle any control messaging callbacks specific to protocol */ +static int cxip_rxc_hpc_ctrl_msg_cb(struct cxip_ctrl_req *req, + const union c_event *event) +{ + uint32_t pid_bits = req->ep_obj->domain->iface->dev->info.pid_bits; + union cxip_match_bits mb = { + .raw = event->tgt_long.match_bits, + }; + uint32_t init = event->tgt_long.initiator.initiator.process; + uint32_t nic_addr; + uint32_t pid; + uint16_t vni; + int ret; + + switch (event->hdr.event_type) { + case C_EVENT_PUT: + assert(cxi_event_rc(event) == C_RC_OK); + + nic_addr = CXI_MATCH_ID_EP(pid_bits, init); + pid = CXI_MATCH_ID_PID(pid_bits, init); + vni = event->tgt_long.vni; + + switch (mb.ctrl_msg_type) { + case CXIP_CTRL_MSG_FC_NOTIFY: + ret = cxip_fc_process_drops(req->ep_obj, nic_addr, pid, + vni, mb.drops); + assert(ret == FI_SUCCESS); + break; + case CXIP_CTRL_MSG_FC_RESUME: + ret = cxip_fc_resume(req->ep_obj, nic_addr, pid, vni); + assert(ret == FI_SUCCESS); + break; + default: + ret = -FI_ENOSYS; + break; + } + break; + default: + ret = -FI_ENOSYS; + break; + } + + return ret; +} + +static void cxip_rxc_hpc_init_struct(struct cxip_rxc *rxc_base, + struct cxip_ep_obj *ep_obj) +{ + struct cxip_rxc_hpc *rxc = container_of(rxc_base, struct cxip_rxc_hpc, + base); + int i; + + assert(rxc->base.protocol == FI_PROTO_CXI); + rxc->base.recv_ptl_idx = CXIP_PTL_IDX_RXQ; + ofi_atomic_initialize32(&rxc->orx_hw_ule_cnt, 0); + ofi_atomic_initialize32(&rxc->orx_tx_reqs, 0); + + for (i = 0; i < CXIP_DEF_EVENT_HT_BUCKETS; i++) + dlist_init(&rxc->deferred_events.bh[i]); + + dlist_init(&rxc->fc_drops); + dlist_init(&rxc->sw_ux_list); + dlist_init(&rxc->sw_recv_queue); + dlist_init(&rxc->sw_pending_ux_list); + + rxc->max_eager_size = cxip_env.rdzv_threshold + cxip_env.rdzv_get_min; + rxc->drop_count = rxc->base.ep_obj->asic_ver < CASSINI_2_0 ? -1 : 0; + rxc->rget_align_mask = cxip_env.rdzv_aligned_sw_rget ? + cxip_env.cacheline_size - 1 : 0; +} + +static void cxip_rxc_hpc_fini_struct(struct cxip_rxc *rxc) +{ + /* place holder */ +} + +static int cxip_rxc_hpc_msg_init(struct cxip_rxc *rxc_base) +{ + struct cxip_rxc_hpc *rxc = container_of(rxc_base, struct cxip_rxc_hpc, + base); + struct cxi_pt_alloc_opts pt_opts = { + .use_long_event = 1, + .is_matching = 1, + .en_flowctrl = 1, + .lossless = cxip_env.msg_lossless, + }; + struct cxi_cq_alloc_opts cq_opts = {}; + enum c_ptlte_state state; + int ret; + + assert(rxc->base.protocol == FI_PROTO_CXI); + dlist_init(&rxc->replay_queue); + + /* For FI_TC_UNSPEC, reuse the TX context command queue if possible. If + * a specific traffic class is requested, allocate a new command queue. + * This is done to prevent performance issues with reusing the TX + * context command queue and changing the communication profile. + */ + if (cxip_env.rget_tc == FI_TC_UNSPEC) { + ret = cxip_ep_cmdq(rxc->base.ep_obj, true, FI_TC_UNSPEC, + rxc->base.rx_evtq.eq, &rxc->tx_cmdq); + if (ret != FI_SUCCESS) { + CXIP_WARN("Unable to allocate TX CMDQ, ret: %d\n", ret); + return -FI_EDOMAIN; + } + } else { + cq_opts.count = rxc->base.ep_obj->txq_size * 4; + cq_opts.flags = CXI_CQ_IS_TX; + cq_opts.policy = cxip_env.cq_policy; + + ret = cxip_cmdq_alloc(rxc->base.ep_obj->domain->lni, + rxc->base.rx_evtq.eq, &cq_opts, + rxc->base.ep_obj->auth_key.vni, + cxip_ofi_to_cxi_tc(cxip_env.rget_tc), + CXI_TC_TYPE_DEFAULT, &rxc->tx_cmdq); + if (ret != FI_SUCCESS) { + CXIP_WARN("Unable to allocate CMDQ, ret: %d\n", ret); + return -FI_ENOSPC; + } + } + + /* If applications AVs are symmetric, use logical FI addresses for + * matching. Otherwise, physical addresses will be used. + */ + if (rxc->base.ep_obj->av->symmetric) { + CXIP_DBG("Using logical PTE matching\n"); + pt_opts.use_logical = 1; + } + + ret = cxip_pte_alloc(rxc->base.ep_obj->ptable, + rxc->base.rx_evtq.eq, rxc->base.recv_ptl_idx, + false, &pt_opts, cxip_recv_pte_cb, rxc, + &rxc->base.rx_pte); + if (ret != FI_SUCCESS) { + CXIP_WARN("Failed to allocate RX PTE: %d\n", ret); + goto put_tx_cmdq; + } + + /* One slot must be reserved to support hardware generated state change + * events. + */ + ret = cxip_evtq_adjust_reserved_fc_event_slots(&rxc->base.rx_evtq, + RXC_RESERVED_FC_SLOTS); + if (ret) { + CXIP_WARN("Unable to adjust RX reserved event slots: %d\n", + ret); + goto free_pte; + } + + /* If starting in or able to transition to software managed + * PtlTE, append request list entries first. + */ + if (cxip_software_pte_allowed()) { + ret = cxip_req_bufpool_init(rxc); + if (ret != FI_SUCCESS) + goto free_slots; + } + + if (rxc->base.msg_offload) { + state = C_PTLTE_ENABLED; + ret = cxip_oflow_bufpool_init(rxc); + if (ret != FI_SUCCESS) + goto free_req_buf; + } else { + state = C_PTLTE_SOFTWARE_MANAGED; + } + + /* Start accepting Puts. */ + ret = cxip_pte_set_state(rxc->base.rx_pte, rxc->base.rx_cmdq, state, 0); + if (ret != FI_SUCCESS) { + CXIP_WARN("cxip_pte_set_state returned: %d\n", ret); + goto free_oflow_buf; + } + + /* Wait for PTE state change */ + do { + sched_yield(); + cxip_evtq_progress(&rxc->base.rx_evtq); + } while (rxc->base.rx_pte->state != state); + + CXIP_DBG("RXC HPC messaging enabled: %p, pid_bits: %d\n", + rxc, rxc->base.pid_bits); + + return FI_SUCCESS; + +free_oflow_buf: + if (rxc->base.msg_offload) + cxip_oflow_bufpool_fini(rxc); +free_req_buf: + if (cxip_software_pte_allowed()) + cxip_req_bufpool_fini(rxc); +free_slots: + cxip_evtq_adjust_reserved_fc_event_slots(&rxc->base.rx_evtq, + -1 * RXC_RESERVED_FC_SLOTS); +free_pte: + cxip_pte_free(rxc->base.rx_pte); + +put_tx_cmdq: + if (cxip_env.rget_tc == FI_TC_UNSPEC) + cxip_ep_cmdq_put(rxc->base.ep_obj, true); + else + cxip_cmdq_free(rxc->tx_cmdq); + + return ret; +} + +static int cxip_rxc_hpc_msg_fini(struct cxip_rxc *rxc_base) +{ + struct cxip_rxc_hpc *rxc = container_of(rxc_base, struct cxip_rxc_hpc, + base); + + assert(rxc->base.protocol == FI_PROTO_CXI); + + if (cxip_env.rget_tc == FI_TC_UNSPEC) + cxip_ep_cmdq_put(rxc->base.ep_obj, true); + else + cxip_cmdq_free(rxc->tx_cmdq); + + cxip_evtq_adjust_reserved_fc_event_slots(&rxc->base.rx_evtq, + -1 * RXC_RESERVED_FC_SLOTS); + + return FI_SUCCESS; +} + +static void cxip_rxc_hpc_cleanup(struct cxip_rxc *rxc_base) +{ + struct cxip_rxc_hpc *rxc = container_of(rxc_base, struct cxip_rxc_hpc, + base); + struct cxip_fc_drops *fc_drops; + struct cxip_ux_send *ux_send; + struct dlist_entry *tmp; + + assert(rxc->base.protocol == FI_PROTO_CXI); + + /* TODO: Manage freeing of UX entries better. This code is redundant + * with the freeing in cxip_recv_sw_matcher(). + */ + dlist_foreach_container_safe(&rxc->sw_ux_list, struct cxip_ux_send, + ux_send, rxc_entry, tmp) { + dlist_remove(&ux_send->rxc_entry); + if (ux_send->req && ux_send->req->type == CXIP_REQ_RBUF) + cxip_req_buf_ux_free(ux_send); + else + free(ux_send); + + rxc->sw_ux_list_len--; + } + + if (rxc->sw_ux_list_len != 0) + CXIP_WARN("sw_ux_list_len %d != 0\n", rxc->sw_ux_list_len); + assert(rxc->sw_ux_list_len == 0); + + /* Free any pending UX entries waiting from the request list */ + dlist_foreach_container_safe(&rxc->sw_pending_ux_list, + struct cxip_ux_send, ux_send, + rxc_entry, tmp) { + dlist_remove(&ux_send->rxc_entry); + if (ux_send->req->type == CXIP_REQ_RBUF) + cxip_req_buf_ux_free(ux_send); + else + free(ux_send); + + rxc->sw_pending_ux_list_len--; + } + + if (rxc->sw_pending_ux_list_len != 0) + CXIP_WARN("sw_pending_ux_list_len %d != 0\n", + rxc->sw_pending_ux_list_len); + assert(rxc->sw_pending_ux_list_len == 0); + + /* Cancel Receives */ + cxip_rxc_recv_req_cleanup(&rxc->base); + + /* Cleanup drops */ + dlist_foreach_container_safe(&rxc->fc_drops, + struct cxip_fc_drops, fc_drops, + rxc_entry, tmp) { + dlist_remove(&fc_drops->rxc_entry); + free(fc_drops); + } + + if (rxc->num_fc_eq_full || rxc->num_fc_no_match || + rxc->num_fc_req_full || rxc->num_fc_unexp || + rxc->num_fc_append_fail || rxc->num_sc_nic_hw2sw_unexp || + rxc->num_sc_nic_hw2sw_append_fail) + CXIP_INFO(CXIP_SC_STATS, rxc->num_fc_eq_full, + rxc->num_fc_append_fail, rxc->num_fc_no_match, + rxc->num_fc_req_full, rxc->num_fc_unexp, + rxc->num_sc_nic_hw2sw_unexp, + rxc->num_sc_nic_hw2sw_append_fail); + + if (cxip_software_pte_allowed()) + cxip_req_bufpool_fini(rxc); + if (cxip_env.msg_offload) + cxip_oflow_bufpool_fini(rxc); +} + +static int cxip_rxc_check_recv_count_hybrid_preempt(struct cxip_rxc *rxc) +{ + int ret; + int count; + + /* Only HPC protocol can transition */ + if (rxc->protocol != FI_PROTO_CXI) + return FI_SUCCESS; + + if (cxip_env.rx_match_mode == CXIP_PTLTE_HYBRID_MODE && + cxip_env.hybrid_posted_recv_preemptive == 1) { + count = ofi_atomic_get32(&rxc->orx_reqs); + + if (count > rxc->attr.size) { + assert(rxc->state == RXC_ENABLED); + + /* On success, need to return -FI_EAGAIN which will + * propagate back to the user. In addition, RXC state + * will have transitioned to RXC_PENDING_PTLTE_DISABLE. + */ + ret = cxip_recv_pending_ptlte_disable(rxc, false); + if (ret == FI_SUCCESS) { + RXC_WARN(rxc, + "Transitioning to SW EP due to too many posted recvs: posted_count=%u request_size=%lu\n", + ret, rxc->attr.size); + return -FI_EAGAIN; + } + + RXC_WARN(rxc, "Failed to transition to SW EP: %d\n", + ret); + return ret; + } + } + + return FI_SUCCESS; +} + +/* + * _cxip_recv_req() - Submit Receive request to hardware. + */ +ssize_t _cxip_recv_req(struct cxip_req *req, bool restart_seq) +{ + struct cxip_rxc *rxc = req->recv.rxc; + uint32_t le_flags = 0; + union cxip_match_bits mb = {}; + union cxip_match_bits ib = { + .tx_id = ~0, + .match_comp = 1, + .cq_data = 1, + .rdzv_done = 1, + .le_type = ~0, + }; + int ret; + struct cxip_md *recv_md = req->recv.recv_md; + uint64_t recv_iova = 0; + + ret = cxip_rxc_check_recv_count_hybrid_preempt(rxc); + if (ret != FI_SUCCESS) + return ret; + + if (req->recv.tagged) { + mb.tagged = 1; + mb.tag = req->recv.tag; + ib.tag = req->recv.ignore; + } + + /* For poorly written applications a periodic check LE pool + * resources can be requested to force transitions to software mode. + * For this to occur, the code must be executing in hybrid mode, + * still matching in hardware, and FI_CXI_HYBRID_RECV_PREEMPTIVE + * explicitly set by the application. + */ + if (cxip_env.rx_match_mode != CXIP_PTLTE_HYBRID_MODE || + ++rxc->recv_appends & CXIP_HYBRID_RECV_CHECK_INTERVAL) + le_flags = C_LE_EVENT_LINK_DISABLE; + + /* Always set manage_local in Receive LEs. This makes Cassini ignore + * initiator remote_offset in all Puts. With this, remote_offset in Put + * events can be used by the initiator for protocol data. The behavior + * of use_once is not impacted by manage_local. + */ + le_flags |= C_LE_EVENT_LINK_DISABLE | C_LE_EVENT_UNLINK_DISABLE | + C_LE_MANAGE_LOCAL | C_LE_UNRESTRICTED_BODY_RO | + C_LE_UNRESTRICTED_END_RO | C_LE_OP_PUT; + + if (!req->recv.multi_recv) + le_flags |= C_LE_USE_ONCE; + if (restart_seq) + le_flags |= C_LE_RESTART_SEQ; + + if (recv_md) + recv_iova = CXI_VA_TO_IOVA(recv_md->md, + (uint64_t)req->recv.recv_buf + + req->recv.start_offset); + + req->recv.hw_offloaded = true; + + /* Issue Append command */ + ret = cxip_pte_append(rxc->rx_pte, recv_iova, + req->recv.ulen - req->recv.start_offset, + recv_md ? recv_md->md->lac : 0, + C_PTL_LIST_PRIORITY, req->req_id, + mb.raw, ib.raw, req->recv.match_id, + req->recv.multi_recv ? + rxc->min_multi_recv : 0, + le_flags, NULL, rxc->rx_cmdq, + !(req->recv.flags & FI_MORE)); + if (ret != FI_SUCCESS) { + RXC_WARN(rxc, "Failed to write Append command: %d\n", ret); + return ret; + } + + return FI_SUCCESS; +} + +/* + * cxip_recv_common() - Common message receive function. Used for tagged and + * untagged sends of all sizes. + */ +static ssize_t +cxip_recv_common(struct cxip_rxc *rxc, void *buf, size_t len, void *desc, + fi_addr_t src_addr, uint64_t tag, uint64_t ignore, + void *context, uint64_t flags, bool tagged, + struct cxip_cntr *comp_cntr) +{ + struct cxip_rxc_hpc *rxc_hpc = container_of(rxc, struct cxip_rxc_hpc, + base); + int ret; + struct cxip_req *req; + struct cxip_ux_send *ux_msg; + uint32_t match_id; + uint16_t vni; + + assert(rxc_hpc->base.protocol == FI_PROTO_CXI); + + if (len && !buf) + return -FI_EINVAL; + + if (rxc->state == RXC_DISABLED) + return -FI_EOPBADSTATE; + + /* HW to SW PtlTE transition, ensure progress is made */ + if (rxc->state != RXC_ENABLED && rxc->state != RXC_ENABLED_SOFTWARE) { + cxip_cq_progress(rxc->recv_cq); + return -FI_EAGAIN; + } + + if (tagged) { + if (tag & ~CXIP_TAG_MASK || ignore & ~CXIP_TAG_MASK) { + RXC_WARN(rxc, + "Invalid tag: %#018lx ignore: %#018lx (%#018lx)\n", + tag, ignore, CXIP_TAG_MASK); + return -FI_EINVAL; + } + } + + ret = cxip_set_recv_match_id(rxc, src_addr, rxc->ep_obj->av_auth_key && + (flags & FI_AUTH_KEY), &match_id, &vni); + if (ret) { + RXC_WARN(rxc, "Error setting match_id: %d %s\n", + ret, fi_strerror(-ret)); + return ret; + } + + ofi_genlock_lock(&rxc->ep_obj->lock); + ret = cxip_recv_req_alloc(rxc, buf, len, NULL, &req, cxip_recv_cb); + if (ret) + goto err; + + /* req->data_len, req->tag, req->data must be set later. req->buf may + * be overwritten later. + */ + req->context = (uint64_t)context; + + req->flags = FI_RECV | (flags & FI_COMPLETION); + if (tagged) + req->flags |= FI_TAGGED; + else + req->flags |= FI_MSG; + + req->recv.cntr = comp_cntr ? comp_cntr : rxc->recv_cntr; + req->recv.match_id = match_id; + req->recv.tag = tag; + req->recv.ignore = ignore; + req->recv.flags = flags; + req->recv.tagged = tagged; + req->recv.multi_recv = (flags & FI_MULTI_RECV ? true : false); + + if (rxc->state != RXC_ENABLED && rxc->state != RXC_ENABLED_SOFTWARE) { + ret = -FI_EAGAIN; + goto err_free_request; + } + + if (!(req->recv.flags & (FI_PEEK | FI_CLAIM))) { + + ret = cxip_recv_req_queue(req, false); + /* Match made in software? */ + if (ret == -FI_EALREADY) { + ofi_genlock_unlock(&rxc->ep_obj->lock); + + return FI_SUCCESS; + } + + /* RXC busy (onloading Sends or full CQ)? */ + if (ret != FI_SUCCESS) + goto err_free_request; + + ofi_genlock_unlock(&rxc->ep_obj->lock); + + RXC_DBG(rxc, + "req: %p buf: %p len: %lu src_addr: %ld tag(%c):" + " 0x%lx ignore: 0x%lx context: %p\n", + req, buf, len, src_addr, tagged ? '*' : '-', tag, + ignore, context); + + return FI_SUCCESS; + } + + /* FI_PEEK with/without FI_CLAIM */ + if (req->recv.flags & FI_PEEK) { + if (req->recv.flags & FI_CLAIM && !req->context) { + RXC_WARN(rxc, "FI_CLAIM requires fi_context\n"); + ret = -FI_EINVAL; + goto err_free_request; + } + ret = cxip_recv_req_peek(req, true); + if (ret == FI_SUCCESS) { + ofi_genlock_unlock(&rxc->ep_obj->lock); + + return ret; + } + + goto err_free_request; + } + + /* FI_CLAIM without FI_PEEK */ + ux_msg = ((struct fi_context *)req->context)->internal[0]; + if (!ux_msg->claimed) { + RXC_WARN(rxc, "Bad fi_context specified with FI_CLAIM\n"); + ret = -FI_EINVAL; + goto err_free_request; + } + + RXC_DBG(rxc, "FI_CLAIM invoke sw matcher %p\n", ux_msg); + ret = cxip_recv_sw_matcher(rxc_hpc, req, ux_msg, true); + if (ret == FI_SUCCESS || ret == -FI_EINPROGRESS) { + ofi_genlock_unlock(&rxc->ep_obj->lock); + + return FI_SUCCESS; + } + +err_free_request: + cxip_recv_req_free(req); +err: + ofi_genlock_unlock(&rxc->ep_obj->lock); + + return ret; +} + +/* + * rdzv_send_req_complete() - Complete long send request. + */ +static void rdzv_send_req_complete(struct cxip_req *req) +{ + cxip_rdzv_id_free(req->send.txc_hpc, req->send.rdzv_id); + + cxip_send_buf_fini(req); + + cxip_report_send_completion(req, true); + + ofi_atomic_dec32(&req->send.txc->otx_reqs); + cxip_evtq_req_free(req); +} + +/* + * rdzv_send_req_event() - Count a rendezvous send event. + * + * Call for each initiator event. The events could be generated in any order. + * Once all expected events are received, complete the request. + * + * A successful rendezvous Send generates two events: Ack and Get. + */ +static void rdzv_send_req_event(struct cxip_req *req) +{ + if (++req->send.rdzv_send_events == 2) + rdzv_send_req_complete(req); +} + +/* + * cxip_send_rdzv_put_cb() - Long send callback. + * + * Progress a long send operation to completion. + */ +static int cxip_send_rdzv_put_cb(struct cxip_req *req, + const union c_event *event) +{ + int event_rc; + int ret; + struct cxip_txc_hpc *txc = req->send.txc_hpc; + + switch (event->hdr.event_type) { + case C_EVENT_ACK: + /* The source Put completed. */ + event_rc = cxi_init_event_rc(event); + + TXC_DBG(txc, "Acked: %p (rc: %s list: %s)\n", req, + cxi_rc_to_str(event_rc), + cxi_ptl_list_to_str(event->init_short.ptl_list)); + + /* If the message was dropped, mark the peer as disabled. Do + * not generate a completion. Free associated resources. Do not + * free the request (it will be used to replay the Send). + */ + if (event_rc == C_RC_PT_DISABLED) { + ret = cxip_send_req_dropped(txc, req); + if (ret == FI_SUCCESS) + cxip_rdzv_id_free(txc, req->send.rdzv_id); + else + ret = -FI_EAGAIN; + + return ret; + } + + /* Message was accepted by the peer. Match order is preserved. + * The request can be dequeued from the SW message queue. This + * allows flow-control recovery to be performed before + * outstanding long Send operations have completed. + */ + ret = cxip_send_req_dequeue(txc, req); + if (ret != FI_SUCCESS) + return ret; + + /* The transaction is complete if the put failed */ + if (event_rc != C_RC_OK) { + req->send.rc = event_rc; + rdzv_send_req_complete(req); + } else { + /* Count the event, another may be expected. */ + rdzv_send_req_event(req); + } + return FI_SUCCESS; + + /* When errors happen, send events can occur before the put/get event. + * These events should just be dropped. + */ + case C_EVENT_SEND: + { + struct cxi_md *md = req->send.send_md->md; + + TXC_WARN(txc, "Unexpected %s event: rc:%s buf:%p len:0x%lx iova:0x%llx md.va:0x%llx lac:%d\n", + cxi_event_to_str(event), + cxi_rc_to_str(cxi_event_rc(event)), req->send.buf, + req->send.len, CXI_VA_TO_IOVA(md, req->send.buf), + md->iova, md->lac); + } + return FI_SUCCESS; + + default: + TXC_FATAL(txc, CXIP_UNEXPECTED_EVENT, + cxi_event_to_str(event), + cxi_rc_to_str(cxi_event_rc(event))); + } +} + +/* + * cxip_rdzv_pte_src_cb() - Process rendezvous source buffer events. + * + * A Get event is generated for each rendezvous Send indicating Send completion. + */ +int cxip_rdzv_pte_src_cb(struct cxip_req *req, const union c_event *event) +{ + struct cxip_rdzv_pte *rdzv_pte = req->req_ctx; + struct cxip_txc_hpc *txc = rdzv_pte->txc; + struct cxip_req *get_req; + union cxip_match_bits mb; + int event_rc = cxi_event_rc(event); + int rdzv_id; + + switch (event->hdr.event_type) { + case C_EVENT_LINK: + if (event_rc == C_RC_OK) + ofi_atomic_inc32(&rdzv_pte->le_linked_success_count); + else + ofi_atomic_inc32(&rdzv_pte->le_linked_failure_count); + return FI_SUCCESS; + + case C_EVENT_GET: + mb.raw = event->tgt_long.match_bits; + rdzv_id = (mb.rdzv_id_hi << CXIP_RDZV_ID_CMD_WIDTH) | + mb.rdzv_id_lo; + get_req = cxip_rdzv_id_lookup(txc, rdzv_id); + if (!get_req) { + TXC_WARN(txc, "Failed to find RDZV ID: %d\n", + mb.rdzv_id_lo); + return FI_SUCCESS; + } + + if (event_rc != C_RC_OK) + TXC_WARN(txc, "Get error: %p rc: %s\n", get_req, + cxi_rc_to_str(event_rc)); + else + TXC_DBG(txc, "Get received: %p rc: %s\n", get_req, + cxi_rc_to_str(event_rc)); + + get_req->send.rc = event_rc; + + /* Count the event, another may be expected. */ + rdzv_send_req_event(get_req); + + return FI_SUCCESS; + default: + TXC_FATAL(txc, CXIP_UNEXPECTED_EVENT, + cxi_event_to_str(event), + cxi_rc_to_str(cxi_event_rc(event))); + } +} + +/* + * _cxip_send_rdzv_put() - Initiate a send rendezvous put operation. + * + * The rendezvous protocol works as follows: + * + * 1. The Initiator performs a Rendezvous Put command which includes a portion + * of the source buffer data. + * 2. Once the Put is matched to a user receive buffer (in the Priority list), + * a Get of the remaining source data is performed. + */ +static ssize_t _cxip_send_rdzv_put(struct cxip_req *req) +{ + struct cxip_txc_hpc *txc = req->send.txc_hpc; + union c_fab_addr dfa; + uint8_t idx_ext; + struct c_full_dma_cmd cmd = {}; + union cxip_match_bits put_mb = {}; + int rdzv_id; + int lac = req->send.send_md->md->lac; + int ret; + + /* Zero length rendezvous not supported. */ + assert(req->send.send_md); + assert(req->send.len); + + /* Allocate rendezvous ID */ + rdzv_id = cxip_rdzv_id_alloc(txc, req); + if (rdzv_id < 0) + return -FI_EAGAIN; + + /* Calculate DFA */ + cxi_build_dfa(req->send.caddr.nic, req->send.caddr.pid, + txc->base.pid_bits, CXIP_PTL_IDX_RXQ, &dfa, &idx_ext); + + /* Allocate a source request for the given LAC. This makes the source + * memory accessible for rendezvous. + */ + ret = cxip_rdzv_pte_src_req_alloc(txc->rdzv_pte, lac); + if (ret) { + TXC_WARN(txc, "Failed to prepare source window: %d\n", ret); + goto err_free_rdzv_id; + } + + + /* Allocate restricted source window. If resources can not be allocated + * discontinue use of the restricted protocol, falling back + * to unrestricted. TODO: keep track and only switch for LAC that + * failed. + */ + if (txc->rdzv_proto == CXIP_RDZV_PROTO_ALT_READ && + !txc->rdzv_nomatch_pte[lac]) { + TXC_DBG(txc, "allocate restricted PTE lac %d\n", lac); + + ret = cxip_rdzv_nomatch_pte_alloc(txc, lac, + &txc->rdzv_nomatch_pte[lac]); + if (ret) { + TXC_WARN(txc, WARN_RESTRICTED_DISABLED, + cxip_rdzv_proto_to_str(txc->rdzv_proto), + cxip_rdzv_proto_to_str(CXIP_RDZV_PROTO_DEFAULT)); + txc->rdzv_proto = CXIP_RDZV_PROTO_DEFAULT; + } + } + + /* Build match bits */ + if (req->send.tagged) { + put_mb.tagged = 1; + put_mb.tag = req->send.tag; + } + + if (req->send.flags & FI_REMOTE_CQ_DATA) + put_mb.cq_data = 1; + + put_mb.rdzv_proto = txc->rdzv_proto; + + req->send.rdzv_id = rdzv_id; + req->cb = cxip_send_rdzv_put_cb; + req->send.rdzv_send_events = 0; + + /* Build Put command descriptor */ + cmd.command.cmd_type = C_CMD_TYPE_DMA; + cmd.index_ext = idx_ext; + cmd.lac = req->send.send_md->md->lac; + cmd.event_send_disable = 1; + cmd.restricted = 0; + cmd.dfa = dfa; + cmd.local_addr = CXI_VA_TO_IOVA(req->send.send_md->md, req->send.buf); + cmd.request_len = req->send.len; + cmd.eq = cxip_evtq_eqn(&txc->base.tx_evtq); + cmd.user_ptr = (uint64_t)req; + cmd.initiator = cxip_msg_match_id(&txc->base); + cmd.header_data = req->send.data; + cmd.remote_offset = + CXI_VA_TO_IOVA(req->send.send_md->md, req->send.buf); + cmd.command.opcode = C_CMD_RENDEZVOUS_PUT; + cmd.eager_length = txc->rdzv_eager_size; + cmd.use_offset_for_get = 1; + + put_mb.rdzv_id_hi = rdzv_id >> CXIP_RDZV_ID_CMD_WIDTH; + put_mb.rdzv_lac = req->send.send_md->md->lac; + put_mb.le_type = CXIP_LE_TYPE_RX; + cmd.match_bits = put_mb.raw; + cmd.rendezvous_id = rdzv_id; + + ret = cxip_txc_emit_dma(&txc->base, req->send.caddr.vni, + cxip_ofi_to_cxi_tc(req->send.tclass), + CXI_TC_TYPE_DEFAULT, req->triggered ? + req->trig_cntr : NULL, req->trig_thresh, + &cmd, req->send.flags); + if (ret) { + TXC_DBG(txc, "Failed to write DMA command: %d\n", ret); + goto err_free_rdzv_id; + } + + req->triggered = false; + + return FI_SUCCESS; + +err_free_rdzv_id: + cxip_rdzv_id_free(txc, rdzv_id); + + return ret; +} + +/* + * cxip_send_eager_cb() - Eager send callback. Used for both tagged and + * untagged messages. + */ +static int cxip_send_eager_cb(struct cxip_req *req, + const union c_event *event) +{ + int match_complete = req->flags & FI_MATCH_COMPLETE; + int ret; + + /* When errors happen, send events can occur before the put/get event. + * These events should just be dropped. + */ + if (event->hdr.event_type == C_EVENT_SEND) { + TXC_WARN(req->send.txc, CXIP_UNEXPECTED_EVENT, + cxi_event_to_str(event), + cxi_rc_to_str(cxi_event_rc(event))); + return FI_SUCCESS; + } + + assert(event->hdr.event_type == C_EVENT_ACK); + + req->send.rc = cxi_init_event_rc(event); + + /* If the message was dropped, mark the peer as disabled. Do not + * generate a completion. Free associated resources. Do not free the + * request (it will be used to replay the Send). + */ + if (req->send.rc == C_RC_PT_DISABLED) { + + ret = cxip_send_req_dropped(req->send.txc_hpc, req); + if (ret != FI_SUCCESS) + return -FI_EAGAIN; + + if (match_complete) + cxip_tx_id_free(req->send.txc_hpc, req->send.tx_id); + + return FI_SUCCESS; + } + + ret = cxip_send_req_dequeue(req->send.txc_hpc, req); + if (ret != FI_SUCCESS) + return ret; + + cxip_send_buf_fini(req); + + /* If MATCH_COMPLETE was requested and the the Put did not match a user + * buffer, do not generate a completion event until the target notifies + * the initiator that the match is complete. + */ + if (match_complete) { + if (req->send.rc == C_RC_OK && + event->init_short.ptl_list != C_PTL_LIST_PRIORITY) { + TXC_DBG(req->send.txc, + "Waiting for match complete: %p\n", req); + return FI_SUCCESS; + } + + TXC_DBG(req->send.txc, "Match complete with Ack: %p\n", req); + cxip_tx_id_free(req->send.txc_hpc, req->send.tx_id); + } + + /* If MATCH_COMPLETE was requested, software must manage counters. */ + cxip_report_send_completion(req, match_complete); + + ofi_atomic_dec32(&req->send.txc->otx_reqs); + cxip_evtq_req_free(req); + + return FI_SUCCESS; +} + +static inline int cxip_set_eager_mb(struct cxip_req *req, + union cxip_match_bits *mb) +{ + int tx_id; + + mb->raw = 0; + mb->le_type = CXIP_LE_TYPE_RX; + mb->tagged = req->send.tagged; + mb->tag = req->send.tag; + mb->cq_data = !!(req->send.flags & FI_REMOTE_CQ_DATA); + + /* Allocate a TX ID if match completion guarantees are required */ + if (req->send.flags & FI_MATCH_COMPLETE) { + + tx_id = cxip_tx_id_alloc(req->send.txc_hpc, req); + if (tx_id < 0) { + TXC_DBG(req->send.txc, + "Failed to allocate TX ID: %d\n", tx_id); + return -FI_EAGAIN; + } + + req->send.tx_id = tx_id; + mb->match_comp = 1; + mb->tx_id = tx_id; + } + + return FI_SUCCESS; +} + +/* + * _cxip_send_eager_idc() - Enqueue eager IDC message + */ +static ssize_t _cxip_send_eager_idc(struct cxip_req *req) +{ + struct cxip_txc *txc = req->send.txc; + union c_fab_addr dfa; + uint8_t idx_ext; + union cxip_match_bits mb; + ssize_t ret; + const void *buf; + struct c_cstate_cmd cstate_cmd = {}; + struct c_idc_msg_hdr idc_cmd; + + assert(req->send.len > 0); + +#if ENABLE_DEBUG + if (req->send.flags & FI_INJECT) + assert(req->send.ibuf); + + /* ibuf and send_md are mutually exclusive. */ + if (req->send.ibuf) { + assert(req->send.send_md == NULL); + } else if (req->send.send_md) { + assert(req->send.ibuf == NULL); + + /* All non FI_HMEM_SYSTEM buffers require an ibuf. */ + assert(req->send.send_md->info.iface == FI_HMEM_SYSTEM); + } +#endif + + /* Calculate DFA */ + cxi_build_dfa(req->send.caddr.nic, req->send.caddr.pid, txc->pid_bits, + CXIP_PTL_IDX_RXQ, &dfa, &idx_ext); + + /* Favor bounce buffer if allocated. */ + if (req->send.ibuf) + buf = req->send.ibuf; + else + buf = req->send.buf; + + ret = cxip_set_eager_mb(req, &mb); + if (ret) + goto err; + + req->cb = cxip_send_eager_cb; + + /* Build commands before taking lock */ + cstate_cmd.event_send_disable = 1; + cstate_cmd.index_ext = idx_ext; + cstate_cmd.eq = cxip_evtq_eqn(&txc->tx_evtq); + cstate_cmd.initiator = cxip_msg_match_id(txc); + + /* If MATCH_COMPLETE was requested, software must manage + * counters. + */ + if (req->send.cntr && !mb.match_comp) { + cstate_cmd.event_ct_ack = 1; + cstate_cmd.ct = req->send.cntr->ct->ctn; + } + + /* Note: IDC command completely filled in */ + idc_cmd.unused_0 = 0; + idc_cmd.dfa = dfa; + idc_cmd.match_bits = mb.raw; + idc_cmd.header_data = req->send.data; + idc_cmd.user_ptr = (uint64_t)req; + + ret = cxip_txc_emit_idc_msg(txc, req->send.caddr.vni, + cxip_ofi_to_cxi_tc(req->send.tclass), + CXI_TC_TYPE_DEFAULT, &cstate_cmd, &idc_cmd, + buf, req->send.len, req->send.flags); + if (ret) { + TXC_DBG(txc, "Failed to write IDC command: %ld\n", ret); + goto err_cleanup; + } + + return FI_SUCCESS; + +err_cleanup: + if (mb.match_comp) + cxip_tx_id_free(req->send.txc_hpc, req->send.tx_id); +err: + return ret; +} + +/* + * _cxip_send_eager() - Enqueue eager send command. + */ +static ssize_t _cxip_send_eager(struct cxip_req *req) +{ + struct cxip_txc *txc = req->send.txc; + union c_fab_addr dfa; + uint8_t idx_ext; + union cxip_match_bits mb; + ssize_t ret; + struct c_full_dma_cmd cmd = {}; + + /* Calculate DFA */ + cxi_build_dfa(req->send.caddr.nic, req->send.caddr.pid, txc->pid_bits, + CXIP_PTL_IDX_RXQ, &dfa, &idx_ext); + + ret = cxip_set_eager_mb(req, &mb); + if (ret) + goto err; + + req->cb = cxip_send_eager_cb; + + cmd.command.cmd_type = C_CMD_TYPE_DMA; + cmd.command.opcode = C_CMD_PUT; + cmd.index_ext = idx_ext; + cmd.event_send_disable = 1; + cmd.dfa = dfa; + cmd.eq = cxip_evtq_eqn(&txc->tx_evtq); + cmd.user_ptr = (uint64_t)req; + cmd.initiator = cxip_msg_match_id(txc); + cmd.match_bits = mb.raw; + cmd.header_data = req->send.data; + + /* Triggered ops could result in 0 length DMA */ + if (req->send.send_md) { + cmd.lac = req->send.send_md->md->lac; + cmd.local_addr = CXI_VA_TO_IOVA(req->send.send_md->md, + req->send.buf); + cmd.request_len = req->send.len; + } + + /* If MATCH_COMPLETE was requested, software must manage + * counters. + */ + if (req->send.cntr && !mb.match_comp) { + cmd.event_ct_ack = 1; + cmd.ct = req->send.cntr->ct->ctn; + } + + ret = cxip_txc_emit_dma(txc, req->send.caddr.vni, + cxip_ofi_to_cxi_tc(req->send.tclass), + CXI_TC_TYPE_DEFAULT, req->triggered ? + req->trig_cntr : NULL, req->trig_thresh, + &cmd, req->send.flags); + if (ret) { + TXC_DBG(txc, "Failed to write DMA command: %ld\n", ret); + goto err_enqueue; + } else { + req->triggered = false; + } + + return FI_SUCCESS; + +err_enqueue: + + if (mb.match_comp) + cxip_tx_id_free(req->send.txc_hpc, req->send.tx_id); +err: + return ret; +} + +static ssize_t _cxip_send_req(struct cxip_req *req) +{ + /* Force all zero-byte operations to use the eager path. This utilizes + * a smaller command format. + */ + if (req->send.len == 0) + return _cxip_send_eager(req); + + /* IDC commands are not supported with triggered operations. */ + if (!req->triggered && + ((req->send.flags & FI_INJECT) || cxip_send_eager_idc(req))) + return _cxip_send_eager_idc(req); + + if (req->send.len <= req->send.txc_hpc->max_eager_size) + return _cxip_send_eager(req); + + return _cxip_send_rdzv_put(req); +} + +/* + * cxip_fc_peer_lookup() - Check if a peer is disabled. + * + * Look up disabled peer state and return it, if available. + * + * Caller must hold ep_obj->lock. + */ +static struct cxip_fc_peer *cxip_fc_peer_lookup(struct cxip_txc_hpc *txc, + struct cxip_addr caddr) +{ + struct cxip_fc_peer *peer; + + dlist_foreach_container(&txc->fc_peers, struct cxip_fc_peer, + peer, txc_entry) { + if (CXIP_ADDR_VNI_EQUAL(peer->caddr, caddr)) + return peer; + } + + return NULL; +} + +/* + * cxip_fc_peer_put() - Account for completion of an outstanding Send targeting + * a disabled peer. + * + * Drop a reference to a disabled peer. When the last reference is dropped, + * attempt flow-control recovery. + * + * Caller must hold ep_obj->lock. + */ +static int cxip_fc_peer_put(struct cxip_fc_peer *peer) +{ + int ret; + + assert(peer->pending > 0); + + /* Account for the completed Send */ + if (!--peer->pending) { + peer->req.send.mb.drops = peer->dropped; + + ret = cxip_ctrl_msg_send(&peer->req); + if (ret != FI_SUCCESS) { + peer->pending++; + return ret; + } + + peer->pending_acks++; + + TXC_DBG(peer->txc, + "Notified disabled peer NIC: %#x PID: %u VNI: %u dropped: %u\n", + peer->caddr.nic, peer->caddr.pid, peer->caddr.vni, + peer->dropped); + } + + return FI_SUCCESS; +} + +/* + * cxip_fc_peer_fini() - Remove disabled peer state. + * + * Caller must hold ep_obj->lock. + */ +static void cxip_fc_peer_fini(struct cxip_fc_peer *peer) +{ + assert(dlist_empty(&peer->msg_queue)); + dlist_remove(&peer->txc_entry); + free(peer); +} + +/* + * cxip_fc_notify_cb() - Process FC notify completion events. + */ +int cxip_fc_notify_cb(struct cxip_ctrl_req *req, const union c_event *event) +{ + struct cxip_fc_peer *peer = container_of(req, struct cxip_fc_peer, req); + struct cxip_txc_hpc *txc = peer->txc; + + switch (event->hdr.event_type) { + case C_EVENT_ACK: + switch (cxi_event_rc(event)) { + case C_RC_OK: + TXC_DBG(txc, + "FC_NOTIFY to %#x:%u:%u successfully sent: retry_count=%u\n", + peer->caddr.nic, peer->caddr.pid, + peer->caddr.vni, peer->retry_count); + + /* Peer flow control structure can only be freed if + * replay is complete and all acks accounted for. + */ + peer->pending_acks--; + if (!peer->pending_acks && peer->replayed) + cxip_fc_peer_fini(peer); + + return FI_SUCCESS; + + /* This error occurs when the target's control event queue has + * run out of space. Since the target should be processing the + * event queue, it is safe to replay messages until C_RC_OK is + * returned. + */ + case C_RC_ENTRY_NOT_FOUND: + peer->retry_count++; + TXC_WARN(txc, + "%#x:%u:%u dropped FC message: retry_delay_usecs=%d retry_count=%u\n", + peer->caddr.nic, peer->caddr.pid, + peer->caddr.vni, cxip_env.fc_retry_usec_delay, + peer->retry_count); + usleep(cxip_env.fc_retry_usec_delay); + return cxip_ctrl_msg_send(req); + default: + TXC_FATAL(txc, CXIP_UNEXPECTED_EVENT_STS, + cxi_event_to_str(event), + cxi_rc_to_str(cxi_event_rc(event))); + } + default: + TXC_FATAL(txc, CXIP_UNEXPECTED_EVENT, + cxi_event_to_str(event), + cxi_rc_to_str(cxi_event_rc(event))); + } +} + +/* + * cxip_fc_peer_init() - Mark a peer as disabled. + * + * Called by sending EP after experiencing first dropped Send to a peer. + * + * Allocate state to track the disabled peer. Locate all outstanding Sends + * targeting the peer. + * + * Caller must hold ep_obj->lock. + */ +static int cxip_fc_peer_init(struct cxip_txc_hpc *txc, struct cxip_addr caddr, + struct cxip_fc_peer **peer) +{ + struct cxip_fc_peer *p; + struct cxip_req *req; + struct dlist_entry *tmp; + + p = calloc(1, sizeof(*p)); + if (!p) { + TXC_WARN(txc, "Failed to allocate FC Peer\n"); + return -FI_ENOMEM; + } + + p->caddr = caddr; + p->txc = txc; + dlist_init(&p->msg_queue); + dlist_insert_tail(&p->txc_entry, &txc->fc_peers); + + p->req.send.nic_addr = caddr.nic; + p->req.send.pid = caddr.pid; + p->req.send.vni = caddr.vni; + /* TODO: remove */ + p->req.send.mb.txc_id = 0; + p->req.send.mb.rxc_id = 0; + + p->req.send.mb.ctrl_le_type = CXIP_CTRL_LE_TYPE_CTRL_MSG; + p->req.send.mb.ctrl_msg_type = CXIP_CTRL_MSG_FC_NOTIFY; + p->req.cb = cxip_fc_notify_cb; + p->req.ep_obj = txc->base.ep_obj; + + /* Queue all Sends to the FC'ed peer */ + dlist_foreach_container_safe(&txc->base.msg_queue, struct cxip_req, + req, send.txc_entry, tmp) { + if (CXIP_ADDR_VNI_EQUAL(req->send.caddr, caddr)) { + dlist_remove(&req->send.txc_entry); + dlist_insert_tail(&req->send.txc_entry, &p->msg_queue); + p->pending++; + req->send.fc_peer = p; + } + } + + *peer = p; + + return FI_SUCCESS; +} + +/* + * cxip_fc_resume() - Replay dropped Sends. + * + * Called by sending EP after being notified disabled peer was re-enabled. + * + * Replay all dropped Sends in order. + */ +int cxip_fc_resume(struct cxip_ep_obj *ep_obj, uint32_t nic_addr, uint32_t pid, + uint16_t vni) +{ + struct cxip_txc_hpc *txc = container_of(ep_obj->txc, + struct cxip_txc_hpc, base); + struct cxip_fc_peer *peer; + struct cxip_addr caddr = { + .nic = nic_addr, + .pid = pid, + .vni = vni, + }; + struct cxip_req *req; + struct dlist_entry *tmp; + int ret; + + peer = cxip_fc_peer_lookup(txc, caddr); + if (!peer) + TXC_FATAL(txc, "Fatal, FC peer not found: NIC: %#x PID: %d\n", + nic_addr, pid); + + TXC_DBG(txc, "Replaying dropped sends, NIC: %#x PID: %d\n", + nic_addr, pid); + + dlist_foreach_container_safe(&peer->msg_queue, struct cxip_req, + req, send.txc_entry, tmp) { + + /* Since messaging does not have events disabled, need to return + * a TXC credit for replay. _cxip_send_req() will take the + * credit again. + */ + ofi_atomic_dec32(&txc->base.otx_reqs); + + /* -FI_EAGAIN can be return if the command queue is full. Loop + * until this goes through. + */ + do { + ret = _cxip_send_req(req); + } while (ret == -FI_EAGAIN); + assert(ret == FI_SUCCESS); + + /* Move request back to the message queue. */ + dlist_remove(&req->send.txc_entry); + req->send.fc_peer = NULL; + dlist_insert_tail(&req->send.txc_entry, &txc->base.msg_queue); + + TXC_DBG(txc, "Replayed %p\n", req); + } + + /* Peer flow control structure can only be freed if replay is complete + * and all acks accounted for. + */ + if (!peer->pending_acks) + cxip_fc_peer_fini(peer); + else + peer->replayed = true; + + return FI_SUCCESS; +} + +/* + * cxip_send_req_dropped() - Mark the Send request dropped. + * + * Mark the Send request dropped. Mark the target peer as disabled. Track all + * outstanding Sends targeting the disabled peer. When all outstanding Sends + * are completed, recovery will be performed. + */ +static int cxip_send_req_dropped(struct cxip_txc_hpc *txc, struct cxip_req *req) +{ + struct cxip_fc_peer *peer; + int ret; + + /* Check if peer is already disabled */ + peer = cxip_fc_peer_lookup(txc, req->send.caddr); + if (!peer) { + ret = cxip_fc_peer_init(txc, req->send.caddr, &peer); + if (ret != FI_SUCCESS) + return ret; + + TXC_DBG(txc, + "Disabled peer detected, NIC: %#x PID: %u VNI: %u pending: %u\n", + peer->caddr.nic, peer->caddr.pid, peer->caddr.vni, + peer->pending); + } + + /* Account for the dropped message. */ + peer->dropped++; + ret = cxip_fc_peer_put(peer); + if (ret) + peer->dropped--; + else + TXC_DBG(txc, + "Send dropped, req: %p NIC: %#x PID: %u VNI: %u pending: %u dropped: %u\n", + req, peer->caddr.nic, peer->caddr.pid, peer->caddr.vni, + peer->pending, peer->dropped); + + return ret; +} + +/* + * cxip_send_req_queue() - Queue Send request on TXC. + * + * Place the Send request in an ordered SW queue. Return error if the target + * peer is disabled. + */ +static int cxip_send_req_queue(struct cxip_txc_hpc *txc, struct cxip_req *req) +{ + struct cxip_fc_peer *peer; + + if (!dlist_empty(&txc->fc_peers)) { + peer = cxip_fc_peer_lookup(txc, req->send.caddr); + if (peer) { + /* Peer is disabled. Progress control EQs so future + * cxip_send_req_queue() may succeed. + */ + cxip_ep_ctrl_progress_locked(txc->base.ep_obj); + + return -FI_EAGAIN; + } + } + + dlist_insert_tail(&req->send.txc_entry, &txc->base.msg_queue); + + return FI_SUCCESS; +} + +/* + * cxip_send_req_dequeue() - Dequeue Send request from TXC. + * + * Remove the Send requst from the ordered message queue. Update peer + * flow-control state, if necessary. + */ +static int cxip_send_req_dequeue(struct cxip_txc_hpc *txc, struct cxip_req *req) +{ + int ret; + + if (req->send.fc_peer) { + /* The peer was disabled after this message arrived. */ + TXC_DBG(txc, + "Send not dropped, req: %p NIC: %#x PID: %u pending: %u dropped: %u\n", + req, req->send.fc_peer->caddr.nic, + req->send.fc_peer->caddr.pid, + req->send.fc_peer->pending, req->send.fc_peer->dropped); + + ret = cxip_fc_peer_put(req->send.fc_peer); + if (ret != FI_SUCCESS) + return ret; + + req->send.fc_peer = NULL; + } + + dlist_remove(&req->send.txc_entry); + + return FI_SUCCESS; +} + +static void cxip_txc_hpc_progress(struct cxip_txc *txc) +{ + cxip_evtq_progress(&txc->tx_evtq); +} + +static int cxip_txc_hpc_cancel_msg_send(struct cxip_req *req) +{ + /* Once command is submitted for HPC we do not cancel */ + return -FI_ENOENT; +} + +static void cxip_txc_hpc_init_struct(struct cxip_txc *txc_base, + struct cxip_ep_obj *ep_obj) +{ + struct cxip_txc_hpc *txc = container_of(txc_base, struct cxip_txc_hpc, + base); + + assert(txc->base.protocol == FI_PROTO_CXI); + txc->base.recv_ptl_idx = CXIP_PTL_IDX_RXQ; + dlist_init(&txc->fc_peers); + txc->max_eager_size = cxip_env.rdzv_threshold + cxip_env.rdzv_get_min; + txc->rdzv_eager_size = cxip_env.rdzv_eager_size; +} + +static void cxip_txc_hpc_fini_struct(struct cxip_txc *txc) +{ +} + +static int cxip_txc_hpc_msg_init(struct cxip_txc *txc_base) +{ + struct cxip_txc_hpc *txc = container_of(txc_base, struct cxip_txc_hpc, + base); + int ret; + + assert(txc->base.protocol == FI_PROTO_CXI); + + /* Protected with ep_obj->lock */ + memset(&txc->rdzv_ids, 0, sizeof(txc->rdzv_ids)); + memset(&txc->msg_rdzv_ids, 0, sizeof(txc->msg_rdzv_ids)); + memset(&txc->tx_ids, 0, sizeof(txc->tx_ids)); + + /* Allocate TGQ for posting source data */ + ret = cxip_ep_cmdq(txc->base.ep_obj, false, FI_TC_UNSPEC, + txc->base.tx_evtq.eq, &txc->rx_cmdq); + if (ret != FI_SUCCESS) { + CXIP_WARN("Unable to allocate TGQ, ret: %d\n", ret); + return -FI_EDOMAIN; + } + + ret = cxip_rdzv_match_pte_alloc(txc, &txc->rdzv_pte); + if (ret) { + CXIP_WARN("Failed to allocate rendezvous PtlTE: %d:%s\n", + ret, fi_strerror(-ret)); + goto err_put_rx_cmdq; + } + + txc->rdzv_proto = cxip_env.rdzv_proto; + CXIP_DBG("TXC RDZV PtlTE enabled: %p proto: %s\n", + txc, cxip_rdzv_proto_to_str(txc->rdzv_proto)); + + return FI_SUCCESS; + +err_put_rx_cmdq: + cxip_ep_cmdq_put(txc->base.ep_obj, false); + + return ret; +} + +static int cxip_txc_hpc_msg_fini(struct cxip_txc *txc_base) +{ + struct cxip_txc_hpc *txc = container_of(txc_base, struct cxip_txc_hpc, + base); + int i; + + assert(txc->base.protocol == FI_PROTO_CXI); + ofi_idx_reset(&txc->rdzv_ids); + ofi_idx_reset(&txc->rdzv_ids); + ofi_idx_reset(&txc->msg_rdzv_ids); + + cxip_rdzv_match_pte_free(txc->rdzv_pte); + + for (i = 0; i < RDZV_NO_MATCH_PTES; i++) { + if (txc->rdzv_nomatch_pte[i]) + cxip_rdzv_nomatch_pte_free(txc->rdzv_nomatch_pte[i]); + } + cxip_ep_cmdq_put(txc->base.ep_obj, false); + + return FI_SUCCESS; +} + +static void cxip_txc_hpc_cleanup(struct cxip_txc *txc_base) +{ + struct cxip_txc_hpc *txc = container_of(txc_base, struct cxip_txc_hpc, + base); + struct cxip_fc_peer *fc_peer; + struct dlist_entry *tmp; + + assert(txc->base.protocol == FI_PROTO_CXI); + dlist_foreach_container_safe(&txc->fc_peers, struct cxip_fc_peer, + fc_peer, txc_entry, tmp) { + dlist_remove(&fc_peer->txc_entry); + free(fc_peer); + } +} + +/* + * cxip_send_common() - Common message send function. Used for tagged and + * untagged sends of all sizes. This includes triggered operations. + */ +static ssize_t +cxip_send_common(struct cxip_txc *txc, uint32_t tclass, const void *buf, + size_t len, void *desc, uint64_t data, fi_addr_t dest_addr, + uint64_t tag, void *context, uint64_t flags, bool tagged, + bool triggered, uint64_t trig_thresh, + struct cxip_cntr *trig_cntr, struct cxip_cntr *comp_cntr) +{ + struct cxip_req *req; + struct cxip_addr caddr; + int ret; + + assert(txc->protocol == FI_PROTO_CXI); + + if (len && !buf) + return -FI_EINVAL; + + if (len > CXIP_EP_MAX_MSG_SZ) + return -FI_EMSGSIZE; + + if (tagged && tag & ~CXIP_TAG_MASK) { + TXC_WARN(txc, "Invalid tag: %#018lx (%#018lx)\n", + tag, CXIP_TAG_MASK); + return -FI_EINVAL; + } + + if (flags & FI_INJECT && len > CXIP_INJECT_SIZE) { + TXC_WARN(txc, "Invalid inject length: %lu\n", len); + return -FI_EMSGSIZE; + } + + ofi_genlock_lock(&txc->ep_obj->lock); + + req = cxip_evtq_req_alloc(&txc->tx_evtq, false, txc); + if (!req) { + TXC_DBG(txc, "Failed to allocate request, return -FI_EAGAIN\n"); + ret = -FI_EAGAIN; + goto unlock; + } + + /* Restrict outstanding success event requests to queue size */ + if (ofi_atomic_get32(&txc->otx_reqs) >= txc->attr.size) { + ret = -FI_EAGAIN; + goto err_req_free; + } + + req->triggered = triggered; + req->trig_thresh = trig_thresh; + req->trig_cntr = trig_cntr; + + /* Save Send parameters to replay */ + req->type = CXIP_REQ_SEND; + req->send.txc = txc; + req->send.tclass = tclass; + + req->send.cntr = triggered ? comp_cntr : txc->send_cntr; + req->send.buf = buf; + req->send.len = len; + req->send.data = data; + req->send.flags = flags; + + /* Set completion parameters */ + req->context = (uint64_t)context; + req->flags = FI_SEND | (flags & (FI_COMPLETION | FI_MATCH_COMPLETE)); + if (tagged) { + req->send.tagged = tagged; + req->send.tag = tag; + req->flags |= FI_TAGGED; + } else { + req->flags |= FI_MSG; + } + + ret = cxip_send_buf_init(req); + if (ret) { + TXC_WARN(txc, "cxip_send_buf_init failed: %d:%s\n", ret, + fi_strerror(-ret)); + goto err_req_free; + } + + /* Look up target CXI address */ + ret = cxip_av_lookup_addr(txc->ep_obj->av, dest_addr, &caddr); + if (ret != FI_SUCCESS) { + TXC_WARN(txc, "Failed to look up FI addr: %d\n", ret); + goto err_req_buf_fini; + } + + if (!txc->ep_obj->av_auth_key) + caddr.vni = txc->ep_obj->auth_key.vni; + + req->send.caddr = caddr; + req->send.dest_addr = dest_addr; + + if (cxip_evtq_saturated(&txc->tx_evtq)) { + TXC_DBG(txc, "TX HW EQ saturated\n"); + ret = -FI_EAGAIN; + goto err_req_buf_fini; + } + + /* Check if target peer is disabled */ + ret = cxip_send_req_queue(req->send.txc_hpc, req); + if (ret != FI_SUCCESS) { + TXC_DBG(txc, "Target peer disabled\n"); + goto err_req_buf_fini; + } + + /* Try Send */ + ret = _cxip_send_req(req); + if (ret != FI_SUCCESS) + goto err_req_dequeue; + + ofi_genlock_unlock(&txc->ep_obj->lock); + + TXC_DBG(txc, + "req: %p buf: %p len: %lu dest_addr: 0x%lX nic: %d pid: %d tag(%c): 0x%lx context %#lx\n", + req, req->send.buf, req->send.len, dest_addr, caddr.nic, + caddr.pid, req->send.tagged ? '*' : '-', req->send.tag, + req->context); + + return FI_SUCCESS; + +err_req_dequeue: + cxip_send_req_dequeue(req->send.txc_hpc, req); +err_req_buf_fini: + cxip_send_buf_fini(req); +err_req_free: + cxip_evtq_req_free(req); +unlock: + ofi_genlock_unlock(&txc->ep_obj->lock); + + return ret; +} + +struct cxip_rxc_ops hpc_rxc_ops = { + .recv_common = cxip_recv_common, + .progress = cxip_rxc_hpc_progress, + .recv_req_tgt_event = cxip_rxc_hpc_recv_req_tgt_event, + .cancel_msg_recv = cxip_rxc_hpc_cancel_msg_recv, + .ctrl_msg_cb = cxip_rxc_hpc_ctrl_msg_cb, + .init_struct = cxip_rxc_hpc_init_struct, + .fini_struct = cxip_rxc_hpc_fini_struct, + .cleanup = cxip_rxc_hpc_cleanup, + .msg_init = cxip_rxc_hpc_msg_init, + .msg_fini = cxip_rxc_hpc_msg_fini, +}; + +struct cxip_txc_ops hpc_txc_ops = { + .send_common = cxip_send_common, + .progress = cxip_txc_hpc_progress, + .cancel_msg_send = cxip_txc_hpc_cancel_msg_send, + .init_struct = cxip_txc_hpc_init_struct, + .fini_struct = cxip_txc_hpc_fini_struct, + .cleanup = cxip_txc_hpc_cleanup, + .msg_init = cxip_txc_hpc_msg_init, + .msg_fini = cxip_txc_hpc_msg_fini, +}; diff --git a/prov/cxi/src/cxip_msg_rnr.c b/prov/cxi/src/cxip_msg_rnr.c new file mode 100644 index 00000000000..b5ae1410e7d --- /dev/null +++ b/prov/cxi/src/cxip_msg_rnr.c @@ -0,0 +1,1266 @@ +/* + * SPDX-License-Identifier: BSD-2 Clause or GPL-2.0-only + * + * Copyright (c) 2018-2024 Hewlett Packard Enterprise Development LP + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "cxip.h" + +#define CXIP_WARN(...) _CXIP_WARN(FI_LOG_EP_CTRL, __VA_ARGS__) +#define CXIP_DBG(...) _CXIP_DBG(FI_LOG_EP_CTRL, __VA_ARGS__) +#define CXIP_INFO(...) _CXIP_INFO(FI_LOG_EP_CTRL, __VA_ARGS__) + +#define APPEND_LE_FATAL "Recieve LE resources exhuasted. Requires use " \ + " of FI_PROTO_CXI endpoint protocol\n" + +static int cxip_rnr_send_cb(struct cxip_req *req, const union c_event *event); + +static void cxip_rnr_recv_pte_cb(struct cxip_pte *pte, + const union c_event *event) +{ + struct cxip_rxc *rxc = (struct cxip_rxc *)pte->ctx; + uint32_t state; + + assert(rxc->protocol == FI_PROTO_CXI_RNR); + + switch (event->hdr.event_type) { + case C_EVENT_STATE_CHANGE: + if (cxi_event_rc(event) != C_RC_OK || + event->tgt_long.ptlte_index != rxc->rx_pte->pte->ptn) + CXIP_FATAL("Failed receive PtlTE state change, %s\n", + cxi_rc_to_str(cxi_event_rc(event))); + + state = event->tgt_long.initiator.state_change.ptlte_state; + + switch (state) { + case C_PTLTE_ENABLED: + assert(rxc->state == RXC_DISABLED); + rxc->state = RXC_ENABLED; + RXC_DBG(rxc, "Receive PtlTE enabled\n"); + break; + case C_PTLTE_DISABLED: + /* Set to disabled before issuing command */ + assert(rxc->state == RXC_DISABLED); + rxc->state = RXC_DISABLED; + RXC_DBG(rxc, "Receive PtlTE disabled\n"); + break; + default: + CXIP_FATAL("Unexpected receive PtlTE state %d\n", + state); + } + break; + + case C_EVENT_COMMAND_FAILURE: + CXIP_FATAL("Command failure: cq=%u target=%u fail_loc=%u " + "cmd_type=%u cmd_size=%u opcode=%u\n", + event->cmd_fail.cq_id, event->cmd_fail.is_target, + event->cmd_fail.fail_loc, + event->cmd_fail.fail_command.cmd_type, + event->cmd_fail.fail_command.cmd_size, + event->cmd_fail.fail_command.opcode); + break; + default: + CXIP_FATAL("Invalid event type: %s\n", cxi_event_to_str(event)); + } +} + +static int cxip_rnr_recv_selective_comp_cb(struct cxip_req *req, + const union c_event *event) +{ + int event_rc; + int ret_err; + + /* When errors happen, send events can occur before the put/get + * event. These events should just be dropped. + */ + if (event->hdr.event_type == C_EVENT_SEND) { + CXIP_WARN("Unexpected %s event: rc=%s\n", + cxi_event_to_str(event), + cxi_rc_to_str(cxi_event_rc(event))); + return FI_SUCCESS; + } + + if (req->recv.tagged) + req->flags = FI_RECV | FI_TAGGED; + else + req->flags = FI_RECV | FI_MSG; + + event_rc = cxi_init_event_rc(event); + ret_err = proverr2errno(event_rc); + + return cxip_cq_req_error(req, 0, ret_err, + cxi_event_rc(event), NULL, 0, + FI_ADDR_UNSPEC); +} + +/* + * cxip_rnr_recv_req() - Submit Receive request to hardware. + */ +static ssize_t cxip_rnr_recv_req(struct cxip_req *req, struct cxip_cntr *cntr, + bool restart_seq) +{ + struct cxip_rxc *rxc = req->recv.rxc; + uint32_t le_flags; + union cxip_match_bits mb = {}; + union cxip_match_bits ib = { + .rnr_cq_data = 1, + }; + int ret; + struct cxip_md *recv_md = req->recv.recv_md; + uint64_t recv_iova = 0; + + if (req->recv.tagged) { + mb.tagged = 1; + mb.rnr_tag = req->recv.tag; + ib.rnr_tag = req->recv.ignore; + } + + if (req->recv.match_id == CXI_MATCH_ID_ANY) + ib.rnr_vni = ~0; + else + mb.rnr_vni = req->recv.vni; + + /* Always set manage_local in Receive LEs. This makes Cassini ignore + * initiator remote_offset in all Puts. With this, remote_offset in Put + * events can be used by the initiator for protocol data. The behavior + * of use_once is not impacted by manage_local. + */ + le_flags = C_LE_EVENT_LINK_DISABLE | C_LE_EVENT_UNLINK_DISABLE | + C_LE_MANAGE_LOCAL | C_LE_UNRESTRICTED_BODY_RO | + C_LE_UNRESTRICTED_END_RO | C_LE_OP_PUT; + + if (req->recv.success_disable) + le_flags |= C_LE_EVENT_SUCCESS_DISABLE; + + /* Note: FI_CXI_CNTR_EVENTS_BYTES == FI_CNTR_EVENTS_BYTES */ + if (cntr) { + le_flags |= C_LE_EVENT_CT_COMM; + + if (cntr->attr.events == FI_CXI_CNTR_EVENTS_BYTES) + le_flags |= C_LE_EVENT_CT_BYTES; + } + + if (!req->recv.multi_recv) + le_flags |= C_LE_USE_ONCE; + if (restart_seq) + le_flags |= C_LE_RESTART_SEQ; + + if (recv_md) + recv_iova = CXI_VA_TO_IOVA(recv_md->md, + (uint64_t)req->recv.recv_buf + + req->recv.start_offset); + + req->recv.hw_offloaded = true; + + /* Issue Append command */ + ret = cxip_pte_append(rxc->rx_pte, recv_iova, + req->recv.ulen - req->recv.start_offset, + recv_md ? recv_md->md->lac : 0, + C_PTL_LIST_PRIORITY, req->req_id, + mb.raw, ib.raw, req->recv.match_id, + req->recv.multi_recv ? + rxc->min_multi_recv : 0, + le_flags, cntr, rxc->rx_cmdq, + !(req->recv.flags & FI_MORE)); + if (ret != FI_SUCCESS) { + RXC_WARN(rxc, "Failed to write Append command: %d\n", ret); + return ret; + } + + return FI_SUCCESS; +} + +/* + * cxip_rnr_recv_cb() - Process user receive buffer events. + * + * For the CS protocol a receive buffer is described by an LE linked to + * the Priority List. Local unexpected message buffering and rendezvous + * messaging are not enabled. + */ +static int cxip_rnr_recv_cb(struct cxip_req *req, const union c_event *event) +{ + struct cxip_rxc_rnr *rxc = req->recv.rxc_rnr; + + switch (event->hdr.event_type) { + case C_EVENT_LINK: + /* Success events are disabled */ + assert(cxi_tgt_event_rc(event) != C_RC_OK); + + /* Failure to link a receive buffer is a fatal operation and + * indicates that FI_PROTO_CXI and portals flow-control is + * required. + */ + RXC_FATAL(rxc, APPEND_LE_FATAL); + break; + + case C_EVENT_UNLINK: + /* If request is for FI_MULTI_RECV and success events are being + * taken (completions required) then cxip_recv_req_report() + * will handle making sure the unlink is not reported prior to + * all messages being reported. + */ + req->recv.unlinked = true; + cxip_recv_req_report(req); + cxip_recv_req_free(req); + + return FI_SUCCESS; + + case C_EVENT_PUT: + cxip_rxc_record_req_stat(&rxc->base, C_PTL_LIST_PRIORITY, + event->tgt_long.rlength, req); + return cxip_complete_put(req, event); + default: + RXC_FATAL(rxc, CXIP_UNEXPECTED_EVENT, + cxi_event_to_str(event), + cxi_rc_to_str(cxi_event_rc(event))); + } +} + +static void cxip_rxc_rnr_progress(struct cxip_rxc *rxc) +{ + cxip_evtq_progress(&rxc->rx_evtq); +} + +static void cxip_rxc_rnr_recv_req_tgt_event(struct cxip_req *req, + const union c_event *event) +{ + union cxip_match_bits mb = { + .raw = event->tgt_long.match_bits + }; + uint32_t init = event->tgt_long.initiator.initiator.process; + + assert(req->recv.rxc->protocol == FI_PROTO_CXI_RNR); + assert(event->hdr.event_type == C_EVENT_PUT); + + req->tag = mb.rnr_tag; + req->recv.initiator = init; + + if (mb.rnr_cq_data) + req->flags |= FI_REMOTE_CQ_DATA; + + req->recv.src_offset = event->tgt_long.remote_offset; + + /* Only need one event to set remaining fields. */ + if (req->recv.tgt_event) + return; + + req->recv.tgt_event = true; + + /* VNI is needed to support FI_AV_AUTH_KEY. */ + req->recv.vni = event->tgt_long.vni; + + /* rlen is used to detect truncation. */ + req->recv.rlen = event->tgt_long.rlength; + + /* RC is used when generating completion events. */ + req->recv.rc = cxi_tgt_event_rc(event); + + /* Header data is provided in all completion events. */ + req->data = event->tgt_long.header_data; + + /* data_len must be set uniquely for each protocol! */ +} + +static int cxip_rxc_rnr_cancel_msg_recv(struct cxip_req *req) +{ + /* Perform default */ + return cxip_recv_cancel(req); +} + +/* Handle any control messaging callbacks specific to protocol */ +static int cxip_rxc_rnr_ctrl_msg_cb(struct cxip_ctrl_req *req, + const union c_event *event) +{ + /* Placeholder */ + return -FI_ENOSYS; +} + +static void cxip_rxc_rnr_init_struct(struct cxip_rxc *rxc_base, + struct cxip_ep_obj *ep_obj) +{ + struct cxip_rxc_rnr *rxc = container_of(rxc_base, struct cxip_rxc_rnr, + base); + + assert(rxc->base.protocol == FI_PROTO_CXI_RNR); + + /* Supports treating truncation as success */ + rxc->base.trunc_ok = cxip_env.trunc_ok; + + /* Overrides */ + rxc->base.recv_ptl_idx = CXIP_PTL_IDX_RNR_RXQ; +} + +static void cxip_rxc_rnr_fini_struct(struct cxip_rxc *rxc) +{ + /* Placeholder */ +} + +static int cxip_rxc_rnr_msg_init(struct cxip_rxc *rxc_base) +{ + struct cxip_rxc_rnr *rxc = container_of(rxc_base, struct cxip_rxc_rnr, + base); + struct cxi_pt_alloc_opts pt_opts = { + .use_long_event = 1, + .is_matching = 1, + .lossless = cxip_env.msg_lossless, + }; + struct cxip_req *req; + int ret; + + assert(rxc->base.protocol == FI_PROTO_CXI_RNR); + + if (rxc->base.domain->hybrid_mr_desc) { + ret = cxip_recv_req_alloc(&rxc->base, NULL, 0, NULL, &req, + cxip_rnr_recv_selective_comp_cb); + if (ret) { + CXIP_WARN("FI_MSG hybrid req alloc failed\n"); + return -FI_ENOMEM; + } + req->context = (uint64_t)rxc->base.context; + req->flags = FI_MSG | FI_RECV; + req->recv.success_disable = true; + rxc->req_selective_comp_msg = req; + + /* Will not be used */ + dlist_init(&req->recv.children); + dlist_init(&req->recv.rxc_entry); + + /* Selective does not count toward outstanding RX operations */ + ofi_atomic_dec32(&rxc->base.orx_reqs); + + ret = cxip_recv_req_alloc(&rxc->base, NULL, 0, NULL, &req, + cxip_rnr_recv_selective_comp_cb); + if (ret) { + CXIP_WARN("FI_MSG hybrid req alloc failed\n"); + ret = -FI_ENOMEM; + goto free_req_msg; + } + req->context = (uint64_t)rxc->base.context; + req->recv.tagged = true; + req->flags = FI_TAGGED | FI_RECV; + req->recv.success_disable = true; + rxc->req_selective_comp_tag = req; + + /* Will not be used */ + dlist_init(&req->recv.children); + dlist_init(&req->recv.rxc_entry); + + /* Selective does not count toward outstanding RX operations */ + ofi_atomic_dec32(&rxc->base.orx_reqs); + rxc->hybrid_mr_desc = true; + } + + /* If applications AVs are symmetric, use logical FI addresses for + * matching. Otherwise, physical addresses will be used. + */ + if (rxc->base.ep_obj->av->symmetric) { + CXIP_DBG("Using logical PTE matching\n"); + pt_opts.use_logical = 1; + } + + ret = cxip_pte_alloc(rxc->base.ep_obj->ptable, rxc->base.rx_evtq.eq, + rxc->base.recv_ptl_idx, false, &pt_opts, + cxip_rnr_recv_pte_cb, &rxc->base, + &rxc->base.rx_pte); + if (ret != FI_SUCCESS) { + CXIP_WARN("Failed to allocate RX PTE: %d\n", ret); + goto free_req_tag; + } + + /* Start accepting Puts. */ + ret = cxip_pte_set_state(rxc->base.rx_pte, rxc->base.rx_cmdq, + C_PTLTE_ENABLED, 0); + if (ret != FI_SUCCESS) { + CXIP_WARN("cxip_pte_set_state returned: %d\n", ret); + goto free_pte; + } + + /* Wait for PTE state change */ + do { + sched_yield(); + cxip_evtq_progress(&rxc->base.rx_evtq); + } while (rxc->base.rx_pte->state != C_PTLTE_ENABLED); + + return FI_SUCCESS; + +free_pte: + cxip_pte_free(rxc->base.rx_pte); +free_req_tag: + if (rxc->req_selective_comp_tag) { + ofi_atomic_inc32(&rxc->base.orx_reqs); + cxip_recv_req_free(rxc->req_selective_comp_tag); + } +free_req_msg: + if (rxc->req_selective_comp_msg) { + ofi_atomic_inc32(&rxc->base.orx_reqs); + cxip_recv_req_free(rxc->req_selective_comp_msg); + } + + return ret; +} + +static int cxip_rxc_rnr_msg_fini(struct cxip_rxc *rxc_base) +{ + struct cxip_rxc_rnr *rxc = container_of(rxc_base, struct cxip_rxc_rnr, + base); + + assert(rxc->base.protocol == FI_PROTO_CXI_RNR); + + /* Must add selective completion requests RX reference counts + * back before freeing. + */ + if (rxc->req_selective_comp_msg) { + ofi_atomic_inc32(&rxc->base.orx_reqs); + cxip_recv_req_free(rxc->req_selective_comp_msg); + } + if (rxc->req_selective_comp_tag) { + ofi_atomic_inc32(&rxc->base.orx_reqs); + cxip_recv_req_free(rxc->req_selective_comp_tag); + } + + return FI_SUCCESS; +} + +static void cxip_rxc_rnr_cleanup(struct cxip_rxc *rxc_base) +{ + cxip_rxc_recv_req_cleanup(rxc_base); +} + +/* + * cxip_recv_common() - Common message receive function. Used for tagged and + * untagged sends of all sizes. + */ +static ssize_t +cxip_recv_common(struct cxip_rxc *rxc, void *buf, size_t len, void *desc, + fi_addr_t src_addr, uint64_t tag, uint64_t ignore, + void *context, uint64_t flags, bool tagged, + struct cxip_cntr *comp_cntr) +{ + struct cxip_rxc_rnr *rxc_rnr = container_of(rxc, struct cxip_rxc_rnr, + base); + struct cxip_req *req = NULL; + struct cxip_req *recv_req; + struct cxip_mr *mr = rxc_rnr->hybrid_mr_desc ? desc : NULL; + struct cxip_cntr *cntr; + int ret; + uint32_t match_id; + uint16_t vni; + + assert(rxc_rnr->base.protocol == FI_PROTO_CXI_RNR); + +#if ENABLE_DEBUG + if (len && !buf) { + RXC_WARN(rxc, "Length %ld but local buffer NULL\n", len); + return -FI_EINVAL; + } + + if (rxc->state == RXC_DISABLED) + return -FI_EOPBADSTATE; + + if (tagged) { + if (tag & ~CXIP_CS_TAG_MASK || ignore & ~CXIP_CS_TAG_MASK) { + RXC_WARN(rxc, + "Invalid tag: %#018lx ignore: %#018lx (%#018lx)\n", + tag, ignore, CXIP_CS_TAG_MASK); + return -FI_EINVAL; + } + } +#endif + + if (!rxc->selective_completion) + flags |= FI_COMPLETION; + + ret = cxip_set_recv_match_id(rxc, src_addr, rxc->ep_obj->av_auth_key && + (flags & FI_AUTH_KEY), &match_id, &vni); + if (ret) { + RXC_WARN(rxc, "Error setting match_id: %d %s\n", + ret, fi_strerror(-ret)); + return ret; + } + + ofi_genlock_lock(&rxc->ep_obj->lock); + + if (mr && !(flags & (FI_COMPLETION | FI_MULTI_RECV | + FI_PEEK | FI_CLAIM))) { + recv_req = tagged ? rxc_rnr->req_selective_comp_tag : + rxc_rnr->req_selective_comp_msg; + assert(recv_req != NULL); + + recv_req->recv.recv_md = mr->md; + recv_req->recv.hybrid_md = true; + recv_req->recv.recv_buf = buf; + recv_req->recv.ulen = len; + } else { + ret = cxip_recv_req_alloc(rxc, buf, len, mr ? mr->md : NULL, + &req, cxip_rnr_recv_cb); + if (ret) + goto err; + + recv_req = req; + recv_req->context = (uint64_t)context; + recv_req->flags = ((tagged ? FI_TAGGED : FI_MSG) | FI_RECV | + (flags & FI_COMPLETION)); + recv_req->recv.tagged = tagged; + + /* Can still disable success events if multi-recv and + * completions are not requested since final mandatory unlink + * will cleanup resources. However, if buffer will be auto- + * unlinked take internal completions to handle the + * accounting to ensure all data has landed. + */ + if (flags & FI_MULTI_RECV && !(flags & FI_COMPLETION) && + !rxc->min_multi_recv) + recv_req->recv.success_disable = true; + else + recv_req->recv.success_disable = false; + } + cntr = comp_cntr ? comp_cntr : rxc->recv_cntr; + recv_req->recv.match_id = match_id; + recv_req->recv.vni = vni; + recv_req->recv.tag = tag; + recv_req->recv.ignore = ignore; + recv_req->recv.flags = flags; + recv_req->recv.multi_recv = (flags & FI_MULTI_RECV ? true : false); + + if (!(recv_req->recv.flags & (FI_PEEK | FI_CLAIM))) { + ret = cxip_rnr_recv_req(recv_req, cntr, false); + if (ret) { + RXC_WARN(rxc, "Receive append failed: %d %s\n", + ret, fi_strerror(-ret)); + goto free_req; + } + ofi_genlock_unlock(&rxc->ep_obj->lock); + + RXC_DBG(rxc, + "req: %p buf: %p len: %lu src_addr: %ld tag(%c):" + " 0x%lx ignore: 0x%lx context: %p\n", + recv_req, buf, len, src_addr, tagged ? '*' : '-', tag, + ignore, context); + + return FI_SUCCESS; + } + + /* No buffered unexpected messages, so FI_PEEK always fails */ + if (recv_req->recv.flags & FI_PEEK) { + recv_req->recv.rc = C_RC_NO_MATCH; + cxip_recv_req_peek_complete(recv_req, NULL); + ofi_genlock_unlock(&rxc->ep_obj->lock); + + return FI_SUCCESS; + } + + /* FI_CLAIM specified by itself cannot be valid */ + RXC_WARN(rxc, "FI_CLAIM not valid\n"); + ret = -FI_EINVAL; + +free_req: + if (req) + cxip_recv_req_free(req); +err: + ofi_genlock_unlock(&rxc->ep_obj->lock); + + return ret; +} + +static inline bool cxip_rnr_req_uses_idc(struct cxip_txc_rnr *txc, + ssize_t len, bool triggered) + +{ + /* TODO: Consider supporting HMEM and IDC by mapping memory */ + return !txc->base.hmem && len && len <= CXIP_INJECT_SIZE && + !triggered && !cxip_env.disable_non_inject_msg_idc; +} + +static inline bool cxip_rnr_tx_success_disable(struct cxip_txc_rnr *txc, + struct cxip_mr *mr, + bool idc, uint64_t flags) +{ + /* Success events can be avoided if we do not require local + * memory registration, RNR retries will not be done, and + * a user completion is not requested, + */ + return (mr || idc) && !txc->max_retry_wait_us && + !(flags & FI_COMPLETION); +} + +static int cxip_rnr_send_selective_comp_cb(struct cxip_req *req, + const union c_event *event) +{ + int event_rc; + int ret_err; + + /* When errors happen, send events can occur before the put/get + * event. These events should just be dropped. + */ + if (event->hdr.event_type == C_EVENT_SEND) { + CXIP_WARN("Unexpected %s event: rc=%s\n", + cxi_event_to_str(event), + cxi_rc_to_str(cxi_event_rc(event))); + return FI_SUCCESS; + } + + if (req->send.tagged) + req->flags = FI_SEND | FI_TAGGED; + else + req->flags = FI_SEND | FI_MSG; + + event_rc = cxi_init_event_rc(event); + ret_err = proverr2errno(event_rc); + + return cxip_cq_req_error(req, 0, ret_err, + cxi_event_rc(event), NULL, 0, + FI_ADDR_UNSPEC); +} + +static inline ssize_t cxip_rnr_send_dma(struct cxip_req *req, + union cxip_match_bits *mb, + union c_fab_addr *dfa, uint8_t idx_ext) +{ + struct cxip_txc *txc = req->send.txc; + struct c_full_dma_cmd cmd = {}; + + cmd.command.cmd_type = C_CMD_TYPE_DMA; + cmd.command.opcode = C_CMD_PUT; + cmd.index_ext = idx_ext; + cmd.event_send_disable = 1; + cmd.dfa = *dfa; + cmd.eq = cxip_evtq_eqn(&txc->tx_evtq); + cmd.user_ptr = (uint64_t)req; + cmd.initiator = cxip_msg_match_id(txc); + cmd.match_bits = mb->raw; + cmd.header_data = req->send.data; + cmd.event_success_disable = req->send.success_disable; + + /* Triggered ops could result in 0 length DMA */ + if (req->send.send_md) { + cmd.lac = req->send.send_md->md->lac; + cmd.local_addr = CXI_VA_TO_IOVA(req->send.send_md->md, + req->send.buf); + cmd.request_len = req->send.len; + } + + if (req->send.cntr) { + if (req->send.cntr->attr.events == FI_CXI_CNTR_EVENTS_BYTES) + cmd.event_ct_bytes = 1; + + cmd.event_ct_ack = 1; + cmd.ct = req->send.cntr->ct->ctn; + } + + return cxip_txc_emit_dma(txc, req->send.caddr.vni, + cxip_ofi_to_cxi_tc(req->send.tclass), + CXI_TC_TYPE_DEFAULT, + req->triggered ? req->trig_cntr : NULL, + req->trig_thresh, &cmd, req->send.flags); +} + +static inline ssize_t cxip_rnr_send_idc(struct cxip_req *req, + union cxip_match_bits *mb, + union c_fab_addr *dfa, uint8_t idx_ext) +{ + struct cxip_txc *txc = req->send.txc; + struct c_cstate_cmd cstate_cmd = {}; + struct c_idc_msg_hdr idc_cmd; + + assert(req->send.len > 0); + assert(!txc->hmem); + + cstate_cmd.event_send_disable = 1; + cstate_cmd.index_ext = idx_ext; + cstate_cmd.eq = cxip_evtq_eqn(&txc->tx_evtq); + cstate_cmd.initiator = cxip_msg_match_id(txc); + cstate_cmd.event_success_disable = req->send.success_disable; + + if (req->send.cntr) { + if (req->send.cntr->attr.events == FI_CXI_CNTR_EVENTS_BYTES) + cstate_cmd.event_ct_bytes = 1; + + cstate_cmd.event_ct_ack = 1; + cstate_cmd.ct = req->send.cntr->ct->ctn; + } + + /* Note: IDC command completely filled in */ + idc_cmd.unused_0 = 0; + idc_cmd.dfa = *dfa; + idc_cmd.match_bits = mb->raw; + idc_cmd.header_data = req->send.data; + idc_cmd.user_ptr = (uint64_t)req; + + return cxip_txc_emit_idc_msg(txc, req->send.caddr.vni, + cxip_ofi_to_cxi_tc(req->send.tclass), + CXI_TC_TYPE_DEFAULT, &cstate_cmd, &idc_cmd, + req->send.buf, req->send.len, + req->send.flags); +} + +/* Caller must hold ep_obj->lock */ +static ssize_t cxip_rnr_msg_send(struct cxip_req *req) +{ + struct cxip_txc *txc = req->send.txc; + union cxip_match_bits mb = { + .rnr_vni = req->send.caddr.vni, + .rnr_tag = req->send.tag, + .rnr_cq_data = !!(req->send.flags & FI_REMOTE_CQ_DATA), + }; + union c_fab_addr dfa; + uint8_t idx_ext; + ssize_t ret; + bool idc = !req->send.send_md || !req->send.len; + + /* Calculate DFA */ + cxi_build_dfa(req->send.caddr.nic, req->send.caddr.pid, txc->pid_bits, + txc->recv_ptl_idx, &dfa, &idx_ext); + + if (req->send.send_md || !req->send.len) + ret = cxip_rnr_send_dma(req, &mb, &dfa, idx_ext); + else + ret = cxip_rnr_send_idc(req, &mb, &dfa, idx_ext); + if (ret) { + TXC_WARN(txc, "Failed to write %s command: %ld\n", + idc ? "IDC" : "DMA", ret); + return ret; + } + + TXC_DBG(txc, "Send %s command submitted for req %p\n", + idc ? "IDC" : "DMA", req); + + return FI_SUCCESS; +} + +/* Queue RNR retry. There are CXIP_NUM_RNR_WAIT_QUEUE, each + * has a consistent time wait for that queue (smaller to larger). + * Therefore, appends to tail will keep each queue in retry time + * order. + * + * Caller must hold ep_obj->lock + */ +static int cxip_rnr_queue_retry(struct cxip_txc_rnr *txc, struct cxip_req *req) +{ + uint64_t cur_time; + uint64_t retry_time; + int index; + + cur_time = ofi_gettime_us(); + + index = req->send.retries < CXIP_NUM_RNR_WAIT_QUEUE ? + req->send.retries : CXIP_NUM_RNR_WAIT_QUEUE - 1; + + /* 1us, 11us, 81us 271us, 641us (max) */ + retry_time = cur_time + 1 + (index * index * index) * 10; +#if 0 + TXC_WARN(txc, "retry_time %ld req->send.max_rnr_time %ld\n", + retry_time, req->send.max_rnr_time); +#endif + if (retry_time > req->send.max_rnr_time) + return -FI_ETIMEDOUT; + + /* Insert and update next timeout */ + req->send.retry_rnr_time = retry_time; + + dlist_insert_tail(&req->send.rnr_entry, &txc->time_wait_queue[index]); + if (retry_time < txc->next_retry_wait_us) + txc->next_retry_wait_us = retry_time; + + req->send.retries++; + ofi_atomic_inc32(&txc->time_wait_reqs); + + TXC_DBG(txc, "Entry added to txc->time_wait_queue[%d]\n", index); +#if 0 + TXC_WARN(txc, + "txc->next_retry_wait_us %ld, req->send.retry_rnr_time %ld\n", + txc->next_retry_wait_us, req->send.retry_rnr_time); +#endif + + return FI_SUCCESS; +} + +static int cxip_process_rnr_time_wait(struct cxip_txc_rnr *txc) +{ + struct cxip_req *req; + struct dlist_entry *tmp; + uint64_t cur_time; + uint64_t next_time; + int index; + int ret; + +#if 0 + TXC_WARN(txc, "Process RNR timewait, wait_reqs %d " + "txc->next_retry_wait_us %ld\n", + ofi_atomic_get32(&txc->time_wait_reqs), + txc->next_retry_wait_us); +#endif + if (!ofi_atomic_get32(&txc->time_wait_reqs)) + return FI_SUCCESS; + + cur_time = ofi_gettime_us(); + if (cur_time < txc->next_retry_wait_us) + return FI_SUCCESS; + + ret = FI_SUCCESS; + for (index = 0; index < CXIP_NUM_RNR_WAIT_QUEUE; index++) { + dlist_foreach_container_safe(&txc->time_wait_queue[index], + struct cxip_req, req, + send.rnr_entry, tmp) { +#if 0 + TXC_WARN(txc, "req %p, req->send.retry_rnr_time " + "%ld cur_time %ld\n", req, + req->send.retry_rnr_time, cur_time); +#endif + if (req->send.retry_rnr_time <= cur_time) { + + /* Do not retry if TX canceled */ + if (req->send.canceled) { + dlist_remove_init(&req->send.rnr_entry); + ofi_atomic_dec32(&txc->time_wait_reqs); + cxip_send_buf_fini(req); + cxip_report_send_completion(req, true); + ofi_atomic_dec32(&txc->base.otx_reqs); + cxip_evtq_req_free(req); + + continue; + } + + /* Must TX return credit, will take it back if + * we could not send. + */ + ofi_atomic_dec32(&txc->base.otx_reqs); + ret = cxip_rnr_msg_send(req); + if (ret != FI_SUCCESS) { + ofi_atomic_inc32(&txc->base.otx_reqs); + goto reset_min_time_wait; + } + + txc->total_retries++; + dlist_remove_init(&req->send.rnr_entry); + ofi_atomic_dec32(&txc->time_wait_reqs); + } else { + break; + } + } + } + +reset_min_time_wait: + next_time = UINT64_MAX; + + for (index = 0; index < CXIP_NUM_RNR_WAIT_QUEUE; index++) { + req = dlist_first_entry_or_null(&txc->time_wait_queue[index], + struct cxip_req, + send.rnr_entry); + if (req && req->send.retry_rnr_time < next_time) + next_time = req->send.retry_rnr_time; + } +#if 0 + TXC_WARN(txc, "Set txc->next_retry_wait_us to %ld\n", next_time); +#endif + txc->next_retry_wait_us = next_time; + + return ret; +} + +static void cxip_txc_rnr_progress(struct cxip_txc *txc_base) +{ + struct cxip_txc_rnr *txc = container_of(txc_base, struct cxip_txc_rnr, + base); + + assert(txc->base.protocol == FI_PROTO_CXI_RNR); + + cxip_evtq_progress(&txc->base.tx_evtq); + cxip_process_rnr_time_wait(txc); +} + +static int cxip_txc_rnr_cancel_msg_send(struct cxip_req *req) +{ + if (req->type != CXIP_REQ_SEND) + return -FI_ENOENT; + + req->send.canceled = true; + + return FI_SUCCESS; +} + +static void cxip_txc_rnr_init_struct(struct cxip_txc *txc_base, + struct cxip_ep_obj *ep_obj) +{ + struct cxip_txc_rnr *txc = container_of(txc_base, struct cxip_txc_rnr, + base); + int i; + + assert(txc->base.protocol == FI_PROTO_CXI_RNR); + + /* Supports treating truncation as success */ + txc->base.trunc_ok = cxip_env.trunc_ok; + + txc->base.recv_ptl_idx = CXIP_PTL_IDX_RNR_RXQ; + ofi_atomic_initialize32(&txc->time_wait_reqs, 0); + txc->max_retry_wait_us = cxip_env.rnr_max_timeout_us; + txc->next_retry_wait_us = UINT64_MAX; + + for (i = 0; i < CXIP_NUM_RNR_WAIT_QUEUE; i++) + dlist_init(&txc->time_wait_queue[i]); +} + +static void cxip_txc_rnr_fini_struct(struct cxip_txc *txc) +{ + /* Placeholder */ +} + +static int cxip_txc_rnr_msg_init(struct cxip_txc *txc_base) +{ + struct cxip_txc_rnr *txc = container_of(txc_base, struct cxip_txc_rnr, + base); + struct cxip_req *req; + + assert(txc->base.protocol == FI_PROTO_CXI_RNR); + + if (txc->base.domain->hybrid_mr_desc) { + req = cxip_evtq_req_alloc(&txc->base.tx_evtq, 0, &txc->base); + if (!req) { + TXC_WARN(txc, "FI_MSG hybrid req alloc failed\n"); + return -FI_ENOMEM; + } + req->type = CXIP_REQ_SEND; + req->cb = cxip_rnr_send_selective_comp_cb; + req->context = (uint64_t)txc->base.context; + req->flags = FI_MSG | FI_SEND; + req->addr = FI_ADDR_UNSPEC; + req->send.success_disable = true; + req->send.txc_rnr = txc; + txc->req_selective_comp_msg = req; + + req = cxip_evtq_req_alloc(&txc->base.tx_evtq, 0, &txc->base); + if (!req) { + TXC_WARN(txc, "FI_TAGGED hybrid req alloc failed\n"); + cxip_evtq_req_free(txc->req_selective_comp_msg); + txc->req_selective_comp_msg = NULL; + return -FI_ENOMEM; + } + req->type = CXIP_REQ_SEND; + req->cb = cxip_rnr_send_selective_comp_cb; + req->context = (uint64_t)txc->base.context; + req->flags = FI_TAGGED | FI_SEND; + req->addr = FI_ADDR_UNSPEC; + req->send.success_disable = true; + req->send.txc_rnr = txc; + txc->req_selective_comp_tag = req; + + txc->hybrid_mr_desc = true; + } + + return FI_SUCCESS; +} + +static int cxip_txc_rnr_msg_fini(struct cxip_txc *txc_base) +{ + struct cxip_txc_rnr *txc = container_of(txc_base, struct cxip_txc_rnr, + base); + + assert(txc->base.protocol == FI_PROTO_CXI_RNR); + + if (txc->req_selective_comp_msg) + cxip_evtq_req_free(txc->req_selective_comp_msg); + if (txc->req_selective_comp_tag) + cxip_evtq_req_free(txc->req_selective_comp_tag); + + TXC_INFO(txc, "Total received RNR nacks %ld, TX retries %ld\n", + txc->total_rnr_nacks, txc->total_retries); + + return FI_SUCCESS; +} + +static void cxip_txc_rnr_cleanup(struct cxip_txc *txc_base) +{ + /* Placeholder */ +} + +static void cxip_rnr_send_req_dequeue(struct cxip_req *req) +{ + /* TODO: Place holder for anything additional */ + + dlist_remove(&req->send.txc_entry); +} + +static int cxip_rnr_send_cb(struct cxip_req *req, const union c_event *event) +{ + struct cxip_txc_rnr *txc = req->send.txc_rnr; + int rc = cxi_event_rc(event); + int ret; + +#if 0 + TXC_WARN(txc, "Event %s RC %s received\n", + cxi_event_to_str(event), + cxi_rc_to_str(rc)); +#endif + + /* Handle at TX FI_MSG/FI_TAGGED message events */ + if (event->hdr.event_type != C_EVENT_ACK) { + TXC_WARN(req->send.txc, CXIP_UNEXPECTED_EVENT, + cxi_event_to_str(event), + cxi_rc_to_str(rc)); + return FI_SUCCESS; + } + + req->send.rc = rc; + + /* Handle RNR acks */ + if (rc == C_RC_ENTRY_NOT_FOUND && + txc->base.enabled && !req->send.canceled) { + + txc->total_rnr_nacks++; + ret = cxip_rnr_queue_retry(txc, req); + + if (ret == FI_SUCCESS) + return ret; + + TXC_WARN(&txc->base, "req %p RNR max timeout buf: %p len: %lu, " + "dest_addr: 0x%lX nic: %#x pid: %d tag(%c) 0x%lx " + "retries %u TX outstanding %u\n", req, req->send.buf, + req->send.len, req->send.dest_addr, + req->send.caddr.nic, req->send.caddr.pid, + req->send.tagged ? '*' : '-', + req->send.tag, req->send.retries, + ofi_atomic_get32(&txc->base.otx_reqs)); + } + + cxip_rnr_send_req_dequeue(req); + cxip_send_buf_fini(req); + + /* If status is good, then the request completed before it could + * be canceled. If canceled, indicate software update of the + * error count is required. + */ + if (rc == C_RC_OK) { + req->send.canceled = false; + + /* Report truncation if requested */ + if (txc->base.trunc_ok) { + req->data_len = event->init_short.mlength; + if (req->send.len > req->data_len) + req->flags |= FI_CXI_TRUNC; + } + } + + cxip_report_send_completion(req, req->send.canceled); + + ofi_atomic_dec32(&txc->base.otx_reqs); + cxip_evtq_req_free(req); + + return FI_SUCCESS; +} + +/* + * cxip_send_common() - Common message send function. Used for tagged and + * untagged sends of all sizes. This includes triggered operations. + */ +static ssize_t +cxip_send_common(struct cxip_txc *txc, uint32_t tclass, const void *buf, + size_t len, void *desc, uint64_t data, fi_addr_t dest_addr, + uint64_t tag, void *context, uint64_t flags, bool tagged, + bool triggered, uint64_t trig_thresh, + struct cxip_cntr *trig_cntr, struct cxip_cntr *comp_cntr) +{ + struct cxip_txc_rnr *txc_rnr = container_of(txc, struct cxip_txc_rnr, + base); + struct cxip_mr *mr = txc->domain->hybrid_mr_desc ? desc : NULL; + struct cxip_req *req = NULL; + struct cxip_req *send_req; + struct cxip_addr caddr; + int ret; + bool idc; + + assert(txc->protocol == FI_PROTO_CXI_RNR); + +#if ENABLE_DEBUG + if (len && !buf) { + TXC_WARN(txc, "Length %ld but source buffer NULL\n", len); + return -FI_EINVAL; + } + if (flags & FI_INJECT) { + TXC_WARN(txc, "FI_INJECT not supported\n"); + return -FI_EINVAL; + } +#endif + /* TODO: This check should not be required in other than debug builds, + * to do that we would need to return -FI_EFAULT, so leaving here for + * now. + */ + if (len > CXIP_EP_MAX_MSG_SZ) + return -FI_EMSGSIZE; + + /* TODO: Move to tagged sends */ + if (tagged && tag & ~CXIP_CS_TAG_MASK) { + TXC_WARN(txc, "Invalid tag: %#018lx (%#018lx)\n", + tag, CXIP_CS_TAG_MASK); + return -FI_EINVAL; + } + + ofi_genlock_lock(&txc->ep_obj->lock); + /* If RNR list is not empty, check if the first retry entry time + * wait has expired, and if so force progress to initiate any + * read retry/retries. + */ + if (txc_rnr->next_retry_wait_us != UINT64_MAX && + ofi_atomic_get32(&txc_rnr->time_wait_reqs)) { + if (ofi_gettime_us() >= txc_rnr->next_retry_wait_us) + cxip_txc_rnr_progress(txc); + } + + idc = cxip_rnr_req_uses_idc(txc_rnr, len, triggered); + + if (cxip_rnr_tx_success_disable(txc_rnr, mr, idc, flags)) { + /* This request cannot be retried, we use the common request + * to pass parameters to the send function. This is done + * with exclusive access to the request. + */ + send_req = tagged ? txc_rnr->req_selective_comp_tag : + txc_rnr->req_selective_comp_msg; + send_req->send.send_md = NULL; + send_req->send.hybrid_md = false; + } else { + req = cxip_evtq_req_alloc(&txc->tx_evtq, false, txc); + if (!req) { + TXC_DBG(txc, + "Failed to allocate request, ret -FI_EAGAIN\n"); + ret = -FI_EAGAIN; + goto unlock; + } + send_req = req; + send_req->cb = cxip_rnr_send_cb; + send_req->type = CXIP_REQ_SEND; + send_req->send.txc = txc; + send_req->context = (uint64_t)context; + send_req->flags = FI_SEND | + (flags & (FI_COMPLETION | FI_MATCH_COMPLETE)); + send_req->send.success_disable = false; + } + + /* Restrict outstanding success event requests to queue size */ + if (ofi_atomic_get32(&txc->otx_reqs) > txc->attr.size) { + ret = -FI_EAGAIN; + goto free_req; + } + + send_req->triggered = triggered; + send_req->trig_thresh = trig_thresh; + send_req->trig_cntr = trig_cntr; + + /* Save Send parameters to replay */ + send_req->send.tclass = tclass; + send_req->send.cntr = triggered ? comp_cntr : txc->send_cntr; + send_req->send.buf = buf; + send_req->send.len = len; + send_req->send.data = data; + send_req->send.flags = flags; + /* Set completion parameters */ + if (tagged) { + send_req->send.tagged = tagged; + send_req->send.tag = tag; + send_req->flags |= FI_TAGGED; + } else { + send_req->flags |= FI_MSG; + } + + if (send_req->send.len && !idc) { + if (!mr) { + ret = cxip_map(txc->domain, send_req->send.buf, + send_req->send.len, 0, + &send_req->send.send_md); + if (ret) { + TXC_WARN(txc, + "Local buffer map failed: %d %s\n", + ret, fi_strerror(-ret)); + goto free_req; + } + } else { + send_req->send.send_md = mr->md; + send_req->send.hybrid_md = true; + } + } + + /* Look up target CXI address */ + ret = cxip_av_lookup_addr(txc->ep_obj->av, dest_addr, &caddr); + if (ret != FI_SUCCESS) { + TXC_WARN(txc, "Failed to look up FI addr: %d %s\n", + ret, fi_strerror(-ret)); + goto free_map; + } + + if (!txc->ep_obj->av_auth_key) + caddr.vni = txc->ep_obj->auth_key.vni; + + send_req->send.caddr = caddr; + send_req->send.dest_addr = dest_addr; + + if (cxip_evtq_saturated(&txc->tx_evtq)) { + TXC_DBG(txc, "TX HW EQ saturated\n"); + ret = -FI_EAGAIN; + goto free_map; + } + + dlist_insert_tail(&send_req->send.txc_entry, &txc->msg_queue); + send_req->send.max_rnr_time = ofi_gettime_us() + + txc_rnr->max_retry_wait_us; + + ret = cxip_rnr_msg_send(send_req); + if (ret != FI_SUCCESS) + goto req_dequeue; + + TXC_DBG(txc, + "req: %p buf: %p len: %lu dest_addr: 0x%lX nic: %d " + "pid: %d tag(%c): 0x%lx context %#lx\n", + send_req, send_req->send.buf, send_req->send.len, dest_addr, + caddr.nic, caddr.pid, send_req->send.tagged ? '*' : '-', + send_req->send.tag, send_req->context); + ofi_genlock_unlock(&txc->ep_obj->lock); + + return FI_SUCCESS; + +req_dequeue: + cxip_rnr_send_req_dequeue(send_req); +free_map: + if (send_req->send.send_md && !send_req->send.hybrid_md) + cxip_unmap(send_req->send.send_md); +free_req: + if (req) + cxip_evtq_req_free(req); +unlock: + ofi_genlock_unlock(&txc->ep_obj->lock); + + return ret; +} + +struct cxip_rxc_ops rnr_rxc_ops = { + .recv_common = cxip_recv_common, + .progress = cxip_rxc_rnr_progress, + .recv_req_tgt_event = cxip_rxc_rnr_recv_req_tgt_event, + .cancel_msg_recv = cxip_rxc_rnr_cancel_msg_recv, + .ctrl_msg_cb = cxip_rxc_rnr_ctrl_msg_cb, + .init_struct = cxip_rxc_rnr_init_struct, + .fini_struct = cxip_rxc_rnr_fini_struct, + .cleanup = cxip_rxc_rnr_cleanup, + .msg_init = cxip_rxc_rnr_msg_init, + .msg_fini = cxip_rxc_rnr_msg_fini, +}; + +struct cxip_txc_ops rnr_txc_ops = { + .send_common = cxip_send_common, + .progress = cxip_txc_rnr_progress, + .cancel_msg_send = cxip_txc_rnr_cancel_msg_send, + .init_struct = cxip_txc_rnr_init_struct, + .fini_struct = cxip_txc_rnr_fini_struct, + .cleanup = cxip_txc_rnr_cleanup, + .msg_init = cxip_txc_rnr_msg_init, + .msg_fini = cxip_txc_rnr_msg_fini, +}; diff --git a/prov/cxi/src/cxip_ptelist_buf.c b/prov/cxi/src/cxip_ptelist_buf.c index bfeaddb058c..b8ee08a3733 100644 --- a/prov/cxi/src/cxip_ptelist_buf.c +++ b/prov/cxi/src/cxip_ptelist_buf.c @@ -20,11 +20,11 @@ cxip_ptelist_to_str(struct cxip_ptelist_bufpool *pool) static int cxip_ptelist_unlink_buf(struct cxip_ptelist_buf *buf) { - struct cxip_rxc *rxc = buf->rxc; + struct cxip_rxc_hpc *rxc = buf->rxc; int ret; - ret = cxip_pte_unlink(rxc->rx_pte, buf->pool->attr.list_type, - buf->req->req_id, rxc->rx_cmdq); + ret = cxip_pte_unlink(rxc->base.rx_pte, buf->pool->attr.list_type, + buf->req->req_id, rxc->base.rx_cmdq); if (ret) RXC_DBG(rxc, "Failed to write command %d %s\n", ret, fi_strerror(-ret)); @@ -35,7 +35,7 @@ static int cxip_ptelist_unlink_buf(struct cxip_ptelist_buf *buf) static int cxip_ptelist_link_buf(struct cxip_ptelist_buf *buf, bool seq_restart) { - struct cxip_rxc *rxc = buf->rxc; + struct cxip_rxc_hpc *rxc = buf->rxc; uint32_t le_flags = C_LE_MANAGE_LOCAL | C_LE_NO_TRUNCATE | C_LE_UNRESTRICTED_BODY_RO | C_LE_OP_PUT | C_LE_UNRESTRICTED_END_RO | C_LE_EVENT_UNLINK_DISABLE; @@ -72,14 +72,14 @@ static int cxip_ptelist_link_buf(struct cxip_ptelist_buf *buf, buf->cur_offset = 0; /* Take a request buffer reference for the link. */ - ret = cxip_pte_append(rxc->rx_pte, + ret = cxip_pte_append(rxc->base.rx_pte, CXI_VA_TO_IOVA(buf->md->md, buf->data), buf->pool->attr.buf_size, buf->md->md->lac, buf->pool->attr.list_type, buf->req->req_id, mb.raw, ib.raw, CXI_MATCH_ID_ANY, buf->pool->attr.min_space_avail, - le_flags, NULL, rxc->rx_cmdq, true); + le_flags, NULL, rxc->base.rx_cmdq, true); if (ret) { RXC_WARN(rxc, "Failed to write %s append %d %s\n", cxip_ptelist_to_str(buf->pool), @@ -110,7 +110,7 @@ static int cxip_ptelist_link_buf(struct cxip_ptelist_buf *buf, static struct cxip_ptelist_buf* cxip_ptelist_buf_alloc(struct cxip_ptelist_bufpool *pool) { - struct cxip_rxc *rxc = pool->rxc; + struct cxip_rxc_hpc *rxc = pool->rxc; struct cxip_ptelist_buf *buf; int ret; @@ -122,7 +122,7 @@ cxip_ptelist_buf_alloc(struct cxip_ptelist_bufpool *pool) if (!buf->data) goto err_free_buf; - if (rxc->hmem && !cxip_env.disable_host_register) { + if (rxc->base.hmem && !cxip_env.disable_host_register) { ret = ofi_hmem_host_register(buf->data, pool->attr.buf_size); if (ret) { RXC_WARN(rxc, @@ -132,12 +132,12 @@ cxip_ptelist_buf_alloc(struct cxip_ptelist_bufpool *pool) } } - ret = cxip_map(rxc->domain, buf->data, pool->attr.buf_size, + ret = cxip_map(rxc->base.domain, buf->data, pool->attr.buf_size, OFI_MR_NOCACHE, &buf->md); if (ret) goto err_unreg_buf; - buf->req = cxip_evtq_req_alloc(&rxc->rx_evtq, true, buf); + buf->req = cxip_evtq_req_alloc(&rxc->base.rx_evtq, true, buf); if (!buf->req) goto err_unmap_buf; @@ -165,7 +165,7 @@ cxip_ptelist_buf_alloc(struct cxip_ptelist_bufpool *pool) err_unmap_buf: cxip_unmap(buf->md); err_unreg_buf: - if (rxc->hmem && !cxip_env.disable_host_register) + if (rxc->base.hmem && !cxip_env.disable_host_register) ofi_hmem_host_unregister(buf); err_free_data_buf: free(buf->data); @@ -179,7 +179,7 @@ static void cxip_ptelist_buf_free(struct cxip_ptelist_buf *buf) { struct cxip_ux_send *ux; struct dlist_entry *tmp; - struct cxip_rxc *rxc = buf->rxc; + struct cxip_rxc_hpc *rxc = buf->rxc; /* Sanity check making sure the buffer was properly removed before * freeing. @@ -201,7 +201,7 @@ static void cxip_ptelist_buf_free(struct cxip_ptelist_buf *buf) ofi_atomic_get32(&buf->refcount)); cxip_evtq_req_free(buf->req); cxip_unmap(buf->md); - if (rxc->hmem && !cxip_env.disable_host_register) + if (rxc->base.hmem && !cxip_env.disable_host_register) ofi_hmem_host_unregister(buf->data); ofi_atomic_dec32(&buf->pool->bufs_allocated); @@ -228,7 +228,7 @@ static void cxip_ptelist_buf_dlist_free(struct dlist_entry *head) void cxip_ptelist_buf_link_err(struct cxip_ptelist_buf *buf, int rc_link_error) { - struct cxip_rxc *rxc = buf->pool->rxc; + struct cxip_rxc_hpc *rxc = buf->pool->rxc; RXC_WARN(rxc, "%s buffer %p link error %s\n", cxip_ptelist_to_str(buf->pool), @@ -258,7 +258,7 @@ void cxip_ptelist_buf_unlink(struct cxip_ptelist_buf *buf) RXC_DBG(pool->rxc, "%s buffer unlink\n", cxip_ptelist_to_str(pool)); } -int cxip_ptelist_bufpool_init(struct cxip_rxc *rxc, +int cxip_ptelist_bufpool_init(struct cxip_rxc_hpc *rxc, struct cxip_ptelist_bufpool **pool, struct cxip_ptelist_bufpool_attr *attr) { @@ -331,11 +331,11 @@ int cxip_ptelist_bufpool_init(struct cxip_rxc *rxc, void cxip_ptelist_bufpool_fini(struct cxip_ptelist_bufpool *pool) { - struct cxip_rxc *rxc = pool->rxc; + struct cxip_rxc_hpc *rxc = pool->rxc; struct cxip_ptelist_buf *buf; int ret; - assert(rxc->rx_pte->state == C_PTLTE_DISABLED); + assert(rxc->base.rx_pte->state == C_PTLTE_DISABLED); RXC_INFO(rxc, "Number of %s buffers allocated %d\n", cxip_ptelist_to_str(pool), @@ -349,13 +349,13 @@ void cxip_ptelist_bufpool_fini(struct cxip_ptelist_bufpool *pool) ret = cxip_ptelist_unlink_buf(buf); if (ret != FI_SUCCESS) CXIP_FATAL("PtlTE %d failed to unlink %s buf %d %s\n", - rxc->rx_pte->pte->ptn, + rxc->base.rx_pte->pte->ptn, cxip_ptelist_to_str(pool), ret, fi_strerror(-ret)); } do { - cxip_evtq_progress(&rxc->rx_evtq); + cxip_evtq_progress(&rxc->base.rx_evtq); } while (ofi_atomic_get32(&pool->bufs_linked)); cxip_ptelist_buf_dlist_free(&pool->active_bufs); @@ -377,12 +377,13 @@ void cxip_ptelist_bufpool_fini(struct cxip_ptelist_bufpool *pool) int cxip_ptelist_buf_replenish(struct cxip_ptelist_bufpool *pool, bool seq_restart) { - struct cxip_rxc *rxc = pool->rxc; + struct cxip_rxc_hpc *rxc = pool->rxc; struct cxip_ptelist_buf *buf; int bufs_added = 0; int ret = FI_SUCCESS; - if (rxc->msg_offload && pool->attr.list_type == C_PTL_LIST_REQUEST) + if (rxc->base.msg_offload && + pool->attr.list_type == C_PTL_LIST_REQUEST) return FI_SUCCESS; while (ofi_atomic_get32(&pool->bufs_linked) < pool->attr.min_posted) { @@ -474,11 +475,12 @@ void cxip_ptelist_buf_put(struct cxip_ptelist_buf *buf, bool repost) * in hardware RX match mode. */ if (buf->pool->attr.list_type == C_PTL_LIST_OVERFLOW && - (!buf->rxc->msg_offload || buf->rxc->state != RXC_ENABLED)) + (!buf->rxc->base.msg_offload || + buf->rxc->base.state != RXC_ENABLED)) goto free_buf; if (buf->pool->attr.list_type == C_PTL_LIST_REQUEST && - buf->rxc->state != RXC_ENABLED_SOFTWARE) + buf->rxc->base.state != RXC_ENABLED_SOFTWARE) goto skip_repost; /* Limit immediate repost if already sufficient */ diff --git a/prov/cxi/src/cxip_rdzv_pte.c b/prov/cxi/src/cxip_rdzv_pte.c index a0af4ea10ce..d99bda07f5c 100644 --- a/prov/cxi/src/cxip_rdzv_pte.c +++ b/prov/cxi/src/cxip_rdzv_pte.c @@ -37,7 +37,7 @@ static int cxip_rdzv_pte_wait_append(struct cxip_rdzv_pte *pte, /* Poll until the LE is linked or a failure occurs. */ do { - cxip_evtq_progress(&pte->txc->tx_evtq); + cxip_evtq_progress(&pte->txc->base.tx_evtq); sched_yield(); } while (!cxip_rdzv_pte_append_done(pte, expected_count)); @@ -86,7 +86,7 @@ int cxip_rdzv_pte_src_req_alloc(struct cxip_rdzv_match_pte *pte, int lac) le_flags = C_LE_UNRESTRICTED_BODY_RO | C_LE_UNRESTRICTED_END_RO | C_LE_OP_GET | C_LE_EVENT_UNLINK_DISABLE; - req = cxip_evtq_req_alloc(&base->txc->tx_evtq, 1, base); + req = cxip_evtq_req_alloc(&base->txc->base.tx_evtq, 1, base); if (!req) { ret = -FI_EAGAIN; CXIP_WARN("Failed to allocate %d rendezvous source request: %d:%s\n", @@ -151,7 +151,7 @@ static int cxip_rdzv_pte_zbp_req_alloc(struct cxip_rdzv_match_pte *pte) int ret; int expected_success_count; - pte->zbp_req = cxip_evtq_req_alloc(&base->txc->tx_evtq, 1, pte); + pte->zbp_req = cxip_evtq_req_alloc(&base->txc->base.tx_evtq, 1, pte); if (!pte->zbp_req) { ret = -FI_ENOMEM; CXIP_WARN("Failed to allocate zero byte put request: %d:%s\n", @@ -196,7 +196,7 @@ static void cxip_rdzv_pte_free(struct cxip_rdzv_pte *pte) /* Flush the CQ to ensure any events referencing the rendezvous requests * are processed. */ - cxip_evtq_progress(&pte->txc->tx_evtq); + cxip_evtq_progress(&pte->txc->base.tx_evtq); } void cxip_rdzv_match_pte_free(struct cxip_rdzv_match_pte *pte) @@ -218,7 +218,7 @@ void cxip_rdzv_nomatch_pte_free(struct cxip_rdzv_nomatch_pte *pte) free(pte); } -static int cxip_rdzv_base_pte_alloc(struct cxip_txc *txc, +static int cxip_rdzv_base_pte_alloc(struct cxip_txc_hpc *txc, uint32_t write_pid_idx, bool write, uint32_t read_pid_idx, bool read, bool matching, @@ -233,11 +233,12 @@ static int cxip_rdzv_base_pte_alloc(struct cxip_txc *txc, ofi_atomic_initialize32(&base_pte->le_linked_success_count, 0); ofi_atomic_initialize32(&base_pte->le_linked_failure_count, 0); - if (matching && txc->ep_obj->av->symmetric) + if (matching && txc->base.ep_obj->av->symmetric) pt_opts.use_logical = 1; /* Reserve the Rendezvous Send PTE */ - ret = cxip_pte_alloc_nomap(txc->ep_obj->ptable, txc->tx_evtq.eq, + ret = cxip_pte_alloc_nomap(txc->base.ep_obj->ptable, + txc->base.tx_evtq.eq, &pt_opts, cxip_rdzv_pte_cb, txc, &base_pte->pte); if (ret != FI_SUCCESS) { @@ -281,12 +282,12 @@ static int cxip_rdzv_base_pte_alloc(struct cxip_txc *txc, } /* ep_obj->lock should be held by caller */ -int cxip_rdzv_match_pte_alloc(struct cxip_txc *txc, +int cxip_rdzv_match_pte_alloc(struct cxip_txc_hpc *txc, struct cxip_rdzv_match_pte **rdzv_pte) { int ret; struct cxip_rdzv_match_pte *match_pte; - uint32_t pid_idx = txc->domain->iface->dev->info.rdzv_get_idx; + uint32_t pid_idx = txc->base.domain->iface->dev->info.rdzv_get_idx; struct cxip_rdzv_pte *base; match_pte = calloc(1, sizeof(*match_pte)); @@ -325,7 +326,7 @@ int cxip_rdzv_match_pte_alloc(struct cxip_txc *txc, } /* ep_obj->lock should be held by caller */ -int cxip_rdzv_nomatch_pte_alloc(struct cxip_txc *txc, int lac, +int cxip_rdzv_nomatch_pte_alloc(struct cxip_txc_hpc *txc, int lac, struct cxip_rdzv_nomatch_pte **rdzv_pte) { int ret; @@ -359,7 +360,7 @@ int cxip_rdzv_nomatch_pte_alloc(struct cxip_txc *txc, int lac, /* Non-matching specific initialization */ base = &nomatch_pte->base_pte; - nomatch_pte->le_req = cxip_evtq_req_alloc(&txc->tx_evtq, 1, + nomatch_pte->le_req = cxip_evtq_req_alloc(&txc->base.tx_evtq, 1, nomatch_pte); if (!nomatch_pte->le_req) { ret = -FI_EAGAIN; diff --git a/prov/cxi/src/cxip_req_buf.c b/prov/cxi/src/cxip_req_buf.c index 84c72cd3488..4a4624c59b7 100644 --- a/prov/cxi/src/cxip_req_buf.c +++ b/prov/cxi/src/cxip_req_buf.c @@ -87,7 +87,7 @@ static struct cxip_ux_send *cxip_req_buf_ux_alloc(struct cxip_ptelist_buf *buf, static int cxip_req_buf_process_ux(struct cxip_ptelist_buf *buf, struct cxip_ux_send *ux) { - struct cxip_rxc *rxc = buf->rxc; + struct cxip_rxc_hpc *rxc = buf->rxc; size_t header_length; uint64_t remote_offset; int ret; @@ -139,8 +139,8 @@ static int cxip_req_buf_process_ux(struct cxip_ptelist_buf *buf, * will be appended to software UX message list following * completion of the on-loading. */ - if (rxc->state != RXC_ENABLED_SOFTWARE && - rxc->state != RXC_FLOW_CONTROL) { + if (rxc->base.state != RXC_ENABLED_SOFTWARE && + rxc->base.state != RXC_FLOW_CONTROL) { rxc->sw_ux_list_len--; dlist_insert_tail(&ux->rxc_entry, &rxc->sw_pending_ux_list); @@ -220,7 +220,7 @@ static int cxip_req_buf_process_put_event(struct cxip_ptelist_buf *buf, { struct cxip_ux_send *ux; int ret = FI_SUCCESS; - struct cxip_rxc *rxc = buf->rxc; + struct cxip_rxc_hpc *rxc = buf->rxc; struct cxip_ptelist_bufpool *pool = buf->pool; assert(event->tgt_long.mlength >= CXIP_REQ_BUF_HEADER_MIN_SIZE); @@ -298,7 +298,7 @@ static int cxip_req_buf_cb(struct cxip_req *req, const union c_event *event) } } -int cxip_req_bufpool_init(struct cxip_rxc *rxc) +int cxip_req_bufpool_init(struct cxip_rxc_hpc *rxc) { struct cxip_ptelist_bufpool_attr attr = { .list_type = C_PTL_LIST_REQUEST, @@ -315,7 +315,7 @@ int cxip_req_bufpool_init(struct cxip_rxc *rxc) return cxip_ptelist_bufpool_init(rxc, &rxc->req_list_bufpool, &attr); } -void cxip_req_bufpool_fini(struct cxip_rxc *rxc) +void cxip_req_bufpool_fini(struct cxip_rxc_hpc *rxc) { return cxip_ptelist_bufpool_fini(rxc->req_list_bufpool); } diff --git a/prov/cxi/src/cxip_rma.c b/prov/cxi/src/cxip_rma.c index 25877a73b04..9c36addddd3 100644 --- a/prov/cxi/src/cxip_rma.c +++ b/prov/cxi/src/cxip_rma.c @@ -503,6 +503,10 @@ static bool cxip_rma_is_idc(struct cxip_txc *txc, uint64_t key, size_t len, { size_t max_idc_size = unr ? CXIP_INJECT_SIZE : C_MAX_IDC_PAYLOAD_RES; + /* DISABLE_NON_INJECT_MSG_IDC disables the IDC + */ + if (cxip_env.disable_non_inject_msg_idc) + return false; /* IDC commands are not supported for unoptimized MR since the IDC * small message format does not support remote offset which is needed * for RMA commands. @@ -650,7 +654,7 @@ static ssize_t cxip_rma_write(struct fid_ep *fid_ep, const void *buf, { struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); - return cxip_rma_common(FI_OP_WRITE, &ep->ep_obj->txc, buf, len, desc, + return cxip_rma_common(FI_OP_WRITE, ep->ep_obj->txc, buf, len, desc, dest_addr, addr, key, 0, ep->tx_attr.op_flags, ep->tx_attr.tclass, ep->tx_attr.msg_order, context, false, 0, NULL, NULL); @@ -674,11 +678,11 @@ static ssize_t cxip_rma_writev(struct fid_ep *fid_ep, const struct iovec *iov, buf = iov[0].iov_base; mr_desc = desc ? desc[0] : NULL; } else { - TXC_WARN(&ep->ep_obj->txc, "Invalid IOV\n"); + TXC_WARN(ep->ep_obj->txc, "Invalid IOV\n"); return -FI_EINVAL; } - return cxip_rma_common(FI_OP_WRITE, &ep->ep_obj->txc, buf, len, + return cxip_rma_common(FI_OP_WRITE, ep->ep_obj->txc, buf, len, mr_desc, dest_addr, addr, key, 0, ep->tx_attr.op_flags, ep->tx_attr.tclass, ep->tx_attr.msg_order, context, false, 0, NULL, @@ -689,7 +693,7 @@ static ssize_t cxip_rma_writemsg(struct fid_ep *fid_ep, const struct fi_msg_rma *msg, uint64_t flags) { struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); - struct cxip_txc *txc = &ep->ep_obj->txc; + struct cxip_txc *txc = ep->ep_obj->txc; size_t len; const void *buf; void *mr_desc; @@ -713,7 +717,7 @@ static ssize_t cxip_rma_writemsg(struct fid_ep *fid_ep, buf = msg->msg_iov[0].iov_base; mr_desc = msg->desc ? msg->desc[0] : NULL; } else { - TXC_WARN(&ep->ep_obj->txc, "Invalid IOV\n"); + TXC_WARN(ep->ep_obj->txc, "Invalid IOV\n"); return -FI_EINVAL; } @@ -742,7 +746,7 @@ ssize_t cxip_rma_inject(struct fid_ep *fid_ep, const void *buf, size_t len, { struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); - return cxip_rma_common(FI_OP_WRITE, &ep->ep_obj->txc, buf, len, NULL, + return cxip_rma_common(FI_OP_WRITE, ep->ep_obj->txc, buf, len, NULL, dest_addr, addr, key, 0, FI_INJECT, ep->tx_attr.tclass, ep->tx_attr.msg_order, NULL, false, 0, NULL, NULL); @@ -754,7 +758,7 @@ static ssize_t cxip_rma_read(struct fid_ep *fid_ep, void *buf, size_t len, { struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); - return cxip_rma_common(FI_OP_READ, &ep->ep_obj->txc, buf, len, desc, + return cxip_rma_common(FI_OP_READ, ep->ep_obj->txc, buf, len, desc, src_addr, addr, key, 0, ep->tx_attr.op_flags, ep->tx_attr.tclass, ep->tx_attr.msg_order, context, false, 0, NULL, NULL); @@ -778,11 +782,11 @@ static ssize_t cxip_rma_readv(struct fid_ep *fid_ep, const struct iovec *iov, buf = iov[0].iov_base; mr_desc = desc ? desc[0] : NULL; } else { - TXC_WARN(&ep->ep_obj->txc, "Invalid IOV\n"); + TXC_WARN(ep->ep_obj->txc, "Invalid IOV\n"); return -FI_EINVAL; } - return cxip_rma_common(FI_OP_READ, &ep->ep_obj->txc, buf, len, mr_desc, + return cxip_rma_common(FI_OP_READ, ep->ep_obj->txc, buf, len, mr_desc, src_addr, addr, key, 0, ep->tx_attr.op_flags, ep->tx_attr.tclass, ep->tx_attr.msg_order, context, false, 0, NULL, NULL); @@ -792,7 +796,7 @@ static ssize_t cxip_rma_readmsg(struct fid_ep *fid_ep, const struct fi_msg_rma *msg, uint64_t flags) { struct cxip_ep *ep = container_of(fid_ep, struct cxip_ep, ep); - struct cxip_txc *txc = &ep->ep_obj->txc; + struct cxip_txc *txc = ep->ep_obj->txc; size_t len; const void *buf; void *mr_desc; @@ -816,7 +820,7 @@ static ssize_t cxip_rma_readmsg(struct fid_ep *fid_ep, buf = msg->msg_iov[0].iov_base; mr_desc = msg->desc ? msg->desc[0] : NULL; } else { - TXC_WARN(&ep->ep_obj->txc, "Invalid IOV\n"); + TXC_WARN(ep->ep_obj->txc, "Invalid IOV\n"); return -FI_EINVAL; } diff --git a/prov/cxi/src/cxip_rxc.c b/prov/cxi/src/cxip_rxc.c index 3fce655a6d7..cdcaed39a59 100644 --- a/prov/cxi/src/cxip_rxc.c +++ b/prov/cxi/src/cxip_rxc.c @@ -18,9 +18,8 @@ #define CXIP_WARN(...) _CXIP_WARN(FI_LOG_EP_CTRL, __VA_ARGS__) #define CXIP_INFO(...) _CXIP_INFO(FI_LOG_EP_CTRL, __VA_ARGS__) -#define CXIP_SC_STATS "FC/SC stats - EQ full: %d append fail: %d no match: %d"\ - " request full: %d unexpected: %d, NIC HW2SW unexp: %d"\ - " NIC HW2SW append fail: %d\n" +extern struct cxip_rxc_ops hpc_rxc_ops; +extern struct cxip_rxc_ops rnr_rxc_ops; /* * cxip_rxc_msg_enable() - Enable RXC messaging. @@ -31,7 +30,7 @@ * * Caller must hold ep_obj->lock. */ -int cxip_rxc_msg_enable(struct cxip_rxc *rxc, uint32_t drop_count) +int cxip_rxc_msg_enable(struct cxip_rxc_hpc *rxc, uint32_t drop_count) { int ret; @@ -39,15 +38,15 @@ int cxip_rxc_msg_enable(struct cxip_rxc *rxc, uint32_t drop_count) * synchronous call is used which handles drop count mismatches. */ if (rxc->new_state == RXC_ENABLED_SOFTWARE) { - ret = cxil_pte_transition_sm(rxc->rx_pte->pte, drop_count); + ret = cxil_pte_transition_sm(rxc->base.rx_pte->pte, drop_count); if (ret) RXC_WARN(rxc, "Error transitioning to SW EP %d %s\n", - ret, fi_strerror(-ret)); + ret, fi_strerror(-ret)); return ret; } - return cxip_pte_set_state(rxc->rx_pte, rxc->rx_cmdq, + return cxip_pte_set_state(rxc->base.rx_pte, rxc->base.rx_cmdq, C_PTLTE_ENABLED, drop_count); } @@ -78,7 +77,40 @@ static int rxc_msg_disable(struct cxip_rxc *rxc) return ret; } -#define RXC_RESERVED_FC_SLOTS 1 +static size_t cxip_rxc_get_num_events(struct cxip_rxc *rxc) +{ + size_t num_events; + + /* Hardware will ensure incoming RDMA operations have event queue space. + * It is the responsibility of software to ensure that any SW initiated + * target commands which may generate an event (e.g. append with failure + * or search) have enough space in the EQ. This can be done in two ways. + * + * 1. Continually increase EQ buffer size until EQ overflows go away. + * This option is not ideal since many application variables are in play + * which impact number of events needed. + * + * 2. Use hybrid endpoint mode to preemptively transition to software + * endpoint when event queue space may be under pressure. When in + * software endpoint mode, software should not be issuing commands, like + * append and search/search & delete, which could result in events being + * generated. + * + * For both cases, RXC size will be used to size number of events. To + * accommodate a stream of unexpected puts and append failures, RXC size + * is added again. With correct credit control for hybrid endpoint to + * preemptively transition to software endpoint, 2* RXC size should be + * enough to prevent EQ overflow. For all other cases, EQ size needs to + * be increased. + */ + + num_events = rxc->attr.size * 2; + + /* Add 1 more event for software initiated state change. */ + num_events++; + + return num_events; +} /* * rxc_msg_init() - Initialize an RX context for messaging. @@ -90,90 +122,38 @@ static int rxc_msg_disable(struct cxip_rxc *rxc) */ static int rxc_msg_init(struct cxip_rxc *rxc) { + size_t num_events; int ret; - struct cxi_pt_alloc_opts pt_opts = { - .use_long_event = 1, - .is_matching = 1, - .en_flowctrl = 1, - .lossless = cxip_env.msg_lossless, - }; - struct cxi_cq_alloc_opts cq_opts = {}; - - ret = cxip_ep_cmdq(rxc->ep_obj, false, FI_TC_UNSPEC, - rxc->rx_evtq.eq, &rxc->rx_cmdq); - if (ret != FI_SUCCESS) { - CXIP_WARN("Unable to allocate RX CMDQ, ret: %d\n", ret); - return -FI_EDOMAIN; - } - - /* For FI_TC_UNSPEC, reuse the TX context command queue if possible. If - * a specific traffic class is requested, allocate a new command queue. - * This is done to prevent performance issues with reusing the TX - * context command queue and changing the communication profile. - */ - if (cxip_env.rget_tc == FI_TC_UNSPEC) { - ret = cxip_ep_cmdq(rxc->ep_obj, true, FI_TC_UNSPEC, - rxc->rx_evtq.eq, &rxc->tx_cmdq); - if (ret != FI_SUCCESS) { - CXIP_WARN("Unable to allocate TX CMDQ, ret: %d\n", ret); - ret = -FI_EDOMAIN; - goto put_rx_cmdq; - } - } else { - cq_opts.count = rxc->ep_obj->txq_size * 4; - cq_opts.flags = CXI_CQ_IS_TX; - cq_opts.policy = cxip_env.cq_policy; - - ret = cxip_cmdq_alloc(rxc->ep_obj->domain->lni, - rxc->rx_evtq.eq, &cq_opts, - rxc->ep_obj->auth_key.vni, - cxip_ofi_to_cxi_tc(cxip_env.rget_tc), - CXI_TC_TYPE_DEFAULT, &rxc->tx_cmdq); - if (ret != FI_SUCCESS) { - CXIP_WARN("Unable to allocate CMDQ, ret: %d\n", ret); - ret = -FI_ENOSPC; - goto put_rx_cmdq; - } - } - /* If applications AVs are symmetric, use logical FI addresses for - * matching. Otherwise, physical addresses will be used. - */ - if (rxc->ep_obj->av->symmetric) { - CXIP_DBG("Using logical PTE matching\n"); - pt_opts.use_logical = 1; + /* Base message initialization */ + num_events = cxip_rxc_get_num_events(rxc); + ret = cxip_evtq_init(&rxc->rx_evtq, rxc->recv_cq, num_events, 1); + if (ret) { + CXIP_WARN("Failed to initialize RXC event queue: %d, %s\n", + ret, fi_strerror(-ret)); + return ret; } - ret = cxip_pte_alloc(rxc->ep_obj->ptable, - rxc->rx_evtq.eq, CXIP_PTL_IDX_RXQ, false, - &pt_opts, cxip_recv_pte_cb, rxc, &rxc->rx_pte); + ret = cxip_ep_cmdq(rxc->ep_obj, false, FI_TC_UNSPEC, rxc->rx_evtq.eq, + &rxc->rx_cmdq); if (ret != FI_SUCCESS) { - CXIP_WARN("Failed to allocate RX PTE: %d\n", ret); - goto put_tx_cmdq; + CXIP_WARN("Unable to allocate RX CMDQ, ret: %d\n", ret); + goto free_evtq; } - /* One slot must be reserved to support hardware generated state change - * events. - */ - ret = cxip_evtq_adjust_reserved_fc_event_slots(&rxc->rx_evtq, - RXC_RESERVED_FC_SLOTS); + /* Derived messaging initialization/overrides */ + ret = rxc->ops.msg_init(rxc); if (ret) { - CXIP_WARN("Unable to adjust RX reserved event slots: %d\n", - ret); - goto free_pte; + CXIP_WARN("RXC derived initialization failed %d\n", ret); + goto put_rx_cmdq; } return FI_SUCCESS; -free_pte: - cxip_pte_free(rxc->rx_pte); -put_tx_cmdq: - if (cxip_env.rget_tc == FI_TC_UNSPEC) - cxip_ep_cmdq_put(rxc->ep_obj, true); - else - cxip_cmdq_free(rxc->tx_cmdq); put_rx_cmdq: cxip_ep_cmdq_put(rxc->ep_obj, false); +free_evtq: + cxip_evtq_fini(&rxc->rx_evtq); return ret; } @@ -188,102 +168,19 @@ static int rxc_msg_init(struct cxip_rxc *rxc) */ static int rxc_msg_fini(struct cxip_rxc *rxc) { - int ret __attribute__((unused)); + int ret; - cxip_pte_free(rxc->rx_pte); + ret = rxc->ops.msg_fini(rxc); + if (ret) + return ret; + cxip_pte_free(rxc->rx_pte); cxip_ep_cmdq_put(rxc->ep_obj, false); - - if (cxip_env.rget_tc == FI_TC_UNSPEC) - cxip_ep_cmdq_put(rxc->ep_obj, true); - else - cxip_cmdq_free(rxc->tx_cmdq); - - cxip_evtq_adjust_reserved_fc_event_slots(&rxc->rx_evtq, - -1 * RXC_RESERVED_FC_SLOTS); - cxip_evtq_fini(&rxc->rx_evtq); return FI_SUCCESS; } -static void cxip_rxc_free_ux_entries(struct cxip_rxc *rxc) -{ - struct cxip_ux_send *ux_send; - struct dlist_entry *tmp; - - /* TODO: Manage freeing of UX entries better. This code is redundant - * with the freeing in cxip_recv_sw_matcher(). - */ - dlist_foreach_container_safe(&rxc->sw_ux_list, struct cxip_ux_send, - ux_send, rxc_entry, tmp) { - dlist_remove(&ux_send->rxc_entry); - if (ux_send->req && ux_send->req->type == CXIP_REQ_RBUF) - cxip_req_buf_ux_free(ux_send); - else - free(ux_send); - - rxc->sw_ux_list_len--; - } - - if (rxc->sw_ux_list_len != 0) - CXIP_WARN("sw_ux_list_len %d != 0\n", rxc->sw_ux_list_len); - assert(rxc->sw_ux_list_len == 0); - - /* Free any pending UX entries waiting from the request list */ - dlist_foreach_container_safe(&rxc->sw_pending_ux_list, - struct cxip_ux_send, ux_send, - rxc_entry, tmp) { - dlist_remove(&ux_send->rxc_entry); - if (ux_send->req->type == CXIP_REQ_RBUF) - cxip_req_buf_ux_free(ux_send); - else - free(ux_send); - - rxc->sw_pending_ux_list_len--; - } - - if (rxc->sw_pending_ux_list_len != 0) - CXIP_WARN("sw_pending_ux_list_len %d != 0\n", - rxc->sw_pending_ux_list_len); - assert(rxc->sw_pending_ux_list_len == 0); -} - -static size_t cxip_rxc_get_num_events(struct cxip_rxc *rxc) -{ - size_t num_events; - - /* Hardware will ensure incoming RDMA operations have event queue space. - * It is the responsibility of software to ensure that any SW initiated - * target commands which may generate an event (e.g. append with failure - * or search) have enough space in the EQ. This can be done in two ways. - * - * 1. Continually increase EQ buffer size until EQ overflows go away. - * This option is not ideal since many application variables are in play - * which impact number of events needed. - * - * 2. Use hybrid endpoint mode to preemptively transition to software - * endpoint when event queue space may be under pressure. When in - * software endpoint mode, software should not be issuing commands, like - * append and search/search & delete, which could result in events being - * generated. - * - * For both cases, RXC size will be used to size number of events. To - * accommodate a stream of unexpected puts and append failures, RXC size - * is added again. With correct credit control for hybrid endpoint to - * preemptively transition to software endpoint, 2* RXC size should be - * enough to prevent EQ overflow. For all other cases, EQ size needs to - * be increased. - */ - - num_events = rxc->attr.size * 2; - - /* Add 1 more event for software initiated state change. */ - num_events++; - - return num_events; -} - /* * cxip_rxc_enable() - Enable an RX context for use. * @@ -293,9 +190,6 @@ static size_t cxip_rxc_get_num_events(struct cxip_rxc *rxc) int cxip_rxc_enable(struct cxip_rxc *rxc) { int ret; - int tmp; - size_t num_events; - enum c_ptlte_state state; if (rxc->state != RXC_DISABLED) return FI_SUCCESS; @@ -310,92 +204,28 @@ int cxip_rxc_enable(struct cxip_rxc *rxc) return -FI_ENOCQ; } - num_events = cxip_rxc_get_num_events(rxc); - ret = cxip_evtq_init(&rxc->rx_evtq, rxc->recv_cq, num_events, 1); - if (ret) { - CXIP_WARN("Failed to initialize RXC event queue: %d, %s\n", - ret, fi_strerror(-ret)); - return ret; - } - ret = rxc_msg_init(rxc); if (ret != FI_SUCCESS) { CXIP_WARN("rxc_msg_init returned: %d\n", ret); - ret = -FI_EDOMAIN; - goto evtq_fini; - } - - /* If starting in or able to transition to software managed - * PtlTE, append request list entries first. - */ - if (cxip_software_pte_allowed()) { - ret = cxip_req_bufpool_init(rxc); - if (ret != FI_SUCCESS) - goto err_msg_fini; - } - - if (rxc->msg_offload) { - state = C_PTLTE_ENABLED; - ret = cxip_oflow_bufpool_init(rxc); - if (ret != FI_SUCCESS) - goto err_req_buf_fini; - } else { - state = C_PTLTE_SOFTWARE_MANAGED; - } - - /* Start accepting Puts. */ - ret = cxip_pte_set_state(rxc->rx_pte, rxc->rx_cmdq, state, 0); - if (ret != FI_SUCCESS) { - CXIP_WARN("cxip_pte_set_state returned: %d\n", ret); - goto err_oflow_buf_fini; + return -FI_EDOMAIN; } - /* Wait for PTE state change */ - do { - sched_yield(); - cxip_evtq_progress(&rxc->rx_evtq); - } while (rxc->rx_pte->state != state); - - rxc->pid_bits = rxc->domain->iface->dev->info.pid_bits; - CXIP_DBG("RXC messaging enabled: %p, pid_bits: %d\n", - rxc, rxc->pid_bits); - return FI_SUCCESS; - -err_oflow_buf_fini: - if (rxc->msg_offload) - cxip_oflow_bufpool_fini(rxc); - -err_req_buf_fini: - if (cxip_software_pte_allowed()) - cxip_req_bufpool_fini(rxc); - -err_msg_fini: - tmp = rxc_msg_fini(rxc); - if (tmp != FI_SUCCESS) - CXIP_WARN("rxc_msg_fini returned: %d\n", tmp); - -evtq_fini: - cxip_evtq_fini(&rxc->rx_evtq); - - return ret; } /* - * rxc_cleanup() - Attempt to free outstanding requests. + * cxip_rxc_recv_req_cleanup() - Attempt to free outstanding requests. * * Outstanding commands may be dropped when the RX Command Queue is freed. * This leads to missing events. Attempt to gather all events before freeing * the RX CQ. If events go missing, resources will be leaked until the * Completion Queue is freed. */ -static void rxc_cleanup(struct cxip_rxc *rxc) +void cxip_rxc_recv_req_cleanup(struct cxip_rxc *rxc) { int ret; uint64_t start; int canceled = 0; - struct cxip_fc_drops *fc_drops; - struct dlist_entry *tmp; if (!ofi_atomic_get32(&rxc->orx_reqs)) return; @@ -421,22 +251,6 @@ static void rxc_cleanup(struct cxip_rxc *rxc) break; } } - - dlist_foreach_container_safe(&rxc->fc_drops, struct cxip_fc_drops, - fc_drops, rxc_entry, tmp) { - dlist_remove(&fc_drops->rxc_entry); - free(fc_drops); - } - - if (rxc->num_fc_eq_full || rxc->num_fc_no_match || - rxc->num_fc_req_full || rxc->num_fc_unexp || - rxc->num_fc_append_fail || rxc->num_sc_nic_hw2sw_unexp || - rxc->num_sc_nic_hw2sw_append_fail) - CXIP_INFO(CXIP_SC_STATS, rxc->num_fc_eq_full, - rxc->num_fc_append_fail, rxc->num_fc_no_match, - rxc->num_fc_req_full, rxc->num_fc_unexp, - rxc->num_sc_nic_hw2sw_unexp, - rxc->num_sc_nic_hw2sw_append_fail); } static void cxip_rxc_dump_counters(struct cxip_rxc *rxc) @@ -478,44 +292,6 @@ static void cxip_rxc_dump_counters(struct cxip_rxc *rxc) } } -void cxip_rxc_struct_init(struct cxip_rxc *rxc, const struct fi_rx_attr *attr, - void *context) -{ - int i; - - dlist_init(&rxc->ep_list); - ofi_atomic_initialize32(&rxc->orx_hw_ule_cnt, 0); - ofi_atomic_initialize32(&rxc->orx_reqs, 0); - ofi_atomic_initialize32(&rxc->orx_tx_reqs, 0); - rxc->max_tx = cxip_env.sw_rx_tx_init_max; - - rxc->context = context; - rxc->attr = *attr; - - for (i = 0; i < CXIP_DEF_EVENT_HT_BUCKETS; i++) - dlist_init(&rxc->deferred_events.bh[i]); - - dlist_init(&rxc->fc_drops); - dlist_init(&rxc->replay_queue); - dlist_init(&rxc->sw_ux_list); - dlist_init(&rxc->sw_recv_queue); - dlist_init(&rxc->sw_pending_ux_list); - - rxc->max_eager_size = cxip_env.rdzv_threshold + cxip_env.rdzv_get_min; - rxc->drop_count = rxc->ep_obj->asic_ver < CASSINI_2_0 ? -1 : 0; - - /* TODO make configurable */ - rxc->min_multi_recv = CXIP_EP_MIN_MULTI_RECV; - rxc->state = RXC_DISABLED; - rxc->msg_offload = cxip_env.msg_offload; - rxc->hmem = !!(attr->caps & FI_HMEM); - rxc->sw_ep_only = cxip_env.rx_match_mode == CXIP_PTLTE_SOFTWARE_MODE; - rxc->rget_align_mask = cxip_env.rdzv_aligned_sw_rget ? - cxip_env.cacheline_size - 1 : 0; - - cxip_msg_counters_init(&rxc->cntrs); -} - /* * cxip_rxc_disable() - Disable the RX context of an base endpoint object. * @@ -537,15 +313,8 @@ void cxip_rxc_disable(struct cxip_rxc *rxc) if (ret != FI_SUCCESS) CXIP_WARN("rxc_msg_disable returned: %d\n", ret); - cxip_rxc_free_ux_entries(rxc); - - rxc_cleanup(rxc); - - if (cxip_software_pte_allowed()) - cxip_req_bufpool_fini(rxc); - - if (cxip_env.msg_offload) - cxip_oflow_bufpool_fini(rxc); + /* Protocol cleanup must call cxip_rxc_recv_req_cleanup() */ + rxc->ops.cleanup(rxc); /* Free hardware resources. */ ret = rxc_msg_fini(rxc); @@ -553,3 +322,141 @@ void cxip_rxc_disable(struct cxip_rxc *rxc) CXIP_WARN("rxc_msg_fini returned: %d\n", ret); } } + +int cxip_rxc_emit_dma(struct cxip_rxc_hpc *rxc, uint16_t vni, + enum cxi_traffic_class tc, + enum cxi_traffic_class_type tc_type, + struct c_full_dma_cmd *dma, uint64_t flags) +{ + int ret; + + if (rxc->base.ep_obj->av_auth_key) { + ret = cxip_domain_emit_dma(rxc->base.domain, vni, tc, + dma, flags); + if (ret) + RXC_WARN(rxc, "Failed to emit domain dma command: %d\n", + ret); + return ret; + } + + /* Ensure correct traffic class is used. */ + ret = cxip_cmdq_cp_set(rxc->tx_cmdq, vni, tc, tc_type); + if (ret) { + RXC_WARN(rxc, "Failed to set traffic class: %d:%s\n", ret, + fi_strerror(-ret)); + return ret; + } + + ret = cxip_cmdq_emit_dma(rxc->tx_cmdq, dma, flags); + if (ret) { + RXC_WARN(rxc, "Failed to emit dma command: %d:%s\n", ret, + fi_strerror(-ret)); + return ret; + } + + cxip_txq_ring(rxc->tx_cmdq, 0, 1); + + return FI_SUCCESS; +} + +int cxip_rxc_emit_idc_msg(struct cxip_rxc_hpc *rxc, uint16_t vni, + enum cxi_traffic_class tc, + enum cxi_traffic_class_type tc_type, + const struct c_cstate_cmd *c_state, + const struct c_idc_msg_hdr *msg, const void *buf, + size_t len, uint64_t flags) +{ + int ret; + + if (rxc->base.ep_obj->av_auth_key) { + ret = cxip_domain_emit_idc_msg(rxc->base.domain, vni, tc, + c_state, msg, buf, len, flags); + if (ret) + RXC_WARN(rxc, "Failed to emit domain idc msg: %d\n", + ret); + return ret; + } + + /* Ensure correct traffic class is used. */ + ret = cxip_cmdq_cp_set(rxc->tx_cmdq, vni, tc, tc_type); + if (ret) { + RXC_WARN(rxc, "Failed to set traffic class: %d:%s\n", ret, + fi_strerror(-ret)); + return ret; + } + + ret = cxip_cmdq_emit_idc_msg(rxc->tx_cmdq, c_state, msg, buf, len, + flags); + if (ret) { + RXC_WARN(rxc, "Failed to emit idc_msg command: %d:%s\n", ret, + fi_strerror(-ret)); + return ret; + } + + cxip_txq_ring(rxc->tx_cmdq, 0, 1); + + return FI_SUCCESS; +} + +struct cxip_rxc *cxip_rxc_calloc(struct cxip_ep_obj *ep_obj, void *context) +{ + struct cxip_rxc *rxc = NULL; + + switch (ep_obj->protocol) { + case FI_PROTO_CXI: + rxc = calloc(1, sizeof(struct cxip_rxc_hpc)); + if (rxc) + rxc->ops = hpc_rxc_ops; + break; + case FI_PROTO_CXI_RNR: + rxc = calloc(1, sizeof(struct cxip_rxc_rnr)); + if (rxc) + rxc->ops = rnr_rxc_ops; + break; + default: + CXIP_WARN("Unsupported EP protocol requested %d\n", + ep_obj->protocol); + return NULL; + } + + if (!rxc) { + CXIP_WARN("Memory allocation failure\n"); + return NULL; + } + + /* Base initialization */ + rxc->protocol = ep_obj->protocol; + rxc->context = context; + rxc->ep_obj = ep_obj; + rxc->domain = ep_obj->domain; + rxc->min_multi_recv = CXIP_EP_MIN_MULTI_RECV; + rxc->state = RXC_DISABLED; + rxc->msg_offload = cxip_env.msg_offload; + rxc->max_tx = cxip_env.sw_rx_tx_init_max; + rxc->attr = ep_obj->rx_attr; + rxc->hmem = !!(rxc->attr.caps & FI_HMEM); + rxc->pid_bits = ep_obj->domain->iface->dev->info.pid_bits; + ofi_atomic_initialize32(&rxc->orx_reqs, 0); + + rxc->sw_ep_only = cxip_env.rx_match_mode == + CXIP_PTLTE_SOFTWARE_MODE; + cxip_msg_counters_init(&rxc->cntrs); + + /* Derived initialization/overrides */ + rxc->ops.init_struct(rxc, ep_obj); + + return rxc; +} + +void cxip_rxc_free(struct cxip_rxc *rxc) +{ + if (!rxc) + return; + + /* Derived structure free */ + rxc->ops.fini_struct(rxc); + + /* Any base stuff */ + + free(rxc); +} diff --git a/prov/cxi/src/cxip_trace.c b/prov/cxi/src/cxip_trace.c deleted file mode 100644 index 5d3a371b5f4..00000000000 --- a/prov/cxi/src/cxip_trace.c +++ /dev/null @@ -1,193 +0,0 @@ -/* - * Copyright (c) 2021-2023 Hewlett Packard Enterprise Development LP - * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only - */ - -/** - * @brief TRACE function for producing runtime debugging logs - * - * The following should be inserted at the top of a code module to trace: - * - * #define TRACE(fmt, ...) CXIP_TRACE(, fmt, ##__VA_ARGS__) - * - * If ENABLE_DEBUG is false at compile time, CXIP_TRACE is a syntactically - * robust NOOP which results in no code being emitted, ensuring that these - * trace calls do not affect performance in production, and none of the - * following comment apply. - * - * - cxip_trace_fn is the function that logs a trace message. - * - cxip_trace_flush_fn can be used to flush buffered trace messages. - * - cxip_trace_close_fn can be used to flush and close the output. - * - cxip_trace_enable_fn is used to enable/disable all tracing. - * - cxip_trace_set() is used to enable a tracing module. - * - cxip_trace_clr() is used to disable a tracing module. - * - * Modules are defined by the list of enum cxip_trace_module values, which - * can be extended as needed to provide finer control over tracing. - * - * The initial values are set in cxip_trace_init() below, using run-time - * environment variables. cxip_trace_enable() can be used to dynamically - * enable or disable tracing. cxip_trace_set() and cxip_trace_clr() can be - * used to dynamically modify which traces will generate output. - * - * Some initialization is required by the use of environment variables: - * - * Specifying the environment variable CXIP_TRACE_FILENAME will deliver - * output to a file with the specified name, followed by the PMI_RANK value - * (if there is one). - * - * Specifying CXIP_TRACE_APPEND in conjunction with CXIP_TRACE_FILENAME will - * open the file in append mode. This is important for NETSIM tests under - * Criterion, since each test is run in a separate process and closes all - * files at completion of each test. - * - * Specifying PMI_RANK as a rank value will apply a prefix to the trace lines - * that identifies the rank of the trace. - * - * Specifying PMI_SIZE will expand the prefix to show the number of ranks. - * - * cxip_trace_fid is exposed, and can be manipulated using the normal stream - * file functions. Default buffering is fully buffered output, which can - * result in delays in the appearance of logging information. Using - * setlinebuf() will run slower, but will display lines more quickly. - * - * cxip_trace_flush() forces all output be flushed AND written to disk, but - * leaves the file open for more writing. - * - * cxip_trace_close() flushes all output and closes the file. - */ -#include "config.h" - -#include -#include -#include -#include -#include - -#include "cxip.h" - -bool cxip_trace_initialized; -bool cxip_trace_enabled; -bool cxip_trace_append; -bool cxip_trace_linebuf; // set line buffering for trace -int cxip_trace_rank; -int cxip_trace_numranks; -char *cxip_trace_filename; -FILE *cxip_trace_fid; -uint64_t cxip_trace_mask; - -/* Static initialization of default trace functions, can be overridden */ -cxip_trace_t cxip_trace_attr cxip_trace_fn = cxip_trace; -cxip_trace_flush_t cxip_trace_flush_fn = cxip_trace_flush; -cxip_trace_close_t cxip_trace_close_fn = cxip_trace_close; -cxip_trace_enable_t cxip_trace_enable_fn = cxip_trace_enable; - -/* Get environment variable as string representation of int */ -static int getenv_int(const char *name) -{ - char *env; - int value; - - value = -1; - env = getenv(name); - if (env) - sscanf(env, "%d", &value); - return value; -} - -void cxip_trace_init(void) -{ - const char *fname; - - if (cxip_trace_initialized) - return; - - cxip_trace_initialized = true; - cxip_trace_enabled = !!getenv("CXIP_TRACE_ENABLE"); - cxip_trace_append = !!getenv("CXIP_TRACE_APPEND"); - cxip_trace_linebuf = !!getenv("CXIP_TRACE_LINEBUF"); - cxip_trace_rank = getenv_int("PMI_RANK"); - cxip_trace_numranks = getenv_int("PMI_SIZE"); - cxip_trace_append = getenv("CXIP_TRACE_APPEND"); - fname = getenv("CXIP_TRACE_FILENAME"); - - cxip_trace_mask = 0L; - if (getenv("CXIP_TRC_CTRL")) - cxip_trace_set(CXIP_TRC_CTRL); - if (getenv("CXIP_TRC_ZBCOLL")) - cxip_trace_set(CXIP_TRC_ZBCOLL); - if (getenv("CXIP_TRC_CURL")) - cxip_trace_set(CXIP_TRC_CURL); - if (getenv("CXIP_TRC_COLL_PKT")) - cxip_trace_set(CXIP_TRC_COLL_PKT); - if (getenv("CXIP_TRC_COLL_JOIN")) - cxip_trace_set(CXIP_TRC_COLL_JOIN); - if (getenv("CXIP_TRC_COLL_DEBUG")) - cxip_trace_set(CXIP_TRC_COLL_DEBUG); - if (getenv("CXIP_TRC_TEST_CODE")) - cxip_trace_set(CXIP_TRC_TEST_CODE); - - if (!fname) - fname = "trace"; - if (fname) { - asprintf(&cxip_trace_filename, "./%s%d", - fname, cxip_trace_rank); - cxip_trace_fid = fopen(cxip_trace_filename, - cxip_trace_append ? "a" : "w"); - if (!cxip_trace_fid) { - fprintf(stderr, "open(%s) failed: %s\n", - cxip_trace_filename, strerror(errno)); - } - if (cxip_trace_linebuf && cxip_trace_fid) - setlinebuf(cxip_trace_fid); - } -} - -void cxip_trace_flush(void) -{ - cxip_trace_init(); - if (cxip_trace_fid) { - fflush(cxip_trace_fid); - fsync(fileno(cxip_trace_fid)); - } -} - -void cxip_trace_close(void) -{ - cxip_trace_init(); - if (cxip_trace_fid) { - cxip_trace_flush(); - fclose(cxip_trace_fid); - cxip_trace_fid = NULL; - cxip_trace_initialized = false; - } -} - -int cxip_trace_attr cxip_trace(const char *fmt, ...) -{ - va_list args; - char *str; - int len; - - cxip_trace_init(); - if (!cxip_trace_enabled) - return 0; - va_start(args, fmt); - len = vasprintf(&str, fmt, args); - va_end(args); - if (len >= 0) { - len = fprintf(cxip_trace_fid, "[%2d|%2d] %s", - cxip_trace_rank, cxip_trace_numranks, str); - free(str); - } - return len; -} - -bool cxip_trace_enable(bool enable) -{ - bool was_enabled = cxip_trace_enabled; - - cxip_trace_init(); - cxip_trace_enabled = enable; - return was_enabled; -} diff --git a/prov/cxi/src/cxip_txc.c b/prov/cxi/src/cxip_txc.c index a15ed8ee65b..24564a5ef72 100644 --- a/prov/cxi/src/cxip_txc.c +++ b/prov/cxi/src/cxip_txc.c @@ -20,6 +20,9 @@ /* 8 Rendezvous, 2 RMA and 2 Atomic + 4 extra */ #define CXIP_INTERNAL_TX_REQS 16 +extern struct cxip_txc_ops hpc_txc_ops; +extern struct cxip_txc_ops rnr_txc_ops; + struct cxip_md *cxip_txc_ibuf_md(void *ibuf) { return ofi_buf_hdr(ibuf)->region->context; @@ -112,7 +115,7 @@ int cxip_txc_ibuf_create(struct cxip_txc *txc) * * Caller must hold txc->ep_obj.lock */ -int cxip_tx_id_alloc(struct cxip_txc *txc, void *ctx) +int cxip_tx_id_alloc(struct cxip_txc_hpc *txc, void *ctx) { int id; @@ -135,7 +138,7 @@ int cxip_tx_id_alloc(struct cxip_txc *txc, void *ctx) * * Caller must hold txc->ep_obj.lock */ -int cxip_tx_id_free(struct cxip_txc *txc, int id) +int cxip_tx_id_free(struct cxip_txc_hpc *txc, int id) { if (id < 0 || id >= CXIP_TX_IDS) return -FI_EINVAL; @@ -147,7 +150,7 @@ int cxip_tx_id_free(struct cxip_txc *txc, int id) } /* Caller must hold txc->ep_obj.lock */ -void *cxip_tx_id_lookup(struct cxip_txc *txc, int id) +void *cxip_tx_id_lookup(struct cxip_txc_hpc *txc, int id) { return ofi_idx_lookup(&txc->tx_ids, id); } @@ -160,7 +163,7 @@ void *cxip_tx_id_lookup(struct cxip_txc *txc, int id) * * Caller must hold txc->ep_obj->lock. */ -int cxip_rdzv_id_alloc(struct cxip_txc *txc, struct cxip_req *req) +int cxip_rdzv_id_alloc(struct cxip_txc_hpc *txc, struct cxip_req *req) { struct indexer *rdzv_ids; int max_rdzv_id; @@ -202,7 +205,7 @@ int cxip_rdzv_id_alloc(struct cxip_txc *txc, struct cxip_req *req) * * Caller must hold txc->ep_obj->lock. */ -int cxip_rdzv_id_free(struct cxip_txc *txc, int id) +int cxip_rdzv_id_free(struct cxip_txc_hpc *txc, int id) { if (id < 0 || id >= CXIP_RDZV_IDS) return -FI_EINVAL; @@ -221,7 +224,7 @@ int cxip_rdzv_id_free(struct cxip_txc *txc, int id) } /* Caller must hold txc->ep_obj->lock. */ -void *cxip_rdzv_id_lookup(struct cxip_txc *txc, int id) +void *cxip_rdzv_id_lookup(struct cxip_txc_hpc *txc, int id) { if (id >= CXIP_RDZV_IDS_MULTI_RECV) { @@ -240,31 +243,12 @@ void *cxip_rdzv_id_lookup(struct cxip_txc *txc, int id) */ static int txc_msg_init(struct cxip_txc *txc) { - int ret; - - /* Allocate TGQ for posting source data */ - ret = cxip_ep_cmdq(txc->ep_obj, false, FI_TC_UNSPEC, - txc->tx_evtq.eq, &txc->rx_cmdq); - if (ret != FI_SUCCESS) { - CXIP_WARN("Unable to allocate TGQ, ret: %d\n", ret); - return -FI_EDOMAIN; - } - - ret = cxip_rdzv_match_pte_alloc(txc, &txc->rdzv_pte); - if (ret) { - CXIP_WARN("Failed to allocate rendezvous PtlTE: %d:%s\n", ret, - fi_strerror(-ret)); - goto err_put_rx_cmdq; - } - txc->rdzv_proto = cxip_env.rdzv_proto; - - CXIP_DBG("TXC RDZV PtlTE enabled: %p proto: %s\n", - txc, cxip_rdzv_proto_to_str(txc->rdzv_proto)); + int ret = FI_SUCCESS; - return FI_SUCCESS; + /* Any common initialization should be added here */ -err_put_rx_cmdq: - cxip_ep_cmdq_put(txc->ep_obj, false); + /* Derived TXC message initialization */ + ret = txc->ops.msg_init(txc); return ret; } @@ -279,18 +263,14 @@ static int txc_msg_init(struct cxip_txc *txc) */ static int txc_msg_fini(struct cxip_txc *txc) { - int i; - - cxip_rdzv_match_pte_free(txc->rdzv_pte); + int ret; - for (i = 0; i < RDZV_NO_MATCH_PTES; i++) { - if (txc->rdzv_nomatch_pte[i]) - cxip_rdzv_nomatch_pte_free(txc->rdzv_nomatch_pte[i]); - } + /* Any common cleanup should be added here */ - cxip_ep_cmdq_put(txc->ep_obj, false); + /* Derived TXC message cleanup */ + ret = txc->ops.msg_fini(txc); - return FI_SUCCESS; + return ret; } static size_t cxip_txc_get_num_events(struct cxip_txc *txc) @@ -343,12 +323,8 @@ int cxip_txc_enable(struct cxip_txc *txc) return ret; } - /* Protected with ep_obj->lock */ - memset(&txc->rdzv_ids, 0, sizeof(txc->rdzv_ids)); - memset(&txc->msg_rdzv_ids, 0, sizeof(txc->msg_rdzv_ids)); - memset(&txc->tx_ids, 0, sizeof(txc->tx_ids)); - num_events = cxip_txc_get_num_events(txc); + ret = cxip_evtq_init(&txc->tx_evtq, txc->send_cq, num_events, 0); if (ret) { CXIP_WARN("Failed to initialize TX event queue: %d, %s\n", @@ -356,8 +332,8 @@ int cxip_txc_enable(struct cxip_txc *txc) goto destroy_ibuf; } - ret = cxip_ep_cmdq(txc->ep_obj, true, txc->tclass, - txc->tx_evtq.eq, &txc->tx_cmdq); + ret = cxip_ep_cmdq(txc->ep_obj, true, txc->tclass, txc->tx_evtq.eq, + &txc->tx_cmdq); if (ret != FI_SUCCESS) { CXIP_WARN("Unable to allocate TX CMDQ, ret: %d\n", ret); ret = -FI_EDOMAIN; @@ -373,7 +349,6 @@ int cxip_txc_enable(struct cxip_txc *txc) } } - txc->pid_bits = txc->domain->iface->dev->info.pid_bits; txc->enabled = true; return FI_SUCCESS; @@ -383,9 +358,6 @@ int cxip_txc_enable(struct cxip_txc *txc) destroy_evtq: cxip_evtq_fini(&txc->tx_evtq); destroy_ibuf: - ofi_idx_reset(&txc->tx_ids); - ofi_idx_reset(&txc->rdzv_ids); - ofi_idx_reset(&txc->msg_rdzv_ids); ofi_bufpool_destroy(txc->ibuf_pool); return ret; @@ -402,11 +374,9 @@ int cxip_txc_enable(struct cxip_txc *txc) static void txc_cleanup(struct cxip_txc *txc) { uint64_t start; - struct cxip_fc_peer *fc_peer; - struct dlist_entry *tmp; if (!ofi_atomic_get32(&txc->otx_reqs)) - goto free_fc_peers; + goto proto_cleanup; cxip_evtq_req_discard(&txc->tx_evtq, txc); @@ -425,27 +395,10 @@ static void txc_cleanup(struct cxip_txc *txc) assert(ofi_atomic_get32(&txc->otx_reqs) == 0); -free_fc_peers: - dlist_foreach_container_safe(&txc->fc_peers, struct cxip_fc_peer, - fc_peer, txc_entry, tmp) { - dlist_remove(&fc_peer->txc_entry); - free(fc_peer); - } -} +proto_cleanup: + txc->ops.cleanup(txc); -void cxip_txc_struct_init(struct cxip_txc *txc, const struct fi_tx_attr *attr, - void *context) -{ - dlist_init(&txc->ep_list); - ofi_atomic_initialize32(&txc->otx_reqs, 0); - dlist_init(&txc->msg_queue); - dlist_init(&txc->fc_peers); - - txc->context = context; - txc->attr = *attr; - txc->max_eager_size = cxip_env.rdzv_threshold + cxip_env.rdzv_get_min; - txc->rdzv_eager_size = cxip_env.rdzv_eager_size; - txc->hmem = !!(attr->caps & FI_HMEM); + ofi_bufpool_destroy(txc->ibuf_pool); } /* @@ -464,11 +417,6 @@ void cxip_txc_disable(struct cxip_txc *txc) txc->enabled = false; txc_cleanup(txc); - ofi_idx_reset(&txc->tx_ids); - ofi_idx_reset(&txc->rdzv_ids); - ofi_idx_reset(&txc->msg_rdzv_ids); - ofi_bufpool_destroy(txc->ibuf_pool); - if (ofi_send_allowed(txc->attr.caps)) { ret = txc_msg_fini(txc); if (ret) @@ -528,8 +476,20 @@ int cxip_txc_emit_idc_put(struct cxip_txc *txc, uint16_t vni, if (!cxip_txc_can_emit_op(txc, c_state->event_success_disable)) return -FI_EAGAIN; + if (txc->ep_obj->av_auth_key) { + ret = cxip_domain_emit_idc_put(txc->domain, vni, tc, c_state, + put, buf, len, flags); + if (ret) + TXC_WARN(txc, "Failed to emit domain idc put: %d\n", + ret); + else if (!c_state->event_success_disable) + ofi_atomic_inc32(&txc->otx_reqs); + + return ret; + } + /* Ensure correct traffic class is used. */ - ret = cxip_txq_cp_set(txc->tx_cmdq, vni, tc, tc_type); + ret = cxip_cmdq_cp_set(txc->tx_cmdq, vni, tc, tc_type); if (ret) { TXC_WARN(txc, "Failed to set traffic class: %d:%s\n", ret, fi_strerror(-ret)); @@ -579,8 +539,19 @@ int cxip_txc_emit_dma(struct cxip_txc *txc, uint16_t vni, return ret; } + if (txc->ep_obj->av_auth_key) { + ret = cxip_domain_emit_dma(txc->domain, vni, tc, dma, flags); + if (ret) + TXC_WARN(txc, "Failed to emit domain dma command: %d\n", + ret); + else if (!dma->event_success_disable) + ofi_atomic_inc32(&txc->otx_reqs); + + return ret; + } + /* Ensure correct traffic class is used. */ - ret = cxip_txq_cp_set(txc->tx_cmdq, vni, tc, tc_type); + ret = cxip_cmdq_cp_set(txc->tx_cmdq, vni, tc, tc_type); if (ret) { TXC_WARN(txc, "Failed to set traffic class: %d:%s\n", ret, fi_strerror(-ret)); @@ -616,8 +587,20 @@ int cxip_txc_emit_idc_amo(struct cxip_txc *txc, uint16_t vni, if (!cxip_txc_can_emit_op(txc, c_state->event_success_disable)) return -FI_EAGAIN; + if (txc->ep_obj->av_auth_key) { + ret = cxip_domain_emit_idc_amo(txc->domain, vni, tc, c_state, + amo, flags, fetching, flush); + if (ret) + TXC_WARN(txc, "Failed to emit domain idc amo: %d\n", + ret); + else if (!c_state->event_success_disable) + ofi_atomic_inc32(&txc->otx_reqs); + + return ret; + } + /* Ensure correct traffic class is used. */ - ret = cxip_txq_cp_set(txc->tx_cmdq, vni, tc, tc_type); + ret = cxip_cmdq_cp_set(txc->tx_cmdq, vni, tc, tc_type); if (ret) { TXC_WARN(txc, "Failed to set traffic class: %d:%s\n", ret, fi_strerror(-ret)); @@ -669,8 +652,20 @@ int cxip_txc_emit_dma_amo(struct cxip_txc *txc, uint16_t vni, return ret; } + if (txc->ep_obj->av_auth_key) { + ret = cxip_domain_emit_dma_amo(txc->domain, vni, tc, amo, flags, + fetching, flush); + if (ret) + TXC_WARN(txc, "Failed to emit domain amo: %d\n", + ret); + else if (!amo->event_success_disable) + ofi_atomic_inc32(&txc->otx_reqs); + + return ret; + } + /* Ensure correct traffic class is used. */ - ret = cxip_txq_cp_set(txc->tx_cmdq, vni, tc, tc_type); + ret = cxip_cmdq_cp_set(txc->tx_cmdq, vni, tc, tc_type); if (ret) { TXC_WARN(txc, "Failed to set traffic class: %d:%s\n", ret, fi_strerror(-ret)); @@ -693,3 +688,113 @@ int cxip_txc_emit_dma_amo(struct cxip_txc *txc, uint16_t vni, return FI_SUCCESS; } + +int cxip_txc_emit_idc_msg(struct cxip_txc *txc, uint16_t vni, + enum cxi_traffic_class tc, + enum cxi_traffic_class_type tc_type, + const struct c_cstate_cmd *c_state, + const struct c_idc_msg_hdr *msg, const void *buf, + size_t len, uint64_t flags) +{ + int ret; + + if (!cxip_txc_can_emit_op(txc, c_state->event_success_disable)) + return -FI_EAGAIN; + + if (txc->ep_obj->av_auth_key) { + ret = cxip_domain_emit_idc_msg(txc->domain, vni, tc, c_state, + msg, buf, len, flags); + if (ret) + TXC_WARN(txc, "Failed to emit domain idc msg: %d\n", + ret); + else if (!c_state->event_success_disable) + ofi_atomic_inc32(&txc->otx_reqs); + + return ret; + } + + /* Ensure correct traffic class is used. */ + ret = cxip_cmdq_cp_set(txc->tx_cmdq, vni, tc, tc_type); + if (ret) { + TXC_WARN(txc, "Failed to set traffic class: %d:%s\n", ret, + fi_strerror(-ret)); + return ret; + } + + ret = cxip_cmdq_emit_idc_msg(txc->tx_cmdq, c_state, msg, buf, len, + flags); + if (ret) { + TXC_WARN(txc, "Failed to emit idc_msg command: %d:%s\n", ret, + fi_strerror(-ret)); + return ret; + } + + /* Kick the command queue. */ + cxip_txq_ring(txc->tx_cmdq, !!(flags & FI_MORE), + ofi_atomic_get32(&txc->otx_reqs)); + + if (!c_state->event_success_disable) + ofi_atomic_inc32(&txc->otx_reqs); + + return FI_SUCCESS; +} + +struct cxip_txc *cxip_txc_calloc(struct cxip_ep_obj *ep_obj, void *context) +{ + struct cxip_txc *txc = NULL; + + switch (ep_obj->protocol) { + case FI_PROTO_CXI: + txc = calloc(1, sizeof(struct cxip_txc_hpc)); + if (txc) + txc->ops = hpc_txc_ops; + break; + case FI_PROTO_CXI_RNR: + txc = calloc(1, sizeof(struct cxip_txc_rnr)); + if (txc) + txc->ops = rnr_txc_ops; + break; + default: + CXIP_WARN("Unsupported EP protocol requested %d\n", + ep_obj->protocol); + return NULL; + } + + if (!txc) { + CXIP_WARN("Memory allocation failure\n"); + return NULL; + } + + /* Common structure initialization */ + txc->protocol = ep_obj->protocol; + txc->context = context; + txc->ep_obj = ep_obj; + txc->domain = ep_obj->domain; + txc->tclass = ep_obj->tx_attr.tclass; + txc->hrp_war_req = ep_obj->asic_ver < CASSINI_2_0; + txc->attr = ep_obj->tx_attr; + txc->hmem = !!(txc->attr.caps & FI_HMEM); + txc->pid_bits = txc->domain->iface->dev->info.pid_bits; + + dlist_init(&txc->msg_queue); + dlist_init(&txc->dom_entry); + ofi_atomic_initialize32(&txc->otx_reqs, 0); + + /* Derived initialization/overrides */ + txc->ops.init_struct(txc, ep_obj); + + return txc; +} + +void cxip_txc_free(struct cxip_txc *txc) +{ + if (!txc) + return; + + /* Derived structure free */ + txc->ops.fini_struct(txc); + + /* Any base stuff */ + + free(txc); +} diff --git a/prov/cxi/src/cxip_zbcoll.c b/prov/cxi/src/cxip_zbcoll.c index 7f59b2ba599..74d5c225481 100644 --- a/prov/cxi/src/cxip_zbcoll.c +++ b/prov/cxi/src/cxip_zbcoll.c @@ -34,7 +34,7 @@ #define CXIP_INFO(...) _CXIP_INFO(FI_LOG_EP_CTRL, __VA_ARGS__) #define CXIP_WARN(...) _CXIP_WARN(FI_LOG_EP_CTRL, __VA_ARGS__) -#define TRACE(fmt, ...) CXIP_TRACE(CXIP_TRC_ZBCOLL, fmt, ##__VA_ARGS__) +#define TRACE(fmt, ...) CXIP_COLL_TRACE(CXIP_TRC_ZBCOLL, fmt, ##__VA_ARGS__) /* see data packing structures below */ #define ZB_MAP_BITS 54 @@ -783,6 +783,7 @@ static void zbsend(struct cxip_ep_obj *ep_obj, uint32_t dstnic, uint32_t dstpid, req->cb = zbdata_send_cb; req->send.nic_addr = dstnic; req->send.pid = dstpid; + req->send.vni = ep_obj->auth_key.vni; req->send.mb.raw = mbv; req->send.mb.ctrl_le_type = CXIP_CTRL_LE_TYPE_CTRL_MSG; req->send.mb.ctrl_msg_type = CXIP_CTRL_MSG_ZB_DATA; diff --git a/prov/cxi/test/auth_key.c b/prov/cxi/test/auth_key.c index 57c30d37a15..b7d4baec521 100644 --- a/prov/cxi/test/auth_key.c +++ b/prov/cxi/test/auth_key.c @@ -1297,17 +1297,17 @@ Test(auth_key, max_ep_auth_key_null_hints) int i = 0; size_t expected_ep_auth_key; - ret = setenv("FI_CXI_COMPAT", "0", 1); - cr_assert(ret == 0); - ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), "cxi0", NULL, FI_SOURCE, NULL, &info); cr_assert_eq(ret, FI_SUCCESS, "fi_getinfo failed: %d", ret); tmp = info; while (tmp) { - /* The first 2 fi_info's should have max_ep_auth_key == 1*/ - if (i < 2) + /* The first and second fi_info are for FI_PROTO_CXI and have a + * max_ep_auth_key == 1. The fourth and fifth fi_info are for + * FI_PROTO_CXI_RNR and have a max_ep_auth_key == 1. + */ + if (i < 2 || (i > 3 && i < 6)) expected_ep_auth_key = 1; else expected_ep_auth_key = DEFAULT_MAX_EP_AUTH_KEY; @@ -1334,9 +1334,6 @@ Test(auth_key, zero_max_ep_auth_key_null_hint) int i = 0; size_t expected_ep_auth_key; - ret = setenv("FI_CXI_COMPAT", "0", 1); - cr_assert(ret == 0); - hints = fi_allocinfo(); cr_assert_not_null(hints, "fi_allocinfo failed"); diff --git a/prov/cxi/test/cntr.c b/prov/cxi/test/cntr.c index 9ab420b9993..f16655e0fbc 100644 --- a/prov/cxi/test/cntr.c +++ b/prov/cxi/test/cntr.c @@ -8,6 +8,7 @@ #include #include +#include #include "cxip.h" #include "cxip_test_common.h" @@ -718,3 +719,128 @@ Test(cntr, cntr_wait_bad_threshold) ret = fi_close(&cntr->fid); cr_assert(ret == FI_SUCCESS); } + +struct cntr_waiter_args { + struct fid_cntr *cntr; + int timeout; + uint64_t thresh; + uint64_t error_count; + uint64_t success_count; +}; + +static void *cntr_waiter(void *data) +{ + struct cntr_waiter_args *args = data; + uint64_t error; + uint64_t success; + int ret; + + ret = fi_cntr_wait(args->cntr, args->thresh, args->timeout); + if (args->error_count && args->thresh > args->success_count) { + cr_assert(ret == -FI_EAVAIL, "fi_cntr_wait ret %d", ret); + error = fi_cntr_readerr(args->cntr); + cr_assert(error == args->error_count, + "Unexpected counter error count %lu", error); + } else if (args->thresh <= args->success_count) { + cr_assert(ret == FI_SUCCESS, "fi_cntr_wait ret %d", ret); + } else { + cr_assert(ret == -FI_ETIMEDOUT, + "fi_cntr_wait ret %d", ret); + } + + if (args->success_count) { + success = fi_cntr_read(args->cntr); + cr_assert(success == args->success_count, + "Unexpected counter success count %lu", success); + } + + pthread_exit(NULL); +} + +static void cntr_wait_success_and_error_runner(struct cntr_waiter_args *args) +{ + struct fid_cntr *cntr; + struct fi_cntr_attr cntr_attr = { + .wait_obj = FI_WAIT_UNSPEC, + }; + pthread_t thread; + pthread_attr_t attr; + int ret; + + ret = fi_cntr_open(cxit_domain, &cntr_attr, &cntr, NULL); + cr_assert(ret == FI_SUCCESS); + args->cntr = cntr; + + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + + ret = pthread_create(&thread, &attr, cntr_waiter, (void *)args); + cr_assert_eq(ret, 0, "Counter waiter create failed %d", ret); + + /* make sure wait thread is in fi_cntr_wait() */ + usleep(1000); + + if (args->success_count) { + ret = fi_cntr_set(cntr, args->success_count); + cr_assert(ret == FI_SUCCESS, "fi_cntr_seterr ret %d", ret); + } + + if (args->error_count) { + ret = fi_cntr_seterr(cntr, args->error_count); + cr_assert(ret == FI_SUCCESS, "fi_cntr_seterr ret %d", ret); + } + + ret = pthread_join(thread, NULL); + cr_assert_eq(ret, 0, "Counter waiter join failed %d", ret); + + ret = fi_close(&cntr->fid); + cr_assert(ret == FI_SUCCESS); +} + +Test(cntr, cntr_wait_error_increment) +{ + struct cntr_waiter_args args = { + .timeout = 2000, + .thresh = 2, + .error_count = 1, + .success_count = 0, + }; + + cntr_wait_success_and_error_runner(&args); +} + +Test(cntr, cntr_wait_success_and_error_increment) +{ + struct cntr_waiter_args args = { + .timeout = 2000, + .thresh = 3, + .error_count = 1, + .success_count = 2, + }; + + cntr_wait_success_and_error_runner(&args); +} + +Test(cntr, cntr_wait_success_increment_timeout) +{ + struct cntr_waiter_args args = { + .timeout = 1000, + .thresh = 3, + .error_count = 0, + .success_count = 2, + }; + + cntr_wait_success_and_error_runner(&args); +} + +Test(cntr, cntr_wait_success_increment) +{ + struct cntr_waiter_args args = { + .timeout = 1000, + .thresh = 3, + .error_count = 0, + .success_count = 4, + }; + + cntr_wait_success_and_error_runner(&args); +} diff --git a/prov/cxi/test/coll.c b/prov/cxi/test/coll.c index b6cc6732253..5ffb811567e 100644 --- a/prov/cxi/test/coll.c +++ b/prov/cxi/test/coll.c @@ -33,7 +33,7 @@ #include "cxip_test_common.h" /* If not compiled with DEBUG=1, this is a no-op */ -#define TRACE(fmt, ...) CXIP_TRACE(CXIP_TRC_TEST_CODE, fmt, ##__VA_ARGS__) +#define TRACE(fmt, ...) CXIP_COLL_TRACE(CXIP_TRC_TEST_CODE, fmt, ##__VA_ARGS__) #define MIN(a,b) (((a)<(b))?(a):(b)) @@ -538,7 +538,7 @@ void _put_data(int count, int from_rank, int to_rank) /* check final counts */ TRACE("check counts\n"); if (count * sizeof(*buf) > - ep->ep_obj->coll.buffer_size - ep->ep_obj->rxc.min_multi_recv) { + ep->ep_obj->coll.buffer_size - ep->ep_obj->rxc->min_multi_recv) { cnt = ofi_atomic_get32(&mc_obj_recv->coll_pte->buf_swap_cnt); cr_assert(cnt > 0, "Did not recirculate buffers\n"); } diff --git a/prov/cxi/test/ctrl.c b/prov/cxi/test/ctrl.c index 4651a52a94a..9551559c651 100644 --- a/prov/cxi/test/ctrl.c +++ b/prov/cxi/test/ctrl.c @@ -14,7 +14,7 @@ #include "cxip.h" #include "cxip_test_common.h" -#define TRACE(fmt, ...) CXIP_TRACE(CXIP_TRC_CTRL, fmt, ##__VA_ARGS__) +#define TRACE(fmt, ...) CXIP_COLL_TRACE(CXIP_TRC_CTRL, fmt, ##__VA_ARGS__) TestSuite(ctrl, .init = cxit_setup_rma, .fini = cxit_teardown_rma, .timeout = CXIT_DEFAULT_TIMEOUT); @@ -164,7 +164,8 @@ Test(ctrl, zb_config) "no tree: simcnt=%d\n", zb->simcount); cr_assert(zb->num_caddrs == 1, "no_tree: num_caddrs=%d\n", zb->num_caddrs); - cr_assert(memcmp(&zb->caddrs[0], &ep_obj->src_addr, sizeof(ep_obj->src_addr)) == 0); + cr_assert(memcmp(&zb->caddrs[0], &ep_obj->src_addr, + sizeof(ep_obj->src_addr)) == 0); cxip_zbcoll_free(zb); /* request simulation */ diff --git a/prov/cxi/test/cxip_test_common.c b/prov/cxi/test/cxip_test_common.c index 999a12627df..b0fe3ccd622 100644 --- a/prov/cxi/test/cxip_test_common.c +++ b/prov/cxi/test/cxip_test_common.c @@ -343,6 +343,32 @@ void cxit_create_local_cntrs(void) cr_assert(ret == FI_SUCCESS, "fi_cntr_open (write)"); } +void cxit_create_local_byte_cntrs(void) +{ + struct fi_cntr_attr attr = { + .events = FI_CXI_CNTR_EVENTS_BYTES, + .wait_obj = FI_WAIT_YIELD, + }; + int ret; + + ret = fi_cntr_open(cxit_domain, &attr, &cxit_send_cntr, + NULL); + cr_assert(ret == FI_SUCCESS, "fi_cntr_open (send)"); + + ret = fi_cntr_open(cxit_domain, &attr, &cxit_recv_cntr, + NULL); + cr_assert(ret == FI_SUCCESS, "fi_cntr_open (recv)"); + + /* For now have read/write still use event counting */ + ret = fi_cntr_open(cxit_domain, NULL, &cxit_read_cntr, + NULL); + cr_assert(ret == FI_SUCCESS, "fi_cntr_open (read)"); + + ret = fi_cntr_open(cxit_domain, NULL, &cxit_write_cntr, + NULL); + cr_assert(ret == FI_SUCCESS, "fi_cntr_open (write)"); +} + void cxit_create_cntrs(void) { cxit_create_local_cntrs(); @@ -458,7 +484,7 @@ void cxit_init(void) fi_freeinfo(hints); } -struct fi_info *cxit_allocinfo(void) +struct fi_info *cxit_allocinfo_common(uint32_t proto) { struct fi_info *info; char *odp_env; @@ -482,13 +508,27 @@ struct fi_info *cxit_allocinfo(void) } /* If remote ODP is enabled then test with ODP */ - odp_env = getenv("CXIP_TEST_ODP"); + odp_env = getenv("FI_CXI_ODP"); if (odp_env && strtol(odp_env, NULL, 10)) info->domain_attr->mr_mode &= ~FI_MR_ALLOCATED; + /* If a EP protocol was specified indicate to use it */ + if (proto) + info->ep_attr->protocol = proto; + return info; } +struct fi_info *cxit_allocinfo(void) +{ + return cxit_allocinfo_common(0); +} + +struct fi_info *cxit_allocinfo_proto(uint32_t proto) +{ + return cxit_allocinfo_common(proto); +} + void cxit_setup_getinfo(void) { cxit_init(); @@ -497,6 +537,14 @@ void cxit_setup_getinfo(void) cxit_fi_hints = cxit_allocinfo(); } +void cxit_setup_getinfo_proto(uint32_t proto) +{ + cxit_init(); + + if (!cxit_fi_hints) + cxit_fi_hints = cxit_allocinfo_proto(proto); +} + void cxit_teardown_getinfo(void) { fi_freeinfo(cxit_fi_hints); @@ -539,6 +587,47 @@ void cxit_teardown_ep(void) cxit_teardown_domain(); } +void cxit_setup_enabled_rnr_msg_ep(void) +{ + int ret; + size_t addrlen = sizeof(cxit_ep_addr); + + cxit_setup_getinfo(); + + cxit_tx_cq_attr.format = FI_CQ_FORMAT_TAGGED; + cxit_av_attr.type = FI_AV_TABLE; + + cxit_fi_hints->domain_attr->data_progress = FI_PROGRESS_MANUAL; + cxit_fi_hints->domain_attr->data_progress = FI_PROGRESS_MANUAL; + + /* Indicate we want to use the CS protocol */ + cxit_fi_hints->ep_attr->protocol = FI_PROTO_CXI_RNR; + + cxit_setup_ep(); + + /* Set up RMA objects */ + cxit_create_ep(); + cxit_create_eq(); + cxit_bind_eq(); + cxit_create_cqs(); + cxit_bind_cqs(); + + /* No FI_RMA_EVENT, don't create/bind remote counters */ + cxit_create_local_cntrs(); + cxit_bind_cntrs(); + + cxit_create_av(); + cxit_bind_av(); + + ret = fi_enable(cxit_ep); + cr_assert(ret == FI_SUCCESS, "ret is: %d\n", ret); + + /* Find assigned Endpoint address. Address is assigned during enable. */ + ret = fi_getname(&cxit_ep->fid, &cxit_ep_addr, &addrlen); + cr_assert(ret == FI_SUCCESS, "ret is %d\n", ret); + cr_assert(addrlen == sizeof(cxit_ep_addr)); +} + void cxit_setup_enabled_ep_disable_fi_rma_event(void) { int ret; @@ -746,6 +835,23 @@ void cxit_setup_rma_mr_events(void) fi_control(&cxit_domain->fid, FI_OPT_CXI_SET_PROV_KEY_CACHE, &disable); } +void cxit_setup_rnr_msg_ep(void) +{ + int ret; + struct cxip_addr fake_addr = {.nic = 0xad, .pid = 0xbc}; + + cxit_setup_enabled_rnr_msg_ep(); + + /* Insert local address into AV to prepare to send to self */ + ret = fi_av_insert(cxit_av, (void *)&fake_addr, 1, NULL, 0, NULL); + cr_assert(ret == 1); + + /* Insert local address into AV to prepare to send to self */ + ret = fi_av_insert(cxit_av, (void *)&cxit_ep_addr, 1, &cxit_ep_fi_addr, + 0, NULL); + cr_assert(ret == 1); +} + void cxit_bind_cqs_hybrid_mr_desc(void) { int ret; @@ -853,13 +959,143 @@ void cxit_setup_rma_hybrid_mr_desc(void) cr_assert(ret == 1); } +void cxit_setup_enabled_rnr_ep_hybrid_mr_desc(void) +{ + int ret; + size_t addrlen = sizeof(cxit_ep_addr); + + cxit_setup_getinfo(); + + cxit_tx_cq_attr.format = FI_CQ_FORMAT_TAGGED; + cxit_av_attr.type = FI_AV_TABLE; + + cxit_fi_hints->domain_attr->data_progress = FI_PROGRESS_MANUAL; + cxit_fi_hints->domain_attr->data_progress = FI_PROGRESS_MANUAL; + + /* Indicate we want to use the CS protocol */ + cxit_fi_hints->ep_attr->protocol = FI_PROTO_CXI_RNR; + cxit_fi_hints->domain_attr->mr_mode = FI_MR_PROV_KEY | FI_MR_ALLOCATED | + FI_MR_ENDPOINT; + + cxit_setup_ep_hybrid_mr_desc(); + + cxit_fi->caps &= ~FI_RMA_EVENT; + cxit_fi->domain_attr->caps &= ~FI_RMA_EVENT; + cxit_fi->tx_attr->caps &= ~FI_RMA_EVENT; + cxit_fi->rx_attr->caps &= ~FI_RMA_EVENT; + + /* Set up RMA objects */ + cxit_create_ep(); + cxit_create_eq(); + cxit_bind_eq(); + cxit_create_cqs(); + cxit_bind_cqs_hybrid_mr_desc(); + + /* No FI_RMA_EVENT, don't create/bind remote counters */ + cxit_create_local_cntrs(); + cxit_bind_cntrs(); + + cxit_create_av(); + cxit_bind_av(); + + ret = fi_enable(cxit_ep); + cr_assert(ret == FI_SUCCESS, "ret is: %d\n", ret); + + /* Find assigned Endpoint address. Address is assigned during enable. */ + ret = fi_getname(&cxit_ep->fid, &cxit_ep_addr, &addrlen); + cr_assert(ret == FI_SUCCESS, "ret is %d\n", ret); + cr_assert(addrlen == sizeof(cxit_ep_addr)); +} + +void cxit_setup_enabled_rnr_ep_hybrid_mr_desc_byte_cntr(void) +{ + int ret; + size_t addrlen = sizeof(cxit_ep_addr); + + cxit_setup_getinfo(); + + cxit_tx_cq_attr.format = FI_CQ_FORMAT_TAGGED; + cxit_av_attr.type = FI_AV_TABLE; + + cxit_fi_hints->domain_attr->data_progress = FI_PROGRESS_MANUAL; + cxit_fi_hints->domain_attr->data_progress = FI_PROGRESS_MANUAL; + + /* Indicate we want to use the CS protocol */ + cxit_fi_hints->ep_attr->protocol = FI_PROTO_CXI_RNR; + cxit_fi_hints->domain_attr->mr_mode = FI_MR_PROV_KEY | FI_MR_ALLOCATED | + FI_MR_ENDPOINT; + + cxit_setup_ep_hybrid_mr_desc(); + + cxit_fi->caps &= ~FI_RMA_EVENT; + cxit_fi->domain_attr->caps &= ~FI_RMA_EVENT; + cxit_fi->tx_attr->caps &= ~FI_RMA_EVENT; + cxit_fi->rx_attr->caps &= ~FI_RMA_EVENT; + + /* Set up RMA objects */ + cxit_create_ep(); + cxit_create_eq(); + cxit_bind_eq(); + cxit_create_cqs(); + cxit_bind_cqs_hybrid_mr_desc(); + + /* No FI_RMA_EVENT, don't create/bind remote counters */ + cxit_create_local_byte_cntrs(); + cxit_bind_cntrs(); + + cxit_create_av(); + cxit_bind_av(); + + ret = fi_enable(cxit_ep); + cr_assert(ret == FI_SUCCESS, "ret is: %d\n", ret); + + /* Find assigned Endpoint address. Address is assigned during enable. */ + ret = fi_getname(&cxit_ep->fid, &cxit_ep_addr, &addrlen); + cr_assert(ret == FI_SUCCESS, "ret is %d\n", ret); + cr_assert(addrlen == sizeof(cxit_ep_addr)); +} + +void cxit_setup_rma_rnr_hybrid_mr_desc(void) +{ + int ret; + struct cxip_addr fake_addr = {.nic = 0xad, .pid = 0xbc}; + + cxit_setup_enabled_rnr_ep_hybrid_mr_desc(); + + /* Insert local address into AV to prepare to send to self */ + ret = fi_av_insert(cxit_av, (void *)&fake_addr, 1, NULL, 0, NULL); + cr_assert(ret == 1); + + /* Insert local address into AV to prepare to send to self */ + ret = fi_av_insert(cxit_av, (void *)&cxit_ep_addr, 1, &cxit_ep_fi_addr, + 0, NULL); + cr_assert(ret == 1); +} + +void cxit_setup_rma_rnr_hybrid_mr_desc_byte_cntr(void) +{ + int ret; + struct cxip_addr fake_addr = {.nic = 0xad, .pid = 0xbc}; + + cxit_setup_enabled_rnr_ep_hybrid_mr_desc_byte_cntr(); + + /* Insert local address into AV to prepare to send to self */ + ret = fi_av_insert(cxit_av, (void *)&fake_addr, 1, NULL, 0, NULL); + cr_assert(ret == 1); + + /* Insert local address into AV to prepare to send to self */ + ret = fi_av_insert(cxit_av, (void *)&cxit_ep_addr, 1, &cxit_ep_fi_addr, + 0, NULL); + cr_assert(ret == 1); +} + void cxit_setup_rma(void) { int ret; struct cxip_addr fake_addr = {.nic = 0xad, .pid = 0xbc}; - cxip_trace_append = true; - cxip_trace_enable(true); + cxip_coll_trace_append = true; + cxip_coll_trace_muted = false; cxit_setup_enabled_ep(); /* Insert local address into AV to prepare to send to self */ diff --git a/prov/cxi/test/cxip_test_common.h b/prov/cxi/test/cxip_test_common.h index d04e8fdf56d..664ef3d36e3 100644 --- a/prov/cxi/test/cxip_test_common.h +++ b/prov/cxi/test/cxip_test_common.h @@ -41,9 +41,6 @@ extern bool cxit_prov_key; extern int s_page_size; extern bool enable_cxi_hmem_ops; -extern bool cxip_trace_enable(bool enable); -extern void cxip_trace_flush(void); - void cxit_init(void); void cxit_create_fabric_info(void); void cxit_destroy_fabric_info(void); @@ -68,8 +65,12 @@ void cxit_destroy_av(void); void cxit_bind_av(void); void cxit_setup_rma_disable_fi_rma_event(void); +void cxit_setup_enabled_rnr_msg_ep(void); +void cxit_setup_rnr_msg_ep(void); struct fi_info *cxit_allocinfo(void); +struct fi_info *cxit_allocinfo_proto(uint32_t proto); void cxit_setup_getinfo(void); +void cxit_setup_getinfo_proto(uint32_t proto); void cxit_teardown_getinfo(void); void cxit_setup_fabric(void); void cxit_teardown_fabric(void); @@ -88,6 +89,8 @@ void cxit_setup_enabled_ep_fd(void); void cxit_setup_rma(void); void cxit_setup_rma_fd(void); void cxit_setup_rma_hybrid_mr_desc(void); +void cxit_setup_rma_rnr_hybrid_mr_desc(void); +void cxit_setup_rma_rnr_hybrid_mr_desc_byte_cntr(void); void cxit_setup_rma_mr_events(void); #define cxit_setup_tagged cxit_setup_rma #define cxit_setup_msg cxit_setup_rma diff --git a/prov/cxi/test/ep.c b/prov/cxi/test/ep.c index dab6ed9a37b..e415c11f018 100644 --- a/prov/cxi/test/ep.c +++ b/prov/cxi/test/ep.c @@ -59,10 +59,6 @@ static struct ep_test_params ep_ep_params[] = { .retval = -FI_EINVAL}, {.type = FI_EP_DGRAM, .retval = -FI_EINVAL}, - {.type = FI_EP_SOCK_STREAM, - .retval = -FI_EINVAL}, - {.type = FI_EP_SOCK_DGRAM, - .retval = -FI_EINVAL}, {.type = FI_EP_RDM, .context = (void *)0xabcdef, .retval = FI_SUCCESS}, @@ -215,8 +211,8 @@ Test(ep, ep_bind_cq) cr_assert_not_null(ep->ep_obj); cr_assert_eq(ep->ep.fid.fclass, FI_CLASS_EP); - cr_assert_eq(ep->ep_obj->txc.send_cq, tx_cq); - cr_assert_eq(ep->ep_obj->rxc.recv_cq, rx_cq); + cr_assert_eq(ep->ep_obj->txc->send_cq, tx_cq); + cr_assert_eq(ep->ep_obj->rxc->recv_cq, rx_cq); cxit_destroy_ep(); cxit_destroy_cqs(); @@ -253,9 +249,9 @@ Test(ep, ep_bind_cq_eps) ep2 = container_of(fid_ep2, struct cxip_ep, ep.fid); cr_assert_not_null(ep2->ep_obj); - cr_assert_eq(ep->ep_obj->txc.send_cq, ep2->ep_obj->txc.send_cq, + cr_assert_eq(ep->ep_obj->txc->send_cq, ep2->ep_obj->txc->send_cq, "Send CQ mismatch"); - cr_assert_eq(ep->ep_obj->rxc.recv_cq, ep2->ep_obj->rxc.recv_cq, + cr_assert_eq(ep->ep_obj->rxc->recv_cq, ep2->ep_obj->rxc->recv_cq, "Receive CQ mismatch"); ret = fi_close(&fid_ep2->fid); @@ -812,10 +808,11 @@ ParameterizedTest(struct ep_getopt_args *param, ep, getopt_args) if (ret == FI_SUCCESS) { cr_assert_not_null(cxi_ep->ep_obj); - cr_assert_eq(*param->optval, cxi_ep->ep_obj->rxc.min_multi_recv, + cr_assert_eq(*param->optval, + cxi_ep->ep_obj->rxc->min_multi_recv, "fi_getopt val mismatch. %zd != %zd", *param->optval, - cxi_ep->ep_obj->rxc.min_multi_recv); + cxi_ep->ep_obj->rxc->min_multi_recv); cr_assert_eq(*param->optlen, sizeof(size_t), "fi_getopt len mismatch. %zd != %zd", *param->optlen, sizeof(size_t)); @@ -890,9 +887,11 @@ ParameterizedTest(struct ep_setopt_args *param, ep, setopt_args) if (ret == FI_SUCCESS) { cr_assert_not_null(cxi_ep->ep_obj); - cr_assert_eq(param->optval, cxi_ep->ep_obj->rxc.min_multi_recv, + cr_assert_eq(param->optval, + cxi_ep->ep_obj->rxc->min_multi_recv, "fi_setopt val mismatch. %zd != %zd", - param->optval, cxi_ep->ep_obj->rxc.min_multi_recv); + param->optval, + cxi_ep->ep_obj->rxc->min_multi_recv); } cxit_destroy_ep(); @@ -958,7 +957,7 @@ Test(ep, stx_ctx) return; ep = container_of(stx, struct cxip_ep, ep); - txc = &ep->ep_obj->txc; + txc = ep->ep_obj->txc; /* Validate stx */ cr_assert_eq(txc->domain, dom); @@ -1003,7 +1002,7 @@ Test(ep, srx_ctx) return; srx_ep = container_of(srx, struct cxip_ep, ep); - rxc = &srx_ep->ep_obj->rxc; + rxc = srx_ep->ep_obj->rxc; /* Validate stx */ cr_assert_eq(rxc->domain, dom); @@ -1290,6 +1289,7 @@ TestSuite(ep_caps, .timeout = CXIT_DEFAULT_TIMEOUT); void verify_ep_msg_cap(uint64_t flags) { struct cxip_ep *ep; + struct cxip_rxc_hpc *rxc_hpc = NULL; int ret; cxit_setup_ep(); @@ -1313,25 +1313,31 @@ void verify_ep_msg_cap(uint64_t flags) ep = container_of(&cxit_ep->fid, struct cxip_ep, ep.fid); + if (ep->ep_obj->rxc->protocol == FI_PROTO_CXI) + rxc_hpc = container_of(ep->ep_obj->rxc, struct cxip_rxc_hpc, + base); + /* Requires knowledge of implementation */ if (flags & FI_SEND) { - cr_assert(ep->ep_obj->txc.enabled, "TX Enabled"); - cr_assert(ep->ep_obj->txc.send_cq != NULL, "Send CQ"); + cr_assert(ep->ep_obj->txc->enabled, "TX Enabled"); + cr_assert(ep->ep_obj->txc->send_cq != NULL, "Send CQ"); } if (flags & FI_RECV) { - cr_assert(ep->ep_obj->rxc.state == RXC_ENABLED || - ep->ep_obj->rxc.state == RXC_ENABLED_SOFTWARE, + cr_assert(ep->ep_obj->rxc->state == RXC_ENABLED || + ep->ep_obj->rxc->state == RXC_ENABLED_SOFTWARE, "RX Enabled"); - cr_assert(ep->ep_obj->rxc.recv_cq != NULL, "Receive CQ"); - cr_assert(ep->ep_obj->rxc.rx_evtq.eq != NULL, "RX H/W EQ"); - cr_assert(ep->ep_obj->rxc.rx_cmdq != NULL, "RX TGT CMDQ"); - cr_assert(ep->ep_obj->rxc.tx_cmdq != NULL, "RX TX CMDQ"); + cr_assert(ep->ep_obj->rxc->recv_cq != NULL, "Receive CQ"); + cr_assert(ep->ep_obj->rxc->rx_evtq.eq != NULL, "RX H/W EQ"); + cr_assert(ep->ep_obj->rxc->rx_cmdq != NULL, "RX TGT CMDQ"); + if (rxc_hpc) + cr_assert(rxc_hpc->tx_cmdq != NULL, "RX TX CMDQ"); } else { - cr_assert(ep->ep_obj->rxc.state == RXC_ENABLED, "R/X enabled"); - cr_assert(ep->ep_obj->rxc.rx_evtq.eq == NULL, "RX H/W EQ"); - cr_assert(ep->ep_obj->rxc.rx_cmdq == NULL, "RX TGT CMDQ"); - cr_assert(ep->ep_obj->rxc.tx_cmdq == NULL, "RX TX CMDQ"); + cr_assert(ep->ep_obj->rxc->state == RXC_ENABLED, "R/X enabled"); + cr_assert(ep->ep_obj->rxc->rx_evtq.eq == NULL, "RX H/W EQ"); + cr_assert(ep->ep_obj->rxc->rx_cmdq == NULL, "RX TGT CMDQ"); + if (rxc_hpc) + cr_assert(rxc_hpc->tx_cmdq == NULL, "RX TX CMDQ"); } cxit_teardown_rma(); diff --git a/prov/cxi/test/fabric.c b/prov/cxi/test/fabric.c index 4eb04e4cf0b..55bc435ff4e 100644 --- a/prov/cxi/test/fabric.c +++ b/prov/cxi/test/fabric.c @@ -379,12 +379,12 @@ Test(getinfo, invalid_fi_directed_recv_with_multiple_auth_keys_per_ep) TestSuite(getinfo_infos, .timeout = CXIT_DEFAULT_TIMEOUT); -#define MAX_INFOS 16 -#define FI_ADDR_CXI_COMPAT FI_ADDR_OPX +#define MAX_INFOS 24 struct info_check { int mr_mode; uint32_t format; + uint32_t protocol; size_t max_ep_auth_key; }; @@ -396,9 +396,10 @@ Test(getinfo_infos, nohints) struct fi_info *fi_ptr; char *dom_name; char *odp; - char *compat; struct info_check infos[MAX_INFOS]; size_t max_ep_auth_key; + uint32_t proto; + uint32_t format; cxit_init(); cr_assert(!cxit_fi_hints, "hints not NULL"); @@ -411,56 +412,56 @@ Test(getinfo_infos, nohints) infos[i].mr_mode = -1; } + odp = getenv("FI_CXI_ODP"); + /* By default when no hints are specified, each interface - * should have 4 fi_info. + * should can have 8 HPC fi_info and 8 CS fi_info. */ - for (i = 0; i < 2; i++) { - if (i < 1) + for (i = 0; i < 4; i++) { + if (i == 0 || i == 2) max_ep_auth_key = 1; else max_ep_auth_key = 4; + /* Set protocol based on compatibility. Note FI_PROTO_CXI_RNR + * does not exist if only old address format/protocol values + * are used. + */ + if (i < 2) + proto = FI_PROTO_CXI; + else + proto = FI_PROTO_CXI_RNR; + + format = FI_ADDR_CXI; infos[info_per_if].mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED | FI_MR_PROV_KEY; - infos[info_per_if].format = FI_ADDR_CXI; + infos[info_per_if].format = format; infos[info_per_if].max_ep_auth_key = max_ep_auth_key; + infos[info_per_if].protocol = proto; info_per_if++; infos[info_per_if].mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED; - infos[info_per_if].format = FI_ADDR_CXI; + infos[info_per_if].format = format; infos[info_per_if].max_ep_auth_key = max_ep_auth_key; + infos[info_per_if].protocol = proto; info_per_if++; /* Add ODP versions if enabled */ - odp = getenv("FI_CXI_ODP"); if (odp && strtol(odp, NULL, 10)) { - infos[info_per_if].format = FI_ADDR_CXI; + infos[info_per_if].format = format; infos[info_per_if].mr_mode = FI_MR_ENDPOINT | FI_MR_PROV_KEY; infos[info_per_if].max_ep_auth_key = max_ep_auth_key; + infos[info_per_if].protocol = proto; info_per_if++; - infos[info_per_if].format = FI_ADDR_CXI; + infos[info_per_if].format = format; infos[info_per_if].mr_mode = FI_MR_ENDPOINT; infos[info_per_if].max_ep_auth_key = max_ep_auth_key; + infos[info_per_if].protocol = proto; info_per_if++; } } - /* If we are supporting compatibility with old constants, - * then fi_info are repeated with compatibility constants. - */ - compat = getenv("FI_CXI_COMPAT"); - if (!compat || strtol(compat, NULL, 10) == 1) { - for (i = 0; i < info_per_if; i++) { - infos[info_per_if + i].mr_mode = - infos[i].mr_mode; - infos[info_per_if + i].format = - FI_ADDR_CXI_COMPAT; - infos[info_per_if + i].max_ep_auth_key = - infos[i].max_ep_auth_key; - } - info_per_if += i; - } cr_assert(info_per_if <= MAX_INFOS, "Too many infos"); fi_ptr = cxit_fi; @@ -507,37 +508,46 @@ Test(getinfo_infos, nohints) cxit_destroy_fabric_info(); } -Test(getinfo_infos, hints) +void getinfo_infos_hints(uint32_t proto) { int num_info; int i; int info_per_if = 0; struct fi_info *fi_ptr; char *dom_name; - char *compat; - struct info_check infos[2]; + char *odp; + int odp_val; + struct info_check infos[3]; + + odp = getenv("FI_CXI_ODP"); + odp_val = !odp ? 0 : strtol(odp, NULL, 10); + + cr_assert(cxit_fi_hints == NULL, "hints not null"); + cxit_setup_getinfo_proto(proto); + cr_assert(cxit_fi_hints != NULL, "hints still null"); + cr_assert(cxit_fi_hints->ep_attr->protocol == proto, + "hints proto %d expected %d failure", + cxit_fi_hints->ep_attr->protocol, proto); - cxit_setup_fabric(); + cxit_create_fabric_info(); cr_assert(cxit_fi != NULL); - cr_assert(cxit_fi_hints != NULL); - for (i = 0; i < 2; i++) { + for (i = 0; i < 3; i++) { infos[i].format = 0; + infos[i].protocol = 0; infos[i].mr_mode = -1; } - infos[0].format = FI_ADDR_CXI; - infos[0].mr_mode = FI_MR_ENDPOINT | FI_MR_ALLOCATED; + /* We have address format FI_ADDR_CXI */ + infos[info_per_if].mr_mode = FI_MR_ENDPOINT; + if (!odp_val) + infos[info_per_if].mr_mode |= FI_MR_ALLOCATED; if (cxit_prov_key) - infos[0].mr_mode |= FI_MR_PROV_KEY; - info_per_if++; + infos[info_per_if].mr_mode |= FI_MR_PROV_KEY; - compat = getenv("FI_CXI_COMPAT"); - if (!compat || strtol(compat, NULL, 10) == 1) { - infos[1].format = FI_ADDR_CXI_COMPAT; - infos[1].mr_mode = infos[0].mr_mode; - info_per_if++; - } + infos[info_per_if].format = FI_ADDR_CXI; + infos[info_per_if].protocol = proto; + info_per_if++; fi_ptr = cxit_fi; @@ -556,9 +566,8 @@ Test(getinfo_infos, hints) break; num_info++; - cr_assert(num_info <= 2, "too many fi_info %d", + cr_assert(num_info <= 3, "too many fi_info %d", num_info); - cr_assert(infos[num_info - 1].mr_mode == fi_ptr->domain_attr->mr_mode, "expected MR mode %x got %x", @@ -581,6 +590,21 @@ Test(getinfo_infos, hints) cxit_teardown_fabric(); } +Test(getinfo_infos, hints_default_proto) +{ + getinfo_infos_hints(0); +} + +Test(getinfo_infos, hints_proto_hpc) +{ + getinfo_infos_hints(FI_PROTO_CXI); +} + +Test(getinfo_infos, hints_proto_cs) +{ + getinfo_infos_hints(FI_PROTO_CXI_RNR); +} + Test(getinfo_infos, hints_no_rma) { int ret; diff --git a/prov/cxi/test/fi_info_test.sh b/prov/cxi/test/fi_info_test.sh old mode 100644 new mode 100755 diff --git a/prov/cxi/test/msg.c b/prov/cxi/test/msg.c index 058761b2745..c7cbeda7399 100644 --- a/prov/cxi/test/msg.c +++ b/prov/cxi/test/msg.c @@ -14,11 +14,8 @@ #include "cxip.h" #include "cxip_test_common.h" -TestSuite(msg, .init = cxit_setup_msg, .fini = cxit_teardown_msg, - .timeout = CXIT_DEFAULT_TIMEOUT); - -/* Test basic send/recv */ -Test(msg, ping) +/* Test basic send/recv - expected or unexpected*/ +static void ping(bool ux) { int i, ret; uint8_t *recv_buf, @@ -40,14 +37,26 @@ Test(msg, ping) for (i = 0; i < send_len; i++) send_buf[i] = i + 0xa0; - /* Post RX buffer */ - ret = fi_recv(cxit_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, NULL); - cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret); - + /* Post RX if not testing unexpected behavior */ + if (!ux) { + ret = fi_recv(cxit_ep, recv_buf, recv_len, NULL, + FI_ADDR_UNSPEC, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret); + } /* Send 64 bytes to self */ ret = fi_send(cxit_ep, send_buf, send_len, NULL, cxit_ep_fi_addr, NULL); cr_assert_eq(ret, FI_SUCCESS, "fi_send failed %d", ret); + /* Post RX if testing unexpected behavior */ + if (ux) { + /* Make sure RX progress has occurred */ + fi_cq_read(cxit_rx_cq, &rx_cqe, 0); + + /* Post RX buffer */ + ret = fi_recv(cxit_ep, recv_buf, recv_len, NULL, + FI_ADDR_UNSPEC, NULL); + } + /* Wait for async event indicating data has been received */ do { ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); @@ -88,9 +97,8 @@ Test(msg, ping) free(send_buf); free(recv_buf); } - /* Test basic send/recv with data */ -Test(msg, pingdata) +static void pingdata(void) { int i, ret; uint8_t *recv_buf, @@ -163,8 +171,8 @@ Test(msg, pingdata) free(recv_buf); } -/* Test basic inject send */ -Test(msg, inject_ping) + +static void vping(void) { int i, ret; uint8_t *recv_buf, @@ -175,6 +183,8 @@ Test(msg, inject_ping) rx_cqe; int err = 0; fi_addr_t from; + struct iovec riovec; + struct iovec siovec; recv_buf = aligned_alloc(s_page_size, recv_len); cr_assert(recv_buf); @@ -187,12 +197,16 @@ Test(msg, inject_ping) send_buf[i] = i + 0xa0; /* Post RX buffer */ - ret = fi_recv(cxit_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, NULL); + riovec.iov_base = recv_buf; + riovec.iov_len = recv_len; + ret = fi_recvv(cxit_ep, &riovec, NULL, 1, FI_ADDR_UNSPEC, NULL); cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret); /* Send 64 bytes to self */ - ret = fi_inject(cxit_ep, send_buf, send_len, cxit_ep_fi_addr); - cr_assert_eq(ret, FI_SUCCESS, "fi_inject failed %d", ret); + siovec.iov_base = send_buf; + siovec.iov_len = send_len; + ret = fi_sendv(cxit_ep, &siovec, NULL, 1, cxit_ep_fi_addr, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_send failed %d", ret); /* Wait for async event indicating data has been received */ do { @@ -200,10 +214,29 @@ Test(msg, inject_ping) } while (ret == -FI_EAGAIN); cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); - validate_rx_event(&rx_cqe, NULL, send_len, FI_MSG | FI_RECV, NULL, - 0, 0); + /* Validate RX event fields */ + cr_assert(rx_cqe.op_context == NULL, "RX CQE Context mismatch"); + cr_assert(rx_cqe.flags == (FI_MSG | FI_RECV), + "RX CQE flags mismatch"); + cr_assert(rx_cqe.len == send_len, "Invalid RX CQE length"); + cr_assert(rx_cqe.buf == 0, "Invalid RX CQE address"); + cr_assert(rx_cqe.data == 0, "Invalid RX CQE data"); + cr_assert(rx_cqe.tag == 0, "Invalid RX CQE tag"); cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + /* Validate TX event fields */ + cr_assert(tx_cqe.op_context == NULL, "TX CQE Context mismatch"); + cr_assert(tx_cqe.flags == (FI_MSG | FI_SEND), + "TX CQE flags mismatch"); + cr_assert(tx_cqe.len == 0, "Invalid TX CQE length"); + cr_assert(tx_cqe.buf == 0, "Invalid TX CQE address"); + cr_assert(tx_cqe.data == 0, "Invalid TX CQE data"); + cr_assert(tx_cqe.tag == 0, "Invalid TX CQE tag"); + /* Validate sent data */ for (i = 0; i < send_len; i++) { cr_expect_eq(recv_buf[i], send_buf[i], @@ -212,16 +245,12 @@ Test(msg, inject_ping) } cr_assert_eq(err, 0, "Data errors seen\n"); - /* Make sure a TX event wasn't delivered */ - ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); - cr_assert(ret == -FI_EAGAIN); - free(send_buf); free(recv_buf); } -/* Test basic injectdata */ -Test(msg, injectdata_ping) +/* Test basic sendmsg/recvmsg */ +static void msgping(void) { int i, ret; uint8_t *recv_buf, @@ -232,7 +261,10 @@ Test(msg, injectdata_ping) rx_cqe; int err = 0; fi_addr_t from; - uint64_t data = 0xabcdabcdabcdabcd; + struct fi_msg rmsg = {}; + struct fi_msg smsg = {}; + struct iovec riovec; + struct iovec siovec; recv_buf = aligned_alloc(s_page_size, recv_len); cr_assert(recv_buf); @@ -245,13 +277,26 @@ Test(msg, injectdata_ping) send_buf[i] = i + 0xa0; /* Post RX buffer */ - ret = fi_recv(cxit_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, NULL); + riovec.iov_base = recv_buf; + riovec.iov_len = recv_len; + rmsg.msg_iov = &riovec; + rmsg.iov_count = 1; + rmsg.addr = FI_ADDR_UNSPEC; + rmsg.context = NULL; + + ret = fi_recvmsg(cxit_ep, &rmsg, 0); cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret); /* Send 64 bytes to self */ - ret = fi_injectdata(cxit_ep, send_buf, send_len, data, - cxit_ep_fi_addr); - cr_assert_eq(ret, FI_SUCCESS, "fi_inject failed %d", ret); + siovec.iov_base = send_buf; + siovec.iov_len = send_len; + smsg.msg_iov = &siovec; + smsg.iov_count = 1; + smsg.addr = cxit_ep_fi_addr; + smsg.context = NULL; + + ret = fi_sendmsg(cxit_ep, &smsg, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_send failed %d", ret); /* Wait for async event indicating data has been received */ do { @@ -259,10 +304,29 @@ Test(msg, injectdata_ping) } while (ret == -FI_EAGAIN); cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); - validate_rx_event(&rx_cqe, NULL, send_len, - FI_MSG | FI_RECV | FI_REMOTE_CQ_DATA, NULL, data, 0); + /* Validate RX event fields */ + cr_assert(rx_cqe.op_context == NULL, "RX CQE Context mismatch"); + cr_assert(rx_cqe.flags == (FI_MSG | FI_RECV), + "RX CQE flags mismatch"); + cr_assert(rx_cqe.len == send_len, "Invalid RX CQE length"); + cr_assert(rx_cqe.buf == 0, "Invalid RX CQE address"); + cr_assert(rx_cqe.data == 0, "Invalid RX CQE data"); + cr_assert(rx_cqe.tag == 0, "Invalid RX CQE tag"); cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + /* Validate TX event fields */ + cr_assert(tx_cqe.op_context == NULL, "TX CQE Context mismatch"); + cr_assert(tx_cqe.flags == (FI_MSG | FI_SEND), + "TX CQE flags mismatch"); + cr_assert(tx_cqe.len == 0, "Invalid TX CQE length"); + cr_assert(tx_cqe.buf == 0, "Invalid TX CQE address"); + cr_assert(tx_cqe.data == 0, "Invalid TX CQE data"); + cr_assert(tx_cqe.tag == 0, "Invalid TX CQE tag"); + /* Validate sent data */ for (i = 0; i < send_len; i++) { cr_expect_eq(recv_buf[i], send_buf[i], @@ -271,16 +335,32 @@ Test(msg, injectdata_ping) } cr_assert_eq(err, 0, "Data errors seen\n"); - /* Make sure a TX event wasn't delivered */ - ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); - cr_assert(ret == -FI_EAGAIN); - free(send_buf); free(recv_buf); } -/* Test basic sendv/recvv */ -Test(msg, vping) +TestSuite(msg, .init = cxit_setup_msg, .fini = cxit_teardown_msg, + .timeout = CXIT_DEFAULT_TIMEOUT); + +/* Test basic send/recv */ +Test(msg, ping) +{ + ping(false); +} + +Test(msg, ping_retry) +{ + ping(true); +} + +/* Test basic send/recv with data */ +Test(msg, pingdata) +{ + pingdata(); +} + +/* Test basic inject send */ +Test(msg, inject_ping) { int i, ret; uint8_t *recv_buf, @@ -291,8 +371,6 @@ Test(msg, vping) rx_cqe; int err = 0; fi_addr_t from; - struct iovec riovec; - struct iovec siovec; recv_buf = aligned_alloc(s_page_size, recv_len); cr_assert(recv_buf); @@ -305,16 +383,12 @@ Test(msg, vping) send_buf[i] = i + 0xa0; /* Post RX buffer */ - riovec.iov_base = recv_buf; - riovec.iov_len = recv_len; - ret = fi_recvv(cxit_ep, &riovec, NULL, 1, FI_ADDR_UNSPEC, NULL); + ret = fi_recv(cxit_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, NULL); cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret); /* Send 64 bytes to self */ - siovec.iov_base = send_buf; - siovec.iov_len = send_len; - ret = fi_sendv(cxit_ep, &siovec, NULL, 1, cxit_ep_fi_addr, NULL); - cr_assert_eq(ret, FI_SUCCESS, "fi_send failed %d", ret); + ret = fi_inject(cxit_ep, send_buf, send_len, cxit_ep_fi_addr); + cr_assert_eq(ret, FI_SUCCESS, "fi_inject failed %d", ret); /* Wait for async event indicating data has been received */ do { @@ -322,29 +396,10 @@ Test(msg, vping) } while (ret == -FI_EAGAIN); cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); - /* Validate RX event fields */ - cr_assert(rx_cqe.op_context == NULL, "RX CQE Context mismatch"); - cr_assert(rx_cqe.flags == (FI_MSG | FI_RECV), - "RX CQE flags mismatch"); - cr_assert(rx_cqe.len == send_len, "Invalid RX CQE length"); - cr_assert(rx_cqe.buf == 0, "Invalid RX CQE address"); - cr_assert(rx_cqe.data == 0, "Invalid RX CQE data"); - cr_assert(rx_cqe.tag == 0, "Invalid RX CQE tag"); + validate_rx_event(&rx_cqe, NULL, send_len, FI_MSG | FI_RECV, NULL, + 0, 0); cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); - /* Wait for async event indicating data has been sent */ - ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); - cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); - - /* Validate TX event fields */ - cr_assert(tx_cqe.op_context == NULL, "TX CQE Context mismatch"); - cr_assert(tx_cqe.flags == (FI_MSG | FI_SEND), - "TX CQE flags mismatch"); - cr_assert(tx_cqe.len == 0, "Invalid TX CQE length"); - cr_assert(tx_cqe.buf == 0, "Invalid TX CQE address"); - cr_assert(tx_cqe.data == 0, "Invalid TX CQE data"); - cr_assert(tx_cqe.tag == 0, "Invalid TX CQE tag"); - /* Validate sent data */ for (i = 0; i < send_len; i++) { cr_expect_eq(recv_buf[i], send_buf[i], @@ -353,12 +408,16 @@ Test(msg, vping) } cr_assert_eq(err, 0, "Data errors seen\n"); + /* Make sure a TX event wasn't delivered */ + ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + cr_assert(ret == -FI_EAGAIN); + free(send_buf); free(recv_buf); } -/* Test basic sendmsg/recvmsg */ -Test(msg, msgping) +/* Test basic injectdata */ +Test(msg, injectdata_ping) { int i, ret; uint8_t *recv_buf, @@ -369,10 +428,7 @@ Test(msg, msgping) rx_cqe; int err = 0; fi_addr_t from; - struct fi_msg rmsg = {}; - struct fi_msg smsg = {}; - struct iovec riovec; - struct iovec siovec; + uint64_t data = 0xabcdabcdabcdabcd; recv_buf = aligned_alloc(s_page_size, recv_len); cr_assert(recv_buf); @@ -385,26 +441,13 @@ Test(msg, msgping) send_buf[i] = i + 0xa0; /* Post RX buffer */ - riovec.iov_base = recv_buf; - riovec.iov_len = recv_len; - rmsg.msg_iov = &riovec; - rmsg.iov_count = 1; - rmsg.addr = FI_ADDR_UNSPEC; - rmsg.context = NULL; - - ret = fi_recvmsg(cxit_ep, &rmsg, 0); + ret = fi_recv(cxit_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, NULL); cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret); /* Send 64 bytes to self */ - siovec.iov_base = send_buf; - siovec.iov_len = send_len; - smsg.msg_iov = &siovec; - smsg.iov_count = 1; - smsg.addr = cxit_ep_fi_addr; - smsg.context = NULL; - - ret = fi_sendmsg(cxit_ep, &smsg, 0); - cr_assert_eq(ret, FI_SUCCESS, "fi_send failed %d", ret); + ret = fi_injectdata(cxit_ep, send_buf, send_len, data, + cxit_ep_fi_addr); + cr_assert_eq(ret, FI_SUCCESS, "fi_inject failed %d", ret); /* Wait for async event indicating data has been received */ do { @@ -412,29 +455,10 @@ Test(msg, msgping) } while (ret == -FI_EAGAIN); cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); - /* Validate RX event fields */ - cr_assert(rx_cqe.op_context == NULL, "RX CQE Context mismatch"); - cr_assert(rx_cqe.flags == (FI_MSG | FI_RECV), - "RX CQE flags mismatch"); - cr_assert(rx_cqe.len == send_len, "Invalid RX CQE length"); - cr_assert(rx_cqe.buf == 0, "Invalid RX CQE address"); - cr_assert(rx_cqe.data == 0, "Invalid RX CQE data"); - cr_assert(rx_cqe.tag == 0, "Invalid RX CQE tag"); + validate_rx_event(&rx_cqe, NULL, send_len, + FI_MSG | FI_RECV | FI_REMOTE_CQ_DATA, NULL, data, 0); cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); - /* Wait for async event indicating data has been sent */ - ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); - cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); - - /* Validate TX event fields */ - cr_assert(tx_cqe.op_context == NULL, "TX CQE Context mismatch"); - cr_assert(tx_cqe.flags == (FI_MSG | FI_SEND), - "TX CQE flags mismatch"); - cr_assert(tx_cqe.len == 0, "Invalid TX CQE length"); - cr_assert(tx_cqe.buf == 0, "Invalid TX CQE address"); - cr_assert(tx_cqe.data == 0, "Invalid TX CQE data"); - cr_assert(tx_cqe.tag == 0, "Invalid TX CQE tag"); - /* Validate sent data */ for (i = 0; i < send_len; i++) { cr_expect_eq(recv_buf[i], send_buf[i], @@ -443,10 +467,26 @@ Test(msg, msgping) } cr_assert_eq(err, 0, "Data errors seen\n"); + /* Make sure a TX event wasn't delivered */ + ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + cr_assert(ret == -FI_EAGAIN); + free(send_buf); free(recv_buf); } +/* Test basic sendv/recvv */ +Test(msg, vping) +{ + vping(); +} + +/* Test basic sendmsg/recvmsg */ +Test(msg, msgping) +{ + msgping(); +} + /* Test basic sendmsg/recvmsg with two EP bound to same CQ */ Test(msg, msgping_cq_share) { @@ -774,7 +814,7 @@ Test(msg, inject_msgping) } /* Test send/recv sizes small to large */ -Test(msg, sizes) +static void sizes(void) { int i, j, ret; uint8_t *recv_buf, @@ -867,6 +907,12 @@ Test(msg, sizes) free(recv_buf); } +/* Test send/recv sizes small to large */ +Test(msg, sizes) +{ + sizes(); +} + /* Test send/recv sizes large to small (this exercises MR caching) */ Test(msg, sizes_desc) { @@ -1922,7 +1968,7 @@ Test(msg, fc_no_eq_space_expected_multi_recv_onload_ules, .timeout = 10) test_fc_multi_recv(1, false); } -Test(msg, zero_byte_send_recv_iov) +static void zero_byte_send_recv_iov(void) { int ret; struct fi_cq_tagged_entry cqe; @@ -1944,10 +1990,15 @@ Test(msg, zero_byte_send_recv_iov) cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); } -Test(msg, zero_byte_send_recv_msg) +Test(msg, zero_byte_send_recv_iov) { - int ret; - struct fi_cq_tagged_entry cqe; + zero_byte_send_recv_iov(); +} + +static void zero_byte_send_recv_msg(void) +{ + int ret; + struct fi_cq_tagged_entry cqe; struct fi_msg rmsg = {}; struct fi_msg smsg = {}; @@ -1972,6 +2023,11 @@ Test(msg, zero_byte_send_recv_msg) cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); } +Test(msg, zero_byte_send_recv_msg) +{ + zero_byte_send_recv_msg(); +} + /* Verify that FI_AV_USER_ID is returned from fi_cq_readfrom(). */ Test(msg, av_user_id) { @@ -2008,6 +2064,360 @@ Test(msg, av_user_id) ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); } +/* Note: the FI_PROTO_CXI_RNR message test suite uses rnr_msg + * so that it will not be included in flow-control and software + * EP tests, which it does not support. + */ +TestSuite(rnr_msg, .init = cxit_setup_rnr_msg_ep, + .fini = cxit_teardown_msg, .timeout = CXIT_DEFAULT_TIMEOUT); + +Test(rnr_msg, ping) +{ + ping(false); +} + +Test(rnr_msg, ping_retry) +{ + ping(true); +} + +Test(rnr_msg, ping_retry_b2b) +{ + /* unexpected, RNR retries */ + ping(true); + ping(true); + /*expected, no RNR retries */ + ping(false); + /* unexpected, RNR retries */ + ping(true); +} + +Test(rnr_msg, pingdata) +{ + pingdata(); +} + +Test(rnr_msg, vping) +{ + vping(); +} + +Test(rnr_msg, msgping) +{ + msgping(); +} + +Test(rnr_msg, sizes) +{ + sizes(); +} + +Test(rnr_msg, zero_byte_send_recv_iov) +{ + zero_byte_send_recv_iov(); +} + +Test(rnr_msg, zero_byte_send_recv_msg) +{ + zero_byte_send_recv_msg(); +} +/* CS - expected messages only */ +static struct msg_multi_recv_params rnr_params[] = { + /* expected eager */ + {.send_len = SHORT_SEND_LEN, + .recv_len = SHORT_SENDS * SHORT_SEND_LEN, + .ux = false}, + + /* exp long */ + {.send_len = LONG_SEND_LEN, + .recv_len = LONG_SENDS*LONG_SEND_LEN, + .ux = false}, + + /* exp overflow */ + {.send_len = LONG_SEND_LEN, + .recv_len = LONG_SENDS*LONG_SEND_LEN + (LONG_SEND_LEN - LONG_OLEN), + .ux = false, + .sends = LONG_SENDS+1, + .olen = LONG_OLEN}, + + /* exp overflow */ + {.send_len = LONG_SEND_LEN, + .recv_len = LONG_SENDS*LONG_SEND_LEN + (LONG_SEND_LEN - SHORT_OLEN), + .ux = false, + .sends = LONG_SENDS+1, + .olen = SHORT_OLEN}, +}; + +ParameterizedTestParameters(rnr_msg, multi_recv) +{ + size_t param_sz; + + param_sz = ARRAY_SIZE(rnr_params); + return cr_make_param_array(struct msg_multi_recv_params, rnr_params, + param_sz); +} + +/* Test multi-recv messaging */ +ParameterizedTest(struct msg_multi_recv_params *param, rnr_msg, multi_recv) +{ + void *recv_buf; + void *send_buf; + + + recv_buf = aligned_alloc(s_page_size, param->recv_len); + cr_assert(recv_buf); + + send_buf = aligned_alloc(s_page_size, param->send_len); + cr_assert(send_buf); + + do_multi_recv(send_buf, param->send_len, recv_buf, + param->recv_len, param->ux, param->sends, + param->olen); + + free(send_buf); + free(recv_buf); +} + +Test(rnr_msg, timeout) +{ + int i, ret; + uint8_t *send_buf; + int send_len = 64; + struct fi_cq_tagged_entry tx_cqe; + struct fi_cq_err_entry err_cqe = {}; + + send_buf = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf); + + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + /* Send 64 bytes to self, no receive posted */ + ret = fi_send(cxit_ep, send_buf, send_len, NULL, cxit_ep_fi_addr, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_send failed %d", ret); + + /* Wait for async event indicating data has been received */ + do { + ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, -FI_EAVAIL, "fi_cq_read unexpected status %d", ret); + + /* Read the error data */ + ret = fi_cq_readerr(cxit_tx_cq, &err_cqe, 0); + cr_assert_eq(ret, 1); + + cr_assert(err_cqe.err == FI_EIO, + "Invalid Error TX CQE err %d", err_cqe.err); + cr_assert(err_cqe.prov_errno == C_RC_ENTRY_NOT_FOUND, + "Invalid Error TX CQE prov_errno %d", err_cqe.prov_errno); + + free(send_buf); +} + +Test(rnr_msg, rnr_cancel) +{ + int i, ret; + uint8_t *send_buf1; + uint8_t *send_buf2; + uint8_t *recv_buf; + int send_len = 64; + struct fi_context ctxt[2]; + struct fi_cq_tagged_entry tx_cqe; + struct fi_cq_tagged_entry rx_cqe; + struct fi_cq_err_entry err_cqe = {}; + send_buf1 = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf1); + send_buf2 = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf2); + recv_buf = aligned_alloc(s_page_size, send_len); + cr_assert(recv_buf); + + for (i = 0; i < send_len; i++) { + send_buf1[i] = i + 0xa0; + send_buf2[i] = i + 0x05; + } + + /* Post two sends of 64 bytes each using a unique context */ + ret = fi_send(cxit_ep, send_buf1, send_len, NULL, cxit_ep_fi_addr, + &ctxt[0]); + cr_assert_eq(ret, FI_SUCCESS, "fi_send one failed %d", ret); + + ret = fi_send(cxit_ep, send_buf2, send_len, NULL, + cxit_ep_fi_addr, &ctxt[1]); + cr_assert_eq(ret, FI_SUCCESS, "fi_send two failed %d", ret); + + /* Cancel the first send */ + ret = fi_cancel(&cxit_ep->fid, &ctxt[0]); + cr_assert_eq(ret, FI_SUCCESS, "Request not found %d", ret); + + /* Give time for a retry to complete */ + usleep(100); + + /* Read the canceled TX completion status */ + do { + ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, -FI_EAVAIL, "fi_cq_read unexpected status %d", ret); + + /* Read the error data */ + ret = fi_cq_readerr(cxit_tx_cq, &err_cqe, 0); + cr_assert_eq(ret, 1); + + cr_assert(err_cqe.err == FI_ECANCELED, + "Invalid Error TX CQE err %d", err_cqe.err); + cr_assert(err_cqe.prov_errno == C_RC_ENTRY_NOT_FOUND, + "Invalid Error TX CQE prov_errno %d", err_cqe.prov_errno); + + /* Post a receive, the second request should land */ + ret = fi_recv(cxit_ep, recv_buf, send_len, NULL, + FI_ADDR_UNSPEC, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret); + + /* Wait for async event indicating data has been sent */ + do { + ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected status %d", ret); + + /* Wait for async event indicating data has been received */ + do { + ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected status %d", ret); + + /* Second TX data should have landed */ + cr_expect_arr_eq(recv_buf, send_buf2, send_len); + + free(send_buf1); + free(send_buf2); + free(recv_buf); +} + +/* Test many CS retries in flight */ +Test(rnr_msg, multi_recv_retries) +{ + int i, j, ret; + int err = 0; + fi_addr_t from; + struct fi_msg rmsg = {}; + struct fi_msg smsg = {}; + struct iovec riovec; + struct iovec siovec; + uint64_t rxe_flags; + int bytes_sent = 0; + uint8_t *recv_buf; + uint8_t *send_buf; + size_t send_len = 8*1024; + int sends = 10; + size_t recv_len = send_len * 5 + 64 * 5; + int sent = 0; + int recved = 0; + struct fi_cq_tagged_entry tx_cqe[sends]; + struct fi_cq_tagged_entry rx_cqe[sends]; + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + memset(recv_buf, 0, recv_len); + + send_buf = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf); + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + /* Post all TX before posting a buffer, these + * will all go into the CS retry flow as they + * are unexpected. + */ + siovec.iov_base = send_buf; + siovec.iov_len = send_len; + smsg.msg_iov = &siovec; + smsg.iov_count = 1; + smsg.addr = cxit_ep_fi_addr; + smsg.context = NULL; + + for (i = 0; i < sends; i++) { + /* Interleave long and short sends. They will complete in a + * different order than they were sent or received. + */ + if (i % 2) + siovec.iov_len = 64; + else + siovec.iov_len = 8*1024; + + ret = fi_sendmsg(cxit_ep, &smsg, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_send failed %d", + ret); + } + + /* Post an RX multi-recv buffer to receive all the sends */ + riovec.iov_base = recv_buf; + riovec.iov_len = recv_len; + rmsg.msg_iov = &riovec; + rmsg.iov_count = 1; + rmsg.addr = FI_ADDR_UNSPEC; + rmsg.context = NULL; + + /* Force more RNR Acks and retries */ + usleep(100); + ret = fi_cq_read(cxit_tx_cq, &tx_cqe[0], 0); + usleep(100); + + /* Start accepting sends */ + ret = fi_recvmsg(cxit_ep, &rmsg, FI_MULTI_RECV); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret); + + for (i = 0; i < sends; i++) { + /* Gather both events, ensure progress on both sides. */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe[recved], 1, + &from); + if (ret == 1) { + recved++; + } else { + cr_assert_eq(ret, -FI_EAGAIN, + "fi_cq_read unexpected value %d", + ret); + } + + ret = fi_cq_read(cxit_tx_cq, &tx_cqe[sent], 1); + if (ret == 1) { + sent++; + } else { + cr_assert_eq(ret, -FI_EAGAIN, + "fi_cq_read unexpected value %d", + ret); + } + } while (!(sent == sends && recved == sends)); + } + + /* All TX and RX completions have been received */ + + for (i = 0; i < sends; i++) { + bytes_sent += rx_cqe[i].len; + rxe_flags = FI_MSG | FI_RECV; + if (bytes_sent > (recv_len - CXIP_EP_MIN_MULTI_RECV)) + rxe_flags |= FI_MULTI_RECV; + + cr_assert(rx_cqe[i].flags == rxe_flags, "CQE flags mismatch"); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + validate_tx_event(&tx_cqe[i], FI_MSG | FI_SEND, NULL); + + /* Validate sent data */ + uint8_t *rbuf = rx_cqe[i].buf; + + for (j = 0; j < rx_cqe[i].len; j++) { + cr_expect_eq(rbuf[j], send_buf[j], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + j, send_buf[j], recv_buf[j], err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + } + + free(send_buf); + free(recv_buf); +} + /* Verify that FI_AV_USER_ID is returned from fi_cq_readfrom(). */ Test(msg, av_user_id_domain_cap) @@ -2160,10 +2570,485 @@ Test(hybrid_preemptive, unexpected_msg_preemptive) cr_assert(ret == FI_SUCCESS); } - while (cxip_ep->ep_obj->rxc.state != RXC_ENABLED_SOFTWARE) + while (cxip_ep->ep_obj->rxc->state != RXC_ENABLED_SOFTWARE) fi_cq_read(cxit_rx_cq, NULL, 0); cr_assert(ret == FI_SUCCESS); cxit_teardown_msg(); } + +static void msg_hybrid_mr_desc_test_runner(bool multirecv, + bool cq_events) +{ + struct mem_region send_window; + struct mem_region recv_window; + uint64_t send_key = 0x2; + uint64_t recv_key = 0x1; + int iters = 10; + int send_len = 1024; + int recv_len = multirecv ? iters * send_len + 20 : send_len; + int recv_msg_len = send_len; + int send_win_len = send_len * iters; + int recv_win_len = multirecv ? recv_len : recv_len * iters; + uint64_t recv_flags = cq_events ? FI_COMPLETION : 0; + uint64_t send_flags = cq_events ? FI_COMPLETION | FI_TRANSMIT_COMPLETE : + FI_TRANSMIT_COMPLETE; + uint64_t max_rnr_wait_us = 0; + struct iovec riovec; + struct iovec siovec; + struct fi_msg msg = {}; + struct fi_cq_tagged_entry cqe; + int ret; + int i; + void *send_desc[1]; + void *recv_desc[1]; + + ret = mr_create(send_win_len, FI_READ | FI_WRITE, 0xa, &send_key, + &send_window); + cr_assert(ret == FI_SUCCESS); + + send_desc[0] = fi_mr_desc(send_window.mr); + cr_assert(send_desc[0] != NULL); + + ret = mr_create(recv_win_len, FI_READ | FI_WRITE, 0x3, &recv_key, + &recv_window); + cr_assert(ret == FI_SUCCESS); + recv_desc[0] = fi_mr_desc(recv_window.mr); + cr_assert(recv_desc[0] != NULL); + + msg.iov_count = 1; + msg.addr = FI_ADDR_UNSPEC; + msg.context = NULL; + msg.desc = recv_desc; + msg.msg_iov = &riovec; + + /* Always pre-post receives */ + if (multirecv) { + riovec.iov_base = recv_window.mem; + riovec.iov_len = recv_win_len; + recv_flags |= FI_MULTI_RECV; + ret = fi_recvmsg(cxit_ep, &msg, recv_flags); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret); + } else { + for (i = 0; i < iters; i++) { + riovec.iov_base = recv_window.mem + recv_len * i; + riovec.iov_len = recv_len; + ret = fi_recvmsg(cxit_ep, &msg, recv_flags); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret); + } + } + + /* If not using completions to avoid internal completion + * set MAX RNR time to 0 + */ + if (!cq_events) { + ret = fi_set_val(&cxit_ep->fid, + FI_OPT_CXI_SET_RNR_MAX_RETRY_TIME, + (void *) &max_rnr_wait_us); + cr_assert(ret == FI_SUCCESS, "Set max RNR = 0 failed %d", ret); + } + + /* Send messages */ + msg.addr = cxit_ep_fi_addr; + msg.iov_count = 1; + msg.context = NULL; + msg.desc = send_desc; + msg.msg_iov = &siovec; + + for (i = 0; i < iters; i++) { + siovec.iov_base = send_window.mem + send_len * i; + siovec.iov_len = send_len; + ret = fi_sendmsg(cxit_ep, &msg, send_flags); + cr_assert_eq(ret, FI_SUCCESS, "fi_sendmsg failed %d", ret); + } + + /* Await Send completions or counter updates */ + if (cq_events) { + for (i = 0; i < iters; i++) { + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + + validate_tx_event(&cqe, FI_MSG | FI_SEND, NULL); + } + } else { + ret = fi_cntr_wait(cxit_send_cntr, iters, 1000); + cr_assert(ret == FI_SUCCESS); + } + + /* Make sure only expected completions were generated */ + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + /* Await Receive completions or counter updates */ + if (cq_events) { + for (i = 0; i < iters; i++) { + ret = cxit_await_completion(cxit_rx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + + recv_flags = FI_MSG | FI_RECV; + if (multirecv) { + /* We've sized so last message will unlink */ + if (i == iters - 1) + recv_flags |= FI_MULTI_RECV; + validate_rx_event(&cqe, NULL, recv_msg_len, + recv_flags, + recv_window.mem + + recv_msg_len * i, 0, 0); + } else { + validate_rx_event(&cqe, NULL, recv_msg_len, + recv_flags, NULL, 0, 0); + } + } + } else { + ret = fi_cntr_wait(cxit_recv_cntr, iters, 1000); + cr_assert(ret == FI_SUCCESS, "Recv cntr wait returned %d", ret); + + /* With FI_MULTI_RECV, a single completion associated with + * the buffer un-link should be reported. + */ + if (multirecv) { + ret = fi_cq_read(cxit_rx_cq, &cqe, 1); + cr_assert(ret == 1); + cr_assert(cqe.flags & FI_MULTI_RECV, + "No FI_MULTI_RECV, flags 0x%lX", cqe.flags); + cr_assert(!(cqe.flags & FI_RECV), "FI_RECV flag set"); + cr_assert(cqe.buf == NULL, + "Unexpected cqe.buf value %p", cqe.buf); + cr_assert(cqe.len == 0, + "Unexpected cqe.len value %ld", cqe.len); + } + } + + /* Make sure only expected completions were generated */ + ret = fi_cq_read(cxit_rx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + for (i = 0; i < send_win_len; i++) + cr_assert_eq(send_window.mem[i], recv_window.mem[i], + "data mismatch, element: (%d) %02x != %02x\n", i, + send_window.mem[i], recv_window.mem[i]); + + mr_destroy(&send_window); + mr_destroy(&recv_window); +} + +TestSuite(rnr_msg_hybrid_mr_desc, .init = cxit_setup_rma_rnr_hybrid_mr_desc, + .fini = cxit_teardown_rma, .timeout = CXIT_DEFAULT_TIMEOUT); + +Test(rnr_msg_hybrid_mr_desc, non_multirecv_comp) +{ + msg_hybrid_mr_desc_test_runner(false, true); +} + +Test(rnr_msg_hybrid_mr_desc, multirecv_comp) +{ + msg_hybrid_mr_desc_test_runner(true, true); +} + +Test(rnr_msg_hybrid_mr_desc, non_multirecv_non_comp) +{ + msg_hybrid_mr_desc_test_runner(false, false); +} + +Test(rnr_msg_hybrid_mr_desc, multirecv_non_comp) +{ + msg_hybrid_mr_desc_test_runner(true, false); +} + +/* Verify non-descriptor traffic works */ +Test(rnr_msg_hybrid_mr_desc, sizes_comp) +{ + uint64_t flags; + int ret; + + /* Turn on completions notifications */ + flags = FI_SEND; + ret = fi_control(&cxit_ep->fid, FI_GETOPSFLAG, (void *)&flags); + cr_assert_eq(ret, FI_SUCCESS, "fi_control FI_GETOPSFLAG TX ret %d", + ret); + flags |= FI_SEND | FI_COMPLETION | FI_TRANSMIT_COMPLETE; + ret = fi_control(&cxit_ep->fid, FI_SETOPSFLAG, (void *)&flags); + cr_assert_eq(ret, FI_SUCCESS, "fi_control FI_SETOPSFLAG TX ret %d", + ret); + + flags = FI_RECV; + ret = fi_control(&cxit_ep->fid, FI_GETOPSFLAG, (void *)&flags); + cr_assert_eq(ret, FI_SUCCESS, "fi_control FI_GETOPSFLAG RX ret %d", + ret); + flags |= FI_RECV | FI_COMPLETION; + ret = fi_control(&cxit_ep->fid, FI_SETOPSFLAG, (void *)&flags); + cr_assert_eq(ret, FI_SUCCESS, "fi_control FI_SETOPSFLAG RX ret %d", + ret); + + sizes(); +} + +static void msg_hybrid_append_test_runner(bool recv_truncation, + bool byte_counts, + bool cq_events) +{ + struct cxip_ep *cxip_ep = container_of(&cxit_ep->fid, struct cxip_ep, + ep.fid); + struct mem_region send_window; + struct mem_region recv_window; + uint64_t send_key = 0x2; + uint64_t recv_key = 0x1; + int iters = 10; + int send_len = 1024; + int recv_len = recv_truncation ? (iters - 2) * send_len : + iters * send_len; + int trunc_byte_len = recv_len; + int send_win_len = send_len * iters; + int recv_win_len = recv_len; + uint64_t recv_flags = cq_events ? FI_COMPLETION : 0; + uint64_t send_flags = cq_events ? FI_COMPLETION | FI_TRANSMIT_COMPLETE : + FI_TRANSMIT_COMPLETE; + uint64_t recv_cnt; + uint64_t max_rnr_wait_us = 0; + size_t min_multi_recv = 0; + size_t opt_len = sizeof(size_t); + struct iovec riovec; + struct iovec siovec; + struct fi_msg msg = {}; + struct fi_context ctxt[1]; + struct fi_cq_tagged_entry cqe; + struct fi_cq_err_entry err_cqe = {}; + int ret; + int i; + void *send_desc[1]; + void *recv_desc[1]; + + ret = mr_create(send_win_len, FI_READ | FI_WRITE, 0xa, &send_key, + &send_window); + cr_assert(ret == FI_SUCCESS); + + send_desc[0] = fi_mr_desc(send_window.mr); + cr_assert(send_desc[0] != NULL); + + ret = mr_create(recv_win_len, FI_READ | FI_WRITE, 0x3, &recv_key, + &recv_window); + cr_assert(ret == FI_SUCCESS); + recv_desc[0] = fi_mr_desc(recv_window.mr); + cr_assert(recv_desc[0] != NULL); + + /* Update min_multi_recv to ensure append buffer does not unlink */ + ret = fi_setopt(&cxit_ep->fid, FI_OPT_ENDPOINT, FI_OPT_MIN_MULTI_RECV, + &min_multi_recv, opt_len); + cr_assert(ret == FI_SUCCESS); + + msg.iov_count = 1; + msg.addr = FI_ADDR_UNSPEC; + msg.context = &ctxt[0]; + msg.desc = recv_desc; + msg.msg_iov = &riovec; + riovec.iov_base = recv_window.mem; + riovec.iov_len = recv_win_len; + recv_flags |= FI_MULTI_RECV; + ret = fi_recvmsg(cxit_ep, &msg, recv_flags); + cr_assert_eq(ret, FI_SUCCESS, "fi_recv failed %d", ret); + + /* Set MAX RNR time to 0 so that message will not be retried */ + ret = fi_set_val(&cxit_ep->fid, + FI_OPT_CXI_SET_RNR_MAX_RETRY_TIME, + (void *) &max_rnr_wait_us); + cr_assert(ret == FI_SUCCESS, "Set max RNR = 0 failed %d", ret); + + /* Send messages */ + msg.addr = cxit_ep_fi_addr; + msg.iov_count = 1; + msg.context = NULL; + msg.desc = send_desc; + msg.msg_iov = &siovec; + + for (i = 0; i < iters; i++) { + siovec.iov_base = send_window.mem + send_len * i; + siovec.iov_len = send_len; + ret = fi_sendmsg(cxit_ep, &msg, send_flags); + cr_assert_eq(ret, FI_SUCCESS, "fi_sendmsg failed %d", ret); + } + + /* Await Send completions or counter updates */ + if (cq_events) { + int write_len = 0; + uint64_t flags = FI_MSG | FI_SEND; + + for (i = 0; i < iters; i++) { + ret = cxit_await_completion(cxit_tx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + + write_len += send_len; + if (cxip_ep->ep_obj->txc->trunc_ok) { + if (write_len > trunc_byte_len) + flags |= FI_CXI_TRUNC; + } + validate_tx_event(&cqe, flags, NULL); + } + + /* Validate that only non-truncated counts were updated */ + ret = fi_cntr_wait(cxit_send_cntr, + byte_counts ? trunc_byte_len : iters, 1000); + cr_assert(ret == FI_SUCCESS, "Bad count %ld", + fi_cntr_read(cxit_send_cntr)); + } else { + ret = fi_cntr_wait(cxit_send_cntr, + byte_counts ? trunc_byte_len : iters, 1000); + cr_assert(ret == FI_SUCCESS, "Bad count %ld", + fi_cntr_read(cxit_send_cntr)); + } + + /* Make sure only expected completions were generated */ + ret = fi_cq_read(cxit_tx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + /* Await Receive completions or counter updates */ + if (cq_events) { + int received_len = 0; + int expected_len; + + for (i = 0; i < iters; i++) { + + ret = cxit_await_completion(cxit_rx_cq, &cqe); + cr_assert_eq(ret, 1, "fi_cq_read failed %d", ret); + + recv_flags = FI_MSG | FI_RECV; + + if (trunc_byte_len - received_len >= send_len) { + expected_len = send_len; + } else { + expected_len = trunc_byte_len - received_len; + recv_flags |= FI_CXI_TRUNC; + } + + validate_rx_event(&cqe, &ctxt[0], expected_len, + recv_flags, + recv_window.mem + + received_len, 0, 0); + received_len += expected_len; + } + + /* Validate that only bytes received were updated */ + ret = fi_cntr_wait(cxit_recv_cntr, byte_counts ? + trunc_byte_len : iters, 1000); + cr_assert(ret == FI_SUCCESS, "Bad return %d count %ld", + ret, fi_cntr_read(cxit_recv_cntr)); + + } else { + ret = fi_cntr_wait(cxit_recv_cntr, byte_counts ? + trunc_byte_len : iters, 1000); + cr_assert(ret == FI_SUCCESS, "Bad return %d count %ld", + ret, fi_cntr_read(cxit_recv_cntr)); + + /* Verify that the truncated messages updated the success + * event count. + */ + if (recv_truncation & !byte_counts) { + recv_cnt = fi_cntr_read(cxit_recv_cntr); + cr_assert(recv_cnt == iters, + "Truncation receive count %ld is wrong", + recv_cnt); + } + } + + /* Verify no completions have been written */ + ret = fi_cq_read(cxit_rx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + /* Cancel append FI_MULIT_RECV buffer */ + ret = fi_cancel(&cxit_ep->fid, &ctxt[0]); + cr_assert_eq(ret, FI_SUCCESS, "fi_cancel failed %d", ret); + + /* Get cancelled entry */ + do { + ret = fi_cq_read(cxit_rx_cq, &cqe, 1); + } while (ret == -FI_EAGAIN); + cr_assert(ret == -FI_EAVAIL, "Did not get cancel status\n"); + + ret = fi_cq_readerr(cxit_rx_cq, &err_cqe, 0); + cr_assert_eq(ret, 1, "Did not get cancel error CQE\n"); + + cr_assert(err_cqe.op_context == &ctxt[0], + "Error CQE coontext mismatch\n"); + cr_assert(err_cqe.flags == (FI_MSG | FI_RECV | FI_MULTI_RECV), + "Error CQE flags mismatch\n"); + cr_assert(err_cqe.err == FI_ECANCELED, + "Error CQE error code mismatch\n"); + cr_assert(err_cqe.prov_errno == 0, + "Error CQE provider error code mismatch\n"); + + /* Make sure only expected completions were generated */ + ret = fi_cq_read(cxit_rx_cq, &cqe, 1); + cr_assert(ret == -FI_EAGAIN); + + for (i = 0; i < recv_win_len; i++) + cr_assert_eq(send_window.mem[i], recv_window.mem[i], + "data mismatch, element: (%d) %02x != %02x\n", i, + send_window.mem[i], recv_window.mem[i]); + + mr_destroy(&send_window); + mr_destroy(&recv_window); +} + +TestSuite(rnr_msg_append_hybrid_mr_desc, + .init = cxit_setup_rma_rnr_hybrid_mr_desc, + .fini = cxit_teardown_rma, .timeout = CXIT_DEFAULT_TIMEOUT); + +Test(rnr_msg_append_hybrid_mr_desc, no_trunc_count_events_non_comp) +{ + msg_hybrid_append_test_runner(false, false, false); +} + +Test(rnr_msg_append_hybrid_mr_desc, no_trunc_count_events_comp) +{ + msg_hybrid_append_test_runner(false, false, true); +} + +Test(rnr_msg_append_hybrid_mr_desc, trunc_count_events_non_comp) +{ + msg_hybrid_append_test_runner(true, false, false); +} + +Test(rnr_msg_append_hybrid_mr_desc, trunc_count_events_comp) +{ + struct cxip_ep *cxip_ep = container_of(&cxit_ep->fid, struct cxip_ep, + ep.fid); + + /* This test requires that experimental truncation a success + * is enabled. + */ + cxip_ep->ep_obj->rxc->trunc_ok = true; + + msg_hybrid_append_test_runner(true, false, true); +} + +TestSuite(rnr_msg_append_hybrid_mr_desc_byte_cntr, + .init = cxit_setup_rma_rnr_hybrid_mr_desc_byte_cntr, + .fini = cxit_teardown_rma, .timeout = CXIT_DEFAULT_TIMEOUT); + +Test(rnr_msg_append_hybrid_mr_desc_byte_cntr, no_trunc_count_bytes_non_comp) +{ + msg_hybrid_append_test_runner(false, true, false); +} + +Test(rnr_msg_append_hybrid_mr_desc_byte_cntr, no_trunc_count_bytes_comp) +{ + msg_hybrid_append_test_runner(false, true, true); +} + +Test(rnr_msg_append_hybrid_mr_desc_byte_cntr, trunc_count_bytes_non_comp) +{ + msg_hybrid_append_test_runner(true, true, false); +} + +Test(rnr_msg_append_hybrid_mr_desc_byte_cntr, trunc_count_bytes_comp) +{ + struct cxip_ep *cxip_ep = container_of(&cxit_ep->fid, struct cxip_ep, + ep.fid); + + /* This test requires that experimental truncation a success + * is enabled. + */ + cxip_ep->ep_obj->rxc->trunc_ok = true; + + msg_hybrid_append_test_runner(true, true, true); +} diff --git a/prov/cxi/test/multinode/multinode_frmwk.c b/prov/cxi/test/multinode/multinode_frmwk.c index 94a847d3ba4..93f6616414e 100644 --- a/prov/cxi/test/multinode/multinode_frmwk.c +++ b/prov/cxi/test/multinode/multinode_frmwk.c @@ -69,7 +69,7 @@ #include "multinode_frmwk.h" /* If not compiled with DEBUG=1, this is a no-op */ -#define TRACE(fmt, ...) CXIP_TRACE(CXIP_TRC_TEST_CODE, fmt, ##__VA_ARGS__) +#define TRACE(fmt, ...) CXIP_COLL_TRACE(CXIP_TRC_TEST_CODE, fmt, ##__VA_ARGS__) #define RETURN_ERROR(ret, txt) \ if (ret != FI_SUCCESS) { \ @@ -80,14 +80,20 @@ #define CLOSE_OBJ(obj) do {if (obj) fi_close(&obj->fid); } while (0) /* Taken from SLURM environment variables */ -int frmwk_numranks; /* PMI_SIZE */ -int frmwk_rank; /* PMI_RANK */ -int frmwk_nics_per_rank; /* PMI_NUM_HSNS (defaults to 1) */ -int frmwk_numnics; -const char *frmwk_unique; /* PMI_SHARED_SECRET */ -const char *frmwk_nodename; /* SLURMD_NODENAME */ -const char frmwk_node0[32]; /* SLURMD_NODELIST (first name) */ -union nicaddr *frmwk_nics; /* array of NIC addresses plus rank and hsn */ +int frmwk_nics_per_rank; /* PMI_NUM_HSNS (defaults to 1) */ +int frmwk_numranks; /* PMI_SIZE */ +const char *frmwk_unique; /* PMI_SHARED_SECRET */ +int frmwk_rank; /* PMI_RANK */ +int frmwk_hwcoll_addrs_per_job; /* FI_CXI_HWCOLL_ADDRS_PER_JOB */ +int frmwk_hwcoll_min_nodes; /* FI_CXI_HWCOLL_MIN_NODES */ +const char *frmwk_jobid; /* FI_CXI_COLL_JOB_ID */ +const char *frmwk_jobstep; /* FI_CXI_COLL_JOB_STEP_ID */ +const char *frmwk_mcast_token; /* FI_CXI_COLL_MCAST_TOKEN */ +const char *frmwk_fabric_mgr_url; /* FI_CXI_COLL_FABRIC_MGR_URL */ +const char *frmwk_nodename; /* SLURMD_NODENAME */ +const char frmwk_node0[32]; /* SLURMD_NODELIST (first name) */ +union nicaddr *frmwk_nics; /* array of NIC addresses */ +int frmwk_numnics; /* number of NIC addresses */ int _frmwk_init; @@ -270,7 +276,7 @@ int frmwk_log(const char *fmt, ...) */ #define FAIL(cond, msg, label) \ if (cond) { \ - printf("FAIL socket %s=%d\n", msg, cond); \ + fprintf(stderr, "FAIL socket %s=%d\n", msg, cond); \ goto label; \ } @@ -692,8 +698,8 @@ int frmwk_populate_av(fi_addr_t **fiaddrp, size_t *sizep) * frmwk_populate_av() has successfully completed. * * @param ret : error code - * @param fmt : printf format - * @param ... : printf parameters + * @param fmt : fprintf format + * @param ... : fprintf parameters * @return int value of ret */ int frmwk_errmsg(int ret, const char *fmt, ...) @@ -836,26 +842,55 @@ void frmwk_init(bool quiet) char *s, *d; int ret = -1; - /* Values are provided by the WLM */ + /* Values are provided by the WLM + * Expected format "prefix[n1-n2,m1-m2,...],..." + * newton9-node-01,newton9-node-02,... + * newton9-node-[01-04,11-14],newton9-extra-[101-104]... + * We only want the first, e.g. newton9-node-01 + */ s = getenv("SLURM_NODELIST"); d = (char *)frmwk_node0; - while (s && *s && *s != '-' && *s != ',') { - if (*s == '[') + while (s && *s && *s != ',') { + if (*s == '[') { s++; - else - *d++ = *s++; + while (*s != '-' && *s != ']') + *d++ = *s++; + break; + } + *d++ = *s++; } *d = 0; frmwk_nodename = getenv("SLURMD_NODENAME"); frmwk_numranks = getenv_int("PMI_SIZE"); frmwk_rank = getenv_int("PMI_RANK"); - frmwk_unique = getenv("PMI_SHARED_SECRET"); - if (frmwk_numranks < 1 || frmwk_rank < 0 || !frmwk_unique) { + frmwk_jobid = getenv("FI_CXI_COLL_JOB_ID"); + frmwk_jobstep = getenv("FI_CXI_COLL_JOB_STEP_ID"); + frmwk_mcast_token = getenv("FI_CXI_COLL_MCAST_TOKEN"); + frmwk_fabric_mgr_url = getenv("FI_CXI_COLL_FABRIC_MGR_URL"); + frmwk_hwcoll_min_nodes = getenv_int("FI_CXI_HWCOLL_MIN_NODES"); + frmwk_hwcoll_addrs_per_job = getenv_int( + "FI_CXI_HWCOLL_ADDRS_PER_JOB"); + if (!frmwk_nodename || + frmwk_numranks < 1 || + frmwk_rank < 0 || + !frmwk_jobid || + !frmwk_jobstep || + frmwk_hwcoll_min_nodes < 1 || + frmwk_hwcoll_addrs_per_job < 1) { if (quiet) goto fail; - fprintf(stderr, "invalid PMI_SIZE=%d\n", frmwk_numranks); - fprintf(stderr, "invalid PMI_RANK=%d\n", frmwk_rank); - fprintf(stderr, "invalid PMI_SHARED_SECRET=%s\n", frmwk_unique); + fprintf(stderr, "frmwk_nodename=%s\n", frmwk_nodename); + fprintf(stderr, "frmwk_numranks=%d\n", frmwk_numranks); + fprintf(stderr, "frmwk_rank=%d\n", frmwk_rank); + fprintf(stderr, "frmwk_jobid=%s\n", frmwk_jobid); + fprintf(stderr, "frmwk_jobstep=%s\n", frmwk_jobstep); + fprintf(stderr, "frmwk_mcast_token=%s\n", frmwk_mcast_token); + fprintf(stderr, "frmwk_fabric_mgr_url=%s\n", + frmwk_fabric_mgr_url); + fprintf(stderr, "frmwk_hwcoll_min_nodes=%d\n", + frmwk_hwcoll_min_nodes); + fprintf(stderr, "frmwk_hwcoll_addrs_per_job=%d\n", + frmwk_hwcoll_addrs_per_job); fprintf(stderr, "Must be run under compatible WLM\n"); goto fail; } @@ -865,14 +900,6 @@ void frmwk_init(bool quiet) if (frmwk_nics_per_rank < 1) frmwk_nics_per_rank = 1; - /* Re-export these as libfabric equivalents */ - setenv("FI_CXI_COLL_JOB_ID", frmwk_unique, 1); - setenv("FI_CXI_COLL_JOB_STEP_ID", "0", 1); - setenv("FI_CXI_COLL_MCAST_TOKEN", "aaaaaa", 1); - setenv("FI_CXI_HWCOLL_MIN_NODES", "4", 1); - setenv("FI_CXI_HWCOLL_ADDRS_PER_JOB", "4", 1); - setenv("FI_CXI_COLL_FABRIC_MGR_URL", "what?", 1); - ret = 0; fail: _frmwk_init = (!ret); @@ -882,7 +909,6 @@ void frmwk_term(void) { free(frmwk_nics); frmwk_nics = NULL; - frmwk_unique = NULL; frmwk_nics_per_rank = 0; frmwk_numranks = 0; frmwk_rank = 0; diff --git a/prov/cxi/test/multinode/multinode_frmwk.h b/prov/cxi/test/multinode/multinode_frmwk.h index f20c89dd9dd..cc32bef3607 100644 --- a/prov/cxi/test/multinode/multinode_frmwk.h +++ b/prov/cxi/test/multinode/multinode_frmwk.h @@ -19,13 +19,20 @@ union nicaddr { #define NICSIZE (sizeof(union nicaddr)) /* These are initialized by frmwk_init() */ -extern int frmwk_nics_per_rank; -extern int frmwk_numranks; -extern int frmwk_numnics; -extern int frmwk_rank; - -/* This is initialized by frmwk_populate_av() */ -extern union nicaddr *frmwk_nics; +extern int frmwk_nics_per_rank; /* PMI_NUM_HSNS (defaults to 1) */ +extern int frmwk_numranks; /* PMI_SIZE */ +extern const char *frmwk_unique; /* PMI_SHARED_SECRET */ +extern int frmwk_rank; /* PMI_RANK */ +extern int frmwk_hwcoll_addrs_per_job; /* FI_CXI_HWCOLL_ADDRS_PER_JOB */ +extern int frmwk_hwcoll_min_nodes; /* FI_CXI_HWCOLL_MIN_NODES */ +extern const char *frmwk_jobid; /* FI_CXI_COLL_JOB_ID */ +extern const char *frmwk_jobstep; /* FI_CXI_COLL_JOB_STEP_ID */ +extern const char *frmwk_mcast_token; /* FI_CXI_COLL_MCAST_TOKEN */ +extern const char *frmwk_fabric_mgr_url;/* FI_CXI_COLL_FABRIC_MGR_URL */ +extern const char *frmwk_nodename; /* SLURMD_NODENAME */ +extern const char frmwk_node0[32]; /* SLURMD_NODELIST (first name) */ +extern union nicaddr *frmwk_nics; /* array of NIC addresses */ +extern int frmwk_numnics; /* number of NIC addresses */ extern char *cxit_node; extern char *cxit_service; diff --git a/prov/cxi/test/multinode/test_coll.c b/prov/cxi/test/multinode/test_coll.c index 974f8c54bb6..3dc80b86278 100644 --- a/prov/cxi/test/multinode/test_coll.c +++ b/prov/cxi/test/multinode/test_coll.c @@ -30,14 +30,15 @@ #include "multinode_frmwk.h" /* If not compiled with DEBUG=1, this is a no-op */ -#define TRACE(fmt, ...) CXIP_TRACE(CXIP_TRC_TEST_CODE, fmt, ##__VA_ARGS__) +#define TRACE(fmt, ...) CXIP_COLL_TRACE(CXIP_TRC_TEST_CODE, fmt, ##__VA_ARGS__) /* convert delays to nsecs */ #define nUSEC(n) (n * 1000L) #define nMSEC(n) (n * 1000000L) #define nSEC(n) (n * 1000000000L) -int verbose = 0; +int verbose; +bool create_multicast; /* Signaling NaN generation, for testing. * Linux feature requires GNU_SOURCE. @@ -143,7 +144,6 @@ static void *_poll_cqs(void) } if (size != -FI_EAGAIN) TRACE("tx ERROR seen = %ld\n", size); - TRACE("%s return NULL\n", __func__); return NULL; } @@ -199,13 +199,15 @@ void avset_ary_destroy(struct avset_ary *setary) avset_ary_init(setary); } -/* create a single avset using fiaddrs, size, and append it to the setary */ +/* create a single avset using fiaddrs, size, and append it to the setary. + * mcast_addr and root_idx apply only to UNICAST model. + */ int avset_ary_append(fi_addr_t *fiaddrs, size_t size, int mcast_addr, int root_idx, struct avset_ary *setary) { struct cxip_comm_key comm_key = { - .keytype = (cxip_env.coll_fabric_mgr_url) ? + .keytype = (cxip_env.coll_fabric_mgr_url && create_multicast) ? COMM_KEY_NONE : COMM_KEY_UNICAST, .ucast.mcast_addr = mcast_addr, .ucast.hwroot_idx = root_idx @@ -223,8 +225,8 @@ int avset_ary_append(fi_addr_t *fiaddrs, size_t size, int i, ret; // expand accumulator list as necessary - TRACE("%s cnt=%d siz=%d\n", __func__, setary->avset_cnt, - setary->avset_siz); + TRACE("%s cnt=%d siz=%d multicast=%d\n", __func__, setary->avset_cnt, + setary->avset_siz, create_multicast); if (setary->avset_siz <= setary->avset_cnt) { void *ptr; int siz; @@ -421,8 +423,7 @@ int coll_multi_join(struct avset_ary *setary, struct dlist_entry *joinlist) dlist_init(&jctx->entry); TRACE("join %d of %d initiating\n", i, total); ret = fi_join_collective(cxit_ep, FI_ADDR_NOTAVAIL, - setary->avset[i], 0L, - &jctx->mc, jctx); + setary->avset[i], 0L, &jctx->mc, jctx); /* node is not participating in this join */ if (ret == -FI_ECONNREFUSED) { free(jctx); @@ -457,7 +458,7 @@ void coll_join_cleanup(struct avset_ary *setary, struct dlist_entry *joinlist) avset_ary_destroy(setary); } -struct join_item *coll_join_item(struct dlist_entry *joinlist, int index) +struct join_item *_get_join_jctx(struct dlist_entry *joinlist, int index) { struct join_item *jctx; @@ -468,8 +469,18 @@ struct join_item *coll_join_item(struct dlist_entry *joinlist, int index) return NULL; } +bool _is_hwroot(struct join_item *jctx) +{ + struct cxip_coll_mc *mc_obj; + + mc_obj = (struct cxip_coll_mc *)jctx->mc; + return (mc_obj->hwroot_idx == mc_obj->mynode_idx); +} -/* Utility function to create a single join with no errors */ +/* Utility function to create a single join with no errors. + * mcast_addr and root_idx apply only to UNICAST model. + * Allows join error conditions to be tested. + */ struct join_item *coll_single_join(fi_addr_t *fiaddrs, size_t size, int mcast_addr, int root_idx, int exp_retval, int exp_prov_errno, @@ -555,45 +566,57 @@ int _test_multi_barrier(struct avset_ary *setary, struct dlist_entry *joinlist, } #endif +int _simple_join(fi_addr_t *fiaddrs, size_t size, + struct avset_ary *setary, + struct dlist_entry *joinlist) +{ + int ret; + + avset_ary_init(setary); + ret = avset_ary_append(fiaddrs, size, 0, 1, setary); + if (ret) + return ret; + + dlist_init(joinlist); + ret = coll_multi_join(setary, joinlist); + if (ret) + return ret; + + return 0; +} + +uint64_t _simple_get_mc(struct dlist_entry *joinlist) +{ + struct join_item *jctx; + + jctx = dlist_first_entry_or_null(joinlist, struct join_item, entry); + return (uint64_t)jctx->mc; +} + +void _simple_join_release(struct avset_ary *setary, + struct dlist_entry *joinlist) +{ + coll_multi_release(joinlist); + avset_ary_destroy(setary); +} + /** - * @brief Simple test of join, returns a count of errors. + * @brief Simple test of join/delete returns a count of errors. * * This creates a single avset_ary from the supplied addresses, with hwroot * of zero, and performs a single join, tests errors, and cleans up. Used to * probe the basic error conditions. */ -int _test_join(fi_addr_t *fiaddrs, size_t size, int exp_ret, - int exp_prov_errno) +int _test_join(fi_addr_t *fiaddrs, size_t size) { struct avset_ary setary; struct dlist_entry joinlist; - struct join_item *jctx; - int ret, errcnt; - - errcnt = 0; - avset_ary_init(&setary); - ret = avset_ary_append(fiaddrs, size, 0, 1, &setary); - errcnt += !!ret; - - dlist_init(&joinlist); - ret = coll_multi_join(&setary, &joinlist); - errcnt += !!ret; - - dlist_foreach_container(&joinlist, struct join_item, jctx, entry) { - if (jctx->retval != exp_ret || - jctx->prov_errno != exp_prov_errno) { - TRACE("exp_ret=%d retval=%d\n", - exp_ret, jctx->retval); - TRACE("exp_prov_errno=%d prov_errno=%d\n", - exp_prov_errno, jctx->prov_errno); - errcnt++; - } - } + int ret; - coll_multi_release(&joinlist); - avset_ary_destroy(&setary); + ret = _simple_join(fiaddrs, size, &setary, &joinlist); + _simple_join_release(&setary, &joinlist); - return errcnt; + return ret; } /* Simple test of barrier, returns a count of errors. */ @@ -601,38 +624,29 @@ int _test_barrier(fi_addr_t *fiaddrs, size_t size, int count) { struct avset_ary setary; struct dlist_entry joinlist; - struct join_item *jctx; uint64_t context; - int i, ret, total, errcnt; + uint64_t mc; + int i, ret, total; - errcnt = 0; - total = 0; - avset_ary_init(&setary); - ret = avset_ary_append(fiaddrs, size, 0, 1, &setary); - errcnt += !!ret; + TRACE("%s entry, create_mcast=%d\n", __func__, create_multicast); + ret = _simple_join(fiaddrs, size, &setary, &joinlist); if (ret) { - TRACE("BARRIER avset not created\n"); + TRACE("BARRIER JOIN failed\n"); goto quit; } + TRACE("BARRIER JOIN COMPLETE\n"); - dlist_init(&joinlist); - ret = coll_multi_join(&setary, &joinlist); - errcnt += !!ret; - if (ret) { - TRACE("BARRIER JOIN not initiated\n"); + mc = _simple_get_mc(&joinlist); + if (!mc) { + TRACE("BARRIER MC invalid\n"); goto quit; } - TRACE("BARRIER JOIN COMPLETE\n"); - - jctx = dlist_first_entry_or_null(&joinlist, struct join_item, entry); - TRACE("Barrier join complete, jctx = %p\n", jctx); for (i = 0; i < count; i++) { do { usleep(rand() % 100); - ret = fi_barrier(cxit_ep, (fi_addr_t )jctx->mc, - &context); - TRACE("barrier = %d\n", ret); + ret = fi_barrier(cxit_ep, mc, &context); } while (ret == -FI_EAGAIN); + TRACE("barrier = %d\n", ret); if (ret == FI_SUCCESS) { TRACE("spin 1...\n"); _wait_cqs(&context); @@ -640,15 +654,16 @@ int _test_barrier(fi_addr_t *fiaddrs, size_t size, int count) total++; } else { TRACE("BARRIER FAILED #%d, ret=%d\n", i, ret); - errcnt++; + goto quit; } } + ret = 0; quit: - frmwk_log0("Barrier errcnt=%d total=%d\n", errcnt, total); - coll_multi_release(&joinlist); - avset_ary_destroy(&setary); - return errcnt; + TRACE("BARRIER exit\n"); + frmwk_log0("Barrier total=%d\n", total); + _simple_join_release(&setary, &joinlist); + return ret; } /* Simple test of broadcast, returns a count of errors. */ @@ -656,16 +671,23 @@ int _test_broadcast(fi_addr_t *fiaddrs, size_t size, int rootidx) { struct avset_ary setary; struct dlist_entry joinlist; - struct join_item *jctx; uint64_t data[4], rslt[4]; uint64_t context; - int i, ret, errcnt; + uint64_t mc; + int i, ret; - errcnt = 0; - jctx = coll_single_join(fiaddrs, size, 0, rootidx, 0, 0, - &setary, &joinlist, "BROADCAST"); - if (!jctx) { - TRACE("BROADCAST JOIN returned NULL\n"); + TRACE("%s entry, create_mcast=%d\n", __func__, create_multicast); + + ret = _simple_join(fiaddrs, size, &setary, &joinlist); + if (ret) { + TRACE("join failed\n"); + goto quit; + } + + mc = _simple_get_mc(&joinlist); + if (!mc) { + TRACE("BARRIER MC invalid\n"); + ret = -1; goto quit; } @@ -679,103 +701,93 @@ int _test_broadcast(fi_addr_t *fiaddrs, size_t size, int rootidx) do { _poll_cqs(); ret = fi_broadcast(cxit_ep, rslt, 4, NULL, - (fi_addr_t )jctx->mc, fiaddrs[rootidx], + mc, fiaddrs[rootidx], FI_UINT64, 0L, &context); } while (ret == -FI_EAGAIN); - errcnt += !!ret; - if (ret == FI_SUCCESS) { - TRACE("spin 1...\n"); - _wait_cqs(&context); - TRACE("BROADCAST COMPLETE\n"); - if (memcmp(rslt, data, sizeof(rslt))) { - for (i = 0; i < 4; i++) - TRACE("[%d] %016lx exp %016lx\n", - i, rslt[i], data[i]); - errcnt++; - } - } else { - TRACE("ret = %d\n", ret); - TRACE("BROADCAST FAILED\n"); - errcnt++; + if (ret) + goto quit; + + TRACE("spin 1...\n"); + _wait_cqs(&context); + TRACE("BROADCAST COMPLETE\n"); + if (memcmp(rslt, data, sizeof(rslt))) { + for (i = 0; i < 4; i++) + TRACE("[%d] %016lx exp %016lx\n", + i, rslt[i], data[i]); + ret = -1; } quit: - coll_multi_release(&joinlist); - avset_ary_destroy(&setary); - return errcnt; + TRACE("BROADCAST exit\n"); + _simple_join_release(&setary, &joinlist); + return ret; } +const struct timespec usec1 = {.tv_sec = 0, .tv_nsec = 10000}; + /* simple test of allreduce, returns a count of errors. */ int _test_allreduce(fi_addr_t *fiaddrs, size_t size) { struct avset_ary setary; struct dlist_entry joinlist; - struct join_item *jctx; int64_t *data, *rslt, *comp; uint64_t context; - int i, j, ret, errcnt; + uint64_t mc; + int r, v, ret; + + TRACE("%s entry, create_mcast=%d\n", __func__, create_multicast); - errcnt = 0; - avset_ary_init(&setary); - ret = avset_ary_append(fiaddrs, size, 0, 1, &setary); - errcnt += !!ret; + ret = _simple_join(fiaddrs, size, &setary, &joinlist); if (ret) { - TRACE("ALLREDUCE avset not created\n"); + TRACE("join failed\n"); goto quit; } - dlist_init(&joinlist); - ret = coll_multi_join(&setary, &joinlist); - errcnt += !!ret; - if (ret) { - TRACE("ALLREDUCE JOIN not initiated\n"); + mc = _simple_get_mc(&joinlist); + if (!mc) { + TRACE("ALLREDUCE MC invalid\n"); + ret = -1; goto quit; } - - jctx = dlist_first_entry_or_null(&joinlist, struct join_item, entry); - TRACE("jctx = %p\n", jctx); - TRACE("mc = %p\n", jctx->mc); + if (_is_hwroot(_get_join_jctx(&joinlist, 0))) + nanosleep(&usec1, NULL); data = calloc(frmwk_numranks*4, sizeof(int64_t)); comp = calloc(4, sizeof(int64_t)); rslt = calloc(4, sizeof(int64_t)); - for (i = 0; i < frmwk_numranks; i++) { - for (j = 0; j < 4; j++) { - data[4*i + j] = ((int64_t)(rand() - RAND_MAX/2) << 32); - data[4*i + j] |= rand(); - comp[j] += data[4*i + j]; - } - } + for (v = 0; v < 4; v++) + for (r = 0; r < frmwk_numranks; r++) + data[4*r + v] = 4*r + v; + for (v = 0; v < 4; v++) + for (r = 0; r < frmwk_numranks; r++) + comp[v] += data[4*r + v]; do { _poll_cqs(); ret = fi_allreduce(cxit_ep, &data[frmwk_rank*4], 4, NULL, - rslt, NULL, (fi_addr_t )jctx->mc, FI_INT64, + rslt, NULL, mc, FI_INT64, FI_SUM, 0L, &context); } while (ret == -FI_EAGAIN); - errcnt += !!ret; - if (ret == FI_SUCCESS) { - TRACE("spin 1...\n"); - _wait_cqs(&context); - TRACE("ALLREDUCE COMPLETE\n"); - if (memcmp(rslt, comp, 4*sizeof(int64_t))) { - for (i = 0; i < 4; i++) - TRACE("[%d] %016lx exp %016lx\n", - i, rslt[i], comp[i]); - errcnt++; + if (ret) + goto quit; + + TRACE("spin...\n"); + _wait_cqs(&context); + TRACE("ALLREDUCE COMPLETE\n"); + for (v = 0; v < 4; v++) { + if (rslt[v] != comp[v]) { + TRACE("[%d] %016lx exp %016lx\n", + v, rslt[v], comp[v]); + ret = 1; } - } else { - TRACE("ret = %d\n", ret); - TRACE("ALLREDUCE FAILED\n"); - errcnt++; } free(rslt); free(comp); free(data); quit: - coll_multi_release(&joinlist); - avset_ary_destroy(&setary); - return errcnt; + TRACE("ALLREDUCE exit\n"); + _simple_join_release(&setary, &joinlist); + return ret; } /** @@ -813,7 +825,6 @@ static uint64_t testmask = 0L; int main(int argc, char **argv) { - bool trace_enabled = true; fi_addr_t *fiaddrs = NULL; fi_addr_t myaddr; struct cxip_addr mycaddr; @@ -824,12 +835,12 @@ int main(int argc, char **argv) int tstnum = 0; int ret = 0; int N = 0; + int S = 1; bool help = false; + bool trace_muted = true; struct join_item *jctx; struct avset_ary setary; struct dlist_entry joinlist; - - const char *testname; char opt; int i, j; @@ -838,8 +849,8 @@ int main(int argc, char **argv) testmask = -1L; testname = NULL; - TRACE("enter main\n"); - while ((opt = getopt(argc, argv, "hvVt:N:")) != -1) { + setvbuf(stdout, NULL, _IONBF, 0); + while ((opt = getopt(argc, argv, "hvVS:Mt:N:")) != -1) { char *str, *s, *p; switch (opt) { @@ -859,19 +870,30 @@ int main(int argc, char **argv) p = s; while (*p && *p != '-') p++; - i = atoi(s); - j = (*p) ? atoi(++p) : i; + if (*p) + *p++ = 0; + i = (*s) ? atoi(s) : 0; + j = (*p) ? atoi(p) : i; if (j > 63) j = 63; - while (i <= j) + while (i <= j) { testmask |= (1L << i++); + } } break; + case 'M': + create_multicast = true; + break; case 'N': N = atoi(optarg); break; + case 'S': + S = atoi(optarg); + printf("S = %d\n", S); + break; case 'V': - trace_enabled = true; + /* tracing is enabled below */ + trace_muted = false; break; case 'v': verbose = true; @@ -894,13 +916,16 @@ int main(int argc, char **argv) do { if (help) { frmwk_log0( - "Usage: test_coll [-hvV] -Ncount[-t testno[-testno][,...]]\n"); - frmwk_log0("\nTests:\n"); + "Usage: t est_coll [-hvV] -M -Ncount [-t testno[-testno][,...]]\n" + " -h generate help and quit.\n" + " -M use multicast model (default unicast model)\n" + " -N iterations (default 1)\n" + " -t test list (default all)\n"); break; } - /* Test requires a minimum of four nodes */ - if (frmwk_check_env(4)) + /* Test requires a minimum of two nodes */ + if (frmwk_check_env(2)) return -1; /* Initialize libfabric on this node */ @@ -909,8 +934,8 @@ int main(int argc, char **argv) if (frmwk_errmsg(ret, "frmwk_init_libfabric()\n")) goto done; - cxip_trace_enable(trace_enabled); - TRACE("==== tracing enabled offset %d\n", frmwk_rank); + /* mute or unmute tracing */ + cxip_coll_trace_muted = trace_muted; /* always start with FI_UNIVERSE */ ret = frmwk_populate_av(&fiaddrs, &size); @@ -993,8 +1018,10 @@ int main(int argc, char **argv) PREAMBLE(0, tstnum, "test join (simple)"); // Test single join over one array list TRACE("======= %s\n", testname); + TRACE("[%d] starting join\n", frmwk_rank); jctx = coll_single_join(fiaddrs, size, 0, 0, 0, 0, &setary, &joinlist, "simple"); + TRACE("[%d] jctx = %p\n", frmwk_rank, jctx); coll_join_cleanup(&setary, &joinlist); errcnt += !!!jctx; tstcnt += 1; @@ -1084,6 +1111,7 @@ int main(int argc, char **argv) } while (0); tstnum++; +#if 0 do { PREAMBLE(0, tstnum, "force -FI_EFAULT on PTE alloc"); // cause zbcoll root (rank 0) to simulate PTE alloc failure @@ -1096,9 +1124,91 @@ int main(int argc, char **argv) frmwk_barrier(); } while (0); tstnum++; +#endif + + do { + struct cxip_coll_mc *mc_obj; + struct cxip_coll_reduction *reduction; + struct cxip_coll_data coll_data; + int ret; + + PREAMBLE(0, tstnum, "test single packet send"); + // Create multicast and send packet through HWRoot + TRACE("======= %s\n", testname); + TRACE("starting join\n"); + + /* root is index 0, others are leaves */ + jctx = coll_single_join(fiaddrs, size, 0, 0, 0, 0, + &setary, &joinlist, "simple"); + TRACE("completed join jctx = %p\n", jctx); + mc_obj = (struct cxip_coll_mc *)jctx->mc; + mc_obj->arm_disable = true; + mc_obj->retry_disable = true; + TRACE("S=%d rank=%d hwroot=%d\n", S, frmwk_rank, + mc_obj->hwroot_idx); + reduction = &mc_obj->reduction[0]; + coll_data.red_cnt = 1; + coll_data.intval.ival[0] = 1234; + coll_data.intval.ival[1] = frmwk_rank; + memset(&reduction->accum, 0, sizeof(reduction->accum)); + if (frmwk_rank == S) { + TRACE("test starting send on %d\n", S); + do { + ret = cxip_coll_send_red_pkt( + reduction, &coll_data, + false, false); + TRACE("send result = %d\n", ret); + } while (ret == -FI_EAGAIN); + TRACE("completed send = %d\n", ret); + } + while (1) + _poll_cqs(); + + coll_join_cleanup(&setary, &joinlist); + errcnt += !!!jctx; + tstcnt += 1; + frmwk_log0("%4s\n", STDMSG(ret)); + frmwk_barrier(); + } while (0); + tstnum++; + +/*###############################################################*/ + do { + uint64_t context; + + PREAMBLE(0, tstnum, "test barrier (simple)"); + // Test single join over one array list + TRACE("======= %s\n", testname); + TRACE("[%d] starting join\n", frmwk_rank); + jctx = coll_single_join(fiaddrs, size, 0, 0, 0, 0, + &setary, &joinlist, "simple"); + TRACE("completed join jctx = %p\n", jctx); + TRACE("start barrier\n"); + do { + ret = fi_barrier(cxit_ep, (fi_addr_t )jctx->mc, + &context); + TRACE("barrier = %d\n", ret); + } while (ret == -FI_EAGAIN); + + if (ret == FI_SUCCESS) { + TRACE("spin 1...\n"); + _wait_cqs(&context); + TRACE("BARRIER COMPLETE #%d\n", i); + } else { + TRACE("BARRIER FAILED #%d, ret=%d\n", i, ret); + errcnt++; + } + coll_join_cleanup(&setary, &joinlist); + errcnt += !!!jctx; + tstcnt += 1; + frmwk_log0("%4s\n", STDMSG(ret)); + frmwk_barrier(); + } while (0); + tstnum++; do { PREAMBLE(0, tstnum, "perform barrier"); + TRACE("Starting barrier\n"); ret = _test_barrier(fiaddrs, size, 1); errcnt += !!ret; tstcnt += 1; @@ -1153,14 +1263,14 @@ int main(int argc, char **argv) ret = coll_multi_join(&setary, &joinlist); TRACE("join = %d\n", ret); - jctx = coll_join_item(&joinlist, 0); + jctx = _get_join_jctx(&joinlist, 0); TRACE("item 0 mc=%p retval=%d prov_errno=%d\n", jctx->mc, jctx->retval, jctx->prov_errno); if (jctx->retval || jctx->prov_errno) { TRACE("unexpected result on coll 0\n"); errcnt++; } - jctx = coll_join_item(&joinlist, 1); + jctx = _get_join_jctx(&joinlist, 1); TRACE("item 1 mc=%p retval=%d prov_errno=%d\n", jctx->mc, jctx->retval, jctx->prov_errno); if (jctx->retval != -FI_EAVAIL || @@ -1189,14 +1299,14 @@ int main(int argc, char **argv) ret = coll_multi_join(&setary, &joinlist); TRACE("join = %d\n", ret); - jctx = coll_join_item(&joinlist, 0); + jctx = _get_join_jctx(&joinlist, 0); TRACE("item 0 mc=%p retval=%d prov_errno=%d\n", jctx->mc, jctx->retval, jctx->prov_errno); if (jctx->retval || jctx->prov_errno) { TRACE("unexpected result on coll 0\n"); errcnt++; } - jctx = coll_join_item(&joinlist, 1); + jctx = _get_join_jctx(&joinlist, 1); TRACE("item 1 mc=%p retval=%d prov_errno=%d\n", jctx->mc, jctx->retval, jctx->prov_errno); if (jctx->retval != -FI_EAVAIL || @@ -1225,14 +1335,14 @@ int main(int argc, char **argv) ret = coll_multi_join(&setary, &joinlist); TRACE("join = %d\n", ret); - jctx = coll_join_item(&joinlist, 0); + jctx = _get_join_jctx(&joinlist, 0); TRACE("item 0 mc=%p retval=%d prov_errno=%d\n", jctx->mc, jctx->retval, jctx->prov_errno); if (jctx->retval || jctx->prov_errno) { TRACE("unexpected result on coll 0\n"); errcnt++; } - jctx = coll_join_item(&joinlist, 1); + jctx = _get_join_jctx(&joinlist, 1); TRACE("item 1 mc=%p retval=%d prov_errno=%d\n", jctx->mc, jctx->retval, jctx->prov_errno); if (jctx->retval != -FI_EAVAIL || @@ -1267,7 +1377,7 @@ int main(int argc, char **argv) int exp_errno = (i < size) ? 0 : CXIP_PROV_ERRNO_HWROOT_INUSE; int good; - jctx = coll_join_item(&joinlist, i); + jctx = _get_join_jctx(&joinlist, i); if (!jctx) { TRACE("no join item\n"); continue; @@ -1340,7 +1450,7 @@ int main(int argc, char **argv) int tree2 = (tree + frmwk_rank)%size; usleep(rand() % 100); - jctx = coll_join_item(&joinlist, tree2); + jctx = _get_join_jctx(&joinlist, tree2); ret = fi_broadcast(cxit_ep, datary[tree2], 4, NULL, (fi_addr_t )jctx->mc, fiaddrs[root], FI_UINT64, diff --git a/prov/cxi/test/multinode/test_frmwk.c b/prov/cxi/test/multinode/test_frmwk.c index 6f8fdd7a850..6be3722ca3f 100644 --- a/prov/cxi/test/multinode/test_frmwk.c +++ b/prov/cxi/test/multinode/test_frmwk.c @@ -29,7 +29,7 @@ #include #include "multinode_frmwk.h" -#define TRACE(fmt, ...) CXIP_TRACE(CXIP_TRC_TEST_CODE, fmt, ##__VA_ARGS__) +#define TRACE(fmt, ...) CXIP_COLL_TRACE(CXIP_TRC_TEST_CODE, fmt, ##__VA_ARGS__) int main(int argc, char **argv) { @@ -64,12 +64,12 @@ int main(int argc, char **argv) fiaddr[i]); } - cxip_trace_enable(true); + cxip_coll_trace_muted = false; TRACE("Trace message test %d\n", 0); TRACE("Trace message test %d\n", 1); - cxip_trace_enable(false); + cxip_coll_trace_muted = true; TRACE("This message should not appear\n"); - cxip_trace_enable(true); + cxip_coll_trace_muted = false; TRACE("This message should appear\n"); frmwk_free_libfabric(); diff --git a/prov/cxi/test/multinode/test_zbcoll.c b/prov/cxi/test/multinode/test_zbcoll.c index d19ff3aabd8..425c1feba27 100644 --- a/prov/cxi/test/multinode/test_zbcoll.c +++ b/prov/cxi/test/multinode/test_zbcoll.c @@ -28,7 +28,7 @@ #include #include "multinode_frmwk.h" -#define TRACE(fmt, ...) CXIP_TRACE(CXIP_TRC_TEST_CODE, fmt, ##__VA_ARGS__) +#define TRACE(fmt, ...) CXIP_COLL_TRACE(CXIP_TRC_TEST_CODE, fmt, ##__VA_ARGS__) /* convert delays to nsecs */ #define nUSEC(n) (n * 1000L) @@ -89,7 +89,7 @@ static void _idle_wait(struct cxip_ep_obj *ep_obj, int msec) continue; TRACE("ns=%ld dsc=%d err=%d ack=%d rcv=%d\n", nsecs, dsc, err, ack, rcv); - cxip_trace_flush(); + cxip_coll_trace_flush(); dsc0 = dsc; err0 = err; ack0 = ack; @@ -129,7 +129,7 @@ static int _send_wait(struct cxip_zbcoll_obj *zb, int sndcnt, int rcvcnt) TRACE("STATE FAILURE\n"); return 1; } - cxip_trace_flush(); + cxip_coll_trace_flush(); return 0; } @@ -885,7 +885,7 @@ static inline bool _istest(uint64_t mask, int test) int main(int argc, char **argv) { - bool trace_enabled = false; + bool trace_muted = true; char hostname[256]; fi_addr_t *fiaddrs = NULL; struct cxip_ep *cxip_ep; @@ -957,7 +957,7 @@ int main(int argc, char **argv) badnic = strtol(optarg, NULL, 16); break; case 'V': - trace_enabled = true; + trace_muted = false; break; case 'v': verbose = true; @@ -970,16 +970,16 @@ int main(int argc, char **argv) } frmwk_init(false); - if (frmwk_check_env(4)) + if (frmwk_check_env(2)) return -1; ret = frmwk_init_libfabric(); if (frmwk_errmsg(ret, "frmwk_init_libfabric()\n")) return ret; - cxip_trace_rank = frmwk_rank; - cxip_trace_numranks = frmwk_numranks; - cxip_trace_enable(trace_enabled); + cxip_coll_trace_rank = frmwk_rank; + cxip_coll_trace_numranks = frmwk_numranks; + cxip_coll_trace_muted = trace_muted; TRACE("==== tracing enabled offset %d\n", frmwk_rank); srand(seed); @@ -1259,7 +1259,7 @@ int main(int argc, char **argv) double time; TRACE("======= %s\n", testname); - trace_enabled = cxip_trace_enable(false); + cxip_coll_trace_muted = true; zb1 = NULL; ret = cxip_zbcoll_alloc(ep_obj, size, fiaddrs, ZB_NOSIM, &zb1); clock_gettime(CLOCK_MONOTONIC, &t0); @@ -1276,11 +1276,11 @@ int main(int argc, char **argv) time = _measure_nsecs(&t0); time /= 1.0*count; time /= 1000.0; - cxip_trace_enable(trace_enabled); + cxip_coll_trace_muted = trace_muted; cxip_zbcoll_free(zb1); errcnt += !!ret; _idle_wait(ep_obj, 100); - frmwk_log0("%4s %s \tcount=%ld time=%1.2fus\n", + frmwk_log0("%4s %s \tcount=%ld time=%1.2fus/op\n", ret ? "FAIL" : "ok", testname, count, time); frmwk_barrier(); } @@ -1291,7 +1291,7 @@ int main(int argc, char **argv) double time; TRACE("======= %s\n", testname); - trace_enabled = cxip_trace_enable(false); + cxip_coll_trace_muted = true; zb1 = NULL; ret = _getgroup(ep_obj, size, fiaddrs, &zb1); clock_gettime(CLOCK_MONOTONIC, &t0); @@ -1303,11 +1303,11 @@ int main(int argc, char **argv) time = _measure_nsecs(&t0); time /= 1.0*count; time /= 1000.0; - cxip_trace_enable(trace_enabled); + cxip_coll_trace_muted = trace_muted; cxip_zbcoll_free(zb1); errcnt += !!ret; _idle_wait(ep_obj, 100); - frmwk_log0("%4s %s \tcount=%ld time=%1.2fus\n", + frmwk_log0("%4s %s \tcount=%ld time=%1.2fus/op\n", ret ? "FAIL" : "ok", testname, count, time); frmwk_barrier(); } @@ -1319,7 +1319,7 @@ int main(int argc, char **argv) double time; TRACE("======= %s\n", testname); - trace_enabled = cxip_trace_enable(false); + cxip_coll_trace_muted = true; zb1 = NULL; ret = _getgroup(ep_obj, size, fiaddrs, &zb1); clock_gettime(CLOCK_MONOTONIC, &t0); @@ -1331,11 +1331,11 @@ int main(int argc, char **argv) time = _measure_nsecs(&t0); time /= 1.0*count; time /= 1000.0; - cxip_trace_enable(trace_enabled); + cxip_coll_trace_muted = trace_muted; cxip_zbcoll_free(zb1); errcnt += !!ret; _idle_wait(ep_obj, 100); - frmwk_log0("%4s %s \tcount=%ld time=%1.2fus\n", + frmwk_log0("%4s %s \tcount=%ld time=%1.2fus/op\n", ret ? "FAIL" : "ok", testname, count, time); frmwk_barrier(); } @@ -1347,7 +1347,7 @@ int main(int argc, char **argv) double time; TRACE("======= %s\n", testname); - trace_enabled = cxip_trace_enable(false); + cxip_coll_trace_muted = true; zb1 = NULL; ret = _getgroup(ep_obj, size, fiaddrs, &zb1); clock_gettime(CLOCK_MONOTONIC, &t0); @@ -1359,11 +1359,11 @@ int main(int argc, char **argv) time = _measure_nsecs(&t0); time /= 1.0*count; time /= 1000.0; - cxip_trace_enable(trace_enabled); + cxip_coll_trace_muted = trace_muted; cxip_zbcoll_free(zb1); errcnt += !!ret; _idle_wait(ep_obj, 100); - frmwk_log0("%4s %s \tcount=%ld time=%1.2fus\n", + frmwk_log0("%4s %s \tcount=%ld time=%1.2fus/op\n", ret ? "FAIL" : "ok", testname, count, time); frmwk_barrier(); } @@ -1396,7 +1396,7 @@ int main(int argc, char **argv) 1, 0, frmwk_rank); //ret = _getgroup(ep_obj, size, fiaddrs, &zb1); TRACE("listening forever....\n"); - cxip_trace_flush(); + cxip_coll_trace_flush(); _idle_wait(ep_obj, -1); frmwk_log0("%4s %s\n", ret ? "FAIL" : "ok", testname); } else { diff --git a/prov/cxi/test/rma.c b/prov/cxi/test/rma.c index 6a53fee3b07..27990b9a8f8 100644 --- a/prov/cxi/test/rma.c +++ b/prov/cxi/test/rma.c @@ -123,8 +123,7 @@ Test(rma, zero_byte_readmsg) mr_destroy(&mem_window); } -/* Test fi_write simple case. Test IDC sizes to multi-packe sizes. */ -Test(rma, simple_write) +static void simple_write(void) { int ret; uint8_t *send_buf; @@ -161,6 +160,12 @@ Test(rma, simple_write) free(send_buf); } +/* Test fi_write simple case. Test IDC sizes to multi-packe sizes. */ +Test(rma, simple_write) +{ + simple_write(); +} + /* Test compatibility of client/provider keys */ Test(rma, key_compatibility) { @@ -803,8 +808,7 @@ Test(rma, simple_inject_write) free(send_buf); } -/* Test fi_read simple case */ -Test(rma, simple_read) +static void simple_read(void) { int ret; uint8_t *local; @@ -845,6 +849,12 @@ Test(rma, simple_read) free(local); } +/* Test fi_read simple case */ +Test(rma, simple_read) +{ + simple_read(); +} + /* Test fi_readv simple case */ Test(rma, simple_readv) { @@ -2234,3 +2244,20 @@ Test(rma_mr_event, stale_key) free(src_buf); free(src_buf2); } + +/* Note: the FI_PROTO_CXI_RNR should not alter the standard RMA + * and Atomic functions. Perform a limited set of writes and reads to + * verify this. + */ +TestSuite(rnr_rma, .init = cxit_setup_rnr_msg_ep, + .fini = cxit_teardown_msg, .timeout = CXIT_DEFAULT_TIMEOUT); + +Test(rnr_rma, simple_write) +{ + simple_write(); +} + +Test(rnr_rma, simple_read) +{ + simple_read(); +} diff --git a/prov/cxi/test/run.sh b/prov/cxi/test/run.sh old mode 100644 new mode 100755 diff --git a/prov/cxi/test/run_tests_vm.sh b/prov/cxi/test/run_tests_vm.sh old mode 100644 new mode 100755 diff --git a/prov/cxi/test/startvm-setup.sh b/prov/cxi/test/startvm-setup.sh old mode 100644 new mode 100755 index 06f19c8c958..0e79f611a86 --- a/prov/cxi/test/startvm-setup.sh +++ b/prov/cxi/test/startvm-setup.sh @@ -15,7 +15,7 @@ ulimit -l unlimited modprobe ptp modprobe iommu_v2 || modprobe amd_iommu_v2 -insmod $DBS_DIR/slingshot_base_link/sbl.ko +insmod $DBS_DIR/slingshot_base_link/cxi-sbl.ko insmod $DBS_DIR/sl-driver/knl/cxi-sl.ko insmod $DBS_DIR/cxi-driver/cxi/cxi-core.ko disable_default_svc=0 insmod $DBS_DIR/cxi-driver/cxi/cxi-user.ko diff --git a/prov/cxi/test/startvm.sh b/prov/cxi/test/startvm.sh old mode 100644 new mode 100755 diff --git a/prov/cxi/test/tagged.c b/prov/cxi/test/tagged.c index f730f5228e9..e711767f308 100644 --- a/prov/cxi/test/tagged.c +++ b/prov/cxi/test/tagged.c @@ -17,8 +17,7 @@ TestSuite(tagged, .init = cxit_setup_tagged, .fini = cxit_teardown_tagged, .timeout = CXIT_DEFAULT_TIMEOUT); -/* Test basic send/recv */ -Test(tagged, ping) +static void ping(void) { int i, ret; uint8_t *recv_buf, @@ -83,6 +82,220 @@ Test(tagged, ping) free(recv_buf); } +/* Test basic send/recv w/data */ +static void pingdata(void) +{ + int i, ret; + uint8_t *recv_buf, + *send_buf; + int recv_len = 64; + int send_len = 64; + struct fi_cq_tagged_entry tx_cqe, + rx_cqe; + int err = 0; + fi_addr_t from; + uint64_t data = 0xabcdabcdabcdabcd; + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + memset(recv_buf, 0, recv_len); + + send_buf = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf); + + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + /* Post RX buffer */ + ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, 0, + 0, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret); + + /* Send 64 bytes to self */ + ret = fi_tsenddata(cxit_ep, send_buf, send_len, NULL, data, + cxit_ep_fi_addr, 0, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsenddata failed %d", ret); + + /* Wait for async event indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_rx_event(&rx_cqe, NULL, send_len, + FI_TAGGED | FI_RECV | FI_REMOTE_CQ_DATA, + NULL, data, 0); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL); + + /* Validate sent data */ + for (i = 0; i < send_len; i++) { + cr_expect_eq(recv_buf[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf[i], err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + + free(send_buf); + free(recv_buf); +} +/* Test basic sendv/recvv */ +static void vping(void) +{ + int i, ret; + uint8_t *recv_buf, + *send_buf; + int recv_len = 64; + int send_len = 64; + struct fi_cq_tagged_entry tx_cqe, + rx_cqe; + int err = 0; + fi_addr_t from; + struct iovec siovec; + struct iovec riovec; + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + memset(recv_buf, 0, recv_len); + + send_buf = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf); + + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + /* Post RX buffer */ + riovec.iov_base = recv_buf; + riovec.iov_len = recv_len; + ret = fi_trecvv(cxit_ep, &riovec, NULL, 1, FI_ADDR_UNSPEC, 0, 0, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecvv failed %d", ret); + + /* Send 64 bytes to self */ + siovec.iov_base = send_buf; + siovec.iov_len = send_len; + ret = fi_tsendv(cxit_ep, &siovec, NULL, 1, cxit_ep_fi_addr, 0, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsendv failed %d", ret); + + /* Wait for async event indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL, + 0, 0); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL); + + /* Validate sent data */ + for (i = 0; i < send_len; i++) { + cr_expect_eq(recv_buf[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf[i], err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + + free(send_buf); + free(recv_buf); +} + +/* Test basic sendmsg/recvmsg */ +static void msgping(void) +{ + int i, ret; + uint8_t *recv_buf, + *send_buf; + int recv_len = 64; + int send_len = 64; + struct fi_cq_tagged_entry tx_cqe, + rx_cqe; + int err = 0; + fi_addr_t from; + struct fi_msg_tagged rmsg = {}; + struct fi_msg_tagged smsg = {}; + struct iovec riovec; + struct iovec siovec; + + recv_buf = aligned_alloc(s_page_size, recv_len); + cr_assert(recv_buf); + memset(recv_buf, 0, recv_len); + + send_buf = aligned_alloc(s_page_size, send_len); + cr_assert(send_buf); + + for (i = 0; i < send_len; i++) + send_buf[i] = i + 0xa0; + + /* Post RX buffer */ + riovec.iov_base = recv_buf; + riovec.iov_len = recv_len; + rmsg.msg_iov = &riovec; + rmsg.iov_count = 1; + rmsg.addr = FI_ADDR_UNSPEC; + rmsg.tag = 0; + rmsg.ignore = 0; + rmsg.context = NULL; + + ret = fi_trecvmsg(cxit_ep, &rmsg, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecvmsg failed %d", ret); + + /* Send 64 bytes to self */ + siovec.iov_base = send_buf; + siovec.iov_len = send_len; + smsg.msg_iov = &siovec; + smsg.iov_count = 1; + smsg.addr = cxit_ep_fi_addr; + smsg.tag = 0; + smsg.ignore = 0; + smsg.context = NULL; + + ret = fi_tsendmsg(cxit_ep, &smsg, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsendmsg failed %d", ret); + + /* Wait for async event indicating data has been received */ + do { + ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL, + 0, 0); + cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); + + /* Wait for async event indicating data has been sent */ + ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); + cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); + + validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL); + + /* Validate sent data */ + for (i = 0; i < send_len; i++) { + cr_expect_eq(recv_buf[i], send_buf[i], + "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", + i, send_buf[i], recv_buf[i], err++); + } + cr_assert_eq(err, 0, "Data errors seen\n"); + + free(send_buf); + free(recv_buf); +} + +/* Test basic send/recv */ +Test(tagged, ping) +{ + ping(); +} + /* Test basic zero-byte send/recv */ Test(tagged, zbr) { @@ -304,6 +517,8 @@ Test(tagged, fail_alt_read_rdzv) int ret; struct cxip_ep *ep = container_of(&cxit_ep->fid, struct cxip_ep, ep.fid); + struct cxip_txc_hpc *txc = container_of(ep->ep_obj->txc, + struct cxip_txc_hpc, base); /* If not testing alt_read protocol skip */ rdzv_proto = getenv("FI_CXI_RDZV_PROTO"); @@ -315,7 +530,7 @@ Test(tagged, fail_alt_read_rdzv) /* Force error on allocation of hardware resources required * by alt_read rendezvous protocol. */ - ep->ep_obj->txc.force_err |= CXIP_TXC_FORCE_ERR_ALT_READ_PROTO_ALLOC; + txc->force_err |= CXIP_TXC_FORCE_ERR_ALT_READ_PROTO_ALLOC; ret = cxit_dom_read_cntr(C_CNTR_IXE_RX_PTL_RESTRICTED_PKT, &start_pkt_cnt, NULL, true); @@ -336,64 +551,7 @@ Test(tagged, fail_alt_read_rdzv) /* Test basic send/recv w/data */ Test(tagged, pingdata) { - int i, ret; - uint8_t *recv_buf, - *send_buf; - int recv_len = 64; - int send_len = 64; - struct fi_cq_tagged_entry tx_cqe, - rx_cqe; - int err = 0; - fi_addr_t from; - uint64_t data = 0xabcdabcdabcdabcd; - - recv_buf = aligned_alloc(s_page_size, recv_len); - cr_assert(recv_buf); - memset(recv_buf, 0, recv_len); - - send_buf = aligned_alloc(s_page_size, send_len); - cr_assert(send_buf); - - for (i = 0; i < send_len; i++) - send_buf[i] = i + 0xa0; - - /* Post RX buffer */ - ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, 0, - 0, NULL); - cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret); - - /* Send 64 bytes to self */ - ret = fi_tsenddata(cxit_ep, send_buf, send_len, NULL, data, - cxit_ep_fi_addr, 0, NULL); - cr_assert_eq(ret, FI_SUCCESS, "fi_tsenddata failed %d", ret); - - /* Wait for async event indicating data has been received */ - do { - ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); - } while (ret == -FI_EAGAIN); - cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); - - validate_rx_event(&rx_cqe, NULL, send_len, - FI_TAGGED | FI_RECV | FI_REMOTE_CQ_DATA, - NULL, data, 0); - cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); - - /* Wait for async event indicating data has been sent */ - ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); - cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); - - validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL); - - /* Validate sent data */ - for (i = 0; i < send_len; i++) { - cr_expect_eq(recv_buf[i], send_buf[i], - "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", - i, send_buf[i], recv_buf[i], err++); - } - cr_assert_eq(err, 0, "Data errors seen\n"); - - free(send_buf); - free(recv_buf); + pingdata(); } /* Test basic inject send */ @@ -459,142 +617,16 @@ Test(tagged, inject_ping) cxit_ep_fi_addr, 0); cr_assert_eq(ret, -FI_EMSGSIZE, "fi_tinject failed %d", ret); - ret = fi_tinject(cxit_ep, send_buf, cxit_fi->ep_attr->max_msg_size+1, - cxit_ep_fi_addr, 0); - cr_assert_eq(ret, -FI_EMSGSIZE, "fi_tinject failed %d", ret); - - free(send_buf); - free(recv_buf); -} - -/* Test basic injectdata */ -Test(tagged, injectdata_ping) -{ - int i, ret; - uint8_t *recv_buf, - *send_buf; - int recv_len = 64; - int send_len = 64; - struct fi_cq_tagged_entry tx_cqe, - rx_cqe; - int err = 0; - fi_addr_t from; - uint64_t data = 0xabcdabcdabcdabcd; - - recv_buf = aligned_alloc(s_page_size, recv_len); - cr_assert(recv_buf); - memset(recv_buf, 0, recv_len); - - send_buf = aligned_alloc(s_page_size, send_len); - cr_assert(send_buf); - - for (i = 0; i < send_len; i++) - send_buf[i] = i + 0xa0; - - /* Post RX buffer */ - ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, 0, - 0, NULL); - cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret); - - /* Send 64 bytes to self */ - ret = fi_tinjectdata(cxit_ep, send_buf, send_len, data, - cxit_ep_fi_addr, 0); - cr_assert_eq(ret, FI_SUCCESS, "fi_tinject failed %d", ret); - - /* Wait for async event indicating data has been received */ - do { - ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); - } while (ret == -FI_EAGAIN); - cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); - - validate_rx_event(&rx_cqe, NULL, send_len, - FI_TAGGED | FI_RECV | FI_REMOTE_CQ_DATA, - NULL, data, 0); - cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); - - /* Validate sent data */ - for (i = 0; i < send_len; i++) { - cr_expect_eq(recv_buf[i], send_buf[i], - "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", - i, send_buf[i], recv_buf[i], err++); - } - cr_assert_eq(err, 0, "Data errors seen\n"); - - /* Make sure a TX event wasn't delivered */ - ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); - cr_assert(ret == -FI_EAGAIN); - - free(send_buf); - free(recv_buf); -} - -/* Test basic sendv/recvv */ -Test(tagged, vping) -{ - int i, ret; - uint8_t *recv_buf, - *send_buf; - int recv_len = 64; - int send_len = 64; - struct fi_cq_tagged_entry tx_cqe, - rx_cqe; - int err = 0; - fi_addr_t from; - struct iovec siovec; - struct iovec riovec; - - recv_buf = aligned_alloc(s_page_size, recv_len); - cr_assert(recv_buf); - memset(recv_buf, 0, recv_len); - - send_buf = aligned_alloc(s_page_size, send_len); - cr_assert(send_buf); - - for (i = 0; i < send_len; i++) - send_buf[i] = i + 0xa0; - - /* Post RX buffer */ - riovec.iov_base = recv_buf; - riovec.iov_len = recv_len; - ret = fi_trecvv(cxit_ep, &riovec, NULL, 1, FI_ADDR_UNSPEC, 0, 0, NULL); - cr_assert_eq(ret, FI_SUCCESS, "fi_trecvv failed %d", ret); - - /* Send 64 bytes to self */ - siovec.iov_base = send_buf; - siovec.iov_len = send_len; - ret = fi_tsendv(cxit_ep, &siovec, NULL, 1, cxit_ep_fi_addr, 0, NULL); - cr_assert_eq(ret, FI_SUCCESS, "fi_tsendv failed %d", ret); - - /* Wait for async event indicating data has been received */ - do { - ret = fi_cq_readfrom(cxit_rx_cq, &rx_cqe, 1, &from); - } while (ret == -FI_EAGAIN); - cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); - - validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL, - 0, 0); - cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); - - /* Wait for async event indicating data has been sent */ - ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); - cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); - - validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL); - - /* Validate sent data */ - for (i = 0; i < send_len; i++) { - cr_expect_eq(recv_buf[i], send_buf[i], - "data mismatch, element[%d], exp=%d saw=%d, err=%d\n", - i, send_buf[i], recv_buf[i], err++); - } - cr_assert_eq(err, 0, "Data errors seen\n"); - + ret = fi_tinject(cxit_ep, send_buf, cxit_fi->ep_attr->max_msg_size+1, + cxit_ep_fi_addr, 0); + cr_assert_eq(ret, -FI_EMSGSIZE, "fi_tinject failed %d", ret); + free(send_buf); free(recv_buf); } -/* Test basic sendmsg/recvmsg */ -Test(tagged, msgping) +/* Test basic injectdata */ +Test(tagged, injectdata_ping) { int i, ret; uint8_t *recv_buf, @@ -605,10 +637,7 @@ Test(tagged, msgping) rx_cqe; int err = 0; fi_addr_t from; - struct fi_msg_tagged rmsg = {}; - struct fi_msg_tagged smsg = {}; - struct iovec riovec; - struct iovec siovec; + uint64_t data = 0xabcdabcdabcdabcd; recv_buf = aligned_alloc(s_page_size, recv_len); cr_assert(recv_buf); @@ -621,30 +650,14 @@ Test(tagged, msgping) send_buf[i] = i + 0xa0; /* Post RX buffer */ - riovec.iov_base = recv_buf; - riovec.iov_len = recv_len; - rmsg.msg_iov = &riovec; - rmsg.iov_count = 1; - rmsg.addr = FI_ADDR_UNSPEC; - rmsg.tag = 0; - rmsg.ignore = 0; - rmsg.context = NULL; - - ret = fi_trecvmsg(cxit_ep, &rmsg, 0); - cr_assert_eq(ret, FI_SUCCESS, "fi_trecvmsg failed %d", ret); + ret = fi_trecv(cxit_ep, recv_buf, recv_len, NULL, FI_ADDR_UNSPEC, 0, + 0, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecv failed %d", ret); /* Send 64 bytes to self */ - siovec.iov_base = send_buf; - siovec.iov_len = send_len; - smsg.msg_iov = &siovec; - smsg.iov_count = 1; - smsg.addr = cxit_ep_fi_addr; - smsg.tag = 0; - smsg.ignore = 0; - smsg.context = NULL; - - ret = fi_tsendmsg(cxit_ep, &smsg, 0); - cr_assert_eq(ret, FI_SUCCESS, "fi_tsendmsg failed %d", ret); + ret = fi_tinjectdata(cxit_ep, send_buf, send_len, data, + cxit_ep_fi_addr, 0); + cr_assert_eq(ret, FI_SUCCESS, "fi_tinject failed %d", ret); /* Wait for async event indicating data has been received */ do { @@ -652,16 +665,11 @@ Test(tagged, msgping) } while (ret == -FI_EAGAIN); cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); - validate_rx_event(&rx_cqe, NULL, send_len, FI_TAGGED | FI_RECV, NULL, - 0, 0); + validate_rx_event(&rx_cqe, NULL, send_len, + FI_TAGGED | FI_RECV | FI_REMOTE_CQ_DATA, + NULL, data, 0); cr_assert(from == cxit_ep_fi_addr, "Invalid source address"); - /* Wait for async event indicating data has been sent */ - ret = cxit_await_completion(cxit_tx_cq, &tx_cqe); - cr_assert_eq(ret, 1, "fi_cq_read unexpected value %d", ret); - - validate_tx_event(&tx_cqe, FI_TAGGED | FI_SEND, NULL); - /* Validate sent data */ for (i = 0; i < send_len; i++) { cr_expect_eq(recv_buf[i], send_buf[i], @@ -670,10 +678,26 @@ Test(tagged, msgping) } cr_assert_eq(err, 0, "Data errors seen\n"); + /* Make sure a TX event wasn't delivered */ + ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + cr_assert(ret == -FI_EAGAIN); + free(send_buf); free(recv_buf); } +/* Test basic sendv/recvv */ +Test(tagged, vping) +{ + vping(); +} + +/* Test basic sendmsg/recvmsg */ +Test(tagged, msgping) +{ + msgping(); +} + /* Test FI_FENCE */ Test(tagged, fence) { @@ -4249,7 +4273,7 @@ Test(tagged, recv_more) struct cxip_ep *ep = container_of(cxit_ep, struct cxip_ep, ep.fid); /* FI_MORE has no meaning if receives are not offloaded */ - if (!ep->ep_obj->rxc.msg_offload) { + if (!ep->ep_obj->rxc->msg_offload) { cr_assert(1); return; } @@ -5775,3 +5799,261 @@ Test(tagged_tx_size, force_progress) free(send_buf); free(recv_buf); } + +/* Note: the FI_PROTO_CXI_RNR tagged message test suite uses rnr_tagged + * so that it will not be included in flow-control and software EP tests, + * which it does not support. + */ +TestSuite(rnr_tagged, .init = cxit_setup_rnr_msg_ep, + .fini = cxit_teardown_msg, .timeout = CXIT_DEFAULT_TIMEOUT); + +Test(rnr_tagged, ping) +{ + ping(); +} + +Test(rnr_tagged, pingdata) +{ + pingdata(); +} + +Test(rnr_tagged, vping) +{ + vping(); +} + +Test(rnr_tagged, msgping) +{ + msgping(); +} + +Test(rnr_tagged, peek) +{ + int ret; + ssize_t len = 4096; + uint8_t *send_buf; + uint8_t *recv_buf; + uint64_t tag = 11; + struct fi_cq_tagged_entry rx_cqe; + struct fi_cq_tagged_entry tx_cqe; + struct fi_context rx_ctxt; + + send_buf = aligned_alloc(s_page_size, len); + cr_assert_not_null(send_buf); + recv_buf = aligned_alloc(s_page_size, len); + cr_assert_not_null(recv_buf); + + memset(send_buf, 0xa5, len); + + /* Issue the Send it will be in retransmits */ + ret = fi_tsend(cxit_ep, send_buf, len, NULL, cxit_ep_fi_addr, + tag, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsend unexpected ret %d", ret); + + /* Just to make sure nothing completed unexpectedly in all modes */ + fi_cq_read(cxit_rx_cq, &rx_cqe, 0); + + /* Issue a FI_PEEK, should return -FI_ENOMSG since there is not + * an unexpected list. + */ + ret = try_peek(FI_ADDR_UNSPEC, tag, 0, len, NULL, false); + cr_assert_eq(ret, FI_ENOMSG, "peek of CS message succeeded"); + + /* Issue a FI_PEEK | FI_CLAIM, should return -FI_ENOMSG since + * there is not an unexpected list. + */ + ret = try_peek(FI_ADDR_UNSPEC, tag, 0, len, &rx_ctxt, true); + cr_assert_eq(ret, FI_ENOMSG, + "peek with claim of CS message succeeded"); + + /* Issue the Receive to recvieve the message that is being RNR + * retried. + */ + ret = fi_trecv(cxit_ep, recv_buf, len, NULL, FI_ADDR_UNSPEC, + tag, 0, NULL); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecv unexpected ret %d", ret); + + do { + ret = fi_cq_read(cxit_rx_cq, &rx_cqe, 1); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "RX CQ error\n"); + + do { + ret = fi_cq_read(cxit_tx_cq, &tx_cqe, 1); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, 1, "TX CQ error\n"); + + /* Validate results */ + cr_expect_arr_eq(send_buf, recv_buf, len); + + free(send_buf); + free(recv_buf); +} + +/* CS multiple tagged received test. The last_to_first option is + * used to verify that a RNR for a specific tagged message will not + * keep other tagged messages from matching. + */ +struct rnr_multitudes_params { + size_t length; + size_t num_ios; + bool last_to_first; +}; + +void do_rnr_multitudes(struct rnr_multitudes_params *param) +{ + int ret; + size_t rx_io; + int i; + size_t tx_io; + size_t buf_len = param->length; + struct fi_cq_tagged_entry *rx_cqe; + struct fi_cq_tagged_entry *tx_cqe; + struct tagged_thread_args *tx_args; + struct tagged_thread_args *rx_args; + struct fi_context *rx_ctxts; + pthread_t tx_thread; + pthread_t rx_thread; + pthread_attr_t attr; + struct tagged_event_args tx_evt_args = { + .cq = cxit_tx_cq, + .io_num = param->num_ios, + }; + struct tagged_event_args rx_evt_args = { + .cq = cxit_rx_cq, + .io_num = param->num_ios, + }; + + tx_cqe = calloc(param->num_ios, sizeof(struct fi_cq_tagged_entry)); + cr_assert_not_null(tx_cqe); + + rx_cqe = calloc(param->num_ios, sizeof(struct fi_cq_tagged_entry)); + cr_assert_not_null(rx_cqe); + + tx_args = calloc(param->num_ios, sizeof(struct tagged_thread_args)); + cr_assert_not_null(tx_args); + + rx_args = calloc(param->num_ios, sizeof(struct tagged_thread_args)); + cr_assert_not_null(rx_args); + + rx_ctxts = calloc(param->num_ios, sizeof(struct fi_context)); + cr_assert_not_null(rx_ctxts); + + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + + tx_evt_args.cqe = tx_cqe; + rx_evt_args.cqe = rx_cqe; + + /* Issue the Sends */ + for (tx_io = 0; tx_io < param->num_ios; tx_io++) { + tx_args[tx_io].len = buf_len; + tx_args[tx_io].tag = tx_io; + tx_args[tx_io].buf = aligned_alloc(s_page_size, buf_len); + cr_assert_not_null(tx_args[tx_io].buf); + for (size_t i = 0; i < buf_len; i++) + tx_args[tx_io].buf[i] = i + 0xa0 + tx_io; + + do { + ret = fi_tsend(cxit_ep, tx_args[tx_io].buf, + tx_args[tx_io].len, NULL, + cxit_ep_fi_addr, tx_args[tx_io].tag, + NULL); + if (ret == -FI_EAGAIN) { + fi_cq_read(cxit_tx_cq, NULL, 0); + fi_cq_read(cxit_rx_cq, NULL, 0); + } + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, FI_SUCCESS, "fi_tsend %ld: unexpected ret %d", + tx_io, ret); + } + + /* Start processing Send events progressing retry of RNR */ + ret = pthread_create(&tx_thread, &attr, tagged_evt_worker, + (void *)&tx_evt_args); + cr_assert_eq(ret, 0, "Send thread create failed %d", ret); + + /* Issue the Receives */ + for (i = 0; i < param->num_ios; i++) { + rx_io = param->last_to_first ? + param->num_ios - 1 - i : i; + rx_args[rx_io].len = buf_len; + rx_args[rx_io].tag = rx_io; + rx_args[rx_io].buf = aligned_alloc(s_page_size, buf_len); + cr_assert_not_null(rx_args[rx_io].buf); + memset(rx_args[rx_io].buf, 0, buf_len); + + do { + ret = fi_trecv(cxit_ep, rx_args[rx_io].buf, + rx_args[rx_io].len, NULL, FI_ADDR_UNSPEC, + rx_args[rx_io].tag, 0, NULL); + if (ret == -FI_EAGAIN) + fi_cq_read(cxit_rx_cq, NULL, 0); + } while (ret == -FI_EAGAIN); + cr_assert_eq(ret, FI_SUCCESS, "fi_trecv %ld: unexpected ret %d", + rx_io, ret); + } + + /* Start processing Receive events */ + ret = pthread_create(&rx_thread, &attr, tagged_evt_worker, + (void *)&rx_evt_args); + cr_assert_eq(ret, 0, "Receive thread create failed %d", ret); + + /* Wait for the RX/TX event threads to complete */ + ret = pthread_join(tx_thread, NULL); + cr_assert_eq(ret, 0, "Send thread join failed %d", ret); + + ret = pthread_join(rx_thread, NULL); + cr_assert_eq(ret, 0, "Recv thread join failed %d", ret); + + /* Validate results */ + for (size_t io = 0; io < param->num_ios; io++) { + /* Validate sent data */ + cr_expect_arr_eq(rx_args[io].buf, tx_args[io].buf, buf_len); + + validate_tx_event(&tx_cqe[io], FI_TAGGED | FI_SEND, NULL); + validate_rx_event(&rx_cqe[io], NULL, + buf_len, FI_TAGGED | FI_RECV, NULL, + 0, tx_args[rx_cqe[io].tag].tag); + free(tx_args[io].buf); + free(rx_args[io].buf); + } + + pthread_attr_destroy(&attr); + free(rx_cqe); + free(tx_cqe); + free(tx_args); + free(rx_args); + free(rx_ctxts); +} + + +ParameterizedTestParameters(rnr_tagged, rnr_multitudes) +{ + size_t param_sz; + + static struct rnr_multitudes_params rnr_params[] = { + {.length = 1024, + .num_ios = 10, + .last_to_first = false}, + {.length = 1024, + .num_ios = 10, + .last_to_first = true}, + {.length = 8192, + .num_ios = 10, + .last_to_first = false}, + {.length = 8192, + .num_ios = 10, + .last_to_first = true}, + }; + + param_sz = ARRAY_SIZE(rnr_params); + return cr_make_param_array(struct rnr_multitudes_params, rnr_params, + param_sz); +} + +ParameterizedTest(struct rnr_multitudes_params *param, rnr_tagged, + rnr_multitudes, .timeout = 60) +{ + do_rnr_multitudes(param); +} diff --git a/prov/cxi/test/test_sw.sh b/prov/cxi/test/test_sw.sh old mode 100644 new mode 100755 diff --git a/prov/efa/Makefile.include b/prov/efa/Makefile.include index 268929211a8..bd53e7b9052 100644 --- a/prov/efa/Makefile.include +++ b/prov/efa/Makefile.include @@ -137,6 +137,7 @@ nodist_prov_efa_test_efa_unit_test_SOURCES = \ prov/efa/test/efa_unit_test_ep.c \ prov/efa/test/efa_unit_test_av.c \ prov/efa/test/efa_unit_test_cq.c \ + prov/efa/test/efa_unit_test_cntr.c \ prov/efa/test/efa_unit_test_device.c \ prov/efa/test/efa_unit_test_info.c \ prov/efa/test/efa_unit_test_hmem.c \ @@ -170,6 +171,10 @@ if HAVE_EFADV_QUERY_MR prov_efa_test_efa_unit_test_LDFLAGS += -Wl,--wrap=efadv_query_mr endif HAVE_EFADV_QUERY_MR +if HAVE_EFA_DATA_IN_ORDER_ALIGNED_128_BYTES +prov_efa_test_efa_unit_test_LDFLAGS += -Wl,--wrap=ibv_query_qp_data_in_order +endif + prov_efa_test_efa_unit_test_LIBS = $(efa_LIBS) $(linkback) endif ENABLE_EFA_UNIT_TEST diff --git a/prov/efa/configure.m4 b/prov/efa/configure.m4 index ba71f14820b..619e9d61c35 100644 --- a/prov/efa/configure.m4 +++ b/prov/efa/configure.m4 @@ -224,6 +224,7 @@ AC_DEFUN([FI_EFA_CONFIGURE],[ AM_CONDITIONAL([HAVE_EFADV_CQ_EX], [ test $efadv_support_extended_cq = 1]) AM_CONDITIONAL([HAVE_EFADV_QUERY_MR], [ test $have_efadv_query_mr = 1]) + AM_CONDITIONAL([HAVE_EFA_DATA_IN_ORDER_ALIGNED_128_BYTES], [ test $efa_support_data_in_order_aligned_128_byte = 1]) AM_CONDITIONAL([ENABLE_EFA_UNIT_TEST], [ test x"$enable_efa_unit_test" != xno]) AC_SUBST(efa_CPPFLAGS) diff --git a/prov/efa/docs/efa_rdm_protocol_v4.md b/prov/efa/docs/efa_rdm_protocol_v4.md index 0fada5145ee..def2873f9b4 100644 --- a/prov/efa/docs/efa_rdm_protocol_v4.md +++ b/prov/efa/docs/efa_rdm_protocol_v4.md @@ -571,8 +571,8 @@ Table: 3.2 Format of the REQ optional raw address header | Field | Type | Length | C language type | |---|---|---|---| -| `size` | integer | 4 | `uint32` | -| `addr` | array | `size` | `uint8[]` | +| `size` | integer | 4 | `uint32_t` | +| `addr` | array | `size` | `uint8_t[]` | As can be seen, the optional raw address consists of two fields `size` and `addr`. The field `size` describes the number of bytes in the `addr` array. The field `addr` contains the raw address. The `size` field is necessary because @@ -599,7 +599,7 @@ application data. For example, the RTR (Request To Read) packet type does not co ### 3.2 Baseline features for two-sided communication This section describes the 3 baseline features for two sided communication: eager message, medium message, and -long-CTS message.` Each of them corresponds to the same named subprotocol. When describing a subprotocol, we +long-CTS message. Each of them corresponds to the same named subprotocol. When describing a subprotocol, we always follow the same structure: workflow, packet format, and implementation tips. #### Eager message feature/subprotocol @@ -659,7 +659,7 @@ on this topic. total_packet_size is reported by the EFA device when a packet is received. The REQ optional header length can be derived from the `flags` field in the base header. The choice of not including data length in the header is to keep the header - length as compact as possible, since eager messagers are sensitive to header length. + length as compact as possible, since eager messages are sensitive to header length. #### Medium message feature/subprotocol @@ -698,7 +698,7 @@ refers to the segment of data in the packet) `seg_offset` seems redundant at first glance, as it can be deduced from the `seg_length` of other packets. However, because the EFA device does not guarantee ordered delivery, the MEDIUM_RTM packets of same message can -arrive in a different order. Therefore, the recipent of MEDIUM_RTM packets +arrive in a different order. Therefore, the recipient of MEDIUM_RTM packets needs `seg_offset` to put the data in the correct location in the receive buffer. @@ -706,7 +706,7 @@ When implementing the medium message protocol, please keep in mind that because the EFA device has a limited TX queue (e.g. it can only send a limited number of packets at a time), it is possible when sending multiple medium RTM packets for some packets to be sent successfully and others to -not be sent due to temporarily being out of reseources. Implementation needs +not be sent due to temporarily being out of resources. Implementation needs to be able to handle this case. Note, this "partial send" situation is unique to the medium message @@ -812,7 +812,7 @@ Table: 3.6 Format a CTS packet The 3 new fields in the header are `multiuse`, `recv_id` and `recv_length`. The field `multiuse` is a 4 byte integer. As the name indicates, it is a multi-purpose field. -Its exact usage is determined by the the `flags` field. +Its exact usage is determined by the `flags` field. If the CONNID_HDR universal flag is toggled in `flags`, this field is the sender's connection ID (connid). Otherwise, it is a padding space. @@ -854,7 +854,7 @@ Table: 3.7 Format of the CTSDATA packet header | `padding` | 4 | integer | `uint32_t` | padding for connid, optional | The last two fields `connid` and `padding` was introduced with the extra request "connid header". -They are optional, which means an implemenation is not required to include them in the DATA of the +They are optional, which means an implementation is not required to include them in the DATA of the data packet. If an implementation does include them in the CTSDATA packet header, the implementation needs to toggle on the CONNID_DHR flag in the `flags` field (Table 1.4). @@ -873,7 +873,7 @@ Before getting into the details of each feature, we will discuss some topics rel There are 3 types of one-sided operations: write, read, and atomic. -Like in two-sided communcation, there are also two endpoints involved in one-sided communcation. +Like in two-sided communication, there are also two endpoints involved in one-sided communication. However, only on one side will the application call libfabric's one-sided API (such as `fi_write`, `fi_read` and `fi_atomic`). In protocol v4, this side is called the requester. @@ -1012,7 +1012,7 @@ Table: 3.11 Format of the READRSP packet's header The field `multiuse` has been introduced before when introducing the CTS packet (table 3.6). It is a multi-purpose field, which can be used to store `connid` or as a padding space, depending on whether the CONNID_HDR universal flag is toggled in `flags`. See section 4.4 for more -information about the field `connid. +information about the field `connid`. The workflow of the emulated long-CTS read subprotocol is illustrated in the following diagram: @@ -1054,7 +1054,7 @@ The workflow of emulated write atomic is illustrated in the following diagram: ![atomic_write](atomic_write.png) It is similar to the emulated eager write subprotocol, except a WRITE_RTA packet was -sent. Table 3.13 lists the binary structure of an WRITE_RTA packet's mandatory +sent. Table 3.13 lists the binary structure of a WRITE_RTA packet's mandatory header: Table: 3.13 Format of the WRITE_RTA packet's mandatory header @@ -1351,7 +1351,7 @@ buffer at a later time. However, if an application has the following set of requ 1. Does not need ordered send/receive (`FI_ORDER_SAS`) 2. Only sends/receives eager messages 3. Does not use tagged send - 4. Does not require FI_DIRECTED_RECV (the ability to receive only from certain addresses) + 4. Does not require `FI_DIRECTED_RECV` (the ability to receive only from certain addresses) it should be possible to receive data directly using the application buffer since, under such conditions, the receiver does not have special requirements on the data it is going to receive, and it will thus accept any @@ -1371,18 +1371,22 @@ to work, the receiver needs to: However, there is no guarantee in the base protocol that the packet header length of EAGER_MSGRTM will not change. -In fact, because of the existance of the handshake subprotocol, the packet header length of an EAGER_MSGRTM +In fact, because of the existence of the handshake subprotocol, the packet header length of an EAGER_MSGRTM will definitely change. Recall that the handshake subprotocol's workflow is: -Before receiving handshake packet, an endpoint will always include the optional raw address header in REQ packets. +1. Before receiving handshake packet, an endpoint will always include the + optional raw address header in REQ packets. +2. After receiving handshake packet, an endpoint will stop including the + optional raw address header in REQ packets. -After receiving handshake packet, an endpoint will stop including the optional raw address header in REQ packets. +Furthermore, a REQ packet may also contain the optional CQ data header (section +3.1) header (`REQ_OPT_CQ_DATA_HDR`) and is not mutually exclusive with other +optional REQ header components. Therefore, a compliant request of _constant +header length_ should include space for the CQ data header. -The extra feature "keep packet header length constant" (constant header length) is designed to solve this problem. - -When an endpoint toggles on this extra request, its peer will try to satisfy it by keeping the header length -constant. Exactly how to achieve that is up to the implementation to decide. The easiest way to do so is to keep -including the raw address header in the EAGER_MSGRTM even after receiving the handshake packet. +The extra feature "keep packet header length constant" (constant header length) +is designed to solve this problem. When an endpoint toggles on this extra +request, its peer will try to satisfy it by keeping the header length constant. Note, because this is an extra request, an endpoint cannot assume its peer will comply with the request. Therefore, the receiving endpoint must be able to handle the situation that a received packet does not have the expected header @@ -1394,7 +1398,7 @@ In that case, implementation will have two choices: 2. Move the application data to the right place Note, this extra request was initially introduced as an extra feature named "zero copy receive", but later it was realized -that this is not an feature because the peer does not do anything different. Rather, it is an expectation that the +that this is not a feature because the peer does not do anything different. Rather, it is an expectation that the receiving endpoint has for the sender. Therefore, it was re-interpreted as an extra request named "constant header length". This re-interpretation does not change the implementation, and thus, it does not cause backward incompatibility. @@ -1407,7 +1411,7 @@ This extra feature is designed to solve the "QP collision" problem, which is com client-server types of application. The "QP collision" problem arose from the fact that the EFA device uses the Device ID (GID) + QP number (QPN) -as the unique identifier of a peer. Recall that the raw address of the EFA endpoint consistsd of 3 parts: +as the unique identifier of a peer. Recall that the raw address of the EFA endpoint consists of 3 parts: GID + QPN + Connection ID (CONNID). The EFA device only recognizes GID and QPN. The connection ID was generated by the endpoint itself during its initialization. diff --git a/prov/efa/src/dgram/efa_dgram_cq.h b/prov/efa/src/dgram/efa_dgram_cq.h index 9e7312f2594..fbb986d3f72 100644 --- a/prov/efa/src/dgram/efa_dgram_cq.h +++ b/prov/efa/src/dgram/efa_dgram_cq.h @@ -1,34 +1,5 @@ -/* - * Copyright (c) 2018-2022 Amazon.com, Inc. or its affiliates. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef EFA_DGRAM_CQ_H #define EFA_DGRAM_CQ_H diff --git a/prov/efa/src/dgram/efa_dgram_ep.c b/prov/efa/src/dgram/efa_dgram_ep.c index b3447bf0130..c02bf3556ae 100644 --- a/prov/efa/src/dgram/efa_dgram_ep.c +++ b/prov/efa/src/dgram/efa_dgram_ep.c @@ -1,35 +1,6 @@ -/* - * Copyright (c) 2013-2015 Intel Corporation, Inc. All rights reserved. - * Copyright (c) 2017-2022 Amazon.com, Inc. or its affiliates. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright (c) 2013-2015 Intel Corporation, Inc. All rights reserved. */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #include "config.h" #include "efa_dgram_ep.h" @@ -332,7 +303,7 @@ static void efa_dgram_ep_progress_internal(struct efa_dgram_ep *ep, struct efa_d if (OFI_UNLIKELY(ret < 0)) { if (OFI_UNLIKELY(ret != -FI_EAVAIL)) { EFA_WARN(FI_LOG_CQ, "no error available errno: %ld\n", ret); - efa_base_ep_write_eq_error(&ep->base_ep, FI_EIO, FI_EFA_ERR_DGRAM_CQ_READ); + efa_base_ep_write_eq_error(&ep->base_ep, -ret, FI_EFA_ERR_DGRAM_CQ_READ); return; } diff --git a/prov/efa/src/dgram/efa_dgram_ep.h b/prov/efa/src/dgram/efa_dgram_ep.h index 3b0b803c7cc..ecc8f1772dd 100644 --- a/prov/efa/src/dgram/efa_dgram_ep.h +++ b/prov/efa/src/dgram/efa_dgram_ep.h @@ -1,35 +1,5 @@ - -/* - * Copyright (c) 2018-2022 Amazon.com, Inc. or its affiliates. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #include "efa_base_ep.h" diff --git a/prov/efa/src/dgram/efa_dgram_msg.c b/prov/efa/src/dgram/efa_dgram_msg.c index 0dbf7303ede..f8a5010daf9 100644 --- a/prov/efa/src/dgram/efa_dgram_msg.c +++ b/prov/efa/src/dgram/efa_dgram_msg.c @@ -1,35 +1,6 @@ -/* - * Copyright (c) 2013-2015 Intel Corporation, Inc. All rights reserved. - * Copyright (c) 2017-2023 Amazon.com, Inc. or its affiliates. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright (c) 2013-2015 Intel Corporation, Inc. All rights reserved. */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #include "config.h" diff --git a/prov/efa/src/dgram/efa_dgram_rma.c b/prov/efa/src/dgram/efa_dgram_rma.c index 22366abcf5d..99f4c1a2929 100644 --- a/prov/efa/src/dgram/efa_dgram_rma.c +++ b/prov/efa/src/dgram/efa_dgram_rma.c @@ -1,35 +1,5 @@ -/* - * Copyright (c) 2020 Amazon.com, Inc. or its affiliates. - * All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #include #include @@ -50,7 +20,7 @@ * self_comm: indicate whether the read is toward * the end point itself. If self_comm is true, * caller must set msg->addr to FI_ADDR_NOTAVAIL. - * + * * On success return 0, * If read iov and rma_iov count out of device limit, return -FI_EINVAL * If read failed, return the error of read operation diff --git a/prov/efa/src/efa.h b/prov/efa/src/efa.h index ffb096a0db3..e8325330406 100644 --- a/prov/efa/src/efa.h +++ b/prov/efa/src/efa.h @@ -1,34 +1,5 @@ -/* - * Copyright (c) 2018-2022 Amazon.com, Inc. or its affiliates. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef EFA_H #define EFA_H @@ -102,15 +73,12 @@ #define EFA_DEFAULT_RUNT_SIZE (307200) +#define EFA_NEURON_RUNT_SIZE (131072) #define EFA_DEFAULT_INTER_MAX_MEDIUM_MESSAGE_SIZE (65536) #define EFA_DEFAULT_INTER_MIN_READ_MESSAGE_SIZE (1048576) #define EFA_DEFAULT_INTER_MIN_READ_WRITE_SIZE (65536) #define EFA_DEFAULT_INTRA_MAX_GDRCOPY_FROM_DEV_SIZE (3072) -#define EFA_NEURON_RUNT_SIZE (131072) -#define EFA_NEURON_INTER_MAX_MEDIUM_MESSAGE_SIZE (49152) -#define EFA_NEURON_INTER_MIN_READ_MESSAGE_SIZE (49152) - /* * The default memory alignment */ diff --git a/prov/efa/src/efa_av.c b/prov/efa/src/efa_av.c index ca722b6113d..95f902d9794 100644 --- a/prov/efa/src/efa_av.c +++ b/prov/efa/src/efa_av.c @@ -1,36 +1,7 @@ -/* - * Copyright (c) 2016, Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2013-2015 Intel Corporation, Inc. All rights reserved. - * Copyright (c) 2017-2022 Amazon.com, Inc. or its affiliates. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright (c) 2016, Cisco Systems, Inc. All rights reserved. */ +/* SPDX-FileCopyrightText: Copyright (c) 2013-2015 Intel Corporation, Inc. All rights reserved. */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #include #include @@ -840,27 +811,24 @@ static void efa_av_close_reverse_av(struct efa_av *av) static int efa_av_close(struct fid *fid) { struct efa_av *av; - int ret = 0; int err = 0; av = container_of(fid, struct efa_av, util_av.av_fid.fid); efa_av_close_reverse_av(av); - ret = ofi_av_close(&av->util_av); - if (ret) { - err = ret; + err = ofi_av_close(&av->util_av); + if (OFI_UNLIKELY(err)) { EFA_WARN(FI_LOG_AV, "Failed to close av: %s\n", - fi_strerror(ret)); + fi_strerror(err)); } if (av->ep_type == FI_EP_RDM) { if (av->shm_rdm_av) { - ret = fi_close(&av->shm_rdm_av->fid); - if (ret) { - err = ret; + err = fi_close(&av->shm_rdm_av->fid); + if (OFI_UNLIKELY(err)) { EFA_WARN(FI_LOG_AV, "Failed to close shm av: %s\n", - fi_strerror(ret)); + fi_strerror(err)); } } } diff --git a/prov/efa/src/efa_av.h b/prov/efa/src/efa_av.h index f90c1944950..75acd87fdd7 100644 --- a/prov/efa/src/efa_av.h +++ b/prov/efa/src/efa_av.h @@ -1,34 +1,5 @@ -/* - * Copyright (c) 2018-2023 Amazon.com, Inc. or its affiliates. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef EFA_AV_H #define EFA_AV_H diff --git a/prov/efa/src/efa_base_ep.c b/prov/efa/src/efa_base_ep.c index 742841d30a6..aaf7fdf55a1 100644 --- a/prov/efa/src/efa_base_ep.c +++ b/prov/efa/src/efa_base_ep.c @@ -34,7 +34,7 @@ int efa_base_ep_bind_av(struct efa_base_ep *base_ep, struct efa_av *av) return 0; } -static int efa_base_ep_destruct_qp(struct efa_base_ep *base_ep) +int efa_base_ep_destruct_qp(struct efa_base_ep *base_ep) { struct efa_domain *domain; struct efa_qp *qp = base_ep->qp; @@ -48,23 +48,31 @@ static int efa_base_ep_destruct_qp(struct efa_base_ep *base_ep) EFA_INFO(FI_LOG_CORE, "destroy qp[%u] failed!\n", qp->qp_num); free(qp); + base_ep->qp = NULL; } return 0; } -int efa_base_ep_destruct(struct efa_base_ep *base_ep) +void efa_base_ep_close_util_ep(struct efa_base_ep *base_ep) { int err; - /* We need to free the util_ep first to avoid race conditions - * with other threads progressing the cq. */ if (base_ep->util_ep_initialized) { err = ofi_endpoint_close(&base_ep->util_ep); if (err) EFA_WARN(FI_LOG_EP_CTRL, "Unable to close util EP\n"); base_ep->util_ep_initialized = false; } +} + +int efa_base_ep_destruct(struct efa_base_ep *base_ep) +{ + int err; + + /* We need to free the util_ep first to avoid race conditions + * with other threads progressing the cq. */ + efa_base_ep_close_util_ep(base_ep); fi_freeinfo(base_ep->info); @@ -149,36 +157,53 @@ static int efa_base_ep_modify_qp_rst2rts(struct efa_base_ep *base_ep, IBV_QP_STATE | IBV_QP_SQ_PSN); } -int efa_base_ep_create_qp(struct efa_base_ep *base_ep, - struct ibv_qp_init_attr_ex *init_attr_ex) +/** + * @brief Create a efa_qp + * + * @param qp double pointer of struct efa_qp + * @param init_attr_ex ibv_qp_init_attr_ex + * @return int 0 on success, negative integer on failure + */ +int efa_qp_create(struct efa_qp **qp, struct ibv_qp_init_attr_ex *init_attr_ex) { - struct efa_qp *qp; struct efadv_qp_init_attr efa_attr = { 0 }; - qp = calloc(1, sizeof(*qp)); - if (!qp) + *qp = calloc(1, sizeof(struct efa_qp)); + if (!*qp) return -FI_ENOMEM; if (init_attr_ex->qp_type == IBV_QPT_UD) { - qp->ibv_qp = ibv_create_qp_ex(init_attr_ex->pd->context, + (*qp)->ibv_qp = ibv_create_qp_ex(init_attr_ex->pd->context, init_attr_ex); } else { assert(init_attr_ex->qp_type == IBV_QPT_DRIVER); efa_attr.driver_qp_type = EFADV_QP_DRIVER_TYPE_SRD; - qp->ibv_qp = efadv_create_qp_ex( + (*qp)->ibv_qp = efadv_create_qp_ex( init_attr_ex->pd->context, init_attr_ex, &efa_attr, sizeof(struct efadv_qp_init_attr)); } - if (!qp->ibv_qp) { + if (!(*qp)->ibv_qp) { EFA_WARN(FI_LOG_EP_CTRL, "ibv_create_qp failed. errno: %d\n", errno); - free(qp); + free(*qp); + *qp = NULL; return -errno; } - qp->ibv_qp_ex = ibv_qp_to_qp_ex(qp->ibv_qp); - base_ep->qp = qp; - qp->base_ep = base_ep; + (*qp)->ibv_qp_ex = ibv_qp_to_qp_ex((*qp)->ibv_qp); + return FI_SUCCESS; +} + +int efa_base_ep_create_qp(struct efa_base_ep *base_ep, + struct ibv_qp_init_attr_ex *init_attr_ex) +{ + int ret; + + ret = efa_qp_create(&base_ep->qp, init_attr_ex); + if (ret) + return ret; + + base_ep->qp->base_ep = base_ep; return 0; } @@ -333,17 +358,17 @@ int efa_base_ep_getname(fid_t fid, void *addr, size_t *addrlen) * in-order operation. */ #if HAVE_EFA_DATA_IN_ORDER_ALIGNED_128_BYTES -bool efa_base_ep_support_op_in_order_aligned_128_bytes(struct efa_base_ep *base_ep, enum ibv_wr_opcode op) +bool efa_qp_support_op_in_order_aligned_128_bytes(struct efa_qp *qp, enum ibv_wr_opcode op) { int caps; - caps = ibv_query_qp_data_in_order(base_ep->qp->ibv_qp, op, + caps = ibv_query_qp_data_in_order(qp->ibv_qp, op, IBV_QUERY_QP_DATA_IN_ORDER_RETURN_CAPS); return !!(caps & IBV_QUERY_QP_DATA_IN_ORDER_ALIGNED_128_BYTES); } #else -bool efa_base_ep_support_op_in_order_aligned_128_bytes(struct efa_base_ep *base_ep, enum ibv_wr_opcode op) +bool efa_qp_support_op_in_order_aligned_128_bytes(struct efa_qp *qp, enum ibv_wr_opcode op) { return false; } diff --git a/prov/efa/src/efa_base_ep.h b/prov/efa/src/efa_base_ep.h index f5bf1e45616..5de64cf13e4 100644 --- a/prov/efa/src/efa_base_ep.h +++ b/prov/efa/src/efa_base_ep.h @@ -1,34 +1,5 @@ -/* - * Copyright (c) 2018-2022 Amazon.com, Inc. or its affiliates. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef EFA_BASE_EP_H #define EFA_BASE_EP_H @@ -59,9 +30,8 @@ struct efa_recv_wr { * * @details * EFA device supports a maximum of 2 iov/SGE - * For receive, we only use 1 SGE */ - struct ibv_sge sge[1]; + struct ibv_sge sge[2]; }; struct efa_base_ep { @@ -98,10 +68,16 @@ int efa_base_ep_construct(struct efa_base_ep *base_ep, int efa_base_ep_getname(fid_t fid, void *addr, size_t *addrlen); +int efa_qp_create(struct efa_qp **qp, struct ibv_qp_init_attr_ex *init_attr_ex); + int efa_base_ep_create_qp(struct efa_base_ep *base_ep, struct ibv_qp_init_attr_ex *init_attr_ex); -bool efa_base_ep_support_op_in_order_aligned_128_bytes(struct efa_base_ep *base_ep, +void efa_base_ep_close_util_ep(struct efa_base_ep *base_ep); + +int efa_base_ep_destruct_qp(struct efa_base_ep *base_ep); + +bool efa_qp_support_op_in_order_aligned_128_bytes(struct efa_qp *qp, enum ibv_wr_opcode op); void efa_base_ep_write_eq_error(struct efa_base_ep *ep, diff --git a/prov/efa/src/efa_cntr.c b/prov/efa/src/efa_cntr.c index 3f76d689614..efff358fdb8 100644 --- a/prov/efa/src/efa_cntr.c +++ b/prov/efa/src/efa_cntr.c @@ -1,40 +1,11 @@ -/* - * Copyright (c) 2013-2018 Intel Corporation. All rights reserved. - * Copyright (c) 2019 Amazon.com, Inc. or its affiliates. - * All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright (c) 2013-2018 Intel Corporation, Inc. All rights reserved. */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #include "ofi_util.h" #include "efa.h" #include "efa_cntr.h" +#include "rdm/efa_rdm_cq.h" static int efa_cntr_wait(struct fid_cntr *cntr_fid, uint64_t threshold, int timeout) { @@ -44,14 +15,13 @@ static int efa_cntr_wait(struct fid_cntr *cntr_fid, uint64_t threshold, int time int numtry = 5; int tryid = 0; int waitim = 1; - struct util_srx_ctx *srx_ctx; + struct efa_domain *domain; - srx_ctx = efa_cntr_get_srx_ctx(cntr_fid); + cntr = container_of(cntr_fid, struct util_cntr, cntr_fid); + domain = container_of(cntr->domain, struct efa_domain, util_domain); - if (srx_ctx) - ofi_genlock_lock(srx_ctx->lock); + ofi_genlock_lock(&domain->srx_lock); - cntr = container_of(cntr_fid, struct util_cntr, cntr_fid); assert(cntr->wait); errcnt = ofi_atomic_get64(&cntr->err); start = (timeout >= 0) ? ofi_gettime_ms() : 0; @@ -84,52 +54,47 @@ static int efa_cntr_wait(struct fid_cntr *cntr_fid, uint64_t threshold, int time } unlock: - if (srx_ctx) - ofi_genlock_unlock(srx_ctx->lock); + ofi_genlock_unlock(&domain->srx_lock); return ret; } static uint64_t efa_cntr_read(struct fid_cntr *cntr_fid) { - struct util_srx_ctx *srx_ctx; + struct efa_domain *domain; struct efa_cntr *efa_cntr; uint64_t ret; efa_cntr = container_of(cntr_fid, struct efa_cntr, util_cntr.cntr_fid); - srx_ctx = efa_cntr_get_srx_ctx(cntr_fid); + domain = container_of(efa_cntr->util_cntr.domain, struct efa_domain, util_domain); - if (srx_ctx) - ofi_genlock_lock(srx_ctx->lock); + ofi_genlock_lock(&domain->srx_lock); if (efa_cntr->shm_cntr) fi_cntr_read(efa_cntr->shm_cntr); ret = ofi_cntr_read(cntr_fid); - if (srx_ctx) - ofi_genlock_unlock(srx_ctx->lock); + ofi_genlock_unlock(&domain->srx_lock); return ret; } static uint64_t efa_cntr_readerr(struct fid_cntr *cntr_fid) { - struct util_srx_ctx *srx_ctx; + struct efa_domain *domain; struct efa_cntr *efa_cntr; uint64_t ret; efa_cntr = container_of(cntr_fid, struct efa_cntr, util_cntr.cntr_fid); - srx_ctx = efa_cntr_get_srx_ctx(cntr_fid); + domain = container_of(efa_cntr->util_cntr.domain, struct efa_domain, util_domain); - if (srx_ctx) - ofi_genlock_lock(srx_ctx->lock); + ofi_genlock_lock(&domain->srx_lock); if (efa_cntr->shm_cntr) fi_cntr_read(efa_cntr->shm_cntr); ret = ofi_cntr_readerr(cntr_fid); - if (srx_ctx) - ofi_genlock_unlock(srx_ctx->lock); + ofi_genlock_unlock(&domain->srx_lock); return ret; } @@ -176,6 +141,28 @@ static struct fi_ops efa_cntr_fi_ops = { .ops_open = fi_no_ops_open, }; +static void efa_rdm_cntr_progress(struct util_cntr *cntr) +{ + struct util_ep *ep; + struct fid_list_entry *fid_entry; + struct dlist_entry *item; + struct efa_cntr *efa_cntr; + struct efa_ibv_cq_poll_list_entry *poll_list_entry; + + ofi_genlock_lock(&cntr->ep_list_lock); + efa_cntr = container_of(cntr, struct efa_cntr, util_cntr); + dlist_foreach(&efa_cntr->ibv_cq_poll_list, item) { + poll_list_entry = container_of(item, struct efa_ibv_cq_poll_list_entry, entry); + efa_rdm_cq_poll_ibv_cq(efa_env.efa_cq_read_size, poll_list_entry->cq); + } + dlist_foreach(&cntr->ep_list, item) { + fid_entry = container_of(item, struct fid_list_entry, entry); + ep = container_of(fid_entry->fid, struct util_ep, ep_fid.fid); + ep->progress(ep); + } + ofi_genlock_unlock(&cntr->ep_list_lock); +} + int efa_cntr_open(struct fid_domain *domain, struct fi_cntr_attr *attr, struct fid_cntr **cntr_fid, void *context) { @@ -184,16 +171,22 @@ int efa_cntr_open(struct fid_domain *domain, struct fi_cntr_attr *attr, struct efa_domain *efa_domain; struct fi_cntr_attr shm_cntr_attr = {0}; struct fi_peer_cntr_context peer_cntr_context = {0}; + ofi_cntr_progress_func cntr_progress_func; cntr = calloc(1, sizeof(*cntr)); if (!cntr) return -FI_ENOMEM; + dlist_init(&cntr->ibv_cq_poll_list); efa_domain = container_of(domain, struct efa_domain, util_domain.domain_fid); + cntr_progress_func = efa_domain->info->ep_attr->type == FI_EP_RDM + ? efa_rdm_cntr_progress + : ofi_cntr_progress; ret = ofi_cntr_init(&efa_prov, domain, attr, &cntr->util_cntr, - &ofi_cntr_progress, context); + cntr_progress_func, context); + if (ret) goto free; diff --git a/prov/efa/src/efa_cntr.h b/prov/efa/src/efa_cntr.h index 4dd1eed800e..a4e1bb26997 100644 --- a/prov/efa/src/efa_cntr.h +++ b/prov/efa/src/efa_cntr.h @@ -1,35 +1,5 @@ -/* - * Copyright (c) Amazon.com, Inc. or its affiliates. - * All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #if HAVE_CONFIG_H #include @@ -41,6 +11,7 @@ struct efa_cntr { struct util_cntr util_cntr; struct fid_cntr *shm_cntr; + struct dlist_entry ibv_cq_poll_list; }; int efa_cntr_open(struct fid_domain *domain, struct fi_cntr_attr *attr, @@ -52,20 +23,5 @@ void efa_cntr_report_rx_completion(struct util_ep *ep, uint64_t flags); void efa_cntr_report_error(struct util_ep *ep, uint64_t flags); -static inline -void *efa_cntr_get_srx_ctx(struct fid_cntr *cntr_fid) -{ - struct efa_cntr *efa_cntr; - struct fid_peer_srx *srx = NULL; - - efa_cntr = container_of(cntr_fid, struct efa_cntr, util_cntr.cntr_fid); - - srx = efa_cntr->util_cntr.domain->srx; - if (!srx) - return NULL; - - return srx->ep_fid.fid.context; -} - #endif diff --git a/prov/efa/src/efa_cq.h b/prov/efa/src/efa_cq.h index f70f0074219..e616741d454 100644 --- a/prov/efa/src/efa_cq.h +++ b/prov/efa/src/efa_cq.h @@ -1,37 +1,77 @@ -/* - * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #include "efa.h" +enum ibv_cq_ex_type { + IBV_CQ, + EFADV_CQ +}; + +struct efa_ibv_cq { + struct ibv_cq_ex *ibv_cq_ex; + enum ibv_cq_ex_type ibv_cq_ex_type; +}; + +struct efa_ibv_cq_poll_list_entry { + struct dlist_entry entry; + struct efa_ibv_cq *cq; +}; + +static inline +int efa_ibv_cq_poll_list_match(struct dlist_entry *entry, const void *cq) +{ + struct efa_ibv_cq_poll_list_entry *item; + item = container_of(entry, struct efa_ibv_cq_poll_list_entry, entry); + return (item->cq == cq); +} + + +static inline +int efa_ibv_cq_poll_list_insert(struct dlist_entry *poll_list, struct ofi_genlock *lock, struct efa_ibv_cq *cq) +{ + int ret = 0; + struct dlist_entry *entry; + struct efa_ibv_cq_poll_list_entry *item; + + ofi_genlock_lock(lock); + entry = dlist_find_first_match(poll_list, efa_ibv_cq_poll_list_match, cq); + if (entry) { + ret = -FI_EALREADY; + goto out; + } + + item = calloc(1, sizeof(*item)); + if (!item) { + ret = -FI_ENOMEM; + goto out; + } + + item->cq = cq; + dlist_insert_tail(&item->entry, poll_list); + +out: + ofi_genlock_unlock(lock); + return (!ret || (ret == -FI_EALREADY)) ? 0 : ret; +} + +static inline +void efa_ibv_cq_poll_list_remove(struct dlist_entry *poll_list, struct ofi_genlock *lock, + struct efa_ibv_cq *cq) +{ + struct efa_ibv_cq_poll_list_entry *item; + struct dlist_entry *entry; + + ofi_genlock_lock(lock); + entry = dlist_remove_first_match(poll_list, efa_ibv_cq_poll_list_match, cq); + ofi_genlock_unlock(lock); + + if (entry) { + item = container_of(entry, struct efa_ibv_cq_poll_list_entry, entry); + free(item); + } +} + /** * @brief Create ibv_cq_ex by calling ibv_create_cq_ex * @@ -50,7 +90,7 @@ static inline int efa_cq_ibv_cq_ex_open_with_ibv_create_cq_ex( if (!*ibv_cq_ex) { EFA_WARN(FI_LOG_CQ, "Unable to create extended CQ: %s\n", strerror(errno)); - return -FI_ENOCQ; + return -FI_EINVAL; } *ibv_cq_ex_type = IBV_CQ; diff --git a/prov/efa/src/efa_device.c b/prov/efa/src/efa_device.c index 9ac9b0745bf..709948909fd 100644 --- a/prov/efa/src/efa_device.c +++ b/prov/efa/src/efa_device.c @@ -1,36 +1,7 @@ -/* - * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. - * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2017-2020 Amazon.com, Inc. or its affiliates. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. */ +/* SPDX-FileCopyrightText: Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved. */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #if HAVE_CONFIG_H # include diff --git a/prov/efa/src/efa_device.h b/prov/efa/src/efa_device.h index 9bf22958c99..061b38d9775 100644 --- a/prov/efa/src/efa_device.h +++ b/prov/efa/src/efa_device.h @@ -1,34 +1,5 @@ -/* - * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef EFA_DEVICE_H #define EFA_DEVICE_H diff --git a/prov/efa/src/efa_domain.c b/prov/efa/src/efa_domain.c index ddae5b9f4f4..f1a81c89780 100644 --- a/prov/efa/src/efa_domain.c +++ b/prov/efa/src/efa_domain.c @@ -1,35 +1,7 @@ -/* - * Copyright (c) 2013-2015 Intel Corporation, Inc. All rights reserved. - * Copyright (c) 2017-2020 Amazon.com, Inc. or its affiliates. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright (c) 2013-2015 Intel Corporation, Inc. All rights reserved. */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + #include #include @@ -187,6 +159,7 @@ static int efa_domain_init_rdm(struct efa_domain *efa_domain, struct fi_info *in efa_domain->addrlen = (info->src_addr) ? info->src_addrlen : info->dest_addrlen; efa_domain->rdm_cq_size = MAX(info->rx_attr->size + info->tx_attr->size, efa_env.cq_size); + efa_domain->num_read_msg_in_flight = 0; return 0; } diff --git a/prov/efa/src/efa_domain.h b/prov/efa/src/efa_domain.h index 6f09ccd582b..1d74a9aa2ed 100644 --- a/prov/efa/src/efa_domain.h +++ b/prov/efa/src/efa_domain.h @@ -1,34 +1,5 @@ -/* - * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef EFA_DOMAIN_H #define EFA_DOMAIN_H @@ -59,6 +30,7 @@ struct efa_domain { size_t rdm_cq_size; struct dlist_entry list_entry; /* linked to g_efa_domain_list */ struct ofi_genlock srx_lock; /* shared among peer providers */ + uint64_t num_read_msg_in_flight; }; extern struct dlist_entry g_efa_domain_list; diff --git a/prov/efa/src/efa_env.c b/prov/efa/src/efa_env.c index 7cbec3a8de5..46229beee69 100644 --- a/prov/efa/src/efa_env.c +++ b/prov/efa/src/efa_env.c @@ -1,35 +1,5 @@ -/* - * Copyright (c) 2019 Amazon.com, Inc. or its affiliates. - * All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #include diff --git a/prov/efa/src/efa_env.h b/prov/efa/src/efa_env.h index 36db115a219..2d315d7bedb 100644 --- a/prov/efa/src/efa_env.h +++ b/prov/efa/src/efa_env.h @@ -1,35 +1,5 @@ -/* - * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. - * All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef _EFA_ENV_H #define _EFA_ENV_H diff --git a/prov/efa/src/efa_errno.h b/prov/efa/src/efa_errno.h index 0c3447b7821..4457c57e69e 100644 --- a/prov/efa/src/efa_errno.h +++ b/prov/efa/src/efa_errno.h @@ -4,6 +4,8 @@ #ifndef EFA_ERRNO_H #define EFA_ERRNO_H +#include + #define EFA_IO_COMP_STATUS_START 0 /** @@ -58,16 +60,17 @@ _(1, FLUSHED, Flushed during queue pair destroy) \ _(2, LOCAL_ERROR_QP_INTERNAL_ERROR, Internal queue pair error) \ _(3, LOCAL_ERROR_INVALID_OP_TYPE, Invalid operation type) \ - _(4, LOCAL_ERROR_INVALID_AH, Invalid address handle) \ + _(4, LOCAL_ERROR_INVALID_AH, Invalid address handle (local)) \ _(5, LOCAL_ERROR_INVALID_LKEY, Invalid local key (LKEY)) \ _(6, LOCAL_ERROR_BAD_LENGTH, Message too long) \ - _(7, REMOTE_ERROR_BAD_ADDRESS, Destination ENI is down or does not run EFA) \ + _(7, REMOTE_ERROR_BAD_ADDRESS, RKEY not registered or does not match remote IOVA) \ _(8, REMOTE_ERROR_ABORT, Receiver connection aborted) \ _(9, REMOTE_ERROR_BAD_DEST_QPN, Invalid receiver queue pair number (QPN)) \ _(10, REMOTE_ERROR_RNR, Receiver not ready) \ _(11, REMOTE_ERROR_BAD_LENGTH, Receiver scatter-gather list (SGL) too short) \ _(12, REMOTE_ERROR_BAD_STATUS, Unexpected status received from remote) \ - _(13, LOCAL_ERROR_UNRESP_REMOTE, Unresponsive receiver (connection never established or unknown)) + _(13, LOCAL_ERROR_UNRESP_REMOTE, Unresponsive receiver (connection never established or unknown)) \ + _(14, REMOTE_ERROR_UNKNOWN_PEER, Invalid address handle on remote) /** * @brief EFA provider proprietary error codes @@ -132,6 +135,48 @@ enum efa_errno { #undef EFA_IO_COMP_STATUS_ENUM #undef EFA_PROV_ERRNO_ENUM +/** + * @brief Convert an EFA error code into a common Libfabric error code + * + * @param[in] err An EFA-specific error code + * @return Analogous common Libfabric error code + * + * @sa fi_errno(3) + */ +static inline int to_fi_errno(enum efa_errno err) { + switch (err) { + case EFA_IO_COMP_STATUS_OK: + return FI_SUCCESS; + case EFA_IO_COMP_STATUS_FLUSHED: + return FI_EHOSTDOWN; + case EFA_IO_COMP_STATUS_LOCAL_ERROR_QP_INTERNAL_ERROR: + case EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_AH: + case EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_LKEY: + case EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_OP_TYPE: + case EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_ADDRESS: + return FI_EINVAL; + case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE: + return FI_EHOSTUNREACH; + case EFA_IO_COMP_STATUS_LOCAL_ERROR_BAD_LENGTH: + case EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_LENGTH: + return FI_EMSGSIZE; + case EFA_IO_COMP_STATUS_REMOTE_ERROR_ABORT: + case FI_EFA_ERR_ESTABLISHED_RECV_UNRESP: + return FI_ECONNABORTED; + case EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_DEST_QPN: + case EFA_IO_COMP_STATUS_REMOTE_ERROR_UNKNOWN_PEER: + return FI_ENOTCONN; + case EFA_IO_COMP_STATUS_REMOTE_ERROR_RNR: + return FI_ENORX; + case EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_STATUS: + return FI_EREMOTEIO; + case FI_EFA_ERR_OOM: + return FI_ENOMEM; + default: + return FI_EOTHER; + } +} + const char *efa_strerror(enum efa_errno); void efa_show_help(enum efa_errno); diff --git a/prov/efa/src/efa_fabric.c b/prov/efa/src/efa_fabric.c index eabf68c128e..b9539b73dda 100644 --- a/prov/efa/src/efa_fabric.c +++ b/prov/efa/src/efa_fabric.c @@ -1,39 +1,6 @@ -/* - * Copyright (c) 2014-2016, Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2017-2020 Amazon.com, Inc. or its affiliates. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN - * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright (c) 2014-2016, Cisco Systems, Inc. All rights reserved. */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #include "config.h" diff --git a/prov/efa/src/efa_fork_support.c b/prov/efa/src/efa_fork_support.c index d6f53426871..82db3505987 100644 --- a/prov/efa/src/efa_fork_support.c +++ b/prov/efa/src/efa_fork_support.c @@ -1,34 +1,5 @@ -/* - * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #include "efa.h" diff --git a/prov/efa/src/efa_fork_support.h b/prov/efa/src/efa_fork_support.h index 93d302dd48c..ef16c23d577 100644 --- a/prov/efa/src/efa_fork_support.h +++ b/prov/efa/src/efa_fork_support.h @@ -1,34 +1,5 @@ -/* - * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef EFA_FORK_SUPPORT_H #define EFA_FORK_SUPPORT_H diff --git a/prov/efa/src/efa_hmem.c b/prov/efa/src/efa_hmem.c index 894b4f648ef..666ed6f0305 100644 --- a/prov/efa/src/efa_hmem.c +++ b/prov/efa/src/efa_hmem.c @@ -1,34 +1,5 @@ -/* - * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #include "efa.h" #include "efa_hmem.h" @@ -91,8 +62,8 @@ static int efa_domain_hmem_info_init_protocol_thresholds(struct efa_domain *efa_ case FI_HMEM_NEURON: info->runt_size = EFA_NEURON_RUNT_SIZE; info->max_intra_eager_size = 0; - info->max_medium_msg_size = EFA_NEURON_INTER_MAX_MEDIUM_MESSAGE_SIZE; - info->min_read_msg_size = EFA_NEURON_INTER_MIN_READ_MESSAGE_SIZE; + info->max_medium_msg_size = 0; + info->min_read_msg_size = efa_max_eager_msg_size_with_largest_header(efa_domain) + 1; info->min_read_write_size = efa_max_eager_msg_size_with_largest_header(efa_domain) + 1; fi_param_get_size_t(&efa_prov, "runt_size", &info->runt_size); fi_param_get_size_t(&efa_prov, "inter_min_read_message_size", &info->min_read_msg_size); diff --git a/prov/efa/src/efa_hmem.h b/prov/efa/src/efa_hmem.h index 9236c2886c5..db376c1a2b4 100644 --- a/prov/efa/src/efa_hmem.h +++ b/prov/efa/src/efa_hmem.h @@ -1,34 +1,5 @@ -/* - * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef EFA_HMEM_H #define EFA_HMEM_H diff --git a/prov/efa/src/efa_mr.c b/prov/efa/src/efa_mr.c index 4ec944cab2b..161707a7dab 100644 --- a/prov/efa/src/efa_mr.c +++ b/prov/efa/src/efa_mr.c @@ -1,34 +1,5 @@ -/* - * Copyright (c) 2017-2020 Amazon.com, Inc. or its affiliates. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #include "config.h" #include @@ -367,7 +338,7 @@ static int efa_mr_cache_regattr(struct fid *fid, const struct fi_mr_attr *attr, domain = container_of(fid, struct efa_domain, util_domain.domain_fid.fid); - assert(attr->iov_count == 1); + assert(attr->iov_count > 0 && attr->iov_count <= domain->info->domain_attr->mr_iov_limit); ofi_mr_info_get_iov_from_mr_attr(&info, attr, flags); info.iface = attr->iface; info.device = attr->device.reserved; diff --git a/prov/efa/src/efa_mr.h b/prov/efa/src/efa_mr.h index b2d95a1aff0..e4c0e2ca143 100644 --- a/prov/efa/src/efa_mr.h +++ b/prov/efa/src/efa_mr.h @@ -1,34 +1,5 @@ -/* - * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef EFA_MR_H #define EFA_MR_H diff --git a/prov/efa/src/efa_prov.c b/prov/efa/src/efa_prov.c index 38ff4836d05..85a71aa2c41 100644 --- a/prov/efa/src/efa_prov.c +++ b/prov/efa/src/efa_prov.c @@ -1,35 +1,5 @@ -/* - * Copyright (c) 2019 Amazon.com, Inc. or its affiliates. - * All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #include #include "efa.h" diff --git a/prov/efa/src/efa_prov.h b/prov/efa/src/efa_prov.h index 456f02ed9cb..c807bd0de6c 100644 --- a/prov/efa/src/efa_prov.h +++ b/prov/efa/src/efa_prov.h @@ -1,38 +1,6 @@ -/* - * Copyright (c) 2017-2023 Amazon.com, Inc. or its affiliates. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN - * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + #ifndef EFA_PROV_H #define EFA_PROV_H diff --git a/prov/efa/src/efa_prov_info.c b/prov/efa/src/efa_prov_info.c index f86a956d50a..06ee44b80cf 100644 --- a/prov/efa/src/efa_prov_info.c +++ b/prov/efa/src/efa_prov_info.c @@ -1,38 +1,5 @@ -/* - * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN - * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #include "ofi_prov.h" #include diff --git a/prov/efa/src/efa_prov_info.h b/prov/efa/src/efa_prov_info.h index 5f16dcba8c0..c5b3ff93c4a 100644 --- a/prov/efa/src/efa_prov_info.h +++ b/prov/efa/src/efa_prov_info.h @@ -1,34 +1,5 @@ -/* - * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef EFA_DEVICE_INFO_H #define EFA_DEVICE_INFO_H diff --git a/prov/efa/src/efa_shm.c b/prov/efa/src/efa_shm.c index 3b0f5dab9dd..624c8cfd10e 100644 --- a/prov/efa/src/efa_shm.c +++ b/prov/efa/src/efa_shm.c @@ -1,34 +1,6 @@ -/* - * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + #include #include "efa.h" #include "efa_shm.h" @@ -114,7 +86,7 @@ void efa_shm_info_create(const struct fi_info *app_info, struct fi_info **shm_in * make this request to shm as well. */ shm_hints->domain_attr->mr_mode = FI_MR_VIRT_ADDR; - if (app_info && (app_info->caps & FI_HMEM)) { + if (app_info->caps & FI_HMEM) { shm_hints->domain_attr->mr_mode |= FI_MR_HMEM; } diff --git a/prov/efa/src/efa_shm.h b/prov/efa/src/efa_shm.h index 706ec025d81..1a2e9184800 100644 --- a/prov/efa/src/efa_shm.h +++ b/prov/efa/src/efa_shm.h @@ -1,34 +1,6 @@ -/* - * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + #ifndef _EFA_SHM_H #define _EFA_SHM_H diff --git a/prov/efa/src/efa_tp.h b/prov/efa/src/efa_tp.h index f93e088fddb..ec3ce8ebc47 100644 --- a/prov/efa/src/efa_tp.h +++ b/prov/efa/src/efa_tp.h @@ -1,3 +1,6 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + #ifndef _EFA_TP_H #define _EFA_TP_H diff --git a/prov/efa/src/efa_tp_def.c b/prov/efa/src/efa_tp_def.c index b6c9d5bb596..f102a1321a3 100644 --- a/prov/efa/src/efa_tp_def.c +++ b/prov/efa/src/efa_tp_def.c @@ -1,3 +1,6 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + #define LTTNG_UST_TRACEPOINT_CREATE_PROBES #define LTTNG_UST_TRACEPOINT_DEFINE diff --git a/prov/efa/src/efa_tp_def.h b/prov/efa/src/efa_tp_def.h index 0e5493a9685..72e03988a56 100644 --- a/prov/efa/src/efa_tp_def.h +++ b/prov/efa/src/efa_tp_def.h @@ -1,3 +1,6 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + #undef LTTNG_UST_TRACEPOINT_PROVIDER #define LTTNG_UST_TRACEPOINT_PROVIDER EFA_TP_PROV diff --git a/prov/efa/src/efa_user_info.c b/prov/efa/src/efa_user_info.c index 8eef343d9d0..1d56554ea0d 100644 --- a/prov/efa/src/efa_user_info.c +++ b/prov/efa/src/efa_user_info.c @@ -1,38 +1,6 @@ -/* - * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN - * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + #include "efa.h" #include "efa_prov_info.h" @@ -440,6 +408,14 @@ int efa_user_info_alter_rdm(int version, struct fi_info *info, const struct fi_i */ if (hints) { if (hints->tx_attr) { + /* efa device doesn't have ordering (EFA_MSG_ORDER == FI_ORDER_NONE). + * if apps request an ordering that is relaxed than + * what provider supports, we should respect that. + * This is specially true for FI_ORDER_NONE: + * No ordering is specified. This value may be used as input in order to obtain + * the default message order supported by the provider. + */ + info->tx_attr->msg_order &= hints->tx_attr->msg_order; atomic_ordering = FI_ORDER_ATOMIC_RAR | FI_ORDER_ATOMIC_RAW | FI_ORDER_ATOMIC_WAR | FI_ORDER_ATOMIC_WAW; if (!(hints->tx_attr->msg_order & atomic_ordering)) { @@ -447,6 +423,22 @@ int efa_user_info_alter_rdm(int version, struct fi_info *info, const struct fi_i } } + if (hints->rx_attr) { + /* efa device doesn't have ordering (EFA_MSG_ORDER == FI_ORDER_NONE). + * if apps request an ordering that is relaxed than + * what provider supports, we should respect that. + * This is specially true for FI_ORDER_NONE: + * No ordering is specified. This value may be used as input in order to obtain + * the default message order supported by the provider. + */ + info->rx_attr->msg_order &= hints->rx_attr->msg_order; + } + + if (info->tx_attr->msg_order != info->rx_attr->msg_order) + EFA_INFO(FI_LOG_EP_CTRL, "Inconsistent tx/rx msg order. Tx msg order: %lu, Rx msg order: %lu. " + "Libfabric can proceed but it is recommended to align the tx and rx msg order.\n", + info->tx_attr->msg_order, info->rx_attr->msg_order); + /* We only support manual progress for RMA operations */ if (hints->caps & FI_RMA) { info->domain_attr->control_progress = FI_PROGRESS_MANUAL; @@ -481,6 +473,15 @@ int efa_user_info_alter_rdm(int version, struct fi_info *info, const struct fi_i EFA_INFO(FI_LOG_CORE, "FI_MSG_PREFIX size = %ld\n", info->ep_attr->msg_prefix_size); } + + /* Handle other EP attributes */ + if (hints->ep_attr) { + if (hints->ep_attr->max_msg_size) { + info->ep_attr->max_msg_size = + MIN(info->ep_attr->max_msg_size, + hints->ep_attr->max_msg_size); + } + } } /* Use a table for AV if the app has no strong requirement */ diff --git a/prov/efa/src/efa_user_info.h b/prov/efa/src/efa_user_info.h index 5464f232be0..2b52ce2b58b 100644 --- a/prov/efa/src/efa_user_info.h +++ b/prov/efa/src/efa_user_info.h @@ -1,38 +1,5 @@ -/* - * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN - * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef _EFA_USER_INFO_H #define _EFA_USER_INFO_H diff --git a/prov/efa/src/rdm/efa_rdm_atomic.c b/prov/efa/src/rdm/efa_rdm_atomic.c index 0fc7d774acf..b8b793cadb3 100644 --- a/prov/efa/src/rdm/efa_rdm_atomic.c +++ b/prov/efa/src/rdm/efa_rdm_atomic.c @@ -1,35 +1,5 @@ -/* - * Copyright (c) 2019-2020 Amazon.com, Inc. or its affiliates. - * All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #include #include @@ -153,7 +123,6 @@ ssize_t efa_rdm_atomic_generic_efa(struct efa_rdm_ep *efa_rdm_ep, txe = efa_rdm_atomic_alloc_txe(efa_rdm_ep, peer, msg, atomic_ex, op, flags); if (OFI_UNLIKELY(!txe)) { err = -FI_EAGAIN; - efa_rdm_ep_progress_internal(efa_rdm_ep); goto out; } @@ -206,7 +175,6 @@ ssize_t efa_rdm_atomic_generic_efa(struct efa_rdm_ep *efa_rdm_ep, } if (OFI_UNLIKELY(err)) { - efa_rdm_ep_progress_internal(efa_rdm_ep); efa_rdm_txe_release(txe); peer->next_msg_id--; } @@ -279,9 +247,8 @@ efa_rdm_atomic_writemsg(struct fid_ep *ep, void *shm_desc[EFA_RDM_IOV_LIMIT] = {NULL}; int err; - EFA_DBG(FI_LOG_EP_DATA, - "%s: iov_len: %lu flags: %lx\n", - __func__, ofi_total_ioc_cnt(msg->msg_iov, msg->iov_count), flags); + EFA_DBG(FI_LOG_EP_DATA, "iov_len: %lu flags: %lx\n", + ofi_total_ioc_cnt(msg->msg_iov, msg->iov_count), flags); efa_rdm_ep = container_of(ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid.fid); err = efa_rdm_ep_cap_check_atomic(efa_rdm_ep); @@ -327,8 +294,8 @@ efa_rdm_atomic_writev(struct fid_ep *ep, msg.context = context; msg.data = 0; - EFA_DBG(FI_LOG_EP_DATA, "%s total_count=%ld atomic_op=%d\n", __func__, - ofi_total_ioc_cnt(iov, count), msg.op); + EFA_DBG(FI_LOG_EP_DATA, "total_count=%ld atomic_op=%d\n", + ofi_total_ioc_cnt(iov, count), msg.op); return efa_rdm_atomic_writemsg(ep, &msg, 0); } @@ -369,8 +336,8 @@ efa_rdm_atomic_readwritemsg(struct fid_ep *ep, return -errno; } - EFA_DBG(FI_LOG_EP_DATA, "%s total_len=%ld atomic_op=%d\n", __func__, - ofi_total_ioc_cnt(msg->msg_iov, msg->iov_count), msg->op); + EFA_DBG(FI_LOG_EP_DATA, "total_len=%ld atomic_op=%d\n", + ofi_total_ioc_cnt(msg->msg_iov, msg->iov_count), msg->op); efa_rdm_ep = container_of(ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid.fid); err = efa_rdm_ep_cap_check_atomic(efa_rdm_ep); @@ -476,9 +443,8 @@ efa_rdm_atomic_compwritemsg(struct fid_ep *ep, return -errno; } - EFA_DBG(FI_LOG_EP_DATA, - "%s: iov_len: %lu flags: %lx\n", - __func__, ofi_total_ioc_cnt(msg->msg_iov, msg->iov_count), flags); + EFA_DBG(FI_LOG_EP_DATA, "iov_len: %lu flags: %lx\n", + ofi_total_ioc_cnt(msg->msg_iov, msg->iov_count), flags); efa_rdm_ep = container_of(ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid.fid); err = efa_rdm_ep_cap_check_atomic(efa_rdm_ep); diff --git a/prov/efa/src/rdm/efa_rdm_atomic.h b/prov/efa/src/rdm/efa_rdm_atomic.h index 72788577607..8e50e58aabb 100644 --- a/prov/efa/src/rdm/efa_rdm_atomic.h +++ b/prov/efa/src/rdm/efa_rdm_atomic.h @@ -1,35 +1,6 @@ -/* - * Copyright (c) Amazon.com, Inc. or its affiliates. - * All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + #if HAVE_CONFIG_H #include #endif /* HAVE_CONFIG_H */ diff --git a/prov/efa/src/rdm/efa_rdm_cq.c b/prov/efa/src/rdm/efa_rdm_cq.c index 2c6440739a8..fca5eae3f05 100644 --- a/prov/efa/src/rdm/efa_rdm_cq.c +++ b/prov/efa/src/rdm/efa_rdm_cq.c @@ -4,6 +4,12 @@ #include "efa.h" #include "efa_rdm_cq.h" #include "ofi_util.h" +#include "efa_av.h" +#include "efa_cntr.h" +#include "efa_rdm_pke_cmd.h" +#include "efa_rdm_pke_utils.h" +#include "efa_rdm_pke_nonreq.h" +#include "efa_rdm_tracepoint.h" static const char *efa_rdm_cq_strerror(struct fid_cq *cq_fid, int prov_errno, @@ -32,6 +38,16 @@ int efa_rdm_cq_close(struct fid *fid) cq = container_of(fid, struct efa_rdm_cq, util_cq.cq_fid.fid); + if (cq->ibv_cq.ibv_cq_ex) { + ret = -ibv_destroy_cq(ibv_cq_ex_to_cq(cq->ibv_cq.ibv_cq_ex)); + if (ret) { + EFA_WARN(FI_LOG_CQ, "Unable to close ibv cq: %s\n", + fi_strerror(-ret)); + return ret; + } + cq->ibv_cq.ibv_cq_ex = NULL; + } + if (cq->shm_cq) { ret = fi_close(&cq->shm_cq->fid); if (ret) { @@ -55,17 +71,349 @@ static struct fi_ops efa_rdm_cq_fi_ops = { .ops_open = fi_no_ops_open, }; +/** + * @brief handle rdma-core CQ completion resulted from IBV_WRITE_WITH_IMM + * + * This function handles hardware-assisted RDMA writes with immediate data at + * remote endpoint. These do not have a packet context, nor do they have a + * connid available. + * + * @param[in] ibv_cq_ex extended ibv cq + * @param[in] flags flags (such as FI_REMOTE_CQ_DATA) + * @param[in] ep efa_rdm_ep + * @param[in] pkt_entry packet entry + */ +static +void efa_rdm_cq_proc_ibv_recv_rdma_with_imm_completion( + struct ibv_cq_ex *ibv_cq_ex, + uint64_t flags, + struct efa_rdm_ep *ep, + struct efa_rdm_pke *pkt_entry + ) +{ + struct util_cq *target_cq; + int ret; + fi_addr_t src_addr; + struct efa_av *efa_av; + uint32_t imm_data = ibv_wc_read_imm_data(ibv_cq_ex); + uint32_t len = ibv_wc_read_byte_len(ibv_cq_ex); + + target_cq = ep->base_ep.util_ep.rx_cq; + efa_av = ep->base_ep.av; + + if (ep->base_ep.util_ep.caps & FI_SOURCE) { + src_addr = efa_av_reverse_lookup_rdm(efa_av, + ibv_wc_read_slid(ibv_cq_ex), + ibv_wc_read_src_qp(ibv_cq_ex), + NULL); + ret = ofi_cq_write_src(target_cq, NULL, flags, len, NULL, imm_data, 0, src_addr); + } else { + ret = ofi_cq_write(target_cq, NULL, flags, len, NULL, imm_data, 0); + } + + if (OFI_UNLIKELY(ret)) { + EFA_WARN(FI_LOG_CQ, + "Unable to write a cq entry for remote for RECV_RDMA operation: %s\n", + fi_strerror(-ret)); + efa_base_ep_write_eq_error(&ep->base_ep, -ret, FI_EFA_ERR_WRITE_SHM_CQ_ENTRY); + } + + efa_cntr_report_rx_completion(&ep->base_ep.util_ep, flags); + + /* Recv with immediate will consume a pkt_entry, but the pkt is not + filled, so free the pkt_entry and record we have one less posted + packet now. */ + assert(pkt_entry); + ep->efa_rx_pkts_posted--; + efa_rdm_pke_release_rx(pkt_entry); +} + +#if HAVE_EFADV_CQ_EX +/** + * @brief Read peer raw address from EFA device and look up the peer address in AV. + * This function should only be called if the peer AH is unknown. + * @return Peer address, or FI_ADDR_NOTAVAIL if unavailable. + */ +static inline +fi_addr_t efa_rdm_cq_determine_peer_address_from_efadv( + struct ibv_cq_ex *ibv_cqx, + enum ibv_cq_ex_type ibv_cq_ex_type) +{ + struct efa_rdm_pke *pkt_entry; + struct efa_rdm_ep *ep; + struct efa_ep_addr efa_ep_addr = {0}; + fi_addr_t addr; + union ibv_gid gid = {0}; + uint32_t *connid = NULL; + + if (ibv_cq_ex_type != EFADV_CQ) { + /* EFA DV CQ is not supported. This could be due to old EFA kernel module versions. */ + return FI_ADDR_NOTAVAIL; + } + + /* Attempt to read sgid from EFA firmware */ + if (efadv_wc_read_sgid(efadv_cq_from_ibv_cq_ex(ibv_cqx), &gid) < 0) { + /* Return code is negative if the peer AH is known */ + return FI_ADDR_NOTAVAIL; + } + + pkt_entry = (void *)(uintptr_t)ibv_cqx->wr_id; + ep = pkt_entry->ep; + assert(ep); + + connid = efa_rdm_pke_connid_ptr(pkt_entry); + if (!connid) { + return FI_ADDR_NOTAVAIL; + } + + /* + * Use raw:qpn:connid as the key to lookup AV for peer's fi_addr + */ + memcpy(efa_ep_addr.raw, gid.raw, sizeof(efa_ep_addr.raw)); + efa_ep_addr.qpn = ibv_wc_read_src_qp(ibv_cqx); + efa_ep_addr.qkey = *connid; + addr = ofi_av_lookup_fi_addr(&ep->base_ep.av->util_av, &efa_ep_addr); + if (addr != FI_ADDR_NOTAVAIL) { + char gid_str_cdesc[INET6_ADDRSTRLEN]; + inet_ntop(AF_INET6, gid.raw, gid_str_cdesc, INET6_ADDRSTRLEN); + EFA_WARN(FI_LOG_AV, + "Recovered peer fi_addr. [Raw]:[QPN]:[QKey] = [%s]:[%" PRIu16 "]:[%" PRIu32 "]\n", + gid_str_cdesc, efa_ep_addr.qpn, efa_ep_addr.qkey); + } + + return addr; +} + +/** + * @brief Determine peer address from ibv_cq_ex + * Attempt to inject or determine peer address if not available. This usually + * happens when the endpoint receives the first packet from a new peer. + * There is an edge case for EFA endpoint - the device might lose the address + * handle of a known peer due to a firmware bug and return FI_ADDR_NOTAVAIL. + * The provider needs to look up the address using Raw address:QPN:QKey. + * Note: This function introduces addtional overhead. It should only be called if + * efa_av_lookup_address_rdm fails to find the peer address. + * @param ep Pointer to RDM endpoint + * @param ibv_cqx Pointer to CQ + * @returns Peer address, or FI_ADDR_NOTAVAIL if unsuccessful. + */ +static inline fi_addr_t efa_rdm_cq_determine_addr_from_ibv_cq(struct ibv_cq_ex *ibv_cqx, enum ibv_cq_ex_type ibv_cq_ex_type) +{ + struct efa_rdm_pke *pkt_entry; + fi_addr_t addr = FI_ADDR_NOTAVAIL; + + pkt_entry = (void *)(uintptr_t)ibv_cqx->wr_id; + + addr = efa_rdm_pke_determine_addr(pkt_entry); + + if (addr == FI_ADDR_NOTAVAIL) { + addr = efa_rdm_cq_determine_peer_address_from_efadv(ibv_cqx, ibv_cq_ex_type); + } + + return addr; +} +#else +/** + * @brief Determine peer address from ibv_cq_ex + * Attempt to inject peer address if not available. This usually + * happens when the endpoint receives the first packet from a new peer. + * Note: This function introduces addtional overhead. It should only be called if + * efa_av_lookup_address_rdm fails to find the peer address. + * @param ep Pointer to RDM endpoint + * @param ibv_cqx Pointer to CQ + * @returns Peer address, or FI_ADDR_NOTAVAIL if unsuccessful. + */ +static inline +fi_addr_t efa_rdm_cq_determine_addr_from_ibv_cq(struct ibv_cq_ex *ibv_cqx, enum ibv_cq_ex_type ibv_cq_ex_type) +{ + struct efa_rdm_pke *pkt_entry; + + pkt_entry = (void *)(uintptr_t)ibv_cqx->wr_id; + + return efa_rdm_pke_determine_addr(pkt_entry); +} +#endif + +/** + * @brief Get the vendor error code for an endpoint's CQ + * + * This function is essentially a wrapper for `ibv_wc_read_vendor_err()`; making + * a best-effort attempt to promote the error code to a proprietary EFA + * provider error code. + * + * @param[in] ibv_cq_ex IBV CQ + * @return EFA-specific error code + * @sa #EFA_PROV_ERRNOS + * + * @todo Currently, this only checks for unresponsive receiver + * (#EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE) and attempts to promote it to + * #FI_EFA_ERR_ESTABLISHED_RECV_UNRESP. This should be expanded to handle other + * RDMA Core error codes (#EFA_IO_COMP_STATUSES) for the sake of more accurate + * error reporting + */ +static int efa_rdm_cq_get_prov_errno(struct ibv_cq_ex *ibv_cq_ex) { + uint32_t vendor_err = ibv_wc_read_vendor_err(ibv_cq_ex); + struct efa_rdm_pke *pkt_entry = (void *) (uintptr_t) ibv_cq_ex->wr_id; + struct efa_rdm_peer *peer; + struct efa_rdm_ep *ep; + + if (OFI_LIKELY(pkt_entry && pkt_entry->addr)) { + ep = pkt_entry->ep; + peer = efa_rdm_ep_get_peer(ep, pkt_entry->addr); + } else { + return vendor_err; + } + + switch (vendor_err) { + case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE: { + if (peer->flags & EFA_RDM_PEER_HANDSHAKE_RECEIVED) + vendor_err = FI_EFA_ERR_ESTABLISHED_RECV_UNRESP; + break; + } + default: + break; + } + + return vendor_err; +} + + +/** + * @brief poll rdma-core cq and process the cq entry + * + * @param[in] ep_poll the RDM endpoint that polls ibv cq. Note this polling endpoint can be different + * from the endpoint that the completed packet entry was posted from (pkt_entry->ep). + * @param[in] cqe_to_process Max number of cq entry to poll and process. A negative number means to poll until cq empty + */ +void efa_rdm_cq_poll_ibv_cq(ssize_t cqe_to_process, struct efa_ibv_cq *ibv_cq) +{ + bool should_end_poll = false; + /* Initialize an empty ibv_poll_cq_attr struct for ibv_start_poll. + * EFA expects .comp_mask = 0, or otherwise returns EINVAL. + */ + struct ibv_poll_cq_attr poll_cq_attr = {.comp_mask = 0}; + struct efa_av *efa_av; + struct efa_rdm_pke *pkt_entry; + ssize_t err; + int opcode; + size_t i = 0; + int prov_errno; + struct efa_rdm_ep *ep = NULL; + struct fi_cq_err_entry err_entry; + struct efa_rdm_cq *efa_rdm_cq; + struct efa_domain *efa_domain; + struct efa_qp *qp; + + efa_rdm_cq = container_of(ibv_cq, struct efa_rdm_cq, ibv_cq); + efa_domain = container_of(efa_rdm_cq->util_cq.domain, struct efa_domain, util_domain); + + /* Call ibv_start_poll only once */ + err = ibv_start_poll(ibv_cq->ibv_cq_ex, &poll_cq_attr); + should_end_poll = !err; + + while (!err) { + pkt_entry = (void *)(uintptr_t)ibv_cq->ibv_cq_ex->wr_id; + qp = efa_domain->qp_table[ibv_wc_read_qp_num(ibv_cq->ibv_cq_ex) & efa_domain->qp_table_sz_m1]; + ep = container_of(qp->base_ep, struct efa_rdm_ep, base_ep); + efa_av = ep->base_ep.av; + efa_rdm_tracepoint(poll_cq, (size_t) ibv_cq->ibv_cq_ex->wr_id); + opcode = ibv_wc_read_opcode(ibv_cq->ibv_cq_ex); + if (ibv_cq->ibv_cq_ex->status) { + prov_errno = efa_rdm_cq_get_prov_errno(ibv_cq->ibv_cq_ex); + switch (opcode) { + case IBV_WC_SEND: /* fall through */ + case IBV_WC_RDMA_WRITE: /* fall through */ + case IBV_WC_RDMA_READ: + efa_rdm_pke_handle_tx_error(pkt_entry, prov_errno); + break; + case IBV_WC_RECV: /* fall through */ + case IBV_WC_RECV_RDMA_WITH_IMM: + efa_rdm_pke_handle_rx_error(pkt_entry, prov_errno); + break; + default: + EFA_WARN(FI_LOG_EP_CTRL, "Unhandled op code %d\n", opcode); + assert(0 && "Unhandled op code"); + } + break; + } + switch (opcode) { + case IBV_WC_SEND: +#if ENABLE_DEBUG + ep->send_comps++; +#endif + efa_rdm_pke_handle_send_completion(pkt_entry); + break; + case IBV_WC_RECV: + pkt_entry->addr = efa_av_reverse_lookup_rdm(efa_av, ibv_wc_read_slid(ibv_cq->ibv_cq_ex), + ibv_wc_read_src_qp(ibv_cq->ibv_cq_ex), pkt_entry); + + if (pkt_entry->addr == FI_ADDR_NOTAVAIL) { + pkt_entry->addr = efa_rdm_cq_determine_addr_from_ibv_cq(ibv_cq->ibv_cq_ex, ibv_cq->ibv_cq_ex_type); + } + + pkt_entry->pkt_size = ibv_wc_read_byte_len(ibv_cq->ibv_cq_ex); + assert(pkt_entry->pkt_size > 0); + efa_rdm_pke_handle_recv_completion(pkt_entry); +#if ENABLE_DEBUG + ep->recv_comps++; +#endif + break; + case IBV_WC_RDMA_READ: + case IBV_WC_RDMA_WRITE: + efa_rdm_pke_handle_rma_completion(pkt_entry); + break; + case IBV_WC_RECV_RDMA_WITH_IMM: + efa_rdm_cq_proc_ibv_recv_rdma_with_imm_completion( + ibv_cq->ibv_cq_ex, + FI_REMOTE_CQ_DATA | FI_RMA | FI_REMOTE_WRITE, + ep, pkt_entry); + break; + default: + EFA_WARN(FI_LOG_EP_CTRL, + "Unhandled cq type\n"); + assert(0 && "Unhandled cq type"); + } + + i++; + if (i == cqe_to_process) { + break; + } + + /* + * ibv_next_poll MUST be call after the current WC is fully processed, + * which prevents later calls on ibv_cq_ex from reading the wrong WC. + */ + err = ibv_next_poll(ibv_cq->ibv_cq_ex); + } + + if (err && err != ENOENT) { + err = err > 0 ? err : -err; + prov_errno = ibv_wc_read_vendor_err(ibv_cq->ibv_cq_ex); + EFA_WARN(FI_LOG_CQ, "Unexpected error when polling ibv cq, err: %s (%zd) prov_errno: %s (%d)\n", fi_strerror(err), err, efa_strerror(prov_errno), prov_errno); + efa_show_help(prov_errno); + err_entry = (struct fi_cq_err_entry) { + .err = err, + .prov_errno = prov_errno, + .op_context = NULL + }; + ofi_cq_write_error(&efa_rdm_cq->util_cq, &err_entry); + } + + if (should_end_poll) + ibv_end_poll(ibv_cq->ibv_cq_ex); +} + static ssize_t efa_rdm_cq_readfrom(struct fid_cq *cq_fid, void *buf, size_t count, fi_addr_t *src_addr) { struct efa_rdm_cq *cq; ssize_t ret; - struct util_srx_ctx *srx_ctx; + struct efa_domain *domain; cq = container_of(cq_fid, struct efa_rdm_cq, util_cq.cq_fid.fid); - srx_ctx = cq->util_cq.domain->srx->ep_fid.fid.context; + domain = container_of(cq->util_cq.domain, struct efa_domain, util_domain); - ofi_genlock_lock(srx_ctx->lock); + ofi_genlock_lock(&domain->srx_lock); if (cq->shm_cq) { fi_cq_read(cq->shm_cq, NULL, 0); @@ -84,7 +432,7 @@ static ssize_t efa_rdm_cq_readfrom(struct fid_cq *cq_fid, void *buf, size_t coun ret = ofi_cq_readfrom(&cq->util_cq.cq_fid, buf, count, src_addr); out: - ofi_genlock_unlock(srx_ctx->lock); + ofi_genlock_unlock(&domain->srx_lock); return ret; } @@ -100,6 +448,30 @@ static struct fi_ops_cq efa_rdm_cq_ops = { .strerror = efa_rdm_cq_strerror, }; +static void efa_rdm_cq_progress(struct util_cq *cq) +{ + struct util_ep *ep; + struct fid_list_entry *fid_entry; + struct dlist_entry *item; + struct efa_rdm_cq *efa_rdm_cq; + struct efa_ibv_cq_poll_list_entry *poll_list_entry; + + ofi_genlock_lock(&cq->ep_list_lock); + efa_rdm_cq = container_of(cq, struct efa_rdm_cq, util_cq); + + dlist_foreach(&efa_rdm_cq->ibv_cq_poll_list, item) { + poll_list_entry = container_of(item, struct efa_ibv_cq_poll_list_entry, entry); + efa_rdm_cq_poll_ibv_cq(efa_env.efa_cq_read_size, poll_list_entry->cq); + } + + dlist_foreach(&cq->ep_list, item) { + fid_entry = container_of(item, struct fid_list_entry, entry); + ep = container_of(fid_entry->fid, struct util_ep, ep_fid.fid); + ep->progress(ep); + } + ofi_genlock_unlock(&cq->ep_list_lock); +} + /** * @brief create a CQ for EFA RDM provider * @@ -116,7 +488,7 @@ static struct fi_ops_cq efa_rdm_cq_ops = { int efa_rdm_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, struct fid_cq **cq_fid, void *context) { - int ret; + int ret, retv; struct efa_rdm_cq *cq; struct efa_domain *efa_domain; struct fi_cq_attr shm_cq_attr = {0}; @@ -134,12 +506,19 @@ int efa_rdm_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, /* Override user cq size if it's less than recommended cq size */ attr->size = MAX(efa_domain->rdm_cq_size, attr->size); + dlist_init(&cq->ibv_cq_poll_list); ret = ofi_cq_init(&efa_prov, domain, attr, &cq->util_cq, - &ofi_cq_progress, context); + &efa_rdm_cq_progress, context); if (ret) goto free; + ret = efa_cq_ibv_cq_ex_open(attr, efa_domain->device->ibv_ctx, &cq->ibv_cq.ibv_cq_ex, &cq->ibv_cq.ibv_cq_ex_type); + if (ret) { + EFA_WARN(FI_LOG_CQ, "Unable to create extended CQ: %s\n", fi_strerror(ret)); + goto close_util_cq; + } + *cq_fid = &cq->util_cq.cq_fid; (*cq_fid)->fid.ops = &efa_rdm_cq_fi_ops; (*cq_fid)->ops = &efa_rdm_cq_ops; @@ -155,11 +534,21 @@ int efa_rdm_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, &cq->shm_cq, &peer_cq_context); if (ret) { EFA_WARN(FI_LOG_CQ, "Unable to open shm cq: %s\n", fi_strerror(-ret)); - goto free; + goto destroy_ibv_cq; } } return 0; +destroy_ibv_cq: + retv = -ibv_destroy_cq(ibv_cq_ex_to_cq(cq->ibv_cq.ibv_cq_ex)); + if (retv) + EFA_WARN(FI_LOG_CQ, "Unable to close ibv cq: %s\n", + fi_strerror(-retv)); +close_util_cq: + retv = ofi_cq_cleanup(&cq->util_cq); + if (retv) + EFA_WARN(FI_LOG_CQ, "Unable to close util cq: %s\n", + fi_strerror(-retv)); free: free(cq); return ret; diff --git a/prov/efa/src/rdm/efa_rdm_cq.h b/prov/efa/src/rdm/efa_rdm_cq.h index 247a7956b91..23848b6cc1d 100644 --- a/prov/efa/src/rdm/efa_rdm_cq.h +++ b/prov/efa/src/rdm/efa_rdm_cq.h @@ -1,44 +1,17 @@ -/* - * Copyright (c) 2019-2023 Amazon.com, Inc. or its affiliates. - * All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef EFA_RDM_CQ_H #define EFA_RDM_CQ_H +#include "efa_cq.h" #include struct efa_rdm_cq { struct util_cq util_cq; struct fid_cq *shm_cq; + struct efa_ibv_cq ibv_cq; + struct dlist_entry ibv_cq_poll_list; }; /* @@ -49,4 +22,5 @@ struct efa_rdm_cq { int efa_rdm_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, struct fid_cq **cq_fid, void *context); -#endif \ No newline at end of file +void efa_rdm_cq_poll_ibv_cq(ssize_t cqe_to_process, struct efa_ibv_cq *ibv_cq); +#endif diff --git a/prov/efa/src/rdm/efa_rdm_ep.h b/prov/efa/src/rdm/efa_rdm_ep.h index 4a343e83582..206490fc0eb 100644 --- a/prov/efa/src/rdm/efa_rdm_ep.h +++ b/prov/efa/src/rdm/efa_rdm_ep.h @@ -12,11 +12,6 @@ #define EFA_RDM_ERROR_MSG_BUFFER_LENGTH 1024 -enum ibv_cq_ex_type { - IBV_CQ, - EFADV_CQ -}; - /** @brief Information of a queued copy. * * This struct is used when receiving buffer is on device. @@ -50,10 +45,6 @@ struct efa_rdm_ep { /* per-version extra feature/request flag */ uint64_t extra_info[EFA_RDM_MAX_NUM_EXINFO]; - struct ibv_cq_ex *ibv_cq_ex; - - enum ibv_cq_ex_type ibv_cq_ex_type; - /* shm provider fid */ struct fid_ep *shm_ep; @@ -84,9 +75,6 @@ struct efa_rdm_ep { /* Resource management flag */ uint64_t rm_full; - /* application's ordering requirements */ - uint64_t msg_order; - /* Application's maximum msg size hint */ size_t max_msg_size; @@ -137,8 +125,6 @@ struct efa_rdm_ep { /* tx/rx_entries used by long CTS msg/write/read protocol * which have data to be sent */ struct dlist_entry ope_longcts_send_list; - /* read entries with data to be read */ - struct dlist_entry read_pending_list; /* list of #efa_rdm_peer that are in backoff due to RNR */ struct dlist_entry peer_backoff_list; /* list of #efa_rdm_peer that will retry posting handshake pkt */ @@ -175,6 +161,12 @@ struct efa_rdm_ep { */ size_t efa_rx_pkts_to_post; + /* + * Number of RX packets that are held (not released) by progress engine + * due to queued hmem copy or local read. + */ + size_t efa_rx_pkts_held; + /* number of outstanding tx ops on efa device */ size_t efa_outstanding_tx_ops; @@ -247,7 +239,7 @@ static inline size_t efa_rdm_ep_get_tx_pool_size(struct efa_rdm_ep *ep) static inline int efa_rdm_ep_need_sas(struct efa_rdm_ep *ep) { - return ep->msg_order & FI_ORDER_SAS; + return ((ep->user_info->tx_attr->msg_order & FI_ORDER_SAS) || (ep->user_info->rx_attr->msg_order & FI_ORDER_SAS)); } @@ -274,8 +266,6 @@ ssize_t efa_rdm_ep_post_queued_pkts(struct efa_rdm_ep *ep, size_t efa_rdm_ep_get_memory_alignment(struct efa_rdm_ep *ep, enum fi_hmem_iface iface); -int efa_rdm_ep_get_prov_errno(struct efa_rdm_ep *ep); - static inline struct efa_domain *efa_rdm_ep_domain(struct efa_rdm_ep *ep) { diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index e7d0b860d43..98f053c9193 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -1,38 +1,7 @@ -/* - * Copyright (c) Amazon.com, Inc. or its affiliates. - * All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #include "efa.h" -#include "efa_cq.h" #include "efa_av.h" #include "efa_rdm_ep.h" #include "efa_rdm_cq.h" @@ -45,6 +14,46 @@ #include "efa_rdm_pke_req.h" #include "efa_cntr.h" +static +void efa_rdm_ep_construct_ibv_qp_init_attr_ex(struct efa_rdm_ep *ep, + struct ibv_qp_init_attr_ex *attr_ex, + struct ibv_cq_ex *tx_cq, + struct ibv_cq_ex *rx_cq) +{ + attr_ex->cap.max_send_wr = ep->base_ep.domain->device->rdm_info->tx_attr->size; + attr_ex->cap.max_send_sge = ep->base_ep.domain->device->rdm_info->tx_attr->iov_limit; + attr_ex->cap.max_recv_wr = ep->base_ep.domain->device->rdm_info->rx_attr->size; + attr_ex->cap.max_recv_sge = ep->base_ep.domain->device->rdm_info->rx_attr->iov_limit; + attr_ex->cap.max_inline_data = ep->base_ep.domain->device->efa_attr.inline_buf_size; + attr_ex->qp_type = IBV_QPT_DRIVER; + attr_ex->comp_mask = IBV_QP_INIT_ATTR_PD | IBV_QP_INIT_ATTR_SEND_OPS_FLAGS; + attr_ex->send_ops_flags = IBV_QP_EX_WITH_SEND; + if (efa_device_support_rdma_read()) + attr_ex->send_ops_flags |= IBV_QP_EX_WITH_RDMA_READ; + if (efa_device_support_rdma_write()) { + attr_ex->send_ops_flags |= IBV_QP_EX_WITH_RDMA_WRITE; + attr_ex->send_ops_flags |= IBV_QP_EX_WITH_RDMA_WRITE_WITH_IMM; + } + attr_ex->pd = efa_rdm_ep_domain(ep)->ibv_pd; + attr_ex->qp_context = ep; + attr_ex->sq_sig_all = 1; + + attr_ex->send_cq = ibv_cq_ex_to_cq(tx_cq); + attr_ex->recv_cq = ibv_cq_ex_to_cq(rx_cq); +} + +static inline +struct efa_rdm_cq *efa_rdm_ep_get_tx_rdm_cq(struct efa_rdm_ep *ep) +{ + return ep->base_ep.util_ep.tx_cq ? container_of(ep->base_ep.util_ep.tx_cq, struct efa_rdm_cq, util_cq) : NULL; +} + +static inline +struct efa_rdm_cq *efa_rdm_ep_get_rx_rdm_cq(struct efa_rdm_ep *ep) +{ + return ep->base_ep.util_ep.rx_cq ? container_of(ep->base_ep.util_ep.rx_cq, struct efa_rdm_cq, util_cq) : NULL; +} + /** * @brief set the "efa_qp" field in the efa_rdm_ep->efa_base_ep * called by efa_rdm_ep_open() @@ -57,30 +66,34 @@ static int efa_rdm_ep_create_base_ep_ibv_qp(struct efa_rdm_ep *ep) { struct ibv_qp_init_attr_ex attr_ex = { 0 }; + struct efa_rdm_cq *tx_rdm_cq, *rx_rdm_cq; + struct ibv_cq_ex *tx_ibv_cq, *rx_ibv_cq; - attr_ex.cap.max_send_wr = ep->base_ep.domain->device->rdm_info->tx_attr->size; - attr_ex.cap.max_send_sge = ep->base_ep.domain->device->rdm_info->tx_attr->iov_limit; - attr_ex.send_cq = ibv_cq_ex_to_cq(ep->ibv_cq_ex); + tx_rdm_cq = efa_rdm_ep_get_tx_rdm_cq(ep); + rx_rdm_cq = efa_rdm_ep_get_rx_rdm_cq(ep); - attr_ex.cap.max_recv_wr = ep->base_ep.domain->device->rdm_info->rx_attr->size; - attr_ex.cap.max_recv_sge = ep->base_ep.domain->device->rdm_info->rx_attr->iov_limit; - attr_ex.recv_cq = ibv_cq_ex_to_cq(ep->ibv_cq_ex); + if (!tx_rdm_cq && !rx_rdm_cq) { + EFA_WARN(FI_LOG_EP_CTRL, + "Endpoint is not bound to a send or receive completion queue\n"); + return -FI_ENOCQ; + } - attr_ex.cap.max_inline_data = ep->base_ep.domain->device->efa_attr.inline_buf_size; + if (!tx_rdm_cq && ofi_needs_tx(ep->base_ep.info->caps)) { + EFA_WARN(FI_LOG_EP_CTRL, + "Endpoint is not bound to a send completion queue when it has transmit capabilities enabled (FI_SEND).\n"); + return -FI_ENOCQ; + } - attr_ex.qp_type = IBV_QPT_DRIVER; - attr_ex.comp_mask = IBV_QP_INIT_ATTR_PD | IBV_QP_INIT_ATTR_SEND_OPS_FLAGS; - attr_ex.send_ops_flags = IBV_QP_EX_WITH_SEND; - if (efa_device_support_rdma_read()) - attr_ex.send_ops_flags |= IBV_QP_EX_WITH_RDMA_READ; - if (efa_device_support_rdma_write()) { - attr_ex.send_ops_flags |= IBV_QP_EX_WITH_RDMA_WRITE; - attr_ex.send_ops_flags |= IBV_QP_EX_WITH_RDMA_WRITE_WITH_IMM; + if (!rx_rdm_cq && ofi_needs_rx(ep->base_ep.info->caps)) { + EFA_WARN(FI_LOG_EP_CTRL, + "Endpoint is not bound to a receive completion queue when it has receive capabilities enabled. (FI_RECV)\n"); + return -FI_ENOCQ; } - attr_ex.pd = efa_rdm_ep_domain(ep)->ibv_pd; - attr_ex.qp_context = ep; - attr_ex.sq_sig_all = 1; + tx_ibv_cq = tx_rdm_cq ? tx_rdm_cq->ibv_cq.ibv_cq_ex : rx_rdm_cq->ibv_cq.ibv_cq_ex; + rx_ibv_cq = rx_rdm_cq ? rx_rdm_cq->ibv_cq.ibv_cq_ex : tx_rdm_cq->ibv_cq.ibv_cq_ex; + + efa_rdm_ep_construct_ibv_qp_init_attr_ex(ep, &attr_ex, tx_ibv_cq, rx_ibv_cq); return efa_base_ep_create_qp(&ep->base_ep, &attr_ex); } @@ -310,7 +323,6 @@ void efa_rdm_ep_init_linked_lists(struct efa_rdm_ep *ep) dlist_init(&ep->ope_queued_ctrl_list); dlist_init(&ep->ope_queued_read_list); dlist_init(&ep->ope_longcts_send_list); - dlist_init(&ep->read_pending_list); dlist_init(&ep->peer_backoff_list); dlist_init(&ep->handshake_queued_peer_list); #if ENABLE_DEBUG @@ -414,7 +426,6 @@ int efa_rdm_ep_open(struct fid_domain *domain, struct fi_info *info, { struct efa_domain *efa_domain = NULL; struct efa_rdm_ep *efa_rdm_ep = NULL; - struct fi_cq_attr cq_attr; int ret, retv, i; efa_rdm_ep = calloc(1, sizeof(*efa_rdm_ep)); @@ -423,9 +434,6 @@ int efa_rdm_ep_open(struct fid_domain *domain, struct fi_info *info, efa_domain = container_of(domain, struct efa_domain, util_domain.domain_fid); - memset(&cq_attr, 0, sizeof(cq_attr)); - cq_attr.format = FI_CQ_FORMAT_DATA; - cq_attr.wait_obj = FI_WAIT_NONE; ret = efa_base_ep_construct(&efa_rdm_ep->base_ep, domain, info, efa_rdm_ep_progress, context); @@ -462,15 +470,6 @@ int efa_rdm_ep_open(struct fid_domain *domain, struct fi_info *info, efa_rdm_ep->efa_device_iov_limit = efa_domain->device->rdm_info->tx_attr->iov_limit; efa_rdm_ep->use_device_rdma = efa_rdm_get_use_device_rdma(info->fabric_attr->api_version); efa_rdm_ep->shm_permitted = true; - - cq_attr.size = MAX(efa_rdm_ep->rx_size + efa_rdm_ep->tx_size, - efa_env.cq_size); - - if (info->tx_attr->op_flags & FI_DELIVERY_COMPLETE) - EFA_INFO(FI_LOG_CQ, "FI_DELIVERY_COMPLETE unsupported\n"); - - assert(info->tx_attr->msg_order == info->rx_attr->msg_order); - efa_rdm_ep->msg_order = info->rx_attr->msg_order; efa_rdm_ep->max_msg_size = info->ep_attr->max_msg_size; efa_rdm_ep->msg_prefix_size = info->ep_attr->msg_prefix_size; efa_rdm_ep->max_proto_hdr_size = efa_rdm_pkt_type_get_max_hdr_size(); @@ -498,13 +497,9 @@ int efa_rdm_ep_open(struct fid_domain *domain, struct fi_info *info, efa_rdm_ep->efa_rx_pkts_posted = 0; efa_rdm_ep->efa_rx_pkts_to_post = 0; + efa_rdm_ep->efa_rx_pkts_held = 0; efa_rdm_ep->efa_outstanding_tx_ops = 0; - assert(!efa_rdm_ep->ibv_cq_ex); - - ret = efa_cq_ibv_cq_ex_open(&cq_attr, efa_domain->device->ibv_ctx, - &efa_rdm_ep->ibv_cq_ex, &efa_rdm_ep->ibv_cq_ex_type); - if (ret) { EFA_WARN(FI_LOG_CQ, "Unable to create extended CQ: %s\n", strerror(errno)); goto err_close_shm_ep; @@ -512,7 +507,7 @@ int efa_rdm_ep_open(struct fid_domain *domain, struct fi_info *info, ret = efa_rdm_ep_create_buffer_pools(efa_rdm_ep); if (ret) - goto err_close_core_cq; + goto err_close_shm_ep; efa_rdm_ep_init_linked_lists(efa_rdm_ep); @@ -538,15 +533,11 @@ int efa_rdm_ep_open(struct fid_domain *domain, struct fi_info *info, efa_rdm_ep->sendrecv_in_order_aligned_128_bytes = false; efa_rdm_ep->write_in_order_aligned_128_bytes = false; - ret = efa_rdm_ep_create_base_ep_ibv_qp(efa_rdm_ep); - if (ret) - goto err_close_core_cq; - efa_rdm_ep->pke_vec = calloc(sizeof(struct efa_rdm_pke *), EFA_RDM_EP_MAX_WR_PER_IBV_POST_RECV); if (!efa_rdm_ep->pke_vec) { EFA_WARN(FI_LOG_EP_CTRL, "cannot alloc memory for efa_rdm_ep->pke_vec!\n"); ret = -FI_ENOMEM; - goto err_close_core_cq; + goto err_close_shm_ep; } *ep = &efa_rdm_ep->base_ep.util_ep.ep_fid; @@ -559,11 +550,6 @@ int efa_rdm_ep_open(struct fid_domain *domain, struct fi_info *info, (*ep)->cm = &efa_rdm_ep_cm_ops; return 0; -err_close_core_cq: - retv = -ibv_destroy_cq(ibv_cq_ex_to_cq(efa_rdm_ep->ibv_cq_ex)); - if (retv) - EFA_WARN(FI_LOG_CQ, "Unable to close cq: %s\n", - fi_strerror(-retv)); err_close_shm_ep: if (efa_rdm_ep->shm_ep) { retv = fi_close(&efa_rdm_ep->shm_ep->fid); @@ -764,6 +750,9 @@ static void efa_rdm_ep_destroy_buffer_pools(struct efa_rdm_ep *efa_rdm_ep) if (efa_rdm_ep->efa_tx_pkt_pool) ofi_bufpool_destroy(efa_rdm_ep->efa_tx_pkt_pool); + + if (efa_rdm_ep->rx_atomrsp_pool) + ofi_bufpool_destroy(efa_rdm_ep->rx_atomrsp_pool); } /* @@ -796,18 +785,76 @@ static inline void efa_rdm_ep_wait_send(struct efa_rdm_ep *efa_rdm_ep) { struct util_srx_ctx *srx_ctx; + struct efa_rdm_cq *tx_cq, *rx_cq; /* peer srx should be initialized when ep is enabled */ assert(efa_rdm_ep->peer_srx_ep); srx_ctx = efa_rdm_ep_get_peer_srx_ctx(efa_rdm_ep); ofi_genlock_lock(srx_ctx->lock); + tx_cq = efa_rdm_ep_get_tx_rdm_cq(efa_rdm_ep); + rx_cq = efa_rdm_ep_get_rx_rdm_cq(efa_rdm_ep); + while (efa_rdm_ep_has_unfinished_send(efa_rdm_ep)) { + /* poll cq until empty */ + if (tx_cq) + efa_rdm_cq_poll_ibv_cq(-1, &tx_cq->ibv_cq); + if (rx_cq) + efa_rdm_cq_poll_ibv_cq(-1, &rx_cq->ibv_cq); efa_rdm_ep_progress_internal(efa_rdm_ep); } ofi_genlock_unlock(srx_ctx->lock); } +static inline +void efa_rdm_ep_remove_cntr_ibv_cq_poll_list(struct efa_rdm_ep *ep) +{ + int i; + struct efa_cntr *efa_cntr; + struct util_cntr *util_cntr; + struct efa_rdm_cq *tx_cq, *rx_cq; + + tx_cq = efa_rdm_ep_get_tx_rdm_cq(ep); + rx_cq = efa_rdm_ep_get_rx_rdm_cq(ep); + + for (i = 0; i< CNTR_CNT; i++) { + util_cntr = ep->base_ep.util_ep.cntrs[i]; + if (util_cntr) { + efa_cntr = container_of(util_cntr, struct efa_cntr, util_cntr); + if (tx_cq && !ofi_atomic_get32(&tx_cq->util_cq.ref)) + efa_ibv_cq_poll_list_remove(&efa_cntr->ibv_cq_poll_list, &efa_cntr->util_cntr.ep_list_lock, &tx_cq->ibv_cq); + + if (rx_cq && !ofi_atomic_get32(&rx_cq->util_cq.ref)) + efa_ibv_cq_poll_list_remove(&efa_cntr->ibv_cq_poll_list, &efa_cntr->util_cntr.ep_list_lock, &rx_cq->ibv_cq); + } + } +} + +static inline +void efa_rdm_ep_remove_cq_ibv_cq_poll_list(struct efa_rdm_ep *ep) +{ + struct efa_rdm_cq *tx_cq, *rx_cq; + + tx_cq = efa_rdm_ep_get_tx_rdm_cq(ep); + rx_cq = efa_rdm_ep_get_rx_rdm_cq(ep); + + /* Remove the cross referencing of the CQs. + * It must happen after ofi_endpoint_close + * so we have cq's reference counters updated. + */ + if (tx_cq && !ofi_atomic_get32(&tx_cq->util_cq.ref)) { + efa_ibv_cq_poll_list_remove(&tx_cq->ibv_cq_poll_list, &tx_cq->util_cq.ep_list_lock, &tx_cq->ibv_cq); + if (rx_cq) + efa_ibv_cq_poll_list_remove(&rx_cq->ibv_cq_poll_list, &rx_cq->util_cq.ep_list_lock, &tx_cq->ibv_cq); + } + + if (rx_cq && !ofi_atomic_get32(&rx_cq->util_cq.ref)) { + efa_ibv_cq_poll_list_remove(&rx_cq->ibv_cq_poll_list, &rx_cq->util_cq.ep_list_lock, &rx_cq->ibv_cq); + if (tx_cq) + efa_ibv_cq_poll_list_remove(&tx_cq->ibv_cq_poll_list, &tx_cq->util_cq.ep_list_lock, &rx_cq->ibv_cq); + } +} + /** * @brief implement the fi_close() API for the EFA RDM endpoint * @param[in,out] fid Endpoint to close @@ -822,15 +869,29 @@ static int efa_rdm_ep_close(struct fid *fid) if (efa_rdm_ep->base_ep.efa_qp_enabled) efa_rdm_ep_wait_send(efa_rdm_ep); - ret = efa_base_ep_destruct(&efa_rdm_ep->base_ep); - if (ret) { - EFA_WARN(FI_LOG_EP_CTRL, "Unable to close base endpoint\n"); - retv = ret; + /* + * util_srx_close will clean all efa_rdm_rxes that are + * associated with peer_rx_entries in unexp msg/tag lists. + * It also decrements the ref count of rx cq. So it must + * be called before we clean up the ibv cq poll list which + * relies on the correct ref count of tx/rx cq. + */ + if (efa_rdm_ep->peer_srx_ep) { + util_srx_close(&efa_rdm_ep->peer_srx_ep->fid); + efa_rdm_ep->peer_srx_ep = NULL; } - ret = -ibv_destroy_cq(ibv_cq_ex_to_cq(efa_rdm_ep->ibv_cq_ex)); + /* We need to free the util_ep first to avoid race conditions + * with other threads progressing the cq. */ + efa_base_ep_close_util_ep(&efa_rdm_ep->base_ep); + + efa_rdm_ep_remove_cntr_ibv_cq_poll_list(efa_rdm_ep); + + efa_rdm_ep_remove_cq_ibv_cq_poll_list(efa_rdm_ep); + + ret = efa_base_ep_destruct(&efa_rdm_ep->base_ep); if (ret) { - EFA_WARN(FI_LOG_EP_CTRL, "Unable to close ibv_cq_ex\n"); + EFA_WARN(FI_LOG_EP_CTRL, "Unable to close base endpoint\n"); retv = ret; } @@ -842,18 +903,14 @@ static int efa_rdm_ep_close(struct fid *fid) } } - /* - * util_srx_close will clean all efa_rdm_rxes that are - * associated with peer_rx_entries in unexp msg/tag lists. - */ - if (efa_rdm_ep->peer_srx_ep) { - util_srx_close(&efa_rdm_ep->peer_srx_ep->fid); - efa_rdm_ep->peer_srx_ep = NULL; - } efa_rdm_ep_destroy_buffer_pools(efa_rdm_ep); if (efa_rdm_ep->pke_vec) free(efa_rdm_ep->pke_vec); + + if (efa_rdm_ep->user_info) + fi_freeinfo(efa_rdm_ep->user_info); + free(efa_rdm_ep); return retv; } @@ -1011,6 +1068,71 @@ void efa_rdm_ep_update_shm(struct efa_rdm_ep *ep) efa_rdm_ep_close_shm_resources(ep); } +static inline +int efa_rdm_ep_insert_cntr_ibv_cq_poll_list(struct efa_rdm_ep *ep) +{ + int i, ret; + struct efa_cntr *efa_cntr; + struct util_cntr *util_cntr; + struct efa_rdm_cq *tx_cq, *rx_cq; + tx_cq = efa_rdm_ep_get_tx_rdm_cq(ep); + rx_cq = efa_rdm_ep_get_rx_rdm_cq(ep); + + for (i = 0; i < CNTR_CNT; i++) { + util_cntr = ep->base_ep.util_ep.cntrs[i]; + if (util_cntr) { + efa_cntr = container_of(util_cntr, struct efa_cntr, util_cntr); + if (tx_cq) { + ret = efa_ibv_cq_poll_list_insert(&efa_cntr->ibv_cq_poll_list, &efa_cntr->util_cntr.ep_list_lock, &tx_cq->ibv_cq); + if (ret) + return ret; + } + if (rx_cq) { + ret = efa_ibv_cq_poll_list_insert(&efa_cntr->ibv_cq_poll_list, &efa_cntr->util_cntr.ep_list_lock, &rx_cq->ibv_cq); + if (ret) + return ret; + } + } + } + + return FI_SUCCESS; +} + +static inline +int efa_rdm_ep_insert_cq_ibv_cq_poll_list(struct efa_rdm_ep *ep) +{ + int ret; + struct efa_rdm_cq *tx_cq, *rx_cq; + /* cross referencing */ + tx_cq = efa_rdm_ep_get_tx_rdm_cq(ep); + rx_cq = efa_rdm_ep_get_rx_rdm_cq(ep); + + if (tx_cq) { + ret = efa_ibv_cq_poll_list_insert(&tx_cq->ibv_cq_poll_list, &tx_cq->util_cq.ep_list_lock, &tx_cq->ibv_cq); + if (ret) + return ret; + + if (rx_cq) { + ret = efa_ibv_cq_poll_list_insert(&tx_cq->ibv_cq_poll_list, &tx_cq->util_cq.ep_list_lock, &rx_cq->ibv_cq); + if (ret) + return ret; + } + } + + if (rx_cq) { + ret = efa_ibv_cq_poll_list_insert(&rx_cq->ibv_cq_poll_list, &rx_cq->util_cq.ep_list_lock, &rx_cq->ibv_cq); + if (ret) + return ret; + + if (tx_cq) { + ret = efa_ibv_cq_poll_list_insert(&rx_cq->ibv_cq_poll_list, &rx_cq->util_cq.ep_list_lock, &tx_cq->ibv_cq); + if (ret) + return ret; + } + } + + return FI_SUCCESS; +} /** * @brief implement the fi_enable() API for EFA RDM endpoint @@ -1032,9 +1154,6 @@ static int efa_rdm_ep_ctrl(struct fid *fid, int command, void *arg) switch (command) { case FI_ENABLE: ep = container_of(fid, struct efa_rdm_ep, base_ep.util_ep.ep_fid.fid); - ret = efa_base_ep_enable(&ep->base_ep); - if (ret) - return ret; /* * efa uses util SRX no matter shm is enabled, so we need to initialize @@ -1044,6 +1163,23 @@ static int efa_rdm_ep_ctrl(struct fid *fid, int command, void *arg) if (ret) return ret; + ret = efa_rdm_ep_create_base_ep_ibv_qp(ep); + if (ret) + return ret; + + /* efa_base_ep_enable destroys qp in the error path */ + ret = efa_base_ep_enable(&ep->base_ep); + if (ret) + return ret; + + ret = efa_rdm_ep_insert_cq_ibv_cq_poll_list(ep); + if (ret) + return ret; + + ret = efa_rdm_ep_insert_cntr_ibv_cq_poll_list(ep); + if (ret) + return ret; + assert(ep->peer_srx_ep); srx_ctx = efa_rdm_ep_get_peer_srx_ctx(ep); ofi_genlock_lock(srx_ctx->lock); @@ -1070,23 +1206,22 @@ static int efa_rdm_ep_ctrl(struct fid *fid, int command, void *arg) ret = fi_srx_context(efa_rdm_ep_domain(ep)->shm_domain, &peer_srx_attr, &peer_srx_ep, &peer_srx_context); if (ret) - goto out; + goto err_unlock_and_destroy_qp; shm_ep_name_len = EFA_SHM_NAME_MAX; ret = efa_shm_ep_name_construct(shm_ep_name, &shm_ep_name_len, &ep->base_ep.src_addr); if (ret < 0) - goto out; + goto err_unlock_and_destroy_qp; fi_setname(&ep->shm_ep->fid, shm_ep_name, shm_ep_name_len); /* Bind srx to shm ep */ ret = fi_ep_bind(ep->shm_ep, &ep->peer_srx_ep->fid, 0); if (ret) - goto out; + goto err_unlock_and_destroy_qp; ret = fi_enable(ep->shm_ep); if (ret) - goto out; + goto err_unlock_and_destroy_qp; } -out: ofi_genlock_unlock(srx_ctx->lock); break; default: @@ -1095,6 +1230,11 @@ static int efa_rdm_ep_ctrl(struct fid *fid, int command, void *arg) } return ret; + +err_unlock_and_destroy_qp: + ofi_genlock_unlock(srx_ctx->lock); + efa_base_ep_destruct_qp(&ep->base_ep); + return ret; } /** @@ -1276,49 +1416,57 @@ static int efa_rdm_ep_set_use_device_rdma(struct efa_rdm_ep *ep, bool use_device } /** - * @brief set sendrecv_in_order_aligned_128_bytes flag in efa_rdm_ep - * called by efa_rdm_ep_setopt - * @param[in,out] ep endpoint - * @param[in] sendrecv_in_order_aligned_128_bytes whether to enable in_order send/recv - * for each 128 bytes aligned buffer - * @return 0 on success, -FI_EOPNOTSUPP if the option cannot be supported - * @related efa_rdm_ep + * @brief check the in order aligned 128 bytes support for a given ibv_wr_op code + * + * @param ep efa_rdm_ep + * @param op_code ibv wr op code + * @return int 0 if in order aligned 128 bytes is supported, -FI_EOPNOTSUPP if + * it is not supported. Other negative integer for other errors. */ static -int efa_rdm_ep_set_sendrecv_in_order_aligned_128_bytes(struct efa_rdm_ep *ep, - bool sendrecv_in_order_aligned_128_bytes) +int efa_rdm_ep_check_qp_in_order_aligned_128_bytes(struct efa_rdm_ep *ep, + enum ibv_wr_opcode op_code) { - /* - * RDMA read is used to copy data from host bounce buffer to the - * application buffer on device - */ - if (sendrecv_in_order_aligned_128_bytes && - !efa_base_ep_support_op_in_order_aligned_128_bytes(&ep->base_ep, IBV_WR_RDMA_READ)) - return -FI_EOPNOTSUPP; + struct efa_qp *qp = NULL; + struct ibv_qp_init_attr_ex attr_ex = {0}; + int ret, retv; + struct ibv_cq_ex *ibv_cq_ex = NULL; + enum ibv_cq_ex_type ibv_cq_ex_type; + struct fi_cq_attr cq_attr = {0}; + + ret = efa_cq_ibv_cq_ex_open(&cq_attr, efa_rdm_ep_domain(ep)->device->ibv_ctx, &ibv_cq_ex, &ibv_cq_ex_type); + if (ret) { + EFA_WARN(FI_LOG_CQ, "Unable to create extended CQ: %d\n", ret); + ret = -FI_EINVAL; + goto out; + } - ep->sendrecv_in_order_aligned_128_bytes = sendrecv_in_order_aligned_128_bytes; - return 0; -} + /* Create a dummy qp for query only */ + efa_rdm_ep_construct_ibv_qp_init_attr_ex(ep, &attr_ex, ibv_cq_ex, ibv_cq_ex); -/** - * @brief set write_in_order_aligned_128_bytes flag in efa_rdm_ep - * called by efa_rdm_ep_set_opt - * @param[in,out] ep endpoint - * @param[in] write_in_order_aligned_128_bytes whether to enable RDMA in order write - * for each 128 bytes aligned buffer. - * @return 0 on success, -FI_EOPNOTSUPP if the option cannot be supported. - * @related efa_rdm_ep - */ -static -int efa_rdm_ep_set_write_in_order_aligned_128_bytes(struct efa_rdm_ep *ep, - bool write_in_order_aligned_128_bytes) -{ - if (write_in_order_aligned_128_bytes && - !efa_base_ep_support_op_in_order_aligned_128_bytes(&ep->base_ep, IBV_WR_RDMA_WRITE)) - return -FI_EOPNOTSUPP; + ret = efa_qp_create(&qp, &attr_ex); + if (ret) + goto out; - ep->write_in_order_aligned_128_bytes = write_in_order_aligned_128_bytes; - return 0; + if (!efa_qp_support_op_in_order_aligned_128_bytes(qp, op_code)) + ret = -FI_EOPNOTSUPP; + +out: + if (qp) { + retv = ibv_destroy_qp(qp->ibv_qp); + if (retv) + EFA_WARN(FI_LOG_EP_CTRL, "destroy ibv qp failed! err: %s\n", + fi_strerror(-retv)); + free(qp); + } + + if (ibv_cq_ex) { + retv = -ibv_destroy_cq(ibv_cq_ex_to_cq(ibv_cq_ex)); + if (retv) + EFA_WARN(FI_LOG_EP_CTRL, "Unable to close ibv cq: %s\n", + fi_strerror(-retv)); + } + return ret; } /** @@ -1367,14 +1515,14 @@ static int efa_rdm_ep_setopt(fid_t fid, int level, int optname, */ if (efa_rdm_ep->base_ep.efa_qp_enabled) { EFA_WARN(FI_LOG_EP_CTRL, - "The option FI_OPT_EFA_RNR_RETRY is required \ - to be set before EP enabled %s\n", __func__); + "The option FI_OPT_EFA_RNR_RETRY is required " + "to be set before EP enabled\n"); return -FI_EINVAL; } if (!efa_domain_support_rnr_retry_modify(efa_rdm_ep_domain(efa_rdm_ep))) { EFA_WARN(FI_LOG_EP_CTRL, - "RNR capability is not supported %s\n", __func__); + "RNR capability is not supported\n"); return -FI_ENOSYS; } efa_rdm_ep->base_ep.rnr_retry = *(size_t *)optval; @@ -1413,20 +1561,29 @@ static int efa_rdm_ep_setopt(fid_t fid, int level, int optname, case FI_OPT_EFA_SENDRECV_IN_ORDER_ALIGNED_128_BYTES: if (optlen != sizeof(bool)) return -FI_EINVAL; - ret = efa_rdm_ep_set_sendrecv_in_order_aligned_128_bytes(efa_rdm_ep, *(bool *)optval); - if (ret) - return ret; + /* + * RDMA read is used to copy data from host bounce buffer to the + * application buffer on device + */ + if (*(bool *)optval) { + ret = efa_rdm_ep_check_qp_in_order_aligned_128_bytes(efa_rdm_ep, IBV_WR_RDMA_READ); + if (ret) + return ret; + } + efa_rdm_ep->sendrecv_in_order_aligned_128_bytes = *(bool *)optval; break; case FI_OPT_EFA_WRITE_IN_ORDER_ALIGNED_128_BYTES: if (optlen != sizeof(bool)) return -FI_EINVAL; - ret = efa_rdm_ep_set_write_in_order_aligned_128_bytes(efa_rdm_ep, *(bool *)optval); - if (ret) - return ret; + if (*(bool *)optval) { + ret = efa_rdm_ep_check_qp_in_order_aligned_128_bytes(efa_rdm_ep, IBV_WR_RDMA_WRITE); + if (ret) + return ret; + } + efa_rdm_ep->write_in_order_aligned_128_bytes = *(bool *)optval; break; default: - EFA_WARN(FI_LOG_EP_CTRL, - "Unknown endpoint option %s\n", __func__); + EFA_WARN(FI_LOG_EP_CTRL, "Unknown endpoint option\n"); return -FI_ENOPROTOOPT; } @@ -1507,8 +1664,7 @@ static int efa_rdm_ep_getopt(fid_t fid, int level, int optname, void *optval, *optlen = sizeof(bool); break; default: - EFA_WARN(FI_LOG_EP_CTRL, - "Unknown endpoint option %s\n", __func__); + EFA_WARN(FI_LOG_EP_CTRL, "Unknown endpoint option\n"); return -FI_ENOPROTOOPT; } diff --git a/prov/efa/src/rdm/efa_rdm_ep_progress.c b/prov/efa/src/rdm/efa_rdm_ep_progress.c index fef63ea87e1..ec0323418c8 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_progress.c +++ b/prov/efa/src/rdm/efa_rdm_ep_progress.c @@ -180,8 +180,9 @@ void efa_rdm_ep_progress_post_internal_rx_pkts(struct efa_rdm_ep *ep) ep->efa_rx_pkts_to_post = 0; } } else { - if (ep->efa_rx_pkts_posted == 0 && ep->efa_rx_pkts_to_post == 0) { - /* Both efa_rx_pkts_posted and efa_rx_pkts_to_post equal to 0 means + if (ep->efa_rx_pkts_posted == 0 && ep->efa_rx_pkts_to_post == 0 && ep->efa_rx_pkts_held == 0) { + /* All of efa_rx_pkts_posted, efa_rx_pkts_to_post and + * efa_rx_pkts_held equal to 0 means * this is the first call of the progress engine on this endpoint. * * In this case, we explictly allocate the 1st chunk of memory @@ -219,6 +220,8 @@ void efa_rdm_ep_progress_post_internal_rx_pkts(struct efa_rdm_ep *ep) ep->efa_rx_pkts_to_post = efa_rdm_ep_get_rx_pool_size(ep); } + /* only valid for non-zero copy */ + assert(ep->efa_rx_pkts_to_post + ep->efa_rx_pkts_posted + ep->efa_rx_pkts_held == efa_rdm_ep_get_rx_pool_size(ep)); } err = efa_rdm_ep_bulk_post_internal_rx_pkts(ep); @@ -251,270 +254,6 @@ void efa_rdm_ep_check_peer_backoff_timer(struct efa_rdm_ep *ep) } } -/** - * @brief handle rdma-core CQ completion resulted from IBV_WRITE_WITH_IMM - * - * This function handles hardware-assisted RDMA writes with immediate data at - * remote endpoint. These do not have a packet context, nor do they have a - * connid available. - * - * @param[in,out] ep endpoint - * @param[in] imm_data Data provided in the IMMEDIATE value. - * @param[in] len Payload length - * @param[in] flags flags (such as FI_REMOTE_CQ_DATA) - */ -void efa_rdm_ep_proc_ibv_recv_rdma_with_imm_completion(struct efa_rdm_ep *ep, - uint32_t imm_data, - uint32_t len, - uint64_t flags, - struct efa_rdm_pke *pkt_entry) -{ - struct util_cq *target_cq; - int ret; - fi_addr_t src_addr; - struct efa_av *efa_av; - - target_cq = ep->base_ep.util_ep.rx_cq; - efa_av = ep->base_ep.av; - - if (ep->base_ep.util_ep.caps & FI_SOURCE) { - src_addr = efa_av_reverse_lookup_rdm(efa_av, - ibv_wc_read_slid(ep->ibv_cq_ex), - ibv_wc_read_src_qp(ep->ibv_cq_ex), - NULL); - ret = ofi_cq_write_src(target_cq, NULL, flags, len, NULL, imm_data, 0, src_addr); - } else { - ret = ofi_cq_write(target_cq, NULL, flags, len, NULL, imm_data, 0); - } - - if (OFI_UNLIKELY(ret)) { - EFA_WARN(FI_LOG_CQ, - "Unable to write a cq entry for remote for RECV_RDMA operation: %s\n", - fi_strerror(-ret)); - efa_base_ep_write_eq_error(&ep->base_ep, FI_EIO, FI_EFA_ERR_WRITE_SHM_CQ_ENTRY); - } - - efa_cntr_report_rx_completion(&ep->base_ep.util_ep, flags); - - /* Recv with immediate will consume a pkt_entry, but the pkt is not - filled, so free the pkt_entry and record we have one less posted - packet now. */ - ep->efa_rx_pkts_posted--; - efa_rdm_pke_release_rx(pkt_entry); -} - -#if HAVE_EFADV_CQ_EX -/** - * @brief Read peer raw address from EFA device and look up the peer address in AV. - * This function should only be called if the peer AH is unknown. - * @return Peer address, or FI_ADDR_NOTAVAIL if unavailable. - */ -static inline -fi_addr_t efa_rdm_ep_determine_peer_address_from_efadv(struct efa_rdm_ep *ep, - struct ibv_cq_ex *ibv_cqx) -{ - struct efa_rdm_pke *pkt_entry; - struct efa_ep_addr efa_ep_addr = {0}; - fi_addr_t addr; - union ibv_gid gid = {0}; - uint32_t *connid = NULL; - - if (ep->ibv_cq_ex_type != EFADV_CQ) { - /* EFA DV CQ is not supported. This could be due to old EFA kernel module versions. */ - return FI_ADDR_NOTAVAIL; - } - - /* Attempt to read sgid from EFA firmware */ - if (efadv_wc_read_sgid(efadv_cq_from_ibv_cq_ex(ibv_cqx), &gid) < 0) { - /* Return code is negative if the peer AH is known */ - return FI_ADDR_NOTAVAIL; - } - - pkt_entry = (void *)(uintptr_t)ibv_cqx->wr_id; - - connid = efa_rdm_pke_connid_ptr(pkt_entry); - if (!connid) { - return FI_ADDR_NOTAVAIL; - } - - /* - * Use raw:qpn:connid as the key to lookup AV for peer's fi_addr - */ - memcpy(efa_ep_addr.raw, gid.raw, sizeof(efa_ep_addr.raw)); - efa_ep_addr.qpn = ibv_wc_read_src_qp(ibv_cqx); - efa_ep_addr.qkey = *connid; - addr = ofi_av_lookup_fi_addr(&ep->base_ep.av->util_av, &efa_ep_addr); - if (addr != FI_ADDR_NOTAVAIL) { - char gid_str_cdesc[INET6_ADDRSTRLEN]; - inet_ntop(AF_INET6, gid.raw, gid_str_cdesc, INET6_ADDRSTRLEN); - EFA_WARN(FI_LOG_AV, - "Recovered peer fi_addr. [Raw]:[QPN]:[QKey] = [%s]:[%" PRIu16 "]:[%" PRIu32 "]\n", - gid_str_cdesc, efa_ep_addr.qpn, efa_ep_addr.qkey); - } - - return addr; -} - -/** - * @brief Determine peer address from ibv_cq_ex - * Attempt to inject or determine peer address if not available. This usually - * happens when the endpoint receives the first packet from a new peer. - * There is an edge case for EFA endpoint - the device might lose the address - * handle of a known peer due to a firmware bug and return FI_ADDR_NOTAVAIL. - * The provider needs to look up the address using Raw address:QPN:QKey. - * Note: This function introduces addtional overhead. It should only be called if - * efa_av_lookup_address_rdm fails to find the peer address. - * @param ep Pointer to RDM endpoint - * @param ibv_cqx Pointer to CQ - * @returns Peer address, or FI_ADDR_NOTAVAIL if unsuccessful. - */ -static inline fi_addr_t efa_rdm_ep_determine_addr_from_ibv_cq(struct efa_rdm_ep *ep, struct ibv_cq_ex *ibv_cqx) -{ - struct efa_rdm_pke *pkt_entry; - fi_addr_t addr = FI_ADDR_NOTAVAIL; - - pkt_entry = (void *)(uintptr_t)ibv_cqx->wr_id; - - addr = efa_rdm_pke_determine_addr(pkt_entry); - - if (addr == FI_ADDR_NOTAVAIL) { - addr = efa_rdm_ep_determine_peer_address_from_efadv(ep, ibv_cqx); - } - - return addr; -} -#else -/** - * @brief Determine peer address from ibv_cq_ex - * Attempt to inject peer address if not available. This usually - * happens when the endpoint receives the first packet from a new peer. - * Note: This function introduces addtional overhead. It should only be called if - * efa_av_lookup_address_rdm fails to find the peer address. - * @param ep Pointer to RDM endpoint - * @param ibv_cqx Pointer to CQ - * @returns Peer address, or FI_ADDR_NOTAVAIL if unsuccessful. - */ -static inline -fi_addr_t efa_rdm_ep_determine_addr_from_ibv_cq(struct efa_rdm_ep *ep, struct ibv_cq_ex *ibv_cqx) -{ - struct efa_rdm_pke *pkt_entry; - - pkt_entry = (void *)(uintptr_t)ibv_cqx->wr_id; - - return efa_rdm_pke_determine_addr(pkt_entry); -} -#endif - -/** - * @brief poll rdma-core cq and process the cq entry - * - * @param[in] ep RDM endpoint - * @param[in] cqe_to_process Max number of cq entry to poll and process. Must be positive. - */ -static inline void efa_rdm_ep_poll_ibv_cq(struct efa_rdm_ep *ep, size_t cqe_to_process) -{ - bool should_end_poll = false; - /* Initialize an empty ibv_poll_cq_attr struct for ibv_start_poll. - * EFA expects .comp_mask = 0, or otherwise returns EINVAL. - */ - struct ibv_poll_cq_attr poll_cq_attr = {.comp_mask = 0}; - struct efa_av *efa_av; - struct efa_rdm_pke *pkt_entry; - ssize_t err; - int opcode; - size_t i = 0; - int prov_errno; - - assert(cqe_to_process > 0); - - efa_av = ep->base_ep.av; - - /* Call ibv_start_poll only once */ - err = ibv_start_poll(ep->ibv_cq_ex, &poll_cq_attr); - should_end_poll = !err; - - while (!err) { - pkt_entry = (void *)(uintptr_t)ep->ibv_cq_ex->wr_id; - efa_rdm_tracepoint(poll_cq, (size_t) ep->ibv_cq_ex->wr_id); - opcode = ibv_wc_read_opcode(ep->ibv_cq_ex); - if (ep->ibv_cq_ex->status) { - prov_errno = efa_rdm_ep_get_prov_errno(ep); - switch (opcode) { - case IBV_WC_SEND: /* fall through */ - case IBV_WC_RDMA_WRITE: /* fall through */ - case IBV_WC_RDMA_READ: - efa_rdm_pke_handle_tx_error(pkt_entry, FI_EIO, prov_errno); - break; - case IBV_WC_RECV: /* fall through */ - case IBV_WC_RECV_RDMA_WITH_IMM: - efa_rdm_pke_handle_rx_error(pkt_entry, FI_EIO, prov_errno); - break; - default: - EFA_WARN(FI_LOG_EP_CTRL, "Unhandled op code %d\n", opcode); - assert(0 && "Unhandled op code"); - } - break; - } - switch (opcode) { - case IBV_WC_SEND: -#if ENABLE_DEBUG - ep->send_comps++; -#endif - efa_rdm_pke_handle_send_completion(pkt_entry); - break; - case IBV_WC_RECV: - pkt_entry->addr = efa_av_reverse_lookup_rdm(efa_av, ibv_wc_read_slid(ep->ibv_cq_ex), - ibv_wc_read_src_qp(ep->ibv_cq_ex), pkt_entry); - - if (pkt_entry->addr == FI_ADDR_NOTAVAIL) { - pkt_entry->addr = efa_rdm_ep_determine_addr_from_ibv_cq(ep, ep->ibv_cq_ex); - } - - pkt_entry->pkt_size = ibv_wc_read_byte_len(ep->ibv_cq_ex); - assert(pkt_entry->pkt_size > 0); - efa_rdm_pke_handle_recv_completion(pkt_entry); -#if ENABLE_DEBUG - ep->recv_comps++; -#endif - break; - case IBV_WC_RDMA_READ: - case IBV_WC_RDMA_WRITE: - efa_rdm_pke_handle_rma_completion(pkt_entry); - break; - case IBV_WC_RECV_RDMA_WITH_IMM: - efa_rdm_ep_proc_ibv_recv_rdma_with_imm_completion(ep, - ibv_wc_read_imm_data(ep->ibv_cq_ex), - ibv_wc_read_byte_len(ep->ibv_cq_ex), - FI_REMOTE_CQ_DATA | FI_RMA | FI_REMOTE_WRITE, - pkt_entry ); - break; - default: - EFA_WARN(FI_LOG_EP_CTRL, - "Unhandled cq type\n"); - assert(0 && "Unhandled cq type"); - } - - i++; - if (i == cqe_to_process) { - break; - } - - /* - * ibv_next_poll MUST be call after the current WC is fully processed, - * which prevents later calls on ibv_cq_ex from reading the wrong WC. - */ - err = ibv_next_poll(ep->ibv_cq_ex); - } - - if (err && err != ENOENT) { - err = err > 0 ? err : -err; - prov_errno = efa_rdm_ep_get_prov_errno(ep); - efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno); - } - - if (should_end_poll) - ibv_end_poll(ep->ibv_cq_ex); -} /** @@ -584,10 +323,6 @@ void efa_rdm_ep_progress_internal(struct efa_rdm_ep *ep) assert(ofi_genlock_held(efa_rdm_ep_get_peer_srx_ctx(ep)->lock)); - /* Poll the EFA completion queue. Restrict poll size - * to avoid CQE flooding and thereby blocking user thread. */ - efa_rdm_ep_poll_ibv_cq(ep, efa_env.efa_cq_read_size); - efa_rdm_ep_progress_post_internal_rx_pkts(ep); efa_rdm_ep_check_peer_backoff_timer(ep); @@ -609,7 +344,7 @@ void efa_rdm_ep_progress_internal(struct efa_rdm_ep *ep) EFA_WARN(FI_LOG_EP_CTRL, "Failed to post HANDSHAKE to peer %ld: %s\n", peer->efa_fiaddr, fi_strerror(-ret)); - efa_base_ep_write_eq_error(&ep->base_ep, FI_EIO, FI_EFA_ERR_PEER_HANDSHAKE); + efa_base_ep_write_eq_error(&ep->base_ep, -ret, FI_EFA_ERR_PEER_HANDSHAKE); return; } diff --git a/prov/efa/src/rdm/efa_rdm_ep_utils.c b/prov/efa/src/rdm/efa_rdm_ep_utils.c index bd77646a797..bc0f2067a13 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_utils.c +++ b/prov/efa/src/rdm/efa_rdm_ep_utils.c @@ -9,7 +9,6 @@ #include #include "efa.h" #include "efa_av.h" -#include "efa_cq.h" #include "efa_rdm_msg.h" #include "efa_rdm_rma.h" #include "efa_rdm_atomic.h" @@ -190,7 +189,7 @@ struct efa_rdm_ope *efa_rdm_ep_alloc_rxe(struct efa_rdm_ep *ep, fi_addr_t addr, break; default: EFA_WARN(FI_LOG_EP_CTRL, - "Unknown operation while %s\n", __func__); + "Unknown operation for RX entry allocation\n"); assert(0 && "Unknown operation"); } @@ -210,9 +209,10 @@ int efa_rdm_ep_post_user_recv_buf(struct efa_rdm_ep *ep, struct efa_rdm_ope *rxe { struct efa_rdm_pke *pkt_entry; struct efa_mr *mr; - int err; + size_t rx_iov_offset = 0; + int err, rx_iov_index = 0; - assert(rxe->iov_count == 1); + assert(rxe->iov_count > 0 && rxe->iov_count <= ep->rx_iov_limit); assert(rxe->iov[0].iov_len >= ep->msg_prefix_size); pkt_entry = (struct efa_rdm_pke *)rxe->iov[0].iov_base; assert(pkt_entry); @@ -229,18 +229,33 @@ int efa_rdm_ep_post_user_recv_buf(struct efa_rdm_ep *ep, struct efa_rdm_ope *rxe pkt_entry->alloc_type = EFA_RDM_PKE_FROM_USER_BUFFER; pkt_entry->flags = EFA_RDM_PKE_IN_USE; pkt_entry->next = NULL; + pkt_entry->ep = ep; /* * The actual receiving buffer size (pkt_size) is - * rxe->total_len - sizeof(struct efa_rdm_pke) + * (total IOV length) - sizeof(struct efa_rdm_pke) * because the first part of user buffer was used to * construct pkt_entry. The actual receiving buffer * posted to device starts from pkt_entry->wiredata. */ - pkt_entry->pkt_size = rxe->iov[0].iov_len - sizeof(struct efa_rdm_pke); - + pkt_entry->pkt_size = ofi_total_iov_len(rxe->iov, rxe->iov_count) - sizeof *pkt_entry; pkt_entry->ope = rxe; rxe->state = EFA_RDM_RXE_MATCHED; + err = ofi_iov_locate(rxe->iov, rxe->iov_count, ep->msg_prefix_size, &rx_iov_index, &rx_iov_offset); + if (OFI_UNLIKELY(err)) { + EFA_WARN(FI_LOG_CQ, "ofi_iov_locate failure: %s (%d)\n", fi_strerror(-err), -err); + return err; + } + assert(rx_iov_index < rxe->iov_count); + assert(rx_iov_offset < rxe->iov[rx_iov_index].iov_len); + + if (rx_iov_index > 0) { + assert(rxe->iov_count - rx_iov_index == 1); + pkt_entry->payload = (char *) rxe->iov[rx_iov_index].iov_base + rx_iov_offset; + pkt_entry->payload_mr = rxe->desc[rx_iov_index]; + pkt_entry->payload_size = ofi_total_iov_len(&rxe->iov[rx_iov_index], rxe->iov_count - rx_iov_index) - rx_iov_offset; + } + err = efa_rdm_pke_recvv(&pkt_entry, 1); if (OFI_UNLIKELY(err)) { efa_rdm_pke_release_rx(pkt_entry); @@ -328,8 +343,7 @@ void efa_rdm_ep_record_tx_op_submitted(struct efa_rdm_ep *ep, struct efa_rdm_pke if (peer) peer->efa_outstanding_tx_ops++; - if (ope) - ope->efa_outstanding_tx_ops++; + ope->efa_outstanding_tx_ops++; #if ENABLE_DEBUG ep->efa_total_posted_tx_ops++; #endif @@ -654,7 +668,7 @@ void efa_rdm_ep_post_handshake_or_queue(struct efa_rdm_ep *ep, struct efa_rdm_pe EFA_WARN(FI_LOG_EP_CTRL, "Failed to post HANDSHAKE to peer %ld: %s\n", peer->efa_fiaddr, fi_strerror(-err)); - efa_base_ep_write_eq_error(&ep->base_ep, FI_EIO, FI_EFA_ERR_PEER_HANDSHAKE); + efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_PEER_HANDSHAKE); return; } @@ -681,42 +695,3 @@ size_t efa_rdm_ep_get_memory_alignment(struct efa_rdm_ep *ep, enum fi_hmem_iface return memory_alignment; } -/** - * @brief Get the vendor error code for an endpoint's CQ - * - * This function is essentially a wrapper for `ibv_wc_read_vendor_err()`; making - * a best-effort attempt to promote the error code to a proprietary EFA - * provider error code. - * - * @param[in] ep EFA RDM endpoint - * @return EFA-specific error code - * @sa #EFA_PROV_ERRNOS - * - * @todo Currently, this only checks for unresponsive receiver - * (#EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE) and attempts to promote it to - * #FI_EFA_ERR_ESTABLISHED_RECV_UNRESP. This should be expanded to handle other - * RDMA Core error codes (#EFA_IO_COMP_STATUSES) for the sake of more accurate - * error reporting - */ -int efa_rdm_ep_get_prov_errno(struct efa_rdm_ep *ep) { - uint32_t vendor_err = ibv_wc_read_vendor_err(ep->ibv_cq_ex); - struct efa_rdm_pke *pkt_entry = (void *) (uintptr_t) ep->ibv_cq_ex->wr_id; - struct efa_rdm_peer *peer; - - if (OFI_LIKELY(pkt_entry && pkt_entry->addr)) - peer = efa_rdm_ep_get_peer(ep, pkt_entry->addr); - else - return vendor_err; - - switch (vendor_err) { - case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE: { - if (peer->flags & EFA_RDM_PEER_HANDSHAKE_RECEIVED) - vendor_err = FI_EFA_ERR_ESTABLISHED_RECV_UNRESP; - break; - } - default: - break; - } - - return vendor_err; -} diff --git a/prov/efa/src/rdm/efa_rdm_msg.c b/prov/efa/src/rdm/efa_rdm_msg.c index 134ea9e068f..7ff4a116928 100644 --- a/prov/efa/src/rdm/efa_rdm_msg.c +++ b/prov/efa/src/rdm/efa_rdm_msg.c @@ -167,7 +167,6 @@ ssize_t efa_rdm_msg_generic_send(struct efa_rdm_ep *ep, struct efa_rdm_peer *pee txe = efa_rdm_ep_alloc_txe(ep, peer, msg, op, tag, flags); if (OFI_UNLIKELY(!txe)) { err = -FI_EAGAIN; - efa_rdm_ep_progress_internal(ep); goto out; } @@ -196,7 +195,6 @@ ssize_t efa_rdm_msg_generic_send(struct efa_rdm_ep *ep, struct efa_rdm_peer *pee err = efa_rdm_msg_post_rtm(ep, txe, use_p2p); if (OFI_UNLIKELY(err)) { - efa_rdm_ep_progress_internal(ep); efa_rdm_txe_release(txe); peer->next_msg_id--; } @@ -917,9 +915,9 @@ ssize_t efa_rdm_msg_generic_recv(struct efa_rdm_ep *ep, const struct fi_msg *msg efa_perfset_start(ep, perf_efa_recv); EFA_DBG(FI_LOG_EP_DATA, - "%s: iov_len: %lu tag: %lx ignore: %lx op: %x flags: %lx\n", - __func__, ofi_total_iov_len(msg->msg_iov, msg->iov_count), tag, ignore, - op, flags); + "iov_len: %lu tag: %lx ignore: %lx op: %x flags: %lx\n", + ofi_total_iov_len(msg->msg_iov, msg->iov_count), + tag, ignore, op, flags); efa_rdm_tracepoint(recv_begin_msg_context, (size_t) msg->context, (size_t) msg->addr); @@ -933,14 +931,11 @@ ssize_t efa_rdm_msg_generic_recv(struct efa_rdm_ep *ep, const struct fi_msg *msg rxe = efa_rdm_msg_alloc_rxe(ep, msg, op, flags, tag, ignore); if (OFI_UNLIKELY(!rxe)) { ret = -FI_EAGAIN; - efa_rdm_ep_progress_internal(ep); ofi_genlock_unlock(srx_ctx->lock); goto out; } ret = efa_rdm_ep_post_user_recv_buf(ep, rxe, flags); - if (ret == -FI_EAGAIN) - efa_rdm_ep_progress_internal(ep); ofi_genlock_unlock(srx_ctx->lock); } else if (op == ofi_op_tagged) { ret = util_srx_generic_trecv(ep->peer_srx_ep, msg->msg_iov, msg->desc, diff --git a/prov/efa/src/rdm/efa_rdm_msg.h b/prov/efa/src/rdm/efa_rdm_msg.h index f247494304b..ceac0845280 100644 --- a/prov/efa/src/rdm/efa_rdm_msg.h +++ b/prov/efa/src/rdm/efa_rdm_msg.h @@ -1,35 +1,5 @@ -/* - * Copyright (c) Amazon.com, Inc. or its affiliates. - * All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ static inline void efa_rdm_msg_construct(struct fi_msg *msg, const struct iovec *iov, void **desc, diff --git a/prov/efa/src/rdm/efa_rdm_ope.c b/prov/efa/src/rdm/efa_rdm_ope.c index 596b3f66c15..e4765b52473 100644 --- a/prov/efa/src/rdm/efa_rdm_ope.c +++ b/prov/efa/src/rdm/efa_rdm_ope.c @@ -57,10 +57,8 @@ void efa_rdm_txe_construct(struct efa_rdm_ope *txe, txe->cq_entry.len = ofi_total_iov_len(txe->iov, txe->iov_count); txe->cq_entry.buf = OFI_LIKELY(txe->cq_entry.len > 0) ? txe->iov[0].iov_base : NULL; - if (ep->msg_prefix_size > 0) { - assert(txe->iov[0].iov_len >= ep->msg_prefix_size); - txe->iov[0].iov_base = (char *)txe->iov[0].iov_base + ep->msg_prefix_size; - txe->iov[0].iov_len -= ep->msg_prefix_size; + if (ep->user_info->mode & FI_MSG_PREFIX) { + ofi_consume_iov_desc(txe->iov, txe->desc, &txe->iov_count, ep->msg_prefix_size); } txe->total_len = ofi_total_iov_len(txe->iov, txe->iov_count); @@ -1206,7 +1204,7 @@ ssize_t efa_rdm_txe_prepare_local_read_pkt_entry(struct efa_rdm_ope *txe) struct efa_rdm_pke *pkt_entry_copy; assert(txe->type == EFA_RDM_TXE); - assert(txe->rma_iov_count == 1); + assert(txe->rma_iov_count > 0 && txe->rma_iov_count <= efa_rdm_ep_domain(txe->ep)->info->tx_attr->rma_iov_limit); pkt_entry = txe->local_read_pkt_entry; if (pkt_entry->mr && !(txe->ep->sendrecv_in_order_aligned_128_bytes)) @@ -1288,11 +1286,11 @@ int efa_rdm_ope_post_read(struct efa_rdm_ope *ope) struct efa_rdm_ep *ep; struct efa_rdm_pke *pkt_entry; - assert(ope->iov_count > 0); - assert(ope->rma_iov_count > 0); - ep = ope->ep; + assert(ope->iov_count > 0 && ope->iov_count <= efa_rdm_ep_domain(ep)->info->tx_attr->iov_limit); + assert(ope->rma_iov_count > 0 && ope->rma_iov_count <= efa_rdm_ep_domain(ep)->info->tx_attr->rma_iov_limit); + if (ope->bytes_read_total_len == 0) { /* According to libfabric document @@ -1433,9 +1431,11 @@ int efa_rdm_ope_post_remote_write(struct efa_rdm_ope *ope) struct efa_rdm_ep *ep; struct efa_rdm_pke *pkt_entry; - assert(ope->iov_count > 0); - assert(ope->rma_iov_count > 0); ep = ope->ep; + + assert(ope->iov_count > 0 && ope->iov_count <= efa_rdm_ep_domain(ep)->info->tx_attr->iov_limit); + assert(ope->rma_iov_count > 0 && ope->rma_iov_count <= efa_rdm_ep_domain(ep)->info->tx_attr->rma_iov_limit); + if (ope->bytes_write_total_len == 0) { /* According to libfabric document * https://ofiwg.github.io/libfabric/main/man/fi_rma.3.html @@ -1665,7 +1665,12 @@ int efa_rdm_rxe_post_local_read_or_queue(struct efa_rdm_ope *rxe, } txe->local_read_pkt_entry = pkt_entry; - return efa_rdm_ope_post_remote_read_or_queue(txe); + err = efa_rdm_ope_post_remote_read_or_queue(txe); + /* The rx pkts are held until the local read completes */ + if (txe->local_read_pkt_entry->alloc_type == EFA_RDM_PKE_FROM_EFA_RX_POOL && !err) + txe->ep->efa_rx_pkts_held++; + + return err; } /** diff --git a/prov/efa/src/rdm/efa_rdm_ope.h b/prov/efa/src/rdm/efa_rdm_ope.h index 6d554ac2f91..a9c2119c4d6 100644 --- a/prov/efa/src/rdm/efa_rdm_ope.h +++ b/prov/efa/src/rdm/efa_rdm_ope.h @@ -1,35 +1,5 @@ -/* - * Copyright (c) Amazon.com, Inc. or its affiliates. - * All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef _EFA_RDM_OPE_H #define _EFA_RDM_OPE_H diff --git a/prov/efa/src/rdm/efa_rdm_peer.c b/prov/efa/src/rdm/efa_rdm_peer.c index 34b810580ca..4a4d526ce4e 100644 --- a/prov/efa/src/rdm/efa_rdm_peer.c +++ b/prov/efa/src/rdm/efa_rdm_peer.c @@ -1,35 +1,5 @@ -/* - * Copyright (c) 2019-2023 Amazon.com, Inc. or its affiliates. - * All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #include "efa.h" #include "efa_av.h" @@ -54,7 +24,6 @@ void efa_rdm_peer_construct(struct efa_rdm_peer *peer, struct efa_rdm_ep *ep, st peer->efa_fiaddr = conn->fi_addr; peer->is_self = efa_is_same_addr(&ep->base_ep.src_addr, conn->ep_addr); peer->host_id = peer->is_self ? ep->host_id : 0; /* Peer host id is exchanged via handshake */ - peer->num_read_msg_in_flight = 0; peer->num_runt_bytes_in_flight = 0; ofi_recvwin_buf_alloc(&peer->robuf, efa_env.recvwin_size); dlist_init(&peer->outstanding_tx_pkts); @@ -302,7 +271,8 @@ int efa_rdm_peer_select_readbase_rtm(struct efa_rdm_peer *peer, int op = ope->op; assert(op == ofi_op_tagged || op == ofi_op_msg); - if (peer->num_read_msg_in_flight == 0 && + + if (efa_rdm_ep_domain(ep)->num_read_msg_in_flight == 0 && efa_rdm_peer_get_runt_size(peer, ep, ope) > 0 && !(ope->fi_flags & FI_DELIVERY_COMPLETE)) { return (op == ofi_op_tagged) ? EFA_RDM_RUNTREAD_TAGRTM_PKT diff --git a/prov/efa/src/rdm/efa_rdm_peer.h b/prov/efa/src/rdm/efa_rdm_peer.h index 35815074d84..26d07298a08 100644 --- a/prov/efa/src/rdm/efa_rdm_peer.h +++ b/prov/efa/src/rdm/efa_rdm_peer.h @@ -58,11 +58,6 @@ struct efa_rdm_peer { * @details this value is capped by efa_env.efa_runt_size */ int64_t num_runt_bytes_in_flight; - - /** - * @brief number of messages that are using read based protocol - */ - int64_t num_read_msg_in_flight; }; /** diff --git a/prov/efa/src/rdm/efa_rdm_pke.c b/prov/efa/src/rdm/efa_rdm_pke.c index aac99ef11de..1abce84fe59 100644 --- a/prov/efa/src/rdm/efa_rdm_pke.c +++ b/prov/efa/src/rdm/efa_rdm_pke.c @@ -1,35 +1,5 @@ -/* - * Copyright (c) Amazon.com, Inc. or its affiliates. - * All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #include #include @@ -475,11 +445,10 @@ ssize_t efa_rdm_pke_sendv(struct efa_rdm_pke **pkt_entry_vec, * This function posts one read request. * * @param[in] pkt_entry read_entry that has information of the read request. - * @param[in,out] ep endpoint * @param[in] local_buf local buffer, where data will be copied to. * @param[in] len read size. * @param[in] desc memory descriptor of local buffer. - * @param[in] remote_buff remote buffer, where data will be read from. + * @param[in] remote_buf remote buffer, where data will be read from. * @param[in] remote_key memory key of remote buffer. * @return On success, return 0 * On failure, return a negative error code. @@ -536,12 +505,7 @@ int efa_rdm_pke_read(struct efa_rdm_pke *pkt_entry, * * This function posts one write request. * - * @param[in] pkt_entry write_entry that has information of the write request. - * @param[in] local_buf local buffer, where data will be copied from. - * @param[in] len write size. - * @param[in] desc memory descriptor of local buffer. - * @param[in] remote_buff remote buffer, where data will be written to. - * @param[in] remote_key memory key of remote buffer. + * @param[in] pkt_entry write_entry that has information of the write request. * @return On success, return 0 * On failure, return a negative error code. */ @@ -622,13 +586,10 @@ int efa_rdm_pke_write(struct efa_rdm_pke *pkt_entry) /** * @brief Post receive requests to EFA device * - * @param[in] ep EFA rdm endpoint - * @param[in] pkt_entry packet entries that contains information of receive buffer - * @param[in] desc Memory registration key - * @param[in] flags flags to be applied to the receive operation + * @param[in] pke_vec packet entries that contains information of receive buffer + * @param[in] pke_cnt Number of packet entries to post receive requests for * @return 0 on success * On error, a negative value corresponding to fabric errno - * */ ssize_t efa_rdm_pke_recvv(struct efa_rdm_pke **pke_vec, int pke_cnt) @@ -644,13 +605,20 @@ ssize_t efa_rdm_pke_recvv(struct efa_rdm_pke **pke_vec, for (i = 0; i < pke_cnt; ++i) { ep->base_ep.efa_recv_wr_vec[i].wr.wr_id = (uintptr_t)pke_vec[i]; - ep->base_ep.efa_recv_wr_vec[i].wr.num_sge = 1; /* Always post one iov/SGE */ + ep->base_ep.efa_recv_wr_vec[i].wr.num_sge = 1; ep->base_ep.efa_recv_wr_vec[i].wr.sg_list = ep->base_ep.efa_recv_wr_vec[i].sge; assert(pke_vec[i]->pkt_size > 0); - ep->base_ep.efa_recv_wr_vec[i].wr.sg_list[0].length = pke_vec[i]->pkt_size; + ep->base_ep.efa_recv_wr_vec[i].wr.sg_list[0].length = pke_vec[i]->pkt_size - pke_vec[i]->payload_size; ep->base_ep.efa_recv_wr_vec[i].wr.sg_list[0].lkey = ((struct efa_mr *) pke_vec[i]->mr)->ibv_mr->lkey; ep->base_ep.efa_recv_wr_vec[i].wr.sg_list[0].addr = (uintptr_t)pke_vec[i]->wiredata; ep->base_ep.efa_recv_wr_vec[i].wr.next = NULL; + + if (pke_vec[i]->payload) { + ep->base_ep.efa_recv_wr_vec[i].wr.num_sge = 2; + ep->base_ep.efa_recv_wr_vec[i].wr.sg_list[1].addr = (uintptr_t) pke_vec[i]->payload; + ep->base_ep.efa_recv_wr_vec[i].wr.sg_list[1].length = pke_vec[i]->payload_size; + ep->base_ep.efa_recv_wr_vec[i].wr.sg_list[1].lkey = ((struct efa_mr *) pke_vec[i]->payload_mr)->ibv_mr->lkey; + } if (i > 0) ep->base_ep.efa_recv_wr_vec[i-1].wr.next = &ep->base_ep.efa_recv_wr_vec[i].wr; #if HAVE_LTTNG diff --git a/prov/efa/src/rdm/efa_rdm_pke_cmd.c b/prov/efa/src/rdm/efa_rdm_pke_cmd.c index b6d80e90ce8..0190dafa71c 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_cmd.c +++ b/prov/efa/src/rdm/efa_rdm_pke_cmd.c @@ -374,16 +374,17 @@ void efa_rdm_pke_handle_data_copied(struct efa_rdm_pke *pkt_entry) * For other types of error, an error EQ entry is written. * * @param[in] pkt_entry pkt entry - * @param[in] err libfabric error code * @param[in] prov_errno provider specific error code */ -void efa_rdm_pke_handle_tx_error(struct efa_rdm_pke *pkt_entry, int err, int prov_errno) +void efa_rdm_pke_handle_tx_error(struct efa_rdm_pke *pkt_entry, int prov_errno) { struct efa_rdm_peer *peer; struct efa_rdm_ope *txe; struct efa_rdm_ope *rxe; struct efa_rdm_ep *ep; + int err = to_fi_errno(prov_errno); + assert(pkt_entry->alloc_type == EFA_RDM_PKE_FROM_EFA_TX_POOL); EFA_DBG(FI_LOG_CQ, "Packet send error: %s (%d)\n", @@ -459,7 +460,7 @@ void efa_rdm_pke_handle_tx_error(struct efa_rdm_pke *pkt_entry, int err, int pro */ if (!(txe->internal_flags & EFA_RDM_TXE_WRITTEN_RNR_CQ_ERR_ENTRY)) { txe->internal_flags |= EFA_RDM_TXE_WRITTEN_RNR_CQ_ERR_ENTRY; - efa_rdm_txe_handle_error(pkt_entry->ope, FI_ENORX, prov_errno); + efa_rdm_txe_handle_error(pkt_entry->ope, err, prov_errno); } efa_rdm_pke_release_tx(pkt_entry); @@ -504,9 +505,7 @@ void efa_rdm_pke_handle_tx_error(struct efa_rdm_pke *pkt_entry, int err, int pro } break; default: - EFA_WARN(FI_LOG_CQ, - "%s unknown x_entry type %d\n", - __func__, pkt_entry->ope->type); + EFA_WARN(FI_LOG_CQ, "Unknown x_entry type: %d\n", pkt_entry->ope->type); assert(0 && "unknown x_entry state"); efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno); efa_rdm_pke_release_tx(pkt_entry); @@ -655,26 +654,51 @@ void efa_rdm_pke_handle_send_completion(struct efa_rdm_pke *pkt_entry) * This function will write error cq or eq entry, then release the packet entry. * * @param[in] pkt_entry pkt entry - * @param[in] err libfabric error code * @param[in] prov_errno provider specific error code */ -void efa_rdm_pke_handle_rx_error(struct efa_rdm_pke *pkt_entry, int err, int prov_errno) +void efa_rdm_pke_handle_rx_error(struct efa_rdm_pke *pkt_entry, int prov_errno) { struct efa_rdm_ep *ep; + int err = to_fi_errno(prov_errno); ep = pkt_entry->ep; + /* + * we should still decrement the efa_rx_pkts_posted + * when getting a failed rx completion. + */ + assert(ep->efa_rx_pkts_posted > 0); + ep->efa_rx_pkts_posted--; EFA_DBG(FI_LOG_CQ, "Packet receive error: %s (%d)\n", efa_strerror(prov_errno), prov_errno); + /* + * pkes posted by efa_rdm_ep_bulk_post_internal_rx_pkts + * are not associated with ope before being progressed + */ + if (!pkt_entry->ope) { + char ep_addr_str[OFI_ADDRSTRLEN]; + size_t buflen=0; + + memset(&ep_addr_str, 0, sizeof(ep_addr_str)); + buflen = sizeof(ep_addr_str); + efa_rdm_ep_raw_addr_str(ep, ep_addr_str, &buflen); + EFA_WARN(FI_LOG_CQ, + "Packet receive error from non TX/RX packet. Our address: %s\n", + ep_addr_str); + + efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno); + efa_rdm_pke_release_rx(pkt_entry); + return; + } + if (pkt_entry->ope->type == EFA_RDM_TXE) { efa_rdm_txe_handle_error(pkt_entry->ope, err, prov_errno); } else if (pkt_entry->ope->type == EFA_RDM_RXE) { efa_rdm_rxe_handle_error(pkt_entry->ope, err, prov_errno); } else { - EFA_WARN(FI_LOG_CQ, - "%s unknown x_entry type %d\n", - __func__, pkt_entry->ope->type); + EFA_WARN(FI_LOG_CQ, "unknown RDM operation entry type encountered: %d\n", + pkt_entry->ope->type); assert(0 && "unknown x_entry state"); efa_base_ep_write_eq_error(&ep->base_ep, err, prov_errno); } @@ -721,7 +745,7 @@ fi_addr_t efa_rdm_pke_insert_addr(struct efa_rdm_pke *pkt_entry, void *raw_addr) ret = efa_av_insert_one(ep->base_ep.av, (struct efa_ep_addr *)raw_addr, &rdm_addr, 0, NULL, false); if (OFI_UNLIKELY(ret != 0)) { - efa_base_ep_write_eq_error(&ep->base_ep, FI_EINVAL, FI_EFA_ERR_AV_INSERT); + efa_base_ep_write_eq_error(&ep->base_ep, ret, FI_EFA_ERR_AV_INSERT); return -1; } @@ -886,7 +910,7 @@ void efa_rdm_pke_handle_recv_completion(struct efa_rdm_pke *pkt_entry) "Peer %d is requesting feature %d, which this EP does not support.\n", (int)pkt_entry->addr, base_hdr->type); - assert(0 && "invalid REQ packe type"); + assert(0 && "invalid REQ packet type"); efa_base_ep_write_eq_error(&ep->base_ep, FI_EIO, FI_EFA_ERR_INVALID_PKT_TYPE); efa_rdm_pke_release_rx(pkt_entry); return; diff --git a/prov/efa/src/rdm/efa_rdm_pke_cmd.h b/prov/efa/src/rdm/efa_rdm_pke_cmd.h index 5f4597cea1a..6ea4ea1238e 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_cmd.h +++ b/prov/efa/src/rdm/efa_rdm_pke_cmd.h @@ -1,35 +1,5 @@ -/* - * Copyright (c) 2019-2022 Amazon.com, Inc. or its affiliates. - * All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef _efa_rdm_pke_CMD_H #define _efa_rdm_pke_CMD_H @@ -50,13 +20,11 @@ fi_addr_t efa_rdm_pke_determine_addr(struct efa_rdm_pke *pkt_entry); void efa_rdm_pke_handle_data_copied(struct efa_rdm_pke *pkt_entry); -void efa_rdm_pke_handle_tx_error(struct efa_rdm_pke *pkt_entry, - int err, int prov_errno); +void efa_rdm_pke_handle_tx_error(struct efa_rdm_pke *pkt_entry, int prov_errno); void efa_rdm_pke_handle_send_completion(struct efa_rdm_pke *pkt_entry); -void efa_rdm_pke_handle_rx_error(struct efa_rdm_pke *pkt_entry, - int err, int prov_errno); +void efa_rdm_pke_handle_rx_error(struct efa_rdm_pke *pkt_entry, int prov_errno); void efa_rdm_pke_handle_recv_completion(struct efa_rdm_pke *pkt_entry); diff --git a/prov/efa/src/rdm/efa_rdm_pke_nonreq.c b/prov/efa/src/rdm/efa_rdm_pke_nonreq.c index a3c85af56c8..e5d735eb28d 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_nonreq.c +++ b/prov/efa/src/rdm/efa_rdm_pke_nonreq.c @@ -497,6 +497,11 @@ void efa_rdm_pke_handle_rma_read_completion(struct efa_rdm_pke *context_pkt_entr if (txe->addr == FI_ADDR_NOTAVAIL) { data_pkt_entry = txe->local_read_pkt_entry; assert(data_pkt_entry->payload_size > 0); + /* We were using a held rx pkt to post local read */ + if (data_pkt_entry->alloc_type == EFA_RDM_PKE_FROM_EFA_RX_POOL) { + assert(txe->ep->efa_rx_pkts_held > 0); + txe->ep->efa_rx_pkts_held--; + } efa_rdm_pke_handle_data_copied(data_pkt_entry); } else { assert(txe && txe->cq_entry.flags & FI_READ); @@ -641,11 +646,8 @@ void efa_rdm_pke_handle_eor_recv(struct efa_rdm_pke *pkt_entry) { struct efa_rdm_eor_hdr *eor_hdr; struct efa_rdm_ope *txe; - struct efa_rdm_peer *peer; - peer = efa_rdm_ep_get_peer(pkt_entry->ep, pkt_entry->addr); - assert(peer); - peer->num_read_msg_in_flight -= 1; + efa_rdm_ep_domain(pkt_entry->ep)->num_read_msg_in_flight -= 1; eor_hdr = (struct efa_rdm_eor_hdr *)pkt_entry->wiredata; @@ -669,11 +671,8 @@ void efa_rdm_pke_handle_read_nack_recv(struct efa_rdm_pke *pkt_entry) { struct efa_rdm_read_nack_hdr *nack_hdr; struct efa_rdm_ope *txe; - struct efa_rdm_peer *peer; - peer = efa_rdm_ep_get_peer(pkt_entry->ep, pkt_entry->addr); - assert(peer); - peer->num_read_msg_in_flight -= 1; + efa_rdm_ep_domain(pkt_entry->ep)->num_read_msg_in_flight -= 1; nack_hdr = (struct efa_rdm_read_nack_hdr *) pkt_entry->wiredata; @@ -801,7 +800,7 @@ void efa_rdm_pke_handle_atomrsp_recv(struct efa_rdm_pke *pkt_entry) txe->atomic_ex.resp_iov_count, atomrsp_pkt->data, atomrsp_hdr->seg_length); if (OFI_UNLIKELY(ret < 0)) { - efa_base_ep_write_eq_error(&pkt_entry->ep->base_ep, FI_EMSGSIZE, EFA_IO_COMP_STATUS_LOCAL_ERROR_BAD_LENGTH); + efa_base_ep_write_eq_error(&pkt_entry->ep->base_ep, -ret, EFA_IO_COMP_STATUS_LOCAL_ERROR_BAD_LENGTH); return; } diff --git a/prov/efa/src/rdm/efa_rdm_pke_req.c b/prov/efa/src/rdm/efa_rdm_pke_req.c index 3d2c2e7e0e5..53c29db815f 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_req.c +++ b/prov/efa/src/rdm/efa_rdm_pke_req.c @@ -1,35 +1,5 @@ -/* - * Copyright (c) Amazon.com, Inc. or its affiliates. - * All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #include #include @@ -200,7 +170,7 @@ uint32_t *efa_rdm_pke_get_req_connid_ptr(struct efa_rdm_pke *pkt_entry) * @return * an integer */ -int64_t efa_rdm_pke_get_req_cq_data(struct efa_rdm_pke *pkt_entry) +uint64_t efa_rdm_pke_get_req_cq_data(struct efa_rdm_pke *pkt_entry) { char *opt_hdr; struct efa_rdm_base_hdr *base_hdr; @@ -305,7 +275,7 @@ size_t efa_rdm_pke_get_req_hdr_size(struct efa_rdm_pke *pkt_entry) opt_hdr += sizeof(struct efa_rdm_req_opt_raw_addr_hdr) + raw_addr_hdr->addr_len; } - if (base_hdr->flags & EFA_RDM_REQ_OPT_CQ_DATA_HDR) + if (base_hdr->flags & EFA_RDM_REQ_OPT_CQ_DATA_HDR || pkt_entry->ep->use_zcpy_rx) opt_hdr += sizeof(struct efa_rdm_req_opt_cq_data_hdr); if (base_hdr->flags & EFA_RDM_PKT_CONNID_HDR) { diff --git a/prov/efa/src/rdm/efa_rdm_pke_req.h b/prov/efa/src/rdm/efa_rdm_pke_req.h index 33e5e857afe..070da30e384 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_req.h +++ b/prov/efa/src/rdm/efa_rdm_pke_req.h @@ -1,35 +1,5 @@ -/* - * Copyright (c) Amazon.com, Inc. or its affiliates. - * All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef _EFA_RDM_PKE_REQ_H #define _EFA_RDM_PKE_REQ_H @@ -47,7 +17,7 @@ void efa_rdm_pke_init_req_hdr_common(struct efa_rdm_pke *pkt_entry, void *efa_rdm_pke_get_req_raw_addr(struct efa_rdm_pke *pkt_entry); -int64_t efa_rdm_pke_get_req_cq_data(struct efa_rdm_pke *pkt_entry); +uint64_t efa_rdm_pke_get_req_cq_data(struct efa_rdm_pke *pkt_entry); uint32_t *efa_rdm_pke_get_req_connid_ptr(struct efa_rdm_pke *pkt_entry); diff --git a/prov/efa/src/rdm/efa_rdm_pke_rta.c b/prov/efa/src/rdm/efa_rdm_pke_rta.c index 381caa99c6b..3fe95ab52f3 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_rta.c +++ b/prov/efa/src/rdm/efa_rdm_pke_rta.c @@ -1,35 +1,6 @@ -/* - * Copyright (c) Amazon.com, Inc. or its affiliates. - * All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + #include "ofi_iov.h" #include "ofi_proto.h" #include "ofi_atomic.h" @@ -47,7 +18,7 @@ /** * @brief initialize the common elements of WRITE_RTA, FETCH_RTA and COMPARE_RTA - * + * * @param[in,out] pkt_entry packet entry * @param[in] pkt_type packet type. possible values are: * EFA_RDM_WRITE_RTA_PKT, EFA_RDM_FETCH_RTA_PKT and @@ -55,7 +26,7 @@ * @param[in] txe TX entry that has information of the * atomic operation * @retunrns - * + * * 0 on success. * negative libfabric error code on error. Possible error include: * -FI_ETRUNC user buffer is larger than maxium atomic message size @@ -107,13 +78,13 @@ ssize_t efa_rdm_pke_init_rta_common(struct efa_rdm_pke *pkt_entry, /** * @brief allocate a RX entry to process an incoming RTA packet - * + * * @param[in] pkt_entry received RTA packet * @param[in] op libfabric operation type. Possible values are: * ofi_op_atomic, ofi_op_atomic_fetch, ofi_op_atomic_compare * @return * pointer to efa_rdm_ope on success. - * NULL when rx entry pool is exhausted. + * NULL when rx entry pool is exhausted. */ struct efa_rdm_ope *efa_rdm_pke_alloc_rta_rxe(struct efa_rdm_pke *pkt_entry, int op) { @@ -165,12 +136,12 @@ struct efa_rdm_ope *efa_rdm_pke_alloc_rta_rxe(struct efa_rdm_pke *pkt_entry, int /** * @brief initialize a WRITE_RTA packet - * + * * @param[in,out] pkt_entry packet entry * @param[in] txe TX entry that has information of the * atomic operation * @returns - * + * * 0 on success. * negative libfabric error code on error. Possible error include: * -FI_ETRUNC user buffer is larger than maxium atomic message size @@ -184,7 +155,7 @@ ssize_t efa_rdm_pke_init_write_rta(struct efa_rdm_pke *pkt_entry, /** * @brief handle the send completion event of a WRITE RTA packet - * + * * @param[in,out] pkt_entry packet entry */ void efa_rdm_pke_handle_write_rta_send_completion(struct efa_rdm_pke *pkt_entry) @@ -219,7 +190,7 @@ int efa_rdm_write_atomic_hmem(struct efa_mr *efa_mr, struct iovec *dst, char *da /** * @brief process a received WRITE RTA packet - * + * * @param[in] pkt_entry received WRITE RTA packet */ int efa_rdm_pke_proc_write_rta(struct efa_rdm_pke *pkt_entry) @@ -278,7 +249,7 @@ int efa_rdm_pke_proc_write_rta(struct efa_rdm_pke *pkt_entry) * @param[in] txe TX entry that has information of the * atomic operation * @returns - * + * * 0 on success. * negative libfabric error code on error. Possible error include: * -FI_ETRUNC user buffer is larger than maxium atomic message size @@ -298,7 +269,7 @@ ssize_t efa_rdm_pke_init_dc_write_rta(struct efa_rdm_pke *pkt_entry, /** * @brief process a received DC WRITE RTA packet - * + * * @param[in] pkt_entry received DC WRITE RTA packet */ int efa_rdm_pke_proc_dc_write_rta(struct efa_rdm_pke *pkt_entry) @@ -345,7 +316,7 @@ int efa_rdm_pke_proc_dc_write_rta(struct efa_rdm_pke *pkt_entry) * @param[in] txe TX entry that has information of the * atomic operation * @returns - * + * * 0 on success. * negative libfabric error code on error. Possible error include: * -FI_ETRUNC user buffer is larger than maxium atomic message size @@ -390,7 +361,7 @@ int efa_rdm_fetch_atomic_hmem(struct efa_mr *efa_mr, struct iovec *dst, char *da /** * @brief process a received FETCH RTA packet - * + * * @param[in] pkt_entry received FETCH RTA packet */ int efa_rdm_pke_proc_fetch_rta(struct efa_rdm_pke *pkt_entry) @@ -459,7 +430,7 @@ int efa_rdm_pke_proc_fetch_rta(struct efa_rdm_pke *pkt_entry) * @param[in] txe TX entry that has information of the * atomic operation * @returns - * + * * 0 on success. * negative libfabric error code on error. Possible error include: * -FI_ETRUNC user buffer is larger than maxium atomic message size @@ -521,7 +492,7 @@ int efa_rdm_compare_atomic_hmem(struct efa_mr *efa_mr, struct iovec *dst, char * /** * @brief process a received COMPARE RTA packet - * + * * @param[in] pkt_entry packet entry */ int efa_rdm_pke_proc_compare_rta(struct efa_rdm_pke *pkt_entry) @@ -549,7 +520,7 @@ int efa_rdm_pke_proc_compare_rta(struct efa_rdm_pke *pkt_entry) dt = rxe->atomic_hdr.datatype; dtsize = ofi_datatype_size(rxe->atomic_hdr.datatype); if (OFI_UNLIKELY(!dtsize)) { - efa_base_ep_write_eq_error(&ep->base_ep, FI_EINVAL, FI_EFA_ERR_INVALID_DATATYPE); + efa_base_ep_write_eq_error(&ep->base_ep, errno, FI_EFA_ERR_INVALID_DATATYPE); efa_rdm_rxe_release(rxe); efa_rdm_pke_release_rx(pkt_entry); return -errno; @@ -580,7 +551,7 @@ int efa_rdm_pke_proc_compare_rta(struct efa_rdm_pke *pkt_entry) err = efa_rdm_ope_post_send_or_queue(rxe, EFA_RDM_ATOMRSP_PKT); if (OFI_UNLIKELY(err)) { - efa_base_ep_write_eq_error(&ep->base_ep, FI_EIO, FI_EFA_ERR_PKT_POST); + efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_PKT_POST); ofi_buf_free(rxe->atomrsp_data); efa_rdm_rxe_release(rxe); efa_rdm_pke_release_rx(pkt_entry); @@ -589,4 +560,4 @@ int efa_rdm_pke_proc_compare_rta(struct efa_rdm_pke *pkt_entry) efa_rdm_pke_release_rx(pkt_entry); return 0; -} \ No newline at end of file +} diff --git a/prov/efa/src/rdm/efa_rdm_pke_rta.h b/prov/efa/src/rdm/efa_rdm_pke_rta.h index 3bdf9be46b6..ad4e928bc4e 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_rta.h +++ b/prov/efa/src/rdm/efa_rdm_pke_rta.h @@ -1,35 +1,5 @@ -/* - * Copyright (c) 2019 Amazon.com, Inc. or its affiliates. - * All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef EFA_RDM_PKE_RTA_H #define EFA_RDM_PKE_RTA_H diff --git a/prov/efa/src/rdm/efa_rdm_pke_rtm.c b/prov/efa/src/rdm/efa_rdm_pke_rtm.c index 2d86807aad7..99c242ac423 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_rtm.c +++ b/prov/efa/src/rdm/efa_rdm_pke_rtm.c @@ -1,35 +1,5 @@ -/* - * Copyright (c) Amazon.com, Inc. or its affiliates. - * All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #include "ofi_iov.h" #include "ofi_proto.h" @@ -505,7 +475,7 @@ void efa_rdm_pke_handle_rtm_rta_recv(struct efa_rdm_pke *pkt_entry) "Invalid msg_id: %" PRIu32 " robuf->exp_msg_id: %" PRIu32 "\n", msg_id, peer->robuf.exp_msg_id); - efa_base_ep_write_eq_error(&ep->base_ep, FI_EIO, FI_EFA_ERR_PKT_ALREADY_PROCESSED); + efa_base_ep_write_eq_error(&ep->base_ep, ret, FI_EFA_ERR_PKT_ALREADY_PROCESSED); efa_rdm_pke_release_rx(pkt_entry); return; } @@ -519,7 +489,7 @@ void efa_rdm_pke_handle_rtm_rta_recv(struct efa_rdm_pke *pkt_entry) EFA_WARN(FI_LOG_EP_CTRL, "Unknown error %d processing REQ packet msg_id: %" PRIu32 "\n", ret, msg_id); - efa_base_ep_write_eq_error(&ep->base_ep, FI_EIO, FI_EFA_ERR_OTHER); + efa_base_ep_write_eq_error(&ep->base_ep, ret, FI_EFA_ERR_OTHER); return; } @@ -700,7 +670,6 @@ ssize_t efa_rdm_pke_proc_matched_eager_rtm(struct efa_rdm_pke *pkt_entry) */ rxe->cq_entry.len = 0; } else { - assert(rxe->cq_entry.buf == pkt_entry->wiredata - sizeof(struct efa_rdm_pke)); rxe->cq_entry.len = pkt_entry->pkt_size + sizeof(struct efa_rdm_pke); } @@ -1202,11 +1171,7 @@ ssize_t efa_rdm_pke_init_longread_tagrtm(struct efa_rdm_pke *pkt_entry, */ void efa_rdm_pke_handle_longread_rtm_sent(struct efa_rdm_pke *pkt_entry) { - struct efa_rdm_peer *peer; - - peer = efa_rdm_ep_get_peer(pkt_entry->ep, pkt_entry->addr); - assert(peer); - peer->num_read_msg_in_flight += 1; + efa_rdm_ep_domain(pkt_entry->ep)->num_read_msg_in_flight += 1; } /** @@ -1387,7 +1352,7 @@ void efa_rdm_pke_handle_runtread_rtm_sent(struct efa_rdm_pke *pkt_entry) if (efa_rdm_pke_get_runtread_rtm_base_hdr(pkt_entry)->seg_offset == 0 && txe->total_len > txe->bytes_runt) - peer->num_read_msg_in_flight += 1; + efa_rdm_ep_domain(pkt_entry->ep)->num_read_msg_in_flight += 1; } /** diff --git a/prov/efa/src/rdm/efa_rdm_pke_rtm.h b/prov/efa/src/rdm/efa_rdm_pke_rtm.h index 854b44708b0..4c55aad9ec0 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_rtm.h +++ b/prov/efa/src/rdm/efa_rdm_pke_rtm.h @@ -1,35 +1,5 @@ -/* - * Copyright (c) Amazon.com, Inc. or its affiliates. - * All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef EFA_RDM_PKE_RTM_H #define EFA_RDM_PKE_RTM_H diff --git a/prov/efa/src/rdm/efa_rdm_pke_rtr.c b/prov/efa/src/rdm/efa_rdm_pke_rtr.c index 13e95fadca2..2ad5718865d 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_rtr.c +++ b/prov/efa/src/rdm/efa_rdm_pke_rtr.c @@ -1,35 +1,6 @@ -/* - * Copyright (c) Amazon.com, Inc. or its affiliates. - * All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + #include "ofi_iov.h" #include "ofi_proto.h" #include "efa_errno.h" @@ -71,9 +42,9 @@ void efa_rdm_pke_init_rtr_common(struct efa_rdm_pke *pkt_entry, /** * @brief initialize a EFA_RDM_SHORT_RTR_PKT - * + * * @param[in] pkt_entry packet entry to be initialized - * + * */ ssize_t efa_rdm_pke_init_short_rtr(struct efa_rdm_pke *pkt_entry, struct efa_rdm_ope *txe) @@ -97,7 +68,7 @@ ssize_t efa_rdm_pke_init_longcts_rtr(struct efa_rdm_pke *pkt_entry, /** * @brief process an incoming RTR packet - * + * * This functions works for both EFA_RDM_SHORT_RTR_PKT and EFA_RDM_LONGCTS_RTR_PKT * @param[in] pkt_entry packet entry */ @@ -131,7 +102,7 @@ void efa_rdm_pke_handle_rtr_recv(struct efa_rdm_pke *pkt_entry) FI_REMOTE_READ, rxe->iov, rxe->desc); if (OFI_UNLIKELY(err)) { EFA_WARN(FI_LOG_CQ, "RMA address verification failed!\n"); - efa_base_ep_write_eq_error(&ep->base_ep, FI_EINVAL, FI_EFA_ERR_RMA_ADDR); + efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_RMA_ADDR); efa_rdm_rxe_release(rxe); efa_rdm_pke_release_rx(pkt_entry); return; @@ -145,11 +116,11 @@ void efa_rdm_pke_handle_rtr_recv(struct efa_rdm_pke *pkt_entry) err = efa_rdm_ope_post_send_or_queue(rxe, EFA_RDM_READRSP_PKT); if (OFI_UNLIKELY(err)) { EFA_WARN(FI_LOG_CQ, "Posting of readrsp packet failed! err=%ld\n", err); - efa_base_ep_write_eq_error(&ep->base_ep, FI_EIO, FI_EFA_ERR_PKT_POST); + efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_PKT_POST); efa_rdm_rxe_release(rxe); efa_rdm_pke_release_rx(pkt_entry); return; } efa_rdm_pke_release_rx(pkt_entry); -} \ No newline at end of file +} diff --git a/prov/efa/src/rdm/efa_rdm_pke_rtr.h b/prov/efa/src/rdm/efa_rdm_pke_rtr.h index 75d7b5ff3d3..afe31abf9c2 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_rtr.h +++ b/prov/efa/src/rdm/efa_rdm_pke_rtr.h @@ -1,35 +1,5 @@ -/* - * Copyright (c) Amazon.com, Inc. or its affiliates. - * All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef EFA_RDM_PKE_RTR_H #define EFA_RDM_PKE_RTR_H diff --git a/prov/efa/src/rdm/efa_rdm_pke_rtw.c b/prov/efa/src/rdm/efa_rdm_pke_rtw.c index 3f9100a10e4..c7dc43f2490 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_rtw.c +++ b/prov/efa/src/rdm/efa_rdm_pke_rtw.c @@ -1,35 +1,6 @@ -/* - * Copyright (c) Amazon.com, Inc. or its affiliates. - * All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + #include "ofi_iov.h" #include "ofi_proto.h" #include "efa_errno.h" @@ -52,7 +23,7 @@ * @param[in,out] pkt_entry RTW packet entry * @param[in] txe TX entry that has RMA write information * @param[in] rma_iov the "rma_iov" field in RTW packet header - * + * * @returns * 0 on success * negative libfabric error code on error. @@ -81,7 +52,7 @@ ssize_t efa_rdm_pke_init_rtw_common(struct efa_rdm_pke *pkt_entry, * @brief allcoate an RX entry for a incoming RTW packet * * The RX entry will be allocated from endpoint's OP entry - * pool + * pool * @param[in] pkt_entry received RTW packet * * @return @@ -176,7 +147,7 @@ void efa_rdm_pke_proc_eager_rtw(struct efa_rdm_pke *pkt_entry, if (OFI_UNLIKELY(err)) { EFA_WARN(FI_LOG_CQ, "RMA address verify failed!\n"); - efa_base_ep_write_eq_error(&ep->base_ep, FI_EIO, FI_EFA_ERR_RMA_ADDR); + efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_RMA_ADDR); efa_rdm_rxe_release(rxe); efa_rdm_pke_release_rx(pkt_entry); return; @@ -198,7 +169,7 @@ void efa_rdm_pke_proc_eager_rtw(struct efa_rdm_pke *pkt_entry, } else { err = efa_rdm_pke_copy_payload_to_ope(pkt_entry, rxe); if (OFI_UNLIKELY(err)) { - efa_base_ep_write_eq_error(&ep->base_ep, FI_EINVAL, FI_EFA_ERR_RXE_COPY); + efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_RXE_COPY); efa_rdm_pke_release_rx(pkt_entry); efa_rdm_rxe_release(rxe); } @@ -209,9 +180,9 @@ void efa_rdm_pke_proc_eager_rtw(struct efa_rdm_pke *pkt_entry, * @brief handle the event that an EFA_RDM_EAGER_RTW packet has been received * * Calls #efa_rdm_pke_proc_eager_rtw() - * + * * @param[in,out] pkt_entry received EFA_RDM_EAGER_RTW packet - * + * */ void efa_rdm_pke_handle_eager_rtw_recv(struct efa_rdm_pke *pkt_entry) { @@ -242,7 +213,7 @@ void efa_rdm_pke_handle_eager_rtw_recv(struct efa_rdm_pke *pkt_entry) * @brief initialize a EFA_RDM_DC_EAGER_RTW_PKT packet * * DC means delivery complete - * + * * @param[in,out] pkt_entry packet entry to be initialized * @param[in] txe TX entry that has RMA write information * @returns @@ -272,9 +243,9 @@ ssize_t efa_rdm_pke_init_dc_eager_rtw(struct efa_rdm_pke *pkt_entry, * * DC means delivery complete * Calls #efa_rdm_pke_proc_eager_rtw() - * + * * @param[in,out] pkt_entry received EFA_RDM_DC_EAGER_RTW packet - * + * */ void efa_rdm_pke_handle_dc_eager_rtw_recv(struct efa_rdm_pke *pkt_entry) { @@ -303,7 +274,7 @@ void efa_rdm_pke_handle_dc_eager_rtw_recv(struct efa_rdm_pke *pkt_entry) /** * @brief initialize the the header of a LONGCTS RTW packet - * + * * This function applies to both EFA_RDM_LONGCTS_RTW_PKT and * EFA_RDM_DC_LONGCTS_RTW_PKT */ @@ -325,7 +296,7 @@ void efa_rdm_pke_init_longcts_rtw_hdr(struct efa_rdm_pke *pkt_entry, /** * @brief initialize a EFA_RDM_LONGCTS_RTW packet * - * + * * @param[in,out] pkt_entry packet entry to be initialized * @param[in] txe TX entry that has RMA write information * @returns @@ -370,7 +341,7 @@ void efa_rdm_pke_handle_longcts_rtw_sent(struct efa_rdm_pke *pkt_entry) * @brief handle the "send completion" event of a LONGCTS RTW packet * * Apply to both EFA_RDM_LONGCTS_RTW and EFA_RDM_DC_LONGCTS_RTW - * + * * @param[in] pkt_entry LONGCTS RTW packet entry */ void efa_rdm_pke_handle_longcts_rtw_send_completion(struct efa_rdm_pke *pkt_entry) @@ -387,7 +358,7 @@ void efa_rdm_pke_handle_longcts_rtw_send_completion(struct efa_rdm_pke *pkt_entr * @brief handle the event that a LONGCTS RTW packet has been received * * applies to both EFA_RDM_LONGCTS_RTW_PKT and EFA_RDM_DC_LONGCTS_RTW_PKT - * + * * @param[in] pkt_entry received LONGCTS RTW paket entry */ void efa_rdm_pke_handle_longcts_rtw_recv(struct efa_rdm_pke *pkt_entry) @@ -420,7 +391,7 @@ void efa_rdm_pke_handle_longcts_rtw_recv(struct efa_rdm_pke *pkt_entry) FI_REMOTE_WRITE, rxe->iov, rxe->desc); if (OFI_UNLIKELY(err)) { EFA_WARN(FI_LOG_CQ, "RMA address verify failed!\n"); - efa_base_ep_write_eq_error(&ep->base_ep, FI_EIO, FI_EFA_ERR_RMA_ADDR); + efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_RMA_ADDR); efa_rdm_rxe_release(rxe); efa_rdm_pke_release_rx(pkt_entry); return; @@ -443,7 +414,7 @@ void efa_rdm_pke_handle_longcts_rtw_recv(struct efa_rdm_pke *pkt_entry) } else { err = efa_rdm_pke_copy_payload_to_ope(pkt_entry, rxe); if (OFI_UNLIKELY(err)) { - efa_base_ep_write_eq_error(&ep->base_ep, FI_EINVAL, FI_EFA_ERR_RXE_COPY); + efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_RXE_COPY); efa_rdm_rxe_release(rxe); efa_rdm_pke_release_rx(pkt_entry); return; @@ -536,7 +507,7 @@ ssize_t efa_rdm_pke_init_longread_rtw(struct efa_rdm_pke *pkt_entry, /** * @brief handle the event that a EFA_RDM_LONGREAD_RTA_PKE has been received - * + * * @param[in] pkt_entry received EFA_RDM_LONGREAD_RTA_PKT packet entry */ void efa_rdm_pke_handle_longread_rtw_recv(struct efa_rdm_pke *pkt_entry) @@ -568,7 +539,7 @@ void efa_rdm_pke_handle_longread_rtw_recv(struct efa_rdm_pke *pkt_entry) FI_REMOTE_WRITE, rxe->iov, rxe->desc); if (OFI_UNLIKELY(err)) { EFA_WARN(FI_LOG_CQ, "RMA address verify failed!\n"); - efa_base_ep_write_eq_error(&ep->base_ep, FI_EINVAL, FI_EFA_ERR_RMA_ADDR); + efa_base_ep_write_eq_error(&ep->base_ep, err, FI_EFA_ERR_RMA_ADDR); efa_rdm_rxe_release(rxe); efa_rdm_pke_release_rx(pkt_entry); return; @@ -596,4 +567,4 @@ void efa_rdm_pke_handle_longread_rtw_recv(struct efa_rdm_pke *pkt_entry) efa_rdm_rxe_release(rxe); efa_rdm_pke_release_rx(pkt_entry); } -} \ No newline at end of file +} diff --git a/prov/efa/src/rdm/efa_rdm_pke_rtw.h b/prov/efa/src/rdm/efa_rdm_pke_rtw.h index f31d6b982d4..32c8d29d664 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_rtw.h +++ b/prov/efa/src/rdm/efa_rdm_pke_rtw.h @@ -1,35 +1,5 @@ -/* - * Copyright (c) Amazon.com, Inc. or its affiliates. - * All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef EFA_RDM_PKE_RTW_H #define EFA_RDM_PKE_RTW_H diff --git a/prov/efa/src/rdm/efa_rdm_pke_utils.c b/prov/efa/src/rdm/efa_rdm_pke_utils.c index 3b8f79195be..90410a2597a 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_utils.c +++ b/prov/efa/src/rdm/efa_rdm_pke_utils.c @@ -1,35 +1,6 @@ -/* - * Copyright (c) Amazon.com, Inc. or its affiliates. - * All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + #include #include "ofi_iov.h" #include "efa.h" @@ -200,6 +171,10 @@ int efa_rdm_ep_flush_queued_blocking_copy_to_hmem(struct efa_rdm_ep *ep) pkt_entry = ep->queued_copy_vec[i].pkt_entry; segment_offset = ep->queued_copy_vec[i].data_offset; rxe = pkt_entry->ope; + if (pkt_entry->alloc_type == EFA_RDM_PKE_FROM_EFA_RX_POOL) { + assert(ep->efa_rx_pkts_held > 0); + ep->efa_rx_pkts_held--; + } if (bytes_copied[i] != MIN(pkt_entry->payload_size, rxe->cq_entry.len - segment_offset)) { @@ -245,6 +220,9 @@ int efa_rdm_pke_queued_copy_payload_to_hmem(struct efa_rdm_pke *pke, rxe->bytes_queued_blocking_copy += pke->payload_size; + if (pke->alloc_type == EFA_RDM_PKE_FROM_EFA_RX_POOL) + ep->efa_rx_pkts_held++; + if (ep->queued_copy_num < EFA_RDM_MAX_QUEUED_COPY && rxe->bytes_copied + rxe->bytes_queued_blocking_copy < rxe->total_len) { return 0; @@ -449,9 +427,9 @@ ssize_t efa_rdm_pke_copy_payload_to_ope(struct efa_rdm_pke *pke, * * 3. message size is 0, thus no data to copy. */ - if (OFI_UNLIKELY((ope->internal_flags & EFA_RDM_RXE_RECV_CANCEL)) || - OFI_UNLIKELY(segment_offset >= ope->cq_entry.len) || - OFI_UNLIKELY(pke->payload_size == 0)) { + if (OFI_UNLIKELY((ope->internal_flags & EFA_RDM_RXE_RECV_CANCEL) || + (segment_offset >= ope->cq_entry.len) || + (pke->payload_size == 0))) { efa_rdm_pke_handle_data_copied(pke); return 0; } diff --git a/prov/efa/src/rdm/efa_rdm_pke_utils.h b/prov/efa/src/rdm/efa_rdm_pke_utils.h index 3a18b2b2fe1..af01abf42e6 100644 --- a/prov/efa/src/rdm/efa_rdm_pke_utils.h +++ b/prov/efa/src/rdm/efa_rdm_pke_utils.h @@ -1,35 +1,5 @@ -/* - * Copyright (c) Amazon.com, Inc. or its affiliates. - * All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef _EFA_RDM_PKE_UTILS_H #define _EFA_RDM_PKE_UTILS_H @@ -40,7 +10,7 @@ /** * @brief get the base header of an pke - * + * * @param[in] pke packet entry * @returns base header */ @@ -91,8 +61,8 @@ size_t efa_rdm_pke_get_payload_offset(struct efa_rdm_pke *pkt_entry); ssize_t efa_rdm_pke_init_payload_from_ope(struct efa_rdm_pke *pke, struct efa_rdm_ope *ope, - size_t segment_offset, size_t payload_offset, + size_t segment_offset, size_t data_size); ssize_t efa_rdm_pke_copy_payload_to_ope(struct efa_rdm_pke *pke, diff --git a/prov/efa/src/rdm/efa_rdm_pkt_type.c b/prov/efa/src/rdm/efa_rdm_pkt_type.c index ce08e18b2aa..b918fac5416 100644 --- a/prov/efa/src/rdm/efa_rdm_pkt_type.c +++ b/prov/efa/src/rdm/efa_rdm_pkt_type.c @@ -1,3 +1,6 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + #include #include "efa_mr.h" #include "efa_rdm_ope.h" diff --git a/prov/efa/src/rdm/efa_rdm_pkt_type.h b/prov/efa/src/rdm/efa_rdm_pkt_type.h index 99758fb1885..b395dc287a8 100644 --- a/prov/efa/src/rdm/efa_rdm_pkt_type.h +++ b/prov/efa/src/rdm/efa_rdm_pkt_type.h @@ -1,35 +1,5 @@ -/* - * Copyright (c) Amazon.com, Inc. or its affiliates. - * All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef _EFA_RDM_PKT_TYPE_H #define _EFA_RDM_PKT_TYPE_H diff --git a/prov/efa/src/rdm/efa_rdm_protocol.h b/prov/efa/src/rdm/efa_rdm_protocol.h index abb365024fb..220605c9dee 100644 --- a/prov/efa/src/rdm/efa_rdm_protocol.h +++ b/prov/efa/src/rdm/efa_rdm_protocol.h @@ -406,7 +406,7 @@ struct efa_rdm_req_opt_raw_addr_hdr { }; struct efa_rdm_req_opt_cq_data_hdr { - int64_t cq_data; + uint64_t cq_data; }; struct efa_rdm_req_opt_connid_hdr { diff --git a/prov/efa/src/rdm/efa_rdm_rma.c b/prov/efa/src/rdm/efa_rdm_rma.c index bb73d87877d..c4328e33a9e 100644 --- a/prov/efa/src/rdm/efa_rdm_rma.c +++ b/prov/efa/src/rdm/efa_rdm_rma.c @@ -33,7 +33,7 @@ int efa_rdm_rma_verified_copy_iov(struct efa_rdm_ep *ep, struct efa_rma_iov *rma EFA_WARN(FI_LOG_EP_CTRL, "MR verification failed (%s), addr: %lx key: %ld\n", fi_strerror(-ret), rma[i].addr, rma[i].key); - return -FI_EACCES; + return ret; } iov[i].iov_base = (void *)rma[i].addr; @@ -167,7 +167,6 @@ ssize_t efa_rdm_rma_readmsg(struct fid_ep *ep, const struct fi_msg_rma *msg, uin txe = efa_rdm_rma_alloc_txe(efa_rdm_ep, peer, msg, ofi_op_read_req, flags); if (OFI_UNLIKELY(!txe)) { - efa_rdm_ep_progress_internal(efa_rdm_ep); err = -FI_EAGAIN; goto out; } @@ -209,13 +208,10 @@ ssize_t efa_rdm_rma_readmsg(struct fid_ep *ep, const struct fi_msg_rma *msg, uin if (OFI_UNLIKELY(err)) { if (err == -FI_ENOBUFS) err = -FI_EAGAIN; - efa_rdm_ep_progress_internal(efa_rdm_ep); goto out; } } else { err = efa_rdm_rma_post_efa_emulated_read(efa_rdm_ep, txe); - if (OFI_UNLIKELY(err)) - efa_rdm_ep_progress_internal(efa_rdm_ep); } out: @@ -330,16 +326,6 @@ bool efa_rdm_rma_should_write_using_rdma(struct efa_rdm_ep *ep, struct efa_rdm_o (txe->iov_count > 1 || txe->rma_iov_count > 1)) return false; - /* - * For local write, handshake is not required and - * we just need to check the local ep caps - */ - if (peer->is_self) - return efa_rdm_ep_support_rdma_write(ep); - - /* Check for hardware support of RDMA write. - A handshake should have been made before the check. */ - assert(peer->flags & EFA_RDM_PEER_HANDSHAKE_RECEIVED); return efa_both_support_rdma_write(ep, peer); } @@ -450,14 +436,12 @@ static inline ssize_t efa_rdm_generic_writemsg(struct efa_rdm_ep *efa_rdm_ep, txe = efa_rdm_rma_alloc_txe(efa_rdm_ep, peer, msg, ofi_op_write, flags); if (OFI_UNLIKELY(!txe)) { - efa_rdm_ep_progress_internal(efa_rdm_ep); err = -FI_EAGAIN; goto out; } err = efa_rdm_rma_post_write(efa_rdm_ep, txe); if (OFI_UNLIKELY(err)) { - efa_rdm_ep_progress_internal(efa_rdm_ep); efa_rdm_txe_release(txe); } out: diff --git a/prov/efa/src/rdm/efa_rdm_rma.h b/prov/efa/src/rdm/efa_rdm_rma.h index 8bf2d21076c..e56d6beb4cd 100644 --- a/prov/efa/src/rdm/efa_rdm_rma.h +++ b/prov/efa/src/rdm/efa_rdm_rma.h @@ -1,35 +1,6 @@ -/* - * Copyright (c) Amazon.com, Inc. or its affiliates. - * All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + #if HAVE_CONFIG_H #include #endif /* HAVE_CONFIG_H */ diff --git a/prov/efa/src/rdm/efa_rdm_rxe_map.c b/prov/efa/src/rdm/efa_rdm_rxe_map.c index baf31f4b556..009431e3f65 100644 --- a/prov/efa/src/rdm/efa_rdm_rxe_map.c +++ b/prov/efa/src/rdm/efa_rdm_rxe_map.c @@ -1,35 +1,6 @@ -/* - * Copyright (c) Amazon.com, Inc. or its affiliates. - * All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + #include "efa.h" #include "efa_rdm_pke.h" #include "efa_rdm_rxe_map.h" @@ -37,7 +8,7 @@ /** * @brief find an RX entry for a received RTM packet entry's sender address and msg_id - * + * * @param[in] rxe_map RX entry map * @param[in] pkt_entry received packet entry * @returns @@ -62,7 +33,7 @@ struct efa_rdm_ope *efa_rdm_rxe_map_lookup(struct efa_rdm_rxe_map *rxe_map, * @details * the insertion will use the combination of packet entry sender address and msg_id as key. * Caller is responsible to make sure the key does not exist in the map. - * + * * @param[in,out] rxe_map RX entry map * @param[in] pkt_entry received RTM packet * @param[in] rxe RX entry @@ -100,11 +71,11 @@ void efa_rdm_rxe_map_insert(struct efa_rdm_rxe_map *rxe_map, /** * @brief remove an RX entry from the RX entry map - * + * * @details * the removal will use the combination of packet entry sender address and msg_id as key. * Caller is responsible to make sure the key does exist in the map. - * + * * @param[in,out] rxe_map RX entry map * @param[in] msg_id message ID * @param[in] addr peer address diff --git a/prov/efa/src/rdm/efa_rdm_rxe_map.h b/prov/efa/src/rdm/efa_rdm_rxe_map.h index 83f80343466..1eb3d0cc7e0 100644 --- a/prov/efa/src/rdm/efa_rdm_rxe_map.h +++ b/prov/efa/src/rdm/efa_rdm_rxe_map.h @@ -1,35 +1,5 @@ -/* - * Copyright (c) Amazon.com, Inc. or its affiliates. - * All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef EFA_RDM_OPE_RECVMAP #define EFA_RDM_OPE_RECVMAP @@ -49,7 +19,7 @@ struct efa_rdm_rxe_map_entry; /** * @brief a hashmap between sender address + msg_id to RX entry - * + * * @details * This hash map is used on the receiver side to implement * medium and runting protocols. Such protocol will send diff --git a/prov/efa/src/rdm/efa_rdm_srx.c b/prov/efa/src/rdm/efa_rdm_srx.c index a61113653bf..47919dc5667 100644 --- a/prov/efa/src/rdm/efa_rdm_srx.c +++ b/prov/efa/src/rdm/efa_rdm_srx.c @@ -1,35 +1,5 @@ -/* - * Copyright (c) Amazon.com, Inc. or its affiliates. - * All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #include "efa.h" #include "efa_rdm_srx.h" diff --git a/prov/efa/src/rdm/efa_rdm_srx.h b/prov/efa/src/rdm/efa_rdm_srx.h index dee155114dd..d7c9b757d29 100644 --- a/prov/efa/src/rdm/efa_rdm_srx.h +++ b/prov/efa/src/rdm/efa_rdm_srx.h @@ -1,35 +1,5 @@ -/* - * Copyright (c) Amazon.com, Inc. or its affiliates. - * All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef _EFA_RDM_SRX_H #define _EFA_RDM_SRX_H diff --git a/prov/efa/src/rdm/efa_rdm_tracepoint.h b/prov/efa/src/rdm/efa_rdm_tracepoint.h index 19c7be0659f..3959bc8f2af 100644 --- a/prov/efa/src/rdm/efa_rdm_tracepoint.h +++ b/prov/efa/src/rdm/efa_rdm_tracepoint.h @@ -1,3 +1,6 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + #ifndef _EFA_RDM_TRACEPOINT_H #define _EFA_RDM_TRACEPOINT_H diff --git a/prov/efa/src/rdm/efa_rdm_tracepoint_def.c b/prov/efa/src/rdm/efa_rdm_tracepoint_def.c index 5cdcceeffa8..ed3e2d5f6e4 100644 --- a/prov/efa/src/rdm/efa_rdm_tracepoint_def.c +++ b/prov/efa/src/rdm/efa_rdm_tracepoint_def.c @@ -1,3 +1,6 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + #define LTTNG_UST_TRACEPOINT_CREATE_PROBES #define LTTNG_UST_TRACEPOINT_DEFINE diff --git a/prov/efa/src/rdm/efa_rdm_tracepoint_def.h b/prov/efa/src/rdm/efa_rdm_tracepoint_def.h index 56c08c15478..200a8e1a5a7 100644 --- a/prov/efa/src/rdm/efa_rdm_tracepoint_def.h +++ b/prov/efa/src/rdm/efa_rdm_tracepoint_def.h @@ -1,3 +1,6 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + #undef LTTNG_UST_TRACEPOINT_PROVIDER #define LTTNG_UST_TRACEPOINT_PROVIDER EFA_RDM_TP_PROV @@ -21,7 +24,7 @@ #define X_ENTRY_FIELDS \ lttng_ust_field_integer(int, msg_id, msg_id) \ lttng_ust_field_integer_hex(size_t, ctx, ctx) \ - lttng_ust_field_integer(int, total_len, total_len) + lttng_ust_field_integer(int, total_len, total_len) LTTNG_UST_TRACEPOINT_EVENT_CLASS(EFA_RDM_TP_PROV, x_entry, LTTNG_UST_TP_ARGS(X_ENTRY_ARGS), diff --git a/prov/efa/src/rdm/efa_rdm_util.h b/prov/efa/src/rdm/efa_rdm_util.h index e9341bd3f08..eefbbd8d248 100644 --- a/prov/efa/src/rdm/efa_rdm_util.h +++ b/prov/efa/src/rdm/efa_rdm_util.h @@ -1,35 +1,5 @@ -/* - * Copyright (c) Amazon.com, Inc. or its affiliates. - * All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef _EFA_RDM_UTIL_H #define _EFA_RDM_UTIL_H @@ -38,7 +8,11 @@ #include "efa_rdm_protocol.h" #include "efa_rdm_pke.h" -#define EFA_RDM_MSG_PREFIX_SIZE (sizeof(struct efa_rdm_pke) + sizeof(struct efa_rdm_eager_msgrtm_hdr) + EFA_RDM_REQ_OPT_RAW_ADDR_HDR_SIZE) +#define EFA_RDM_MSG_PREFIX_SIZE ( \ + sizeof(struct efa_rdm_pke) + \ + sizeof(struct efa_rdm_eager_msgrtm_hdr) + \ + sizeof(struct efa_rdm_req_opt_cq_data_hdr) + \ + EFA_RDM_REQ_OPT_RAW_ADDR_HDR_SIZE) #if defined(static_assert) && defined(__x86_64__) static_assert(EFA_RDM_MSG_PREFIX_SIZE % 8 == 0, "message prefix size alignment check"); diff --git a/prov/efa/test/efa_unit_test_av.c b/prov/efa/test/efa_unit_test_av.c index c1c23e073e5..9ca730d0b6e 100644 --- a/prov/efa/test/efa_unit_test_av.c +++ b/prov/efa/test/efa_unit_test_av.c @@ -1,3 +1,6 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + #include "efa_unit_tests.h" /** @@ -5,7 +8,7 @@ * This test calls fi_av_insert() twice with the same raw address, * and verifies that returned fi_addr is the same and * ibv_create_ah only gets called once. - * + * * @param[in] state struct efa_resource that is managed by the framework */ void test_av_insert_duplicate_raw_addr(struct efa_resource **state) @@ -40,7 +43,7 @@ void test_av_insert_duplicate_raw_addr(struct efa_resource **state) * This test calls fi_av_insert() twice with two difference raw address with same GID, * and verifies that returned fi_addr is different and ibv_create_ah only gets called once. * this is because libfabric EFA provider has a cache for address handle (AH). - * + * * @param[in] state struct efa_resource that is managed by the framework */ void test_av_insert_duplicate_gid(struct efa_resource **state) diff --git a/prov/efa/test/efa_unit_test_cntr.c b/prov/efa/test/efa_unit_test_cntr.c new file mode 100644 index 00000000000..dd2e69b205e --- /dev/null +++ b/prov/efa/test/efa_unit_test_cntr.c @@ -0,0 +1,98 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ +#include "efa_unit_tests.h" +#include "efa_cntr.h" + +/** + * @brief get the length of the ibv_cq_poll_list for a given efa_rdm_cq + * + * @param cq_fid cq fid + * @return int the length of the ibv_cq_poll_list + */ +static +int test_efa_rdm_cntr_get_ibv_cq_poll_list_length(struct fid_cntr *cntr_fid) +{ + int i = 0; + struct dlist_entry *item; + struct efa_cntr *cntr; + + cntr = container_of(cntr_fid, struct efa_cntr, util_cntr.cntr_fid.fid); + dlist_foreach(&cntr->ibv_cq_poll_list, item) { + i++; + } + + return i; +} + +/** + * @brief Check the length of ibv_cq_poll_list in cntr when 1 cq is bind to 1 ep + * as both tx/rx cq. + * + * @param state struct efa_resource that is managed by the framework + */ +void test_efa_rdm_cntr_ibv_cq_poll_list_same_tx_rx_cq_single_ep(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct fid_cntr *cntr; + struct fi_cntr_attr cntr_attr = {0}; + + efa_unit_test_resource_construct_ep_not_enabled(resource, FI_EP_RDM); + + assert_int_equal(fi_cntr_open(resource->domain, &cntr_attr, &cntr, NULL), 0); + + /* TODO: expand this test to all flags */ + assert_int_equal(fi_ep_bind(resource->ep, &cntr->fid, FI_TRANSMIT), 0); + + assert_int_equal(fi_enable(resource->ep), 0); + + /* efa_unit_test_resource_construct binds single OFI CQ as both tx/rx cq of ep */ + assert_int_equal(test_efa_rdm_cntr_get_ibv_cq_poll_list_length(cntr), 1); + + /* ep must be closed before cq/av/eq... */ + fi_close(&resource->ep->fid); + resource->ep = NULL; + + fi_close(&cntr->fid); +} + +/** + * @brief Check the length of ibv_cq_poll_list in cntr when separate tx/rx cq is bind to 1 ep. + * + * @param state struct efa_resource that is managed by the framework + */ +void test_efa_rdm_cntr_ibv_cq_poll_list_separate_tx_rx_cq_single_ep(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct fid_cq *txcq, *rxcq; + struct fi_cq_attr cq_attr = {0}; + struct fid_cntr *cntr; + struct fi_cntr_attr cntr_attr = {0}; + + efa_unit_test_resource_construct_no_cq_and_ep_not_enabled(resource, FI_EP_RDM); + + assert_int_equal(fi_cq_open(resource->domain, &cq_attr, &txcq, NULL), 0); + + assert_int_equal(fi_ep_bind(resource->ep, &txcq->fid, FI_SEND), 0); + + assert_int_equal(fi_cq_open(resource->domain, &cq_attr, &rxcq, NULL), 0); + + assert_int_equal(fi_ep_bind(resource->ep, &rxcq->fid, FI_RECV), 0); + + assert_int_equal(fi_cntr_open(resource->domain, &cntr_attr, &cntr, NULL), 0); + + /* TODO: expand this test to all flags */ + assert_int_equal(fi_ep_bind(resource->ep, &cntr->fid, FI_TRANSMIT), 0); + + assert_int_equal(fi_enable(resource->ep), 0); + + assert_int_equal(test_efa_rdm_cntr_get_ibv_cq_poll_list_length(cntr), 2); + + /* ep must be closed before cq/av/eq... */ + fi_close(&resource->ep->fid); + resource->ep = NULL; + fi_close(&txcq->fid); + fi_close(&rxcq->fid); + fi_close(&cntr->fid); +} + + diff --git a/prov/efa/test/efa_unit_test_common.c b/prov/efa/test/efa_unit_test_common.c index d77248390f9..5776f52fa40 100644 --- a/prov/efa/test/efa_unit_test_common.c +++ b/prov/efa/test/efa_unit_test_common.c @@ -1,3 +1,6 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + #include "efa_unit_tests.h" #include "efa_rdm_pke_utils.h" #include "efa_rdm_pke_nonreq.h" @@ -40,7 +43,7 @@ struct fi_info *efa_unit_test_alloc_hints(enum fi_ep_type ep_type) if (!hints) return NULL; - hints->fabric_attr->prov_name = "efa"; + hints->fabric_attr->prov_name = strdup("efa"); hints->ep_attr->type = ep_type; hints->domain_attr->mr_mode |= FI_MR_LOCAL | FI_MR_ALLOCATED; @@ -54,7 +57,7 @@ struct fi_info *efa_unit_test_alloc_hints(enum fi_ep_type ep_type) void efa_unit_test_resource_construct_with_hints(struct efa_resource *resource, enum fi_ep_type ep_type, struct fi_info *hints, - bool enable_ep) + bool enable_ep, bool open_cq) { int ret = 0; struct fi_av_attr av_attr = {0}; @@ -89,11 +92,13 @@ void efa_unit_test_resource_construct_with_hints(struct efa_resource *resource, fi_ep_bind(resource->ep, &resource->av->fid, 0); - ret = fi_cq_open(resource->domain, &cq_attr, &resource->cq, NULL); - if (ret) - goto err; + if (open_cq) { + ret = fi_cq_open(resource->domain, &cq_attr, &resource->cq, NULL); + if (ret) + goto err; - fi_ep_bind(resource->ep, &resource->cq->fid, FI_SEND | FI_RECV); + fi_ep_bind(resource->ep, &resource->cq->fid, FI_SEND | FI_RECV); + } if (enable_ep) { ret = fi_enable(resource->ep); @@ -116,7 +121,7 @@ void efa_unit_test_resource_construct(struct efa_resource *resource, enum fi_ep_ if (!resource->hints) goto err; efa_unit_test_resource_construct_with_hints(resource, ep_type, - resource->hints, true); + resource->hints, true, true); return; err: @@ -133,14 +138,31 @@ void efa_unit_test_resource_construct_ep_not_enabled(struct efa_resource *resour if (!resource->hints) goto err; efa_unit_test_resource_construct_with_hints(resource, ep_type, - resource->hints, false); + resource->hints, false, true); return; err: efa_unit_test_resource_destruct(resource); /* Fail test early if the resource struct fails to initialize */ - assert_int_equal(1, 0); + fail(); +} + +void efa_unit_test_resource_construct_no_cq_and_ep_not_enabled(struct efa_resource *resource, + enum fi_ep_type ep_type) +{ + resource->hints = efa_unit_test_alloc_hints(ep_type); + if (!resource->hints) + goto err; + efa_unit_test_resource_construct_with_hints(resource, ep_type, + resource->hints, false, false); + return; + +err: + efa_unit_test_resource_destruct(resource); + + /* Fail test early if the resource struct fails to initialize */ + fail(); } /** @@ -177,6 +199,10 @@ void efa_unit_test_resource_destruct(struct efa_resource *resource) if (resource->info) { fi_freeinfo(resource->info); } + + if (resource->hints) { + fi_freeinfo(resource->hints); + } } void efa_unit_test_buff_construct(struct efa_unit_test_buff *buff, struct efa_resource *resource, size_t buff_size) diff --git a/prov/efa/test/efa_unit_test_cq.c b/prov/efa/test/efa_unit_test_cq.c index 9649b090e7b..c59dde05f1c 100644 --- a/prov/efa/test/efa_unit_test_cq.c +++ b/prov/efa/test/efa_unit_test_cq.c @@ -4,6 +4,7 @@ #include "efa_unit_tests.h" #include "dgram/efa_dgram_ep.h" #include "dgram/efa_dgram_cq.h" +#include "rdm/efa_rdm_cq.h" /** * @brief implementation of test cases for fi_cq_read() works with empty device CQ for given endpoint type @@ -31,7 +32,8 @@ void test_impl_cq_read_empty_cq(struct efa_resource *resource, enum fi_ep_type e struct efa_rdm_ep *efa_rdm_ep; efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - ibv_cqx = efa_rdm_ep->ibv_cq_ex; + assert(efa_rdm_ep->base_ep.util_ep.rx_cq); + ibv_cqx = container_of(efa_rdm_ep->base_ep.util_ep.rx_cq, struct efa_rdm_cq, util_cq)->ibv_cq.ibv_cq_ex; } ibv_cqx->start_poll = &efa_mock_ibv_start_poll_return_mock; @@ -98,6 +100,7 @@ static void test_rdm_cq_read_bad_send_status(struct efa_resource *resource, struct ibv_qp_ex *ibv_qpx; struct efa_rdm_ep *efa_rdm_ep; struct efa_rdm_peer *peer; + struct efa_rdm_cq *efa_rdm_cq; efa_unit_test_resource_construct(resource, FI_EP_RDM); efa_unit_test_buff_construct(&send_buff, resource, 4096 /* buff_size */); @@ -105,7 +108,9 @@ static void test_rdm_cq_read_bad_send_status(struct efa_resource *resource, efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); efa_rdm_ep->host_id = local_host_id; ibv_qpx = efa_rdm_ep->base_ep.qp->ibv_qp_ex; - ibv_cqx = efa_rdm_ep->ibv_cq_ex; + + efa_rdm_cq = container_of(resource->cq, struct efa_rdm_cq, util_cq.cq_fid.fid); + ibv_cqx = efa_rdm_cq->ibv_cq.ibv_cq_ex; /* close shm_ep to force efa_rdm_ep to use efa device to send */ if (efa_rdm_ep->shm_ep) { err = fi_close(&efa_rdm_ep->shm_ep->fid); @@ -143,10 +148,12 @@ static void test_rdm_cq_read_bad_send_status(struct efa_resource *resource, ibv_cqx->end_poll = &efa_mock_ibv_end_poll_check_mock; ibv_cqx->read_opcode = &efa_mock_ibv_read_opcode_return_mock; ibv_cqx->read_vendor_err = &efa_mock_ibv_read_vendor_err_return_mock; + ibv_cqx->read_qp_num = &efa_mock_ibv_read_qp_num_return_mock; will_return(efa_mock_ibv_start_poll_use_saved_send_wr_with_mock_status, IBV_WC_GENERAL_ERR); will_return(efa_mock_ibv_end_poll_check_mock, NULL); will_return(efa_mock_ibv_read_opcode_return_mock, IBV_WC_SEND); will_return(efa_mock_ibv_read_vendor_err_return_mock, vendor_error); + will_return(efa_mock_ibv_read_qp_num_return_mock, 0); ret = fi_cq_read(resource->cq, &cq_entry, 1); /* fi_cq_read() called efa_mock_ibv_start_poll_use_saved_send_wr(), which pulled one send_wr from g_ibv_submitted_wr_idv=_vec */ assert_int_equal(g_ibv_submitted_wr_id_cnt, 0); @@ -162,7 +169,7 @@ static void test_rdm_cq_read_bad_send_status(struct efa_resource *resource, strerror = fi_cq_strerror(resource->cq, cq_err_entry.prov_errno, cq_err_entry.err_data, NULL, 0); assert_int_equal(ret, 1); - assert_int_equal(cq_err_entry.err, FI_EIO); + assert_int_not_equal(cq_err_entry.err, FI_SUCCESS); assert_int_equal(cq_err_entry.prov_errno, vendor_error); /* Reset value */ @@ -274,36 +281,35 @@ void test_ibv_cq_ex_read_bad_recv_status(struct efa_resource **state) struct efa_resource *resource = *state; struct efa_rdm_pke *pkt_entry; struct fi_cq_data_entry cq_entry; - struct fi_cq_err_entry cq_err_entry = {0}; - struct efa_ep_addr raw_addr = {0}; - size_t raw_addr_len = sizeof(struct efa_ep_addr); - fi_addr_t peer_addr; - int ret, err, numaddr, err_data_size = 1024; + struct fi_eq_err_entry eq_err_entry; + int ret; + struct efa_rdm_cq *efa_rdm_cq; efa_unit_test_resource_construct(resource, FI_EP_RDM); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + /* + * The rx pkt entry should only be allocated and posted by the progress engine. + * However, to mock a receive completion, we have to allocate an rx entry + * and modify it out of band. The proess engine grow the rx pool in the first + * call and set efa_rdm_ep->efa_rx_pkts_posted as the rx pool size. Here we + * follow the progress engine to set the efa_rx_pkts_posted counter manually + * TODO: modify the rx pkt as part of the ibv cq poll mock so we don't have to + * allocate pkt entry and hack the pkt counters. + */ pkt_entry = efa_rdm_pke_alloc(efa_rdm_ep, efa_rdm_ep->efa_rx_pkt_pool, EFA_RDM_PKE_FROM_EFA_RX_POOL); + assert_non_null(pkt_entry); + efa_rdm_ep->efa_rx_pkts_posted = efa_rdm_ep_get_rx_pool_size(efa_rdm_ep); - /* create a fake peer */ - err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len); - assert_int_equal(err, 0); - raw_addr.qpn = 1; - raw_addr.qkey = 0x1234; - numaddr = fi_av_insert(resource->av, &raw_addr, 1, &peer_addr, 0, NULL); - assert_int_equal(numaddr, 1); - - pkt_entry->ope = efa_rdm_ep_alloc_rxe(efa_rdm_ep, peer_addr, ofi_op_msg); - /* A receive completion requires efa rx pkts are posted */ - efa_rdm_ep->efa_rx_pkts_posted++; - assert_non_null(pkt_entry); + efa_rdm_cq = container_of(resource->cq, struct efa_rdm_cq, util_cq.cq_fid.fid); - efa_rdm_ep->ibv_cq_ex->start_poll = &efa_mock_ibv_start_poll_return_mock; - efa_rdm_ep->ibv_cq_ex->end_poll = &efa_mock_ibv_end_poll_check_mock; - efa_rdm_ep->ibv_cq_ex->read_opcode = &efa_mock_ibv_read_opcode_return_mock; - efa_rdm_ep->ibv_cq_ex->read_vendor_err = &efa_mock_ibv_read_vendor_err_return_mock; + efa_rdm_cq->ibv_cq.ibv_cq_ex->start_poll = &efa_mock_ibv_start_poll_return_mock; + efa_rdm_cq->ibv_cq.ibv_cq_ex->end_poll = &efa_mock_ibv_end_poll_check_mock; + efa_rdm_cq->ibv_cq.ibv_cq_ex->read_opcode = &efa_mock_ibv_read_opcode_return_mock; + efa_rdm_cq->ibv_cq.ibv_cq_ex->read_vendor_err = &efa_mock_ibv_read_vendor_err_return_mock; + efa_rdm_cq->ibv_cq.ibv_cq_ex->read_qp_num = &efa_mock_ibv_read_qp_num_return_mock; will_return(efa_mock_ibv_start_poll_return_mock, 0); will_return(efa_mock_ibv_end_poll_check_mock, NULL); @@ -312,19 +318,20 @@ void test_ibv_cq_ex_read_bad_recv_status(struct efa_resource **state) * therefore use will_return_always() */ will_return_always(efa_mock_ibv_read_opcode_return_mock, IBV_WC_RECV); + will_return_always(efa_mock_ibv_read_qp_num_return_mock, 0); will_return(efa_mock_ibv_read_vendor_err_return_mock, EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE); - efa_rdm_ep->ibv_cq_ex->wr_id = (uintptr_t)pkt_entry; - efa_rdm_ep->ibv_cq_ex->status = IBV_WC_GENERAL_ERR; + /* the recv error will not populate to application cq because it's an EFA internal error and + * and not related to any application recv. Currently we can only read the error from eq. + */ + efa_rdm_cq->ibv_cq.ibv_cq_ex->wr_id = (uintptr_t)pkt_entry; + efa_rdm_cq->ibv_cq.ibv_cq_ex->status = IBV_WC_GENERAL_ERR; ret = fi_cq_read(resource->cq, &cq_entry, 1); - assert_int_equal(ret, -FI_EAVAIL); + assert_int_equal(ret, -FI_EAGAIN); - cq_err_entry.err_data = malloc(err_data_size); - cq_err_entry.err_data_size = err_data_size; - ret = fi_cq_readerr(resource->cq, &cq_err_entry, 0); - assert_int_equal(ret, 1); - assert_int_equal(cq_err_entry.err, FI_EIO); - assert_int_equal(cq_err_entry.prov_errno, EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE); - free(cq_err_entry.err_data); + ret = fi_eq_readerr(resource->eq, &eq_err_entry, 0); + assert_int_equal(ret, sizeof(eq_err_entry)); + assert_int_not_equal(eq_err_entry.err, FI_SUCCESS); + assert_int_equal(eq_err_entry.prov_errno, EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE); } /** @@ -336,33 +343,145 @@ void test_ibv_cq_ex_read_bad_recv_status(struct efa_resource **state) */ void test_ibv_cq_ex_read_failed_poll(struct efa_resource **state) { - struct efa_rdm_ep *efa_rdm_ep; struct efa_resource *resource = *state; struct fi_cq_data_entry cq_entry; - struct fi_eq_err_entry eq_err_entry; + struct fi_cq_err_entry cq_err_entry; int ret; + struct efa_rdm_cq *efa_rdm_cq; efa_unit_test_resource_construct(resource, FI_EP_RDM); - efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); - efa_rdm_ep->ibv_cq_ex->start_poll = &efa_mock_ibv_start_poll_return_mock; - efa_rdm_ep->ibv_cq_ex->end_poll = &efa_mock_ibv_end_poll_check_mock; - efa_rdm_ep->ibv_cq_ex->read_vendor_err = &efa_mock_ibv_read_vendor_err_return_mock; + efa_rdm_cq = container_of(resource->cq, struct efa_rdm_cq, util_cq.cq_fid.fid); + efa_rdm_cq->ibv_cq.ibv_cq_ex->start_poll = &efa_mock_ibv_start_poll_return_mock; + efa_rdm_cq->ibv_cq.ibv_cq_ex->end_poll = &efa_mock_ibv_end_poll_check_mock; + efa_rdm_cq->ibv_cq.ibv_cq_ex->read_vendor_err = &efa_mock_ibv_read_vendor_err_return_mock; will_return(efa_mock_ibv_start_poll_return_mock, EFAULT); will_return(efa_mock_ibv_read_vendor_err_return_mock, EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE); ret = fi_cq_read(resource->cq, &cq_entry, 1); - /* TODO: - * Our current behavior is to return -FI_EAGAIN, but it is not right. - * We need to fix the behaivor in the provider and update the test case. - */ - assert_int_equal(ret, -FI_EAGAIN); + assert_int_equal(ret, -FI_EAVAIL); - ret = fi_eq_readerr(resource->eq, &eq_err_entry, 0); - assert_int_equal(ret, sizeof(eq_err_entry)); - assert_int_not_equal(eq_err_entry.err, FI_ENOENT); - assert_int_equal(eq_err_entry.prov_errno, EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE); + ret = fi_cq_readerr(resource->cq, &cq_err_entry, 0); + assert_int_equal(ret, 1); + assert_int_not_equal(cq_err_entry.err, FI_ENOENT); + assert_int_equal(cq_err_entry.prov_errno, EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE); +} + +/** + * @brief Test efa_rdm_cq_open() handles rdma-core CQ creation failure gracefully + * + * @param[in] state struct efa_resource that is managed by the framework + */ +void test_rdm_cq_create_error_handling(struct efa_resource **state) +{ + + struct efa_resource *resource = *state; + struct ibv_device **ibv_device_list; + struct efa_device efa_device = {0}; + struct efa_domain *efa_domain = NULL; + struct verbs_context *vctx = NULL; + struct fi_cq_attr cq_attr = {0}; + + ibv_device_list = ibv_get_device_list(&g_device_cnt); + if (ibv_device_list == NULL) { + skip(); + return; + } + efa_device_construct(&efa_device, 0, ibv_device_list[0]); + + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + assert_non_null(resource->hints); + assert_int_equal(fi_getinfo(FI_VERSION(1, 14), NULL, NULL, 0ULL, resource->hints, &resource->info), 0); + assert_int_equal(fi_fabric(resource->info->fabric_attr, &resource->fabric, NULL), 0); + assert_int_equal(fi_domain(resource->fabric, resource->info, &resource->domain, NULL), 0); + + vctx = verbs_get_ctx_op(efa_device.ibv_ctx, create_cq_ex); +#if HAVE_EFADV_CQ_EX + g_efa_unit_test_mocks.efadv_create_cq = &efa_mock_efadv_create_cq_set_eopnotsupp_and_return_null; + expect_function_call(efa_mock_efadv_create_cq_set_eopnotsupp_and_return_null); +#endif + /* Mock out the create_cq_ex function pointer which is called by ibv_create_cq_ex */ + vctx->create_cq_ex = &efa_mock_create_cq_ex_return_null; + expect_function_call(efa_mock_create_cq_ex_return_null); + + efa_domain = container_of(resource->domain, struct efa_domain, util_domain.domain_fid); + efa_domain->device = &efa_device; + + assert_int_not_equal(fi_cq_open(resource->domain, &cq_attr, &resource->cq, NULL), 0); + /* set cq as NULL to avoid double free by fi_close in cleanup stage */ + resource->cq = NULL; +} + +/** + * @brief get the length of the ibv_cq_poll_list for a given efa_rdm_cq + * + * @param cq_fid cq fid + * @return int the length of the ibv_cq_poll_list + */ +static +int test_efa_rdm_cq_get_ibv_cq_poll_list_length(struct fid_cq *cq_fid) +{ + int i = 0; + struct dlist_entry *item; + struct efa_rdm_cq *cq; + + cq = container_of(cq_fid, struct efa_rdm_cq, util_cq.cq_fid.fid); + dlist_foreach(&cq->ibv_cq_poll_list, item) { + i++; + } + + return i; +} + +/** + * @brief Check the length of ibv_cq_poll_list when 1 cq is bind to 1 ep + * as both tx/rx cq. + * + * @param state struct efa_resource that is managed by the framework + */ +void test_efa_rdm_cq_ibv_cq_poll_list_same_tx_rx_cq_single_ep(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + + efa_unit_test_resource_construct(resource, FI_EP_RDM); + + /* efa_unit_test_resource_construct binds single OFI CQ as both tx/rx cq of ep */ + assert_int_equal(test_efa_rdm_cq_get_ibv_cq_poll_list_length(resource->cq), 1); +} + +/** + * @brief Check the length of ibv_cq_poll_list when separate tx/rx cq is bind to 1 ep. + * + * @param state struct efa_resource that is managed by the framework + */ +void test_efa_rdm_cq_ibv_cq_poll_list_separate_tx_rx_cq_single_ep(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct fid_cq *txcq, *rxcq; + struct fi_cq_attr cq_attr = {0}; + + efa_unit_test_resource_construct_no_cq_and_ep_not_enabled(resource, FI_EP_RDM); + + assert_int_equal(fi_cq_open(resource->domain, &cq_attr, &txcq, NULL), 0); + + assert_int_equal(fi_ep_bind(resource->ep, &txcq->fid, FI_SEND), 0); + + assert_int_equal(fi_cq_open(resource->domain, &cq_attr, &rxcq, NULL), 0); + + assert_int_equal(fi_ep_bind(resource->ep, &rxcq->fid, FI_RECV), 0); + + assert_int_equal(fi_enable(resource->ep), 0); + + assert_int_equal(test_efa_rdm_cq_get_ibv_cq_poll_list_length(txcq), 2); + + assert_int_equal(test_efa_rdm_cq_get_ibv_cq_poll_list_length(rxcq), 2); + + /* ep must be closed before cq/av/eq... */ + fi_close(&resource->ep->fid); + resource->ep = NULL; + fi_close(&txcq->fid); + fi_close(&rxcq->fid); } #if HAVE_EFADV_CQ_EX @@ -388,6 +507,7 @@ static void test_impl_ibv_cq_ex_read_unknow_peer_ah(struct efa_resource *resourc struct efadv_cq *efadv_cq; struct efa_unit_test_buff recv_buff; int ret; + struct efa_rdm_cq *efa_rdm_cq; /* * Always use mocked efadv_create_cq instead of the real one. @@ -406,6 +526,7 @@ static void test_impl_ibv_cq_ex_read_unknow_peer_ah(struct efa_resource *resourc efa_unit_test_resource_construct(resource, FI_EP_RDM); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + efa_rdm_cq = container_of(resource->cq, struct efa_rdm_cq, util_cq.cq_fid.fid); /* Construct a minimal recv buffer */ efa_unit_test_buff_construct(&recv_buff, resource, efa_rdm_ep->min_multi_recv_size); @@ -425,27 +546,37 @@ static void test_impl_ibv_cq_ex_read_unknow_peer_ah(struct efa_resource *resourc assert_non_null(peer); peer->flags |= EFA_RDM_PEER_HANDSHAKE_SENT; - /* Setup packet entry */ + /* + * The rx pkt entry should only be allocated and posted by the progress engine. + * However, to mock a receive completion, we have to allocate an rx entry + * and modify it out of band. The proess engine grow the rx pool in the first + * call and set efa_rdm_ep->efa_rx_pkts_posted as the rx pool size. Here we + * follow the progress engine to set the efa_rx_pkts_posted counter manually + * TODO: modify the rx pkt as part of the ibv cq poll mock so we don't have to + * allocate pkt entry and hack the pkt counters. + */ pkt_entry = efa_rdm_pke_alloc(efa_rdm_ep, efa_rdm_ep->efa_rx_pkt_pool, EFA_RDM_PKE_FROM_EFA_RX_POOL); - /* A receive completion requires efa rx pkts are posted */ - efa_rdm_ep->efa_rx_pkts_posted++; + assert_non_null(pkt_entry); + efa_rdm_ep->efa_rx_pkts_posted = efa_rdm_ep_get_rx_pool_size(efa_rdm_ep); + pkt_attr.msg_id = 0; pkt_attr.connid = raw_addr.qkey; /* Packet type must be in [EFA_RDM_REQ_PKT_BEGIN, EFA_RDM_EXTRA_REQ_PKT_END) */ efa_unit_test_eager_msgrtm_pkt_construct(pkt_entry, &pkt_attr); /* Setup CQ */ - efa_rdm_ep->ibv_cq_ex->wr_id = (uintptr_t)pkt_entry; - efa_rdm_ep->ibv_cq_ex->start_poll = &efa_mock_ibv_start_poll_return_mock; - efa_rdm_ep->ibv_cq_ex->next_poll = &efa_mock_ibv_next_poll_check_function_called_and_return_mock; - efa_rdm_ep->ibv_cq_ex->end_poll = &efa_mock_ibv_end_poll_check_mock; - efa_rdm_ep->ibv_cq_ex->read_slid = &efa_mock_ibv_read_slid_return_mock; - efa_rdm_ep->ibv_cq_ex->read_byte_len = &efa_mock_ibv_read_byte_len_return_mock; - efa_rdm_ep->ibv_cq_ex->read_opcode = &efa_mock_ibv_read_opcode_return_mock; - efa_rdm_ep->ibv_cq_ex->read_src_qp = &efa_mock_ibv_read_src_qp_return_mock; + efa_rdm_cq->ibv_cq.ibv_cq_ex->wr_id = (uintptr_t)pkt_entry; + efa_rdm_cq->ibv_cq.ibv_cq_ex->start_poll = &efa_mock_ibv_start_poll_return_mock; + efa_rdm_cq->ibv_cq.ibv_cq_ex->next_poll = &efa_mock_ibv_next_poll_check_function_called_and_return_mock; + efa_rdm_cq->ibv_cq.ibv_cq_ex->end_poll = &efa_mock_ibv_end_poll_check_mock; + efa_rdm_cq->ibv_cq.ibv_cq_ex->read_slid = &efa_mock_ibv_read_slid_return_mock; + efa_rdm_cq->ibv_cq.ibv_cq_ex->read_byte_len = &efa_mock_ibv_read_byte_len_return_mock; + efa_rdm_cq->ibv_cq.ibv_cq_ex->read_opcode = &efa_mock_ibv_read_opcode_return_mock; + efa_rdm_cq->ibv_cq.ibv_cq_ex->read_qp_num = &efa_mock_ibv_read_qp_num_return_mock; + efa_rdm_cq->ibv_cq.ibv_cq_ex->read_src_qp = &efa_mock_ibv_read_src_qp_return_mock; if (support_efadv_cq) { - efadv_cq = efadv_cq_from_ibv_cq_ex(efa_rdm_ep->ibv_cq_ex); + efadv_cq = efadv_cq_from_ibv_cq_ex(efa_rdm_cq->ibv_cq.ibv_cq_ex); assert_non_null(efadv_cq); efadv_cq->wc_read_sgid = &efa_mock_efadv_wc_read_sgid_return_zero_code_and_expect_next_poll_and_set_gid; @@ -462,6 +593,7 @@ static void test_impl_ibv_cq_ex_read_unknow_peer_ah(struct efa_resource *resourc will_return(efa_mock_ibv_read_slid_return_mock, 0xffff); // slid=0xffff(-1) indicates an unknown AH will_return(efa_mock_ibv_read_byte_len_return_mock, pkt_entry->pkt_size); will_return_maybe(efa_mock_ibv_read_opcode_return_mock, IBV_WC_RECV); + will_return_maybe(efa_mock_ibv_read_qp_num_return_mock, 0); will_return_maybe(efa_mock_ibv_read_src_qp_return_mock, raw_addr.qpn); /* Post receive buffer */ diff --git a/prov/efa/test/efa_unit_test_device.c b/prov/efa/test/efa_unit_test_device.c index 98452a9999b..319212da06a 100644 --- a/prov/efa/test/efa_unit_test_device.c +++ b/prov/efa/test/efa_unit_test_device.c @@ -1,3 +1,6 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + #include "efa_unit_tests.h" /* diff --git a/prov/efa/test/efa_unit_test_domain.c b/prov/efa/test/efa_unit_test_domain.c index 77c3eb1509c..ccfa1c53149 100644 --- a/prov/efa/test/efa_unit_test_domain.c +++ b/prov/efa/test/efa_unit_test_domain.c @@ -1,3 +1,6 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + #include "efa_unit_tests.h" /* test fi_open_ops with a wrong name */ diff --git a/prov/efa/test/efa_unit_test_ep.c b/prov/efa/test/efa_unit_test_ep.c index bd188c4120d..88358d644d3 100644 --- a/prov/efa/test/efa_unit_test_ep.c +++ b/prov/efa/test/efa_unit_test_ep.c @@ -1,4 +1,8 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + #include "efa_unit_tests.h" +#include "rdm/efa_rdm_cq.h" /** * @brief Verify the EFA RDM endpoint correctly parses the host id string @@ -105,6 +109,7 @@ void test_efa_rdm_ep_handshake_exchange_host_id(struct efa_resource **state, uin struct efa_rdm_pke *pkt_entry; uint64_t actual_peer_host_id = UINT64_MAX; int ret; + struct efa_rdm_cq *efa_rdm_cq; g_efa_unit_test_mocks.local_host_id = local_host_id; g_efa_unit_test_mocks.peer_host_id = peer_host_id; @@ -114,6 +119,8 @@ void test_efa_rdm_ep_handshake_exchange_host_id(struct efa_resource **state, uin efa_unit_test_resource_construct(resource, FI_EP_RDM); efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + efa_rdm_cq = container_of(resource->cq, struct efa_rdm_cq, util_cq.cq_fid.fid); + efa_rdm_ep->host_id = g_efa_unit_test_mocks.local_host_id; /* close shm_ep to force efa_rdm_ep to use efa device to send */ if (efa_rdm_ep->shm_ep) { @@ -136,9 +143,18 @@ void test_efa_rdm_ep_handshake_exchange_host_id(struct efa_resource **state, uin assert_int_equal(peer->host_id, 0); assert_int_not_equal(peer->flags & EFA_RDM_PEER_HANDSHAKE_SENT, EFA_RDM_PEER_HANDSHAKE_SENT); - /* Setup rx packet entry. Manually increase counter to avoid underflow */ + /* + * The rx pkt entry should only be allocated and posted by the progress engine. + * However, to mock a receive completion, we have to allocate an rx entry + * and modify it out of band. The proess engine grow the rx pool in the first + * call and set efa_rdm_ep->efa_rx_pkts_posted as the rx pool size. Here we + * follow the progress engine to set the efa_rx_pkts_posted counter manually + * TODO: modify the rx pkt as part of the ibv cq poll mock so we don't have to + * allocate pkt entry and hack the pkt counters. + */ pkt_entry = efa_rdm_pke_alloc(efa_rdm_ep, efa_rdm_ep->efa_rx_pkt_pool, EFA_RDM_PKE_FROM_EFA_RX_POOL); - efa_rdm_ep->efa_rx_pkts_posted++; + assert_non_null(pkt_entry); + efa_rdm_ep->efa_rx_pkts_posted = efa_rdm_ep_get_rx_pool_size(efa_rdm_ep); pkt_attr.connid = include_connid ? raw_addr.qkey : 0; pkt_attr.host_id = g_efa_unit_test_mocks.peer_host_id; @@ -156,16 +172,17 @@ void test_efa_rdm_ep_handshake_exchange_host_id(struct efa_resource **state, uin expect_function_call(efa_mock_ibv_wr_send_verify_handshake_pkt_local_host_id_and_save_wr); /* Setup CQ */ - efa_rdm_ep->ibv_cq_ex->end_poll = &efa_mock_ibv_end_poll_check_mock; - efa_rdm_ep->ibv_cq_ex->next_poll = &efa_mock_ibv_next_poll_check_function_called_and_return_mock; - efa_rdm_ep->ibv_cq_ex->read_byte_len = &efa_mock_ibv_read_byte_len_return_mock; - efa_rdm_ep->ibv_cq_ex->read_opcode = &efa_mock_ibv_read_opcode_return_mock; - efa_rdm_ep->ibv_cq_ex->read_slid = &efa_mock_ibv_read_slid_return_mock; - efa_rdm_ep->ibv_cq_ex->read_src_qp = &efa_mock_ibv_read_src_qp_return_mock; - efa_rdm_ep->ibv_cq_ex->read_vendor_err = &efa_mock_ibv_read_vendor_err_return_mock; - efa_rdm_ep->ibv_cq_ex->start_poll = &efa_mock_ibv_start_poll_return_mock; - efa_rdm_ep->ibv_cq_ex->status = IBV_WC_SUCCESS; - efa_rdm_ep->ibv_cq_ex->wr_id = (uintptr_t)pkt_entry; + efa_rdm_cq->ibv_cq.ibv_cq_ex->end_poll = &efa_mock_ibv_end_poll_check_mock; + efa_rdm_cq->ibv_cq.ibv_cq_ex->next_poll = &efa_mock_ibv_next_poll_check_function_called_and_return_mock; + efa_rdm_cq->ibv_cq.ibv_cq_ex->read_byte_len = &efa_mock_ibv_read_byte_len_return_mock; + efa_rdm_cq->ibv_cq.ibv_cq_ex->read_opcode = &efa_mock_ibv_read_opcode_return_mock; + efa_rdm_cq->ibv_cq.ibv_cq_ex->read_slid = &efa_mock_ibv_read_slid_return_mock; + efa_rdm_cq->ibv_cq.ibv_cq_ex->read_src_qp = &efa_mock_ibv_read_src_qp_return_mock; + efa_rdm_cq->ibv_cq.ibv_cq_ex->read_qp_num = &efa_mock_ibv_read_qp_num_return_mock; + efa_rdm_cq->ibv_cq.ibv_cq_ex->read_vendor_err = &efa_mock_ibv_read_vendor_err_return_mock; + efa_rdm_cq->ibv_cq.ibv_cq_ex->start_poll = &efa_mock_ibv_start_poll_return_mock; + efa_rdm_cq->ibv_cq.ibv_cq_ex->status = IBV_WC_SUCCESS; + efa_rdm_cq->ibv_cq.ibv_cq_ex->wr_id = (uintptr_t)pkt_entry; expect_function_call(efa_mock_ibv_next_poll_check_function_called_and_return_mock); /* Receive handshake packet */ @@ -173,6 +190,7 @@ void test_efa_rdm_ep_handshake_exchange_host_id(struct efa_resource **state, uin will_return(efa_mock_ibv_next_poll_check_function_called_and_return_mock, ENOENT); will_return(efa_mock_ibv_read_byte_len_return_mock, pkt_entry->pkt_size); will_return(efa_mock_ibv_read_opcode_return_mock, IBV_WC_RECV); + will_return(efa_mock_ibv_read_qp_num_return_mock, 0); will_return(efa_mock_ibv_read_slid_return_mock, efa_rdm_ep_get_peer_ahn(efa_rdm_ep, peer_addr)); will_return(efa_mock_ibv_read_src_qp_return_mock, raw_addr.qpn); will_return(efa_mock_ibv_start_poll_return_mock, IBV_WC_SUCCESS); @@ -183,6 +201,7 @@ void test_efa_rdm_ep_handshake_exchange_host_id(struct efa_resource **state, uin */ will_return(efa_mock_ibv_end_poll_check_mock, NULL); will_return(efa_mock_ibv_read_opcode_return_mock, IBV_WC_SEND); + will_return(efa_mock_ibv_read_qp_num_return_mock, 0); will_return(efa_mock_ibv_read_vendor_err_return_mock, FI_EFA_ERR_OTHER); will_return(efa_mock_ibv_start_poll_return_mock, IBV_WC_SUCCESS); @@ -195,8 +214,8 @@ void test_efa_rdm_ep_handshake_exchange_host_id(struct efa_resource **state, uin * We need to poll the CQ twice explicitly to point the CQE * to the saved send wr in handshake */ - efa_rdm_ep->ibv_cq_ex->status = IBV_WC_GENERAL_ERR; - efa_rdm_ep->ibv_cq_ex->wr_id = (uintptr_t)g_ibv_submitted_wr_id_vec[0]; + efa_rdm_cq->ibv_cq.ibv_cq_ex->status = IBV_WC_GENERAL_ERR; + efa_rdm_cq->ibv_cq.ibv_cq_ex->wr_id = (uintptr_t)g_ibv_submitted_wr_id_vec[0]; /* Progress the send wr to clean up outstanding tx ops */ cq_read_send_ret = fi_cq_read(resource->cq, &cq_entry, 1); @@ -237,48 +256,6 @@ void test_efa_rdm_ep_handshake_receive_without_peer_host_id_and_do_not_send_loca test_efa_rdm_ep_handshake_exchange_host_id(state, 0x0, 0x0, true); } -/** - * @brief Test efa_rdm_ep_open() handles rdma-core CQ creation failure gracefully - * - * @param[in] state struct efa_resource that is managed by the framework - */ -void test_efa_rdm_ep_cq_create_error_handling(struct efa_resource **state) -{ - - struct efa_resource *resource = *state; - struct ibv_device **ibv_device_list; - struct efa_device efa_device = {0}; - struct efa_domain *efa_domain = NULL; - struct verbs_context *vctx = NULL; - - ibv_device_list = ibv_get_device_list(&g_device_cnt); - if (ibv_device_list == NULL) { - skip(); - return; - } - efa_device_construct(&efa_device, 0, ibv_device_list[0]); - - resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); - assert_non_null(resource->hints); - assert_int_equal(fi_getinfo(FI_VERSION(1, 14), NULL, NULL, 0ULL, resource->hints, &resource->info), 0); - assert_int_equal(fi_fabric(resource->info->fabric_attr, &resource->fabric, NULL), 0); - assert_int_equal(fi_domain(resource->fabric, resource->info, &resource->domain, NULL), 0); - - vctx = verbs_get_ctx_op(efa_device.ibv_ctx, create_cq_ex); -#if HAVE_EFADV_CQ_EX - g_efa_unit_test_mocks.efadv_create_cq = &efa_mock_efadv_create_cq_set_eopnotsupp_and_return_null; - expect_function_call(efa_mock_efadv_create_cq_set_eopnotsupp_and_return_null); -#endif - /* Mock out the create_cq_ex function pointer which is called by ibv_create_cq_ex */ - vctx->create_cq_ex = &efa_mock_create_cq_ex_return_null; - expect_function_call(efa_mock_create_cq_ex_return_null); - - efa_domain = container_of(resource->domain, struct efa_domain, util_domain.domain_fid); - efa_domain->device = &efa_device; - - assert_int_not_equal(fi_endpoint(resource->domain, resource->info, &resource->ep, NULL), 0); -} - static void check_ep_pkt_pool_flags(struct fid_ep *ep, int expected_flags) { struct efa_rdm_ep *efa_rdm_ep; @@ -458,7 +435,7 @@ void test_efa_rdm_ep_rma_without_caps(struct efa_resource **state) resource->hints->caps |= FI_MSG | FI_TAGGED; resource->hints->caps &= ~FI_RMA; resource->hints->domain_attr->mr_mode = FI_MR_BASIC; - efa_unit_test_resource_construct_with_hints(resource, FI_EP_RDM, resource->hints, true); + efa_unit_test_resource_construct_with_hints(resource, FI_EP_RDM, resource->hints, true, true); /* ensure we don't have RMA capability. */ efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); @@ -508,7 +485,7 @@ void test_efa_rdm_ep_atomic_without_caps(struct efa_resource **state) resource->hints->caps |= FI_MSG | FI_TAGGED; resource->hints->caps &= ~FI_ATOMIC; resource->hints->domain_attr->mr_mode = FI_MR_BASIC; - efa_unit_test_resource_construct_with_hints(resource, FI_EP_RDM, resource->hints, true); + efa_unit_test_resource_construct_with_hints(resource, FI_EP_RDM, resource->hints, true, true); /* ensure we don't have ATOMIC capability. */ efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); @@ -602,3 +579,137 @@ void test_efa_rdm_ep_setopt_shared_memory_permitted(struct efa_resource **state) assert_null(ep->shm_ep); } + +/** + * @brief Test fi_enable with different optval of fi_setopt for + * FI_OPT_EFA_WRITE_IN_ORDER_ALIGNED_128_BYTES optname. + * @param state struct efa_resource that is managed by the framework + * @param expected_status expected return status of fi_enable + * @param optval the optval passed to fi_setopt + */ +void test_efa_rdm_ep_enable_qp_in_order_aligned_128_bytes_common(struct efa_resource **state, int expected_status, bool optval) +{ + struct efa_resource *resource = *state; + + efa_unit_test_resource_construct_ep_not_enabled(resource, FI_EP_RDM); + + /* fi_setopt should always succeed */ + assert_int_equal(fi_setopt(&resource->ep->fid, FI_OPT_ENDPOINT, + FI_OPT_EFA_WRITE_IN_ORDER_ALIGNED_128_BYTES, &optval, + sizeof(optval)), expected_status); +} + +#if HAVE_EFA_DATA_IN_ORDER_ALIGNED_128_BYTES +/** + * @brief Test the case where fi_enable should return success + * + * @param state struct efa_resource that is managed by the framework + */ +void test_efa_rdm_ep_enable_qp_in_order_aligned_128_bytes_good(struct efa_resource **state) +{ + /* mock ibv_query_qp_data_in_order to return required capability */ + g_efa_unit_test_mocks.ibv_query_qp_data_in_order = &efa_mock_ibv_query_qp_data_in_order_return_in_order_aligned_128_bytes; + test_efa_rdm_ep_enable_qp_in_order_aligned_128_bytes_common(state, FI_SUCCESS, true); +} + +/** + * @brief Test the case where fi_enable should return -FI_EOPNOTSUPP + * + * @param state struct efa_resource that is managed by the framework + */ +void test_efa_rdm_ep_enable_qp_in_order_aligned_128_bytes_bad(struct efa_resource **state) +{ + /* mock ibv_query_qp_data_in_order to return zero capability */ + g_efa_unit_test_mocks.ibv_query_qp_data_in_order = &efa_mock_ibv_query_qp_data_in_order_return_0; + test_efa_rdm_ep_enable_qp_in_order_aligned_128_bytes_common(state, -FI_EOPNOTSUPP, true); +} + +#else + +void test_efa_rdm_ep_enable_qp_in_order_aligned_128_bytes_good(struct efa_resource **state) +{ + test_efa_rdm_ep_enable_qp_in_order_aligned_128_bytes_common(state, FI_SUCCESS, false); +} + +void test_efa_rdm_ep_enable_qp_in_order_aligned_128_bytes_bad(struct efa_resource **state) +{ + test_efa_rdm_ep_enable_qp_in_order_aligned_128_bytes_common(state, -FI_EOPNOTSUPP, true); +} + +#endif + +static void +test_efa_rdm_ep_use_zcpy_rx_impl(struct efa_resource *resource, bool expected_use_zcpy_rx) { + struct efa_rdm_ep *ep; + + efa_unit_test_resource_construct_with_hints(resource, FI_EP_RDM, resource->hints, false, false); + + ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + + assert_true(ep->use_zcpy_rx == expected_use_zcpy_rx); +} + +/** + * @brief Verify zcpy_rx is enabled when the following requirements are met: + * 1. app doesn't require FI_ORDER_SAS in tx or rx's msg_order + * 2. app uses FI_MSG_PREFIX mode + * 3. app's max msg size is smaller than mtu_size - prefix_size + * 4. app doesn't use FI_DIRECTED_RECV, FI_TAGGED, FI_ATOMIC capability + */ +void test_efa_rdm_ep_user_zcpy_rx_happy(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + assert_non_null(resource->hints); + + /* Just use a small enough size */ + resource->hints->ep_attr->max_msg_size = 1000; + resource->hints->tx_attr->msg_order = FI_ORDER_NONE; + resource->hints->rx_attr->msg_order = FI_ORDER_NONE; + resource->hints->mode = FI_MSG_PREFIX; + resource->hints->caps = FI_MSG; + + test_efa_rdm_ep_use_zcpy_rx_impl(resource, true); +} + +/** + * @brief When sas is requested for either tx or rx. zcpy will be disabled + */ +void test_efa_rdm_ep_user_zcpy_rx_unhappy_due_to_sas(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + assert_non_null(resource->hints); + + /* Just use a small enough size */ + resource->hints->ep_attr->max_msg_size = 1000; + + resource->hints->tx_attr->msg_order = FI_ORDER_SAS; + resource->hints->rx_attr->msg_order = FI_ORDER_NONE; + resource->hints->mode = FI_MSG_PREFIX; + resource->hints->caps = FI_MSG; + + test_efa_rdm_ep_use_zcpy_rx_impl(resource, false); +} + +/** + * @brief zcpy will be disabled if app doesn't use FI_MSG_PREFIX mode. + */ +void test_efa_rdm_ep_user_zcpy_rx_unhappy_due_to_no_prefix(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + assert_non_null(resource->hints); + + /* Just use a small enough size */ + resource->hints->ep_attr->max_msg_size = 1000; + + resource->hints->tx_attr->msg_order = FI_ORDER_NONE; + resource->hints->rx_attr->msg_order = FI_ORDER_NONE; + resource->hints->caps = FI_MSG; + + test_efa_rdm_ep_use_zcpy_rx_impl(resource, false); +} diff --git a/prov/efa/test/efa_unit_test_fork_support.c b/prov/efa/test/efa_unit_test_fork_support.c index e401bfd1a30..ab1e9e01a00 100644 --- a/prov/efa/test/efa_unit_test_fork_support.c +++ b/prov/efa/test/efa_unit_test_fork_support.c @@ -1,3 +1,6 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + #include "efa_unit_tests.h" /** diff --git a/prov/efa/test/efa_unit_test_hmem.c b/prov/efa/test/efa_unit_test_hmem.c index a21242b234d..febef0354d2 100644 --- a/prov/efa/test/efa_unit_test_hmem.c +++ b/prov/efa/test/efa_unit_test_hmem.c @@ -1,3 +1,6 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + #include "efa_unit_tests.h" @@ -8,7 +11,7 @@ * when HAVE_NEURON=1, will still return 0 but leave * efa_hmem_info[FI_HMEM_NEURON].initialized and * efa_hmem_info[FI_HMEM_NEURON].p2p_supported_by_device as false. - * + * * @param[in] state struct efa_resource that is managed by the framework */ void test_efa_hmem_info_update_neuron(struct efa_resource **state) diff --git a/prov/efa/test/efa_unit_test_info.c b/prov/efa/test/efa_unit_test_info.c index a0fdfaaf22c..b48232aedcc 100644 --- a/prov/efa/test/efa_unit_test_info.c +++ b/prov/efa/test/efa_unit_test_info.c @@ -1,3 +1,6 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + #include "efa_unit_tests.h" /** @@ -99,6 +102,169 @@ void test_info_open_ep_with_api_1_1_info() assert_int_equal(err, 0); } +/** + * @brief Verify info->ep_attr is set according to hints. + * + */ +void test_info_ep_attr() +{ + struct fi_info *hints, *info; + int err; + + hints = efa_unit_test_alloc_hints(FI_EP_RDM); + assert_non_null(hints); + + hints->ep_attr->max_msg_size = 1024; + + err = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), NULL, NULL, 0ULL, hints, &info); + + assert_int_equal(err, 0); + assert_int_equal(hints->ep_attr->max_msg_size, info->ep_attr->max_msg_size); + + fi_freeinfo(info); +} + +/** + * @brief Verify info->tx/rx_attr->msg_order is set according to hints. + * + */ +static void +test_info_tx_rx_msg_order_from_hints(struct fi_info *hints, int expected_ret) +{ + struct fi_info *info; + int err; + + err = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), NULL, NULL, 0ULL, hints, &info); + + assert_int_equal(err, expected_ret); + + if (expected_ret == FI_SUCCESS) { + assert_true(hints->tx_attr->msg_order == info->tx_attr->msg_order); + assert_true(hints->rx_attr->msg_order == info->rx_attr->msg_order); + } + + fi_freeinfo(info); +} + +/** + * @brief Verify info->tx/rx_attr->op_flags is set according to hints. + * + */ +static void +test_info_tx_rx_op_flags_from_hints(struct fi_info *hints, int expected_ret) +{ + struct fi_info *info; + int err; + + err = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), NULL, NULL, 0ULL, hints, &info); + + assert_int_equal(err, expected_ret); + + if (expected_ret == FI_SUCCESS) { + assert_true(hints->tx_attr->op_flags == info->tx_attr->op_flags); + assert_true(hints->rx_attr->op_flags == info->rx_attr->op_flags); + } + + fi_freeinfo(info); +} + +/** + * @brief Verify info->tx/rx_attr->size is set according to hints. + * + */ +static void test_info_tx_rx_size_from_hints(struct fi_info *hints, int expected_ret) +{ + struct fi_info *info; + int err; + + err = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), NULL, NULL, 0ULL, hints, &info); + + assert_int_equal(err, expected_ret); + + if (expected_ret == FI_SUCCESS) { + assert_true(hints->tx_attr->size == info->tx_attr->size); + assert_true(hints->rx_attr->size == info->rx_attr->size); + } + + fi_freeinfo(info); +} + +void test_info_tx_rx_msg_order_rdm_order_none(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + assert_non_null(resource->hints); + + resource->hints->tx_attr->msg_order = FI_ORDER_NONE; + resource->hints->rx_attr->msg_order = FI_ORDER_NONE; + test_info_tx_rx_msg_order_from_hints(resource->hints, 0); +} + +void test_info_tx_rx_msg_order_rdm_order_sas(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + assert_non_null(resource->hints); + + resource->hints->tx_attr->msg_order = FI_ORDER_SAS; + resource->hints->rx_attr->msg_order = FI_ORDER_SAS; + test_info_tx_rx_msg_order_from_hints(resource->hints, 0); +} + +void test_info_tx_rx_msg_order_dgram_order_none(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + + resource->hints = efa_unit_test_alloc_hints(FI_EP_DGRAM); + assert_non_null(resource->hints); + + resource->hints->tx_attr->msg_order = FI_ORDER_NONE; + resource->hints->rx_attr->msg_order = FI_ORDER_NONE; + test_info_tx_rx_msg_order_from_hints(resource->hints, 0); +} + +/** + * @brief dgram endpoint doesn't support any ordering, so fi_getinfo + * should return -FI_ENODATA if hints requests sas + */ +void test_info_tx_rx_msg_order_dgram_order_sas(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + + resource->hints = efa_unit_test_alloc_hints(FI_EP_DGRAM); + assert_non_null(resource->hints); + + resource->hints->tx_attr->msg_order = FI_ORDER_SAS; + resource->hints->rx_attr->msg_order = FI_ORDER_SAS; + test_info_tx_rx_msg_order_from_hints(resource->hints, -FI_ENODATA); +} + +void test_info_tx_rx_op_flags_rdm(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + assert_non_null(resource->hints); + + resource->hints->tx_attr->op_flags = FI_DELIVERY_COMPLETE; + resource->hints->rx_attr->op_flags = FI_COMPLETION; + test_info_tx_rx_op_flags_from_hints(resource->hints, 0); +} + +void test_info_tx_rx_size_rdm(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + assert_non_null(resource->hints); + + resource->hints->tx_attr->size = 16; + resource->hints->rx_attr->size = 16; + test_info_tx_rx_size_from_hints(resource->hints, 0); +} + static void test_info_check_shm_info_from_hints(struct fi_info *hints) { struct fi_info *info; diff --git a/prov/efa/test/efa_unit_test_mocks.c b/prov/efa/test/efa_unit_test_mocks.c index 927ec7d771a..af9f2671b10 100644 --- a/prov/efa/test/efa_unit_test_mocks.c +++ b/prov/efa/test/efa_unit_test_mocks.c @@ -1,3 +1,6 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + #define _GNU_SOURCE #include #include @@ -169,6 +172,11 @@ uint32_t efa_mock_ibv_read_vendor_err_return_mock(struct ibv_cq_ex *current) return mock(); } +uint32_t efa_mock_ibv_read_qp_num_return_mock(struct ibv_cq_ex *current) +{ + return mock(); +} + int g_ofi_copy_from_hmem_iov_call_counter; ssize_t efa_mock_ofi_copy_from_hmem_iov_inc_counter(void *dest, size_t size, enum fi_hmem_iface hmem_iface, uint64_t device, @@ -195,6 +203,9 @@ struct efa_unit_test_mocks g_efa_unit_test_mocks = { #if HAVE_EFADV_QUERY_MR .efadv_query_mr = __real_efadv_query_mr, #endif +#if HAVE_EFA_DATA_IN_ORDER_ALIGNED_128_BYTES + .ibv_query_qp_data_in_order = __real_ibv_query_qp_data_in_order, +#endif }; struct ibv_ah *__wrap_ibv_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr) @@ -349,3 +360,20 @@ int efa_mock_efadv_query_mr_recv_and_rdma_read_ic_id_0_1(struct ibv_mr *ibv_mr, } #endif /* HAVE_EFADV_QUERY_MR */ + +#if HAVE_EFA_DATA_IN_ORDER_ALIGNED_128_BYTES +int __wrap_ibv_query_qp_data_in_order(struct ibv_qp *qp, enum ibv_wr_opcode op, uint32_t flags) +{ + return g_efa_unit_test_mocks.ibv_query_qp_data_in_order(qp, op, flags); +} + +int efa_mock_ibv_query_qp_data_in_order_return_0(struct ibv_qp *qp, enum ibv_wr_opcode op, uint32_t flags) +{ + return 0; +} + +int efa_mock_ibv_query_qp_data_in_order_return_in_order_aligned_128_bytes(struct ibv_qp *qp, enum ibv_wr_opcode op, uint32_t flags) +{ + return IBV_QUERY_QP_DATA_IN_ORDER_ALIGNED_128_BYTES; +} +#endif diff --git a/prov/efa/test/efa_unit_test_mocks.h b/prov/efa/test/efa_unit_test_mocks.h index 7bf37b0c0a6..a23864a38b8 100644 --- a/prov/efa/test/efa_unit_test_mocks.h +++ b/prov/efa/test/efa_unit_test_mocks.h @@ -1,3 +1,6 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + #ifndef EFA_UNIT_TEST_RDMA_CORE_MOCKS_H #define EFA_UNIT_TEST_RDMA_CORE_MOCKS_H @@ -65,6 +68,8 @@ uint32_t efa_mock_ibv_read_opcode_return_mock(struct ibv_cq_ex *current); uint32_t efa_mock_ibv_read_vendor_err_return_mock(struct ibv_cq_ex *current); +uint32_t efa_mock_ibv_read_qp_num_return_mock(struct ibv_cq_ex *current); + ssize_t __real_ofi_copy_from_hmem_iov(void *dest, size_t size, enum fi_hmem_iface hmem_iface, uint64_t device, const struct iovec *hmem_iov, @@ -106,6 +111,10 @@ struct efa_unit_test_mocks #if HAVE_EFADV_QUERY_MR int (*efadv_query_mr)(struct ibv_mr *ibv_mr, struct efadv_mr_attr *attr, uint32_t inlen); #endif + +#if HAVE_EFA_DATA_IN_ORDER_ALIGNED_128_BYTES + int (*ibv_query_qp_data_in_order)(struct ibv_qp *qp, enum ibv_wr_opcode op, uint32_t flags); +#endif }; struct ibv_cq_ex *efa_mock_create_cq_ex_return_null(struct ibv_context *context, struct ibv_cq_init_attr_ex *init_attr); @@ -146,6 +155,12 @@ int efa_mock_efadv_query_mr_rdma_recv_ic_id_2(struct ibv_mr *ibv_mr, struct efad int efa_mock_efadv_query_mr_recv_and_rdma_read_ic_id_0_1(struct ibv_mr *ibv_mr, struct efadv_mr_attr *attr, uint32_t inlen); #endif +#if HAVE_EFA_DATA_IN_ORDER_ALIGNED_128_BYTES +int __real_ibv_query_qp_data_in_order(struct ibv_qp *qp, enum ibv_wr_opcode op, uint32_t flags); +int efa_mock_ibv_query_qp_data_in_order_return_0(struct ibv_qp *qp, enum ibv_wr_opcode op, uint32_t flags); +int efa_mock_ibv_query_qp_data_in_order_return_in_order_aligned_128_bytes(struct ibv_qp *qp, enum ibv_wr_opcode op, uint32_t flags); +#endif + enum ibv_fork_status __real_ibv_is_fork_initialized(void); enum ibv_fork_status efa_mock_ibv_is_fork_initialized_return_mock(void); diff --git a/prov/efa/test/efa_unit_test_ope.c b/prov/efa/test/efa_unit_test_ope.c index 7e8fc897377..c9a4d23ae26 100644 --- a/prov/efa/test/efa_unit_test_ope.c +++ b/prov/efa/test/efa_unit_test_ope.c @@ -1,3 +1,6 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + #include "efa_unit_tests.h" void test_efa_rdm_ope_prepare_to_post_send_impl(struct efa_resource *resource, diff --git a/prov/efa/test/efa_unit_test_rnr.c b/prov/efa/test/efa_unit_test_rnr.c index 5c38a05051c..411cc030dd2 100644 --- a/prov/efa/test/efa_unit_test_rnr.c +++ b/prov/efa/test/efa_unit_test_rnr.c @@ -1,3 +1,6 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + #include "efa_unit_tests.h" #include "efa_rdm_pke_cmd.h" diff --git a/prov/efa/test/efa_unit_test_runt.c b/prov/efa/test/efa_unit_test_runt.c index 0769fe48cf5..9bb23a2d7bd 100644 --- a/prov/efa/test/efa_unit_test_runt.c +++ b/prov/efa/test/efa_unit_test_runt.c @@ -1,3 +1,6 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + #include "efa_unit_tests.h" /** diff --git a/prov/efa/test/efa_unit_test_send.c b/prov/efa/test/efa_unit_test_send.c index 9ca2c60bebc..b3ed1a7873c 100644 --- a/prov/efa/test/efa_unit_test_send.c +++ b/prov/efa/test/efa_unit_test_send.c @@ -1,34 +1,5 @@ -/* - * Copyright (c) Amazon.com, Inc. or its affiliates. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #include "efa_unit_tests.h" #include "ofi_util.h" diff --git a/prov/efa/test/efa_unit_test_srx.c b/prov/efa/test/efa_unit_test_srx.c index 494040233e3..733faa67d57 100644 --- a/prov/efa/test/efa_unit_test_srx.c +++ b/prov/efa/test/efa_unit_test_srx.c @@ -1,34 +1,5 @@ -/* - * Copyright (c) Amazon.com, Inc. or its affiliates. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #include "efa_unit_tests.h" #include "ofi_util.h" @@ -38,7 +9,7 @@ * @brief This test validates whether the default min_multi_recv size is correctly * passed from ep to srx, and whether is correctly modified when application * change it via fi_setopt - * + * */ void test_efa_srx_min_multi_recv_size(struct efa_resource **state) { diff --git a/prov/efa/test/efa_unit_tests.c b/prov/efa/test/efa_unit_tests.c index ab7810e70f7..fe075e57919 100644 --- a/prov/efa/test/efa_unit_tests.c +++ b/prov/efa/test/efa_unit_tests.c @@ -1,3 +1,6 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + #include "efa_unit_tests.h" struct efa_env orig_efa_env = {0}; @@ -84,16 +87,21 @@ int main(void) cmocka_unit_test_setup_teardown(test_efa_rdm_ep_getopt_undersized_optlen, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_getopt_oversized_optlen, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_setopt_shared_memory_permitted, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), - cmocka_unit_test_setup_teardown(test_efa_rdm_ep_cq_create_error_handling, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rdm_ep_enable_qp_in_order_aligned_128_bytes_good, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rdm_ep_enable_qp_in_order_aligned_128_bytes_bad, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_pkt_pool_flags, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_pkt_pool_page_alignment, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_dc_atomic_error_handling, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_send_with_shm_no_copy, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_rma_without_caps, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_atomic_without_caps, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rdm_ep_user_zcpy_rx_happy, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rdm_ep_user_zcpy_rx_unhappy_due_to_sas, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rdm_ep_user_zcpy_rx_unhappy_due_to_no_prefix, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_dgram_cq_read_empty_cq, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_ibv_cq_ex_read_empty_cq, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_ibv_cq_ex_read_failed_poll, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_rdm_cq_create_error_handling, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_rdm_cq_read_bad_send_status_unresponsive_receiver, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_rdm_cq_read_bad_send_status_unresponsive_receiver_missing_peer_host_id, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_rdm_cq_read_bad_send_status_invalid_qpn, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), @@ -104,6 +112,13 @@ int main(void) cmocka_unit_test_setup_teardown(test_rdm_fallback_to_ibv_create_cq_ex_cq_read_ignore_forgotton_peer, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_info_open_ep_with_wrong_info, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_info_open_ep_with_api_1_1_info, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_info_ep_attr, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_info_tx_rx_msg_order_rdm_order_none, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_info_tx_rx_msg_order_rdm_order_sas, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_info_tx_rx_msg_order_dgram_order_none, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_info_tx_rx_msg_order_dgram_order_sas, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_info_tx_rx_op_flags_rdm, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_info_tx_rx_size_rdm, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_info_check_shm_info_hmem, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_info_check_shm_info_op_flags, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_info_check_shm_info_threading, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), @@ -144,6 +159,10 @@ int main(void) cmocka_unit_test_setup_teardown(test_efa_rdm_peer_select_readbase_rtm_do_runt, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_domain_open_ops_wrong_name, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_domain_open_ops_mr_query, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rdm_cq_ibv_cq_poll_list_same_tx_rx_cq_single_ep, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rdm_cq_ibv_cq_poll_list_separate_tx_rx_cq_single_ep, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rdm_cntr_ibv_cq_poll_list_same_tx_rx_cq_single_ep, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rdm_cntr_ibv_cq_poll_list_separate_tx_rx_cq_single_ep, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), }; cmocka_set_message_output(CM_OUTPUT_XML); diff --git a/prov/efa/test/efa_unit_tests.h b/prov/efa/test/efa_unit_tests.h index be47a9f7ab8..07259f285ed 100644 --- a/prov/efa/test/efa_unit_tests.h +++ b/prov/efa/test/efa_unit_tests.h @@ -1,3 +1,6 @@ +/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */ +/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ + #ifndef EFA_UNIT_TESTS_H #define EFA_UNIT_TESTS_H @@ -32,10 +35,12 @@ struct fi_info *efa_unit_test_alloc_hints(enum fi_ep_type ep_type); void efa_unit_test_resource_construct(struct efa_resource *resource, enum fi_ep_type ep_type); void efa_unit_test_resource_construct_ep_not_enabled( struct efa_resource *resource, enum fi_ep_type ep_type); +void efa_unit_test_resource_construct_no_cq_and_ep_not_enabled( + struct efa_resource *resource, enum fi_ep_type ep_type); void efa_unit_test_resource_construct_with_hints(struct efa_resource *resource, enum fi_ep_type ep_type, struct fi_info *hints, - bool enable_ep); + bool enable_ep, bool open_cq); void efa_unit_test_resource_destruct(struct efa_resource *resource); @@ -95,7 +100,6 @@ void test_efa_rdm_ep_handshake_receive_valid_peer_host_id_and_do_not_send_local_ void test_efa_rdm_ep_handshake_receive_without_peer_host_id_and_do_not_send_local_host_id(); void test_efa_rdm_ep_getopt_undersized_optlen(); void test_efa_rdm_ep_getopt_oversized_optlen(); -void test_efa_rdm_ep_cq_create_error_handling(); void test_efa_rdm_ep_pkt_pool_flags(); void test_efa_rdm_ep_pkt_pool_page_alignment(); void test_efa_rdm_ep_dc_atomic_error_handling(); @@ -103,9 +107,15 @@ void test_efa_rdm_ep_send_with_shm_no_copy(); void test_efa_rdm_ep_rma_without_caps(); void test_efa_rdm_ep_atomic_without_caps(); void test_efa_rdm_ep_setopt_shared_memory_permitted(); +void test_efa_rdm_ep_enable_qp_in_order_aligned_128_bytes_good(); +void test_efa_rdm_ep_enable_qp_in_order_aligned_128_bytes_bad(); +void test_efa_rdm_ep_user_zcpy_rx_happy(); +void test_efa_rdm_ep_user_zcpy_rx_unhappy_due_to_sas(); +void test_efa_rdm_ep_user_zcpy_rx_unhappy_due_to_no_prefix(); void test_dgram_cq_read_empty_cq(); void test_ibv_cq_ex_read_empty_cq(); void test_ibv_cq_ex_read_failed_poll(); +void test_rdm_cq_create_error_handling(); void test_rdm_cq_read_bad_send_status_unresponsive_receiver(); void test_rdm_cq_read_bad_send_status_unresponsive_receiver_missing_peer_host_id(); void test_rdm_cq_read_bad_send_status_invalid_qpn(); @@ -116,6 +126,13 @@ void test_rdm_fallback_to_ibv_create_cq_ex_cq_read_ignore_forgotton_peer(); void test_ibv_cq_ex_read_ignore_removed_peer(); void test_info_open_ep_with_wrong_info(); void test_info_open_ep_with_api_1_1_info(); +void test_info_ep_attr(); +void test_info_tx_rx_msg_order_rdm_order_none(); +void test_info_tx_rx_msg_order_rdm_order_sas(); +void test_info_tx_rx_msg_order_dgram_order_none(); +void test_info_tx_rx_msg_order_dgram_order_sas(); +void test_info_tx_rx_op_flags_rdm(); +void test_info_tx_rx_size_rdm(); void test_info_check_shm_info_hmem(); void test_info_check_shm_info_op_flags(); void test_info_check_shm_info_threading(); @@ -156,5 +173,8 @@ void test_efa_rdm_peer_select_readbase_rtm_no_runt(); void test_efa_rdm_peer_select_readbase_rtm_do_runt(); void test_efa_domain_open_ops_wrong_name(); void test_efa_domain_open_ops_mr_query(); - +void test_efa_rdm_cq_ibv_cq_poll_list_same_tx_rx_cq_single_ep(); +void test_efa_rdm_cq_ibv_cq_poll_list_separate_tx_rx_cq_single_ep(); +void test_efa_rdm_cntr_ibv_cq_poll_list_same_tx_rx_cq_single_ep(); +void test_efa_rdm_cntr_ibv_cq_poll_list_separate_tx_rx_cq_single_ep(); #endif diff --git a/prov/opx/configure.m4 b/prov/opx/configure.m4 index e5c34a59a09..4fa5ebd43f4 100644 --- a/prov/opx/configure.m4 +++ b/prov/opx/configure.m4 @@ -49,7 +49,7 @@ AC_DEFUN([FI_OPX_CONFIGURE],[ dnl and is not supported for non-x86 processors. AS_IF([test "x$macos" = "x1"],[opx_happy=0], [test "x$freebsd" = "x1"],[opx_happy=0], - [test x$host_cpu != xx86_64],[opx_happy=0], + [test x$host_cpu != xx86_64 && test x$host_cpu != xriscv && test x$host_cpu != xriscv64],[opx_happy=0], [test x"$enable_opx" != x"no"],[ AC_MSG_CHECKING([for opx provider]) diff --git a/prov/opx/include/fi_opx_tid.h b/prov/opx/include/fi_opx_tid.h index 29d1e46e4f5..436c640209d 100644 --- a/prov/opx/include/fi_opx_tid.h +++ b/prov/opx/include/fi_opx_tid.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2022-2023 Cornelis Networks. + * Copyright (C) 2022-2024 Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -111,20 +111,11 @@ ctx->__hfi_tidexpcnt) */ #define OPX_MAX_TID_COUNT 2048 -#define OPX_TID_VADDR(tid_reuse_cache) (tid_reuse_cache->tid_vaddr) -#define OPX_TID_LENGTH(tid_reuse_cache) (tid_reuse_cache->tid_length) -#define OPX_TID_NINFO(tid_reuse_cache) (tid_reuse_cache->ninfo) -#define OPX_TID_INFO(tid_reuse_cache, idx) (tid_reuse_cache->info[idx]) -#define OPX_TID_NPAIRS(tid_reuse_cache) (tid_reuse_cache->npairs) -#define OPX_TID_PAIR(tid_reuse_cache, idx) (tid_reuse_cache->pairs[idx]) -#define OPX_TID_IS_INVALID(tid_reuse_cache) (tid_reuse_cache->invalid) -#define OPX_TID_INVALID(tid_reuse_cache) (tid_reuse_cache->invalid = 1) -#define OPX_TID_VALID(tid_reuse_cache) (tid_reuse_cache->invalid = 0) #define OPX_TID_NPAGES(tid_reuse_cache, npages) \ do { \ npages = 0; \ - const uint32_t *tids = &OPX_TID_INFO(tid_reuse_cache, 0); \ - const uint32_t ntids = OPX_TID_NINFO(tid_reuse_cache); \ + const uint32_t *tids = &tid_reuse_cache->info[0]; \ + const uint32_t ntids = tid_reuse_cache->ninfo; \ for (int i = 0; i < ntids; ++i) { \ npages += (int)FI_OPX_EXP_TID_GET(tids[i], LEN); \ FI_DBG(fi_opx_global.prov, FI_LOG_MR, \ @@ -211,10 +202,10 @@ static inline void OPX_TID_CACHE_DEBUG_FPRINTF(const char *format, ...) __func__, __LINE__, \ string, \ tid_vaddr, tid_vaddr + tid_length, tid_length, \ - OPX_TID_VADDR(tid_reuse_cache), \ - OPX_TID_VADDR(tid_reuse_cache) + \ - OPX_TID_LENGTH(tid_reuse_cache), \ - OPX_TID_LENGTH(tid_reuse_cache), count); \ + tid_reuse_cache->tid_vaddr, \ + tid_reuse_cache->tid_vaddr + \ + tid_reuse_cache->tid_length, \ + tid_reuse_cache->tid_length, count); \ last_vaddr = tid_vaddr; \ last_length = tid_length; \ count = 0; \ @@ -226,10 +217,10 @@ static inline void OPX_TID_CACHE_DEBUG_FPRINTF(const char *format, ...) "tid vaddr [%#lx - %#lx] length %lu\n", \ string, tid_vaddr, \ tid_vaddr + tid_length, tid_length, \ - OPX_TID_VADDR(tid_reuse_cache), \ - OPX_TID_VADDR(tid_reuse_cache) + \ - OPX_TID_LENGTH(tid_reuse_cache), \ - OPX_TID_LENGTH(tid_reuse_cache)); \ + tid_reuse_cache->tid_vaddr, \ + tid_reuse_cache->tid_vaddr + \ + tid_reuse_cache->tid_length, \ + tid_reuse_cache->tid_length); \ } while (0) #else /* noisier regular debug logging */ @@ -240,10 +231,10 @@ static inline void OPX_TID_CACHE_DEBUG_FPRINTF(const char *format, ...) "tid vaddr [%#lx - %#lx] length %lu\n", \ string, tid_vaddr, \ tid_vaddr + tid_length, tid_length, \ - OPX_TID_VADDR(tid_reuse_cache), \ - OPX_TID_VADDR(tid_reuse_cache) + \ - OPX_TID_LENGTH(tid_reuse_cache), \ - OPX_TID_LENGTH(tid_reuse_cache)); + tid_reuse_cache->tid_vaddr, \ + tid_reuse_cache->tid_vaddr + \ + tid_reuse_cache->tid_length, \ + tid_reuse_cache->tid_length); #endif /* Special debug for expected receive data ONLY */ @@ -253,8 +244,8 @@ static inline void OPX_TID_CACHE_DEBUG_FPRINTF(const char *format, ...) static int count = 0; \ static uint64_t last_vaddr = 0UL; \ static int32_t last_length = 0; \ - if ((last_vaddr != OPX_TID_VADDR(tid_reuse_cache)) || \ - (last_length != OPX_TID_LENGTH(tid_reuse_cache))) { \ + if ((last_vaddr != tid_reuse_cache->tid_vaddr) || \ + (last_length != tid_reuse_cache->tid_length)) { \ fprintf(stderr, \ "## %s:%u OPX_TID_CACHE_VERBOSE_DEBUG %s TIDs " \ "input vaddr [%#lx - %#lx] length %lu, " \ @@ -262,13 +253,13 @@ static inline void OPX_TID_CACHE_DEBUG_FPRINTF(const char *format, ...) "last count %u\n", \ __func__, __LINE__, \ string, \ - OPX_TID_VADDR(tid_reuse_cache), \ - OPX_TID_VADDR(tid_reuse_cache) + \ - OPX_TID_LENGTH(tid_reuse_cache), \ - OPX_TID_LENGTH(tid_reuse_cache), last_vaddr, \ + tid_reuse_cache->tid_vaddr, \ + tid_reuse_cache->tid_vaddr + \ + tid_reuse_cache->tid_length, \ + tid_reuse_cache->tid_length, last_vaddr, \ last_vaddr + last_length, last_length, count); \ - last_vaddr = OPX_TID_VADDR(tid_reuse_cache); \ - last_length = OPX_TID_LENGTH(tid_reuse_cache); \ + last_vaddr = tid_reuse_cache->tid_vaddr; \ + last_length = tid_reuse_cache->tid_length; \ count = 0; \ } \ ++count; \ @@ -279,10 +270,10 @@ static inline void OPX_TID_CACHE_DEBUG_FPRINTF(const char *format, ...) "OPX_TID_CACHE_VERBOSE_DEBUG %s TIDs " \ "tid vaddr [%#lx - %#lx] length %lu\n", \ string, \ - OPX_TID_VADDR(tid_reuse_cache), \ - OPX_TID_VADDR(tid_reuse_cache) + \ - OPX_TID_LENGTH(tid_reuse_cache), \ - OPX_TID_LENGTH(tid_reuse_cache)) + tid_reuse_cache->tid_vaddr, \ + tid_reuse_cache->tid_vaddr + \ + tid_reuse_cache->tid_length, \ + tid_reuse_cache->tid_length) #endif #endif /* _FI_PROV_OPX_TID_H_ */ diff --git a/prov/opx/include/fi_opx_tid_cache.h b/prov/opx/include/fi_opx_tid_cache.h index 92e8603a647..3c365f620f5 100644 --- a/prov/opx/include/fi_opx_tid_cache.h +++ b/prov/opx/include/fi_opx_tid_cache.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2022-2023 Cornelis Networks. + * Copyright (C) 2022-2024 Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -56,8 +56,26 @@ void opx_tid_cache_delete_abort(); #define OPX_ENTRY_NOT_FOUND 2 #define OPX_ENTRY_IN_USE 3 -/* Flush cache entries internal entry point */ -bool opx_tid_cache_flush(struct ofi_mr_cache *cache, bool flush_lru); +/* Flush cache entries */ +void opx_tid_cache_flush_all(struct ofi_mr_cache *cache,const bool flush_lru,const bool flush_all); + +__OPX_FORCE_INLINE__ +void opx_tid_cache_flush(struct ofi_mr_cache *cache, const bool flush_lru) +{ + /* Nothing to do, early exit */ + if (dlist_empty(&cache->dead_region_list) && + (!flush_lru || + dlist_empty(&cache->lru_list))) return; + + pthread_mutex_unlock(&mm_lock); + + /* Flush dead list or lru (one-time) */ + opx_tid_cache_flush_all(cache, flush_lru, false);/* one time */ + + pthread_mutex_lock(&mm_lock); + return; + +} /* Purge all entries for the specified endpoint */ void opx_tid_cache_purge_ep(struct ofi_mr_cache *cache, struct fi_opx_ep *opx_ep); @@ -68,7 +86,7 @@ void opx_tid_cache_cleanup(struct ofi_mr_cache *cache); /* De-register (lazy, unless force is true) a memory region on TID rendezvous completion */ void opx_deregister_for_rzv(struct fi_opx_ep *opx_ep, const uint64_t tid_vaddr, const int64_t tid_length, - bool force); + bool invalidate); /* forward declaration of parameter structure */ struct fi_opx_hfi1_rx_rzv_rts_params; @@ -78,6 +96,8 @@ struct fi_opx_hfi1_rx_rzv_rts_params; * returns non-zero on failure (fallback to Eager rendezvous) */ int opx_register_for_rzv(struct fi_opx_hfi1_rx_rzv_rts_params *params, - const uint64_t tid_vaddr, const uint64_t tid_length); + const uint64_t tid_vaddr, const uint64_t tid_length, + const enum fi_hmem_iface tid_iface, + const uint64_t tid_device); #endif /* _FI_PROV_OPX_TID_CACHE_H_ */ diff --git a/prov/opx/include/opa_service.h b/prov/opx/include/opa_service.h index 0030804be82..4ae6044035e 100644 --- a/prov/opx/include/opa_service.h +++ b/prov/opx/include/opa_service.h @@ -6,7 +6,7 @@ GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. - Copyright(c) 2021 Cornelis Networks. + Copyright(c) 2021-2024 Cornelis Networks. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as @@ -23,7 +23,7 @@ BSD LICENSE Copyright(c) 2015 Intel Corporation. - Copyright(c) 2021 Cornelis Networks. + Copyright(c) 2021-2024 Cornelis Networks. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -93,6 +93,7 @@ /* base name of path (without unit #) for opa driver */ #define OPX_DEVICE_PATH "/dev/hfi1" #define OPX_CLASS_PATH "/sys/class/infiniband/hfi1" +#define OPX_CLASS_DIR_PATH "/sys/class/infiniband" /* Commands used to communicate with driver. */ enum OPX_HFI_CMD { @@ -110,8 +111,8 @@ enum OPX_HFI_CMD { OPX_HFI_CMD_TID_INVAL_READ, /* read TID cache invalidations */ OPX_HFI_CMD_GET_VERS, /* get the version of the user cdev */ -#ifdef PSM_CUDA - OPX_HFI_CMD_TID_UPDATE_V2 = 28, +#ifdef OPX_HMEM + OPX_HFI_CMD_TID_UPDATE_V3, #endif OPX_HFI_CMD_LAST, }; @@ -187,8 +188,9 @@ int opx_hfi_get_num_units(); returns -1 when an error occurred. */ int opx_hfi_get_unit_active(int unit); -/* get the number of contexts from the unit id. */ -int opx_hfi_get_num_contexts(int unit); +/* Get the number of free contexts from the unit id. */ +/* Returns 0 if no unit or no match. */ +int opx_hfi_get_num_free_contexts(int unit); /* Open hfi device file, return -1 on error. */ int opx_hfi_context_open(int unit, int port, uint64_t open_timeout); diff --git a/prov/opx/include/opa_user_gen1.h b/prov/opx/include/opa_user_gen1.h index 5321d996581..06167be9d8e 100644 --- a/prov/opx/include/opa_user_gen1.h +++ b/prov/opx/include/opa_user_gen1.h @@ -6,7 +6,7 @@ GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. - Copyright(c) 2021-2023 Cornelis Networks. + Copyright(c) 2021-2024 Cornelis Networks. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as @@ -23,7 +23,7 @@ BSD LICENSE Copyright(c) 2015 Intel Corporation. - Copyright(c) 2021-2022 Cornelis Networks. + Copyright(c) 2021-2024 Cornelis Networks. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -542,13 +542,17 @@ static __inline__ void opx_hfi_hdrset_seq(__le32 *rbuf, uint32_t val) See full description at declaration */ static __inline__ int32_t opx_hfi_update_tid(struct _hfi_ctrl *ctrl, uint64_t vaddr, uint32_t *length, - uint64_t tidlist, uint32_t *tidcnt, uint16_t flags) + uint64_t tidlist, uint32_t *tidcnt, + uint64_t flags) { struct hfi1_cmd cmd; + +#ifdef OPX_HMEM + struct hfi1_tid_info_v3 tidinfo; +#else struct hfi1_tid_info tidinfo; -#ifdef PSM_CUDA - struct hfi1_tid_info_v2 tidinfov2; #endif + int err; tidinfo.vaddr = vaddr; /* base address for this send to map */ @@ -557,25 +561,20 @@ static __inline__ int32_t opx_hfi_update_tid(struct _hfi_ctrl *ctrl, tidinfo.tidlist = tidlist; /* driver copies tids back directly */ tidinfo.tidcnt = 0; /* clear to zero */ - FI_DBG(&fi_opx_provider, FI_LOG_MR,"OPX_DEBUG_ENTRY update [%p - %p], length %u (pages %u)\n", (void*)vaddr,(void*)(vaddr + *length), *length, (*length)/4096); - +#ifdef OPX_HMEM + cmd.type = OPX_HFI_CMD_TID_UPDATE_V3; + tidinfo.flags = flags; + tidinfo.context = 0ull; +#else cmd.type = OPX_HFI_CMD_TID_UPDATE; /* HFI1_IOCTL_TID_UPDATE */ +#endif + FI_DBG(&fi_opx_provider, FI_LOG_MR, + "OPX_DEBUG_ENTRY update [%p - %p], length %u (pages %u)\n", + (void*)vaddr, (void*) (vaddr + *length), *length, (*length) / 4096); + cmd.len = sizeof(tidinfo); cmd.addr = (__u64) &tidinfo; -#ifdef PSM_CUDA - if (PSMI_IS_DRIVER_GPUDIRECT_ENABLED) { - /* Copy values to v2 struct */ - tidinfov2.vaddr = tidinfo.vaddr; - tidinfov2.length = tidinfo.length; - tidinfov2.tidlist = tidinfo.tidlist; - tidinfov2.tidcnt = tidinfo.tidcnt; - tidinfov2.flags = flags; - - cmd.type = OPX_HFI_CMD_TID_UPDATE_V2; - cmd.len = sizeof(tidinfov2); - cmd.addr = (__u64) &tidinfov2; - } -#endif + errno = 0; err = opx_hfi_cmd_write(ctrl->fd, &cmd, sizeof(cmd)); __attribute__((__unused__)) int saved_errno = errno; @@ -584,15 +583,25 @@ static __inline__ int32_t opx_hfi_update_tid(struct _hfi_ctrl *ctrl, struct hfi1_tid_info *rettidinfo = (struct hfi1_tid_info *)cmd.addr; if ((rettidinfo->length != *length) || (rettidinfo->tidcnt == 0) ) { - FI_WARN(&fi_opx_provider, FI_LOG_MR,"PARTIAL UPDATE errno %d \"%s\" INPUTS vaddr [%p - %p] length %u (pages %u), OUTPUTS vaddr [%p - %p] length %u (pages %u), tidcnt %u\n", saved_errno, strerror(saved_errno), (void*)vaddr,(void*)(vaddr + *length), *length, (*length)/4096, (void*)vaddr,(void*)(vaddr + rettidinfo->length), rettidinfo->length, rettidinfo->length/4096, rettidinfo->tidcnt); + FI_WARN(&fi_opx_provider, FI_LOG_MR, + "PARTIAL UPDATE errno %d \"%s\" INPUTS vaddr [%p - %p] length %u (pages %u), OUTPUTS vaddr [%p - %p] length %u (pages %u), tidcnt %u\n", + saved_errno, strerror(saved_errno), (void*)vaddr, + (void*)(vaddr + *length), *length, (*length)/4096, + (void*)rettidinfo->vaddr,(void*)(rettidinfo->vaddr + rettidinfo->length), + rettidinfo->length, rettidinfo->length/4096, + rettidinfo->tidcnt); } /* Always update outputs, even on soft errors */ *length = rettidinfo->length; *tidcnt = rettidinfo->tidcnt; - FI_DBG(&fi_opx_provider, FI_LOG_MR,"OPX_DEBUG_EXIT OUTPUTS errno %d \"%s\" vaddr [%p - %p] length %u (pages %u), tidcnt %u\n", saved_errno, strerror(saved_errno), (void*)vaddr,(void*)(vaddr + *length), *length, (*length)/4096, *tidcnt); - + FI_DBG(&fi_opx_provider, FI_LOG_MR, + "OPX_DEBUG_EXIT OUTPUTS errno %d \"%s\" vaddr [%p - %p] length %u (pages %u), tidcnt %u\n", + saved_errno, strerror(saved_errno), (void*)vaddr, + (void*)(vaddr + *length), *length, (*length)/4096, *tidcnt); } else { - FI_WARN(&fi_opx_provider, FI_LOG_MR,"FAILED ERR %d errno %d \"%s\"\n", err, saved_errno, strerror(saved_errno)); + FI_WARN(&fi_opx_provider, FI_LOG_MR, + "FAILED ERR %d errno %d \"%s\"\n", + err, saved_errno, strerror(saved_errno)); /* Hard error, we can't trust these */ *length = 0; *tidcnt = 0; diff --git a/prov/opx/include/opx_shm.h b/prov/opx/include/opx_shm.h index e74eb065891..09564830331 100644 --- a/prov/opx/include/opx_shm.h +++ b/prov/opx/include/opx_shm.h @@ -1,6 +1,6 @@ /* * Copyright (c) 2016-2018 Intel Corporation. All rights reserved. - * Copyright (c) 2021-2023 Cornelis Networks. + * Copyright (c) 2021-2024 Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -59,15 +59,17 @@ #include #include -#ifdef OPX_DAOS_SUPPORT +#ifdef OPX_DAOS #define OPX_SHM_MAX_CONN_NUM 0xffff #else /* FI_OPX_MAX_HFIS * 256 */ -#define OPX_SHM_MAX_CONN_NUM 0x1000 +#define OPX_SHM_MAX_CONN_NUM (0x1000) +#define OPX_SHM_MAX_CONN_MASK (OPX_SHM_MAX_CONN_NUM - 1) #endif +static_assert((OPX_SHM_MAX_CONN_NUM & OPX_SHM_MAX_CONN_MASK) == 0, + "OPX_SHM_MAX_CONN_NUM must be a power of 2!"); #define OPX_SHM_SEGMENT_NAME_MAX_LENGTH (512) -#define OPX_SHM_TX_CONNECT_MAX_WAIT (5000) // 5 seconds #define OPX_SHM_SEGMENT_NAME_PREFIX "/opx.shm." #define OPX_SHM_FILE_NAME_PREFIX_FORMAT "%s-%02hhX.%d" @@ -80,14 +82,14 @@ struct opx_shm_connection { void *segment_ptr; size_t segment_size; bool inuse; - char segment_key[OPX_SHM_SEGMENT_NAME_MAX_LENGTH]; + char segment_key[OPX_SHM_SEGMENT_NAME_MAX_LENGTH]; }; struct opx_shm_tx { + struct opx_shm_tx *next; // for signal handler + struct fi_provider *prov; struct opx_shm_fifo_segment *fifo_segment[OPX_SHM_MAX_CONN_NUM]; struct opx_shm_connection connection[OPX_SHM_MAX_CONN_NUM]; - struct fi_provider *prov; - struct opx_shm_tx *next; // for signal handler uint32_t rank; uint32_t rank_inst; }; @@ -98,12 +100,12 @@ struct opx_shm_resynch { }; struct opx_shm_rx { + struct opx_shm_rx *next; // for signal handler + struct fi_provider *prov; struct opx_shm_fifo_segment *fifo_segment; void *segment_ptr; size_t segment_size; char segment_key[OPX_SHM_SEGMENT_NAME_MAX_LENGTH]; - struct fi_provider *prov; - struct opx_shm_rx *next; // for signal handler struct opx_shm_resynch resynch_connection[OPX_SHM_MAX_CONN_NUM]; }; @@ -112,42 +114,42 @@ extern struct opx_shm_rx *shm_rx_head; struct opx_shm_packet { - ofi_atomic64_t sequence_; - uint32_t origin_rank; - uint32_t origin_rank_inst; + ofi_atomic64_t sequence_; + uint32_t origin_rank; + uint32_t origin_rank_inst; - // TODO: Figure out why using pad_next_cacheline causes a segfault due to alignment w/ movaps instruction - // but the other one below does not, even though in both cases the struct size is the - // same, and data starts at a 16-byte aligned offset into the struct. + // TODO: Figure out why using pad_next_cacheline causes a segfault due to alignment w/ movaps instruction + // but the other one below does not, even though in both cases the struct size is the + // same, and data starts at a 16-byte aligned offset into the struct. - // sizeof(opx_shm_packet) == 8320, data starts at offset 0x40 (64) - // uint8_t pad_next_cacheline[FI_OPX_CACHE_LINE_SIZE - sizeof(ofi_atomic64_t) - sizeof(uint32_t) - sizeof(uint32_t)]; + // sizeof(opx_shm_packet) == 8320, data starts at offset 0x40 (64) + // uint8_t pad_next_cacheline[FI_OPX_CACHE_LINE_SIZE - sizeof(ofi_atomic64_t) - sizeof(uint32_t) - sizeof(uint32_t)]; - // sizeof(opx_shm_packet) == 8320, data starts at offset 0x20 (32) - uint64_t pad; + // sizeof(opx_shm_packet) == 8320, data starts at offset 0x20 (32) + uint64_t pad; - uint8_t data[FI_OPX_SHM_PACKET_SIZE]; + uint8_t data[FI_OPX_SHM_PACKET_SIZE]; }__attribute__((__aligned__(64))); struct opx_shm_fifo { - ofi_atomic64_t enqueue_pos_; - uint8_t pad0_[FI_OPX_CACHE_LINE_SIZE - sizeof(ofi_atomic64_t)]; - ofi_atomic64_t dequeue_pos_; - uint8_t pad1_[FI_OPX_CACHE_LINE_SIZE - sizeof(ofi_atomic64_t)]; - struct opx_shm_packet buffer_[FI_OPX_SHM_FIFO_SIZE]; + ofi_atomic64_t enqueue_pos_; + uint8_t pad0_[FI_OPX_CACHE_LINE_SIZE - sizeof(ofi_atomic64_t)]; + ofi_atomic64_t dequeue_pos_; + uint8_t pad1_[FI_OPX_CACHE_LINE_SIZE - sizeof(ofi_atomic64_t)]; + struct opx_shm_packet buffer_[FI_OPX_SHM_FIFO_SIZE]; } __attribute__((__aligned__(64))); static_assert((offsetof(struct opx_shm_fifo, enqueue_pos_) & 0x3fUL) == 0, - "struct opx_shm_fifo->enqueue_pos_ needs to be 64-byte aligned!"); + "struct opx_shm_fifo->enqueue_pos_ needs to be 64-byte aligned!"); static_assert((offsetof(struct opx_shm_fifo, dequeue_pos_) & 0x3fUL) == 0, - "struct opx_shm_fifo->dequeue_pos_ needs to be 64-byte aligned!"); + "struct opx_shm_fifo->dequeue_pos_ needs to be 64-byte aligned!"); static_assert(offsetof(struct opx_shm_fifo, buffer_) == (FI_OPX_CACHE_LINE_SIZE * 2), - "struct opx_shm_fifo->buffer_ should be 2 cachelines into struct"); + "struct opx_shm_fifo->buffer_ should be 2 cachelines into struct"); struct opx_shm_fifo_segment { - ofi_atomic64_t initialized_; - uint8_t pad1_[FI_OPX_CACHE_LINE_SIZE - sizeof(ofi_atomic64_t)]; - struct opx_shm_fifo fifo; + ofi_atomic64_t initialized_; + uint8_t pad1_[FI_OPX_CACHE_LINE_SIZE - sizeof(ofi_atomic64_t)]; + struct opx_shm_fifo fifo; } __attribute__((__aligned__(64))); static inline @@ -232,11 +234,11 @@ ssize_t opx_shm_rx_init (struct opx_shm_rx *rx, rx->segment_ptr = segment_ptr; rx->segment_size = segment_size; - // TODO: MHEINZ we probably need a lock here. - rx->next = shm_rx_head; shm_rx_head = rx; // add to signal handler list. + rx->next = shm_rx_head; + shm_rx_head = rx; // add to signal handler list. ofi_atomic_set64(&rx->fifo_segment->initialized_, 1); - + close(segment_fd); /* safe to close now */ FI_LOG(prov, FI_LOG_INFO, FI_LOG_FABRIC, @@ -282,8 +284,8 @@ ssize_t opx_shm_tx_init (struct opx_shm_tx *tx, tx->rank = hfi_rank; tx->rank_inst = hfi_rank_inst; - // TODO: MHEINZ we probably need a lock here. - tx->next = shm_tx_head; shm_tx_head = tx; // add to signal handler list. + tx->next = shm_tx_head; + shm_tx_head = tx; // add to signal handler list. return FI_SUCCESS; } @@ -299,83 +301,57 @@ ssize_t opx_shm_tx_connect (struct opx_shm_tx *tx, assert(segment_index < OPX_SHM_MAX_CONN_NUM); int err = 0; - char segment_key[OPX_SHM_SEGMENT_NAME_MAX_LENGTH]; - memset(segment_key, 0, OPX_SHM_SEGMENT_NAME_MAX_LENGTH); + void *segment_ptr = tx->connection[segment_index].segment_ptr; + if (segment_ptr == NULL) { + char segment_key[OPX_SHM_SEGMENT_NAME_MAX_LENGTH]; + snprintf(segment_key, OPX_SHM_SEGMENT_NAME_MAX_LENGTH, + OPX_SHM_SEGMENT_NAME_PREFIX "%s.%d", + unique_job_key, rx_id); - snprintf(segment_key, OPX_SHM_SEGMENT_NAME_MAX_LENGTH, - OPX_SHM_SEGMENT_NAME_PREFIX "%s.%d", - unique_job_key, rx_id); + int segment_fd = shm_open(segment_key, O_RDWR, 0600); + if (segment_fd == -1) { + FI_DBG(tx->prov, FI_LOG_FABRIC, + "Unable to create shm object '%s'; errno = '%s'\n", + segment_key, strerror(errno)); + return -FI_EAGAIN; + } - if (segment_index >= OPX_SHM_MAX_CONN_NUM) { - FI_LOG(tx->prov, FI_LOG_WARN, FI_LOG_FABRIC, - "Unable to create shm object '%s'; segment_index %u (rx %u) too large\n", - segment_key, segment_index, rx_id); - return -FI_E2BIG; - } + size_t segment_size = sizeof(struct opx_shm_fifo_segment) + 64; - int segment_fd; - unsigned loop = 0; - for (;;) { - segment_fd = shm_open(segment_key, O_RDWR, 0600); - if (segment_fd == -1) { - if (loop++ > OPX_SHM_TX_CONNECT_MAX_WAIT) { - FI_LOG(tx->prov, FI_LOG_WARN, FI_LOG_FABRIC, - "Unable to create shm object '%s'; errno = '%s'\n", - segment_key, strerror(errno)); - return -FI_EAGAIN; - } - usleep(1000); - } else { - break; + segment_ptr = mmap(NULL, segment_size, PROT_READ | PROT_WRITE, + MAP_SHARED, segment_fd, 0); + if (segment_ptr == MAP_FAILED) { + FI_LOG(tx->prov, FI_LOG_WARN, FI_LOG_FABRIC, + "mmap failed: '%s'\n", strerror(errno)); + err = errno; + goto error_return; } - } - size_t segment_size = sizeof(struct opx_shm_fifo_segment) + 64; + close(segment_fd); /* safe to close now */ - void *segment_ptr = mmap(NULL, segment_size, PROT_READ | PROT_WRITE, - MAP_SHARED, segment_fd, 0); - if (segment_ptr == MAP_FAILED) { - FI_LOG(tx->prov, FI_LOG_WARN, FI_LOG_FABRIC, - "mmap failed: '%s'\n", strerror(errno)); - err = errno; - goto error_return; + tx->connection[segment_index].segment_ptr = segment_ptr; + tx->connection[segment_index].segment_size = segment_size; + tx->connection[segment_index].inuse = false; + strcpy(tx->connection[segment_index].segment_key, segment_key); } - close(segment_fd); /* safe to close now */ - - /* - * Wait for completion of the initialization of the SHM segment before using - * it. - */ - loop = 0; struct opx_shm_fifo_segment *fifo_segment = (struct opx_shm_fifo_segment *)(((uintptr_t)segment_ptr + 64) & (~0x03Full)); - for (;;) { - uint64_t init = - atomic_load_explicit(&fifo_segment->initialized_.val, memory_order_acquire); - - if (init == 0) { - if (loop++ > OPX_SHM_TX_CONNECT_MAX_WAIT) { - FI_LOG(tx->prov, FI_LOG_WARN, FI_LOG_FABRIC, - "SHM object '%s' still initializing.\n", - segment_key); - return -FI_EAGAIN; - } - usleep(1000); - } else { - break; - } + uint64_t init = atomic_load_explicit(&fifo_segment->initialized_.val, + memory_order_acquire); + if (init == 0) { + FI_DBG(tx->prov, FI_LOG_FABRIC, + "SHM object '%s' still initializing.\n", + tx->connection[segment_index].segment_key); + return -FI_EAGAIN; } - tx->connection[segment_index].segment_ptr = segment_ptr; - tx->connection[segment_index].segment_size = segment_size; - tx->connection[segment_index].inuse = false; tx->fifo_segment[segment_index] = fifo_segment; - strcpy(tx->connection[segment_index].segment_key, segment_key); FI_LOG(tx->prov, FI_LOG_INFO, FI_LOG_FABRIC, - "SHM connection to %u context passed. Segment (%s), %d, segment (%p) size %zu segment_index %u\n", - rx_id, segment_key, segment_fd, segment_ptr, segment_size, segment_index); + "SHM connection to %u context passed. Segment (%s), segment (%p) size %zu segment_index %u\n", + rx_id, tx->connection[segment_index].segment_key, segment_ptr, + tx->connection[segment_index].segment_size, segment_index); return FI_SUCCESS; @@ -436,19 +412,25 @@ static inline void * opx_shm_tx_next (struct opx_shm_tx *tx, uint8_t peer_hfi_unit, uint8_t peer_rx_index, uint64_t *pos, bool use_rank, unsigned rank, unsigned rank_inst, ssize_t *rc) { +#ifdef OPX_DAOS /* HFI Rank Support: Used HFI rank index instead of HFI index. */ unsigned segment_index = (!use_rank) ? OPX_SHM_SEGMENT_INDEX(peer_hfi_unit, peer_rx_index) : opx_shm_daos_rank_index(rank, rank_inst); - +#else + unsigned segment_index = OPX_SHM_SEGMENT_INDEX(peer_hfi_unit, peer_rx_index); +#endif assert(segment_index < OPX_SHM_MAX_CONN_NUM); + +#ifndef NDEBUG if (segment_index >= OPX_SHM_MAX_CONN_NUM) { *rc = -FI_EIO; FI_LOG(tx->prov, FI_LOG_WARN, FI_LOG_FABRIC, "SHM %u context exceeds maximum contexts supported.\n", segment_index); return NULL; } +#endif - if (tx->fifo_segment[segment_index] == NULL) { + if (OFI_UNLIKELY(tx->fifo_segment[segment_index] == NULL)) { *rc = -FI_EIO; FI_LOG(tx->prov, FI_LOG_WARN, FI_LOG_FABRIC, "SHM %u context FIFO not initialized.\n", segment_index); diff --git a/prov/opx/include/rdma/opx/fi_opx_compiler.h b/prov/opx/include/rdma/opx/fi_opx_compiler.h index 73c105f6d7f..7fadf792768 100644 --- a/prov/opx/include/rdma/opx/fi_opx_compiler.h +++ b/prov/opx/include/rdma/opx/fi_opx_compiler.h @@ -59,14 +59,33 @@ static inline void fi_opx_compiler_msync_writes() { - asm volatile ("sfence" : : : "memory"); +#if defined(__riscv) + +#if defined(__riscv_xlen) && (__riscv_xlen == 64) + asm volatile ("fence ow,ow" : : : "memory"); +#else +#error "Unsupported CPU type" +#endif + +#else + asm volatile ("sfence" : : : "memory"); +#endif } static inline void fi_opx_compiler_msync_reads() { - asm volatile ("lfence" : : : "memory"); -} +#if defined(__riscv) +#if defined(__riscv_xlen) && (__riscv_xlen == 64) + asm volatile ("fence ir,ir" : : : "memory"); +#else +#error "Unsupported CPU type" +#endif + +#else + asm volatile ("lfence" : : : "memory"); +#endif +} #define fi_opx_compiler_barrier() __asm__ __volatile__ ( "" ::: "memory" ) @@ -87,9 +106,19 @@ void fi_opx_compiler_store_u64(volatile uint64_t * const variable, const uint64_ static inline void fi_opx_compiler_inc_u64(volatile uint64_t * const variable) { +#if defined(__riscv) + +#if defined(__riscv_xlen) && (__riscv_xlen == 64) + (*variable) += 1; +#else +#error "Unsupported CPU type" +#endif + +#else __asm__ __volatile__ ("lock ; incq %0" : "=m" (*variable) : "m" (*variable)); +#endif return; } @@ -97,18 +126,41 @@ static inline uint64_t fi_opx_compiler_fetch_and_inc_u64(volatile uint64_t * const variable) { uint64_t value = 1; +#if defined(__riscv) + +#if defined(__riscv_xlen) && (__riscv_xlen == 64) + const uint64_t tmp = (*variable); + (*variable) = value; + value = tmp; + (*variable) = (*variable) + value; +#else +#error "Unsupported CPU type" +#endif + +#else __asm__ __volatile__ ("lock ; xadd %0,%1" : "=r" (value), "=m" (*variable) : "0" (value), "m" (*variable)); +#endif return value; } static inline void fi_opx_compiler_dec_u64(volatile uint64_t * const variable) { +#if defined(__riscv) + +#if defined(__riscv_xlen) && (__riscv_xlen == 64) + (*variable) -= 1; +#else +#error "Unsupported CPU type" +#endif + +#else __asm__ __volatile__ ("lock ; decq %0" : "=m" (*variable) : "m" (*variable)); +#endif return; } diff --git a/prov/opx/include/rdma/opx/fi_opx_debug_counters.h b/prov/opx/include/rdma/opx/fi_opx_debug_counters.h index 401f69ddb9e..78fd7678d67 100644 --- a/prov/opx/include/rdma/opx/fi_opx_debug_counters.h +++ b/prov/opx/include/rdma/opx/fi_opx_debug_counters.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2021-2023 Cornelis Networks. + * Copyright (C) 2021-2024 Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -169,12 +169,22 @@ struct fi_opx_debug_counters { } sdma; struct { - uint64_t total_requests; uint64_t tid_updates; uint64_t tid_resource_limit; + uint64_t tid_resource_limit_length_chunk_short; + uint64_t tid_resource_limit_length_chunk_long; + uint64_t tid_resource_limit_tidcnt_chunk_zero; uint64_t tid_invalidate_needed; - uint64_t tid_replays; - uint64_t rts_fallback_eager; + uint64_t tid_rcv_pkts; + uint64_t tid_rcv_pkts_replays; + uint64_t rts_tid_ineligible; + uint64_t rts_tid_eligible; + uint64_t rts_fallback_eager_immediate; + uint64_t rts_fallback_eager_misaligned_thrsh; + uint64_t rts_fallback_eager_reg_rzv; + uint64_t rts_tid_setup_retries; + uint64_t rts_tid_setup_retry_success; + uint64_t rts_tid_setup_success; uint64_t tid_buckets[4]; uint64_t first_tidpair_minlen; uint64_t first_tidpair_maxlen; @@ -245,6 +255,9 @@ struct fi_opx_debug_counters { uint64_t rma_atomic_fetch_intranode; uint64_t rma_atomic_cmp_fetch_hfi; uint64_t rma_atomic_cmp_fetch_intranode; + + uint64_t tid_update; + uint64_t tid_recv; } hmem; }; @@ -331,12 +344,31 @@ void fi_opx_debug_counters_print(struct fi_opx_debug_counters *counters) #endif #ifdef OPX_DEBUG_COUNTERS_EXPECTED_RECEIVE - FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.total_requests); FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.tid_updates); FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.tid_resource_limit); + FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.tid_resource_limit_length_chunk_short); + FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.tid_resource_limit_tidcnt_chunk_zero); FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.tid_invalidate_needed); - FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.tid_replays); - FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.rts_fallback_eager); + FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.tid_rcv_pkts); + FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.tid_rcv_pkts_replays); + FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.rts_tid_ineligible); + FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.rts_tid_eligible); + FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.rts_fallback_eager_immediate); + FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.rts_fallback_eager_misaligned_thrsh); + FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.rts_fallback_eager_reg_rzv); + FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.rts_tid_setup_retries); + FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.rts_tid_setup_retry_success); + FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.rts_tid_setup_success); + uint64_t rts_sum = counters->expected_receive.rts_fallback_eager_immediate + + counters->expected_receive.rts_fallback_eager_misaligned_thrsh + + counters->expected_receive.rts_fallback_eager_reg_rzv + + counters->expected_receive.rts_tid_setup_success; + if (rts_sum != counters->expected_receive.rts_tid_eligible) { + fprintf(stderr, + "(%d) ### WARN: rts_tid_eligible (%lu) != SUM(rts_tid_setup_success + rts_fallback*) (%lu)! Accounting error?\n\n", + pid, + counters->expected_receive.rts_tid_eligible, rts_sum); + } FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER_ARR(pid, expected_receive.tid_buckets, 4); FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.first_tidpair_minlen); FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, expected_receive.first_tidpair_maxlen); @@ -433,6 +465,9 @@ void fi_opx_debug_counters_print(struct fi_opx_debug_counters *counters) FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, hmem.rma_atomic_fetch_hfi); FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, hmem.rma_atomic_cmp_fetch_intranode); FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, hmem.rma_atomic_cmp_fetch_hfi); + + FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, hmem.tid_update); + FI_OPX_DEBUG_COUNTERS_PRINT_COUNTER(pid, hmem.tid_recv); #endif } diff --git a/prov/opx/include/rdma/opx/fi_opx_domain.h b/prov/opx/include/rdma/opx/fi_opx_domain.h index 03cf1a9c3de..f5388f2f286 100644 --- a/prov/opx/include/rdma/opx/fi_opx_domain.h +++ b/prov/opx/include/rdma/opx/fi_opx_domain.h @@ -1,6 +1,6 @@ /* * Copyright (C) 2016 by Argonne National Laboratory. - * Copyright (C) 2021-2022 Cornelis Networks. + * Copyright (C) 2021-2024 Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -93,10 +93,9 @@ struct fi_opx_node { #define OPX_DEFAULT_JOB_KEY_STR "00112233445566778899aabbccddeeff" #define OPX_DEFAULT_PROG_AFFINITY_STR "0:3:1" - -#define OPX_MIN_DCOMP_THRESHOLD FI_OPX_SDMA_MIN_LENGTH -#define OPX_DEFAULT_DCOMP_THRESHOLD FI_OPX_SDMA_DC_MIN -#define OPX_MAX_DCOMP_THRESHOLD (INT_MAX - 1) +#define OPX_SDMA_BOUNCE_BUF_MIN FI_OPX_SDMA_MIN_LENGTH +#define OPX_SDMA_BOUNCE_BUF_THRESHOLD FI_OPX_SDMA_DC_MIN +#define OPX_SDMA_BOUNCE_BUF_MAX (INT_MAX - 1) struct fi_opx_domain { struct fid_domain domain_fid; diff --git a/prov/opx/include/rdma/opx/fi_opx_endpoint.h b/prov/opx/include/rdma/opx/fi_opx_endpoint.h index d651fd015a7..8bd3317e9e4 100644 --- a/prov/opx/include/rdma/opx/fi_opx_endpoint.h +++ b/prov/opx/include/rdma/opx/fi_opx_endpoint.h @@ -1,6 +1,6 @@ /* * Copyright (C) 2016 by Argonne National Laboratory. - * Copyright (C) 2021-2023 Cornelis Networks. + * Copyright (C) 2021-2024 Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -240,7 +240,7 @@ struct fi_opx_ep_tx { volatile union fi_opx_hfi1_pio_state *pio_state; /* 1 qw = 8 bytes */ volatile uint64_t * pio_scb_sop_first; - uint32_t dcomp_threshold; /* const; messages over this size will always force delivery completition */ + uint32_t sdma_bounce_buf_threshold; uint16_t pio_max_eager_tx_bytes; uint16_t pio_flow_eager_tx_bytes; @@ -523,10 +523,7 @@ struct fi_opx_ep { bool is_rx_cq_bound; bool use_expected_tid_rzv; uint8_t unused_cacheline5[3]; - - uint32_t unused_cacheline5_u32[1]; - uint32_t mcache_flush_counter; - uint32_t unused_cacheline5b; + uint32_t unused_cacheline5_u32[3]; ofi_spin_t lock; /* lock size varies based on ENABLE_DEBUG*/ @@ -690,7 +687,7 @@ static void fi_opx_dump_daos_av_addr_rank(struct fi_opx_ep *opx_ep, if (cur_av_rank) { union fi_opx_addr addr; addr.fi = cur_av_rank->fi_addr; - + if ((addr.uid.lid == find_addr.uid.lid) && (cur_av_rank->key.rank == opx_ep->daos_info.rank)) { found = 1; FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "Dump av_rank_hashmap[%d] = rank:%d LID:0x%x fi_addr:0x%08lx - Found.\n", @@ -751,7 +748,7 @@ static struct fi_opx_daos_av_rank * fi_opx_get_daos_av_rank(struct fi_opx_ep *op if (cur_av_rank) { union fi_opx_addr addr; addr.fi = cur_av_rank->fi_addr; - + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "GET Dump av_rank_hashmap[%d] = rank:%d LID:0x%x fi_addr:0x%08lx\n", i++, cur_av_rank->key.rank, addr.uid.lid, addr.fi); @@ -1060,7 +1057,7 @@ void complete_receive_operation_internal (struct fid_ep *ep, break; } } - + FI_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "INJECT send_len %lu <= recv_len %lu; enqueue cq (completed)\n", send_len, recv_len); @@ -1538,6 +1535,7 @@ void complete_receive_operation_internal (struct fid_ep *ep, }; const uint64_t immediate_byte_count = immediate_info.byte_count; const uint64_t immediate_qw_count = immediate_info.qw_count; + const uint64_t immediate_fragment = ((immediate_byte_count + immediate_qw_count + 63) >> 6); const uint64_t immediate_block_count = immediate_info.block_count; const uint64_t immediate_total = immediate_byte_count + immediate_qw_count * sizeof(uint64_t) + @@ -1595,7 +1593,7 @@ void complete_receive_operation_internal (struct fid_ep *ep, } if (immediate_block_count) { - const union cacheline * const immediate_block = p->rendezvous.contiguous.immediate_block; + const union cacheline * const immediate_block = &p->rendezvous.contiguous.cache_line_1 + immediate_fragment; union cacheline * rbuf_block = (union cacheline *)rbuf; for (i=0; irendezvous.contiguous.immediate_block; + const union cacheline * const immediate_block = &p->rendezvous.contiguous.cache_line_1 + immediate_fragment; uint8_t *rbuf_start = (uint8_t *)recv_buf; rbuf_start += xfer_len - (immediate_end_block_count << 6); memcpy(rbuf_start, immediate_block[immediate_block_count].qw, @@ -1666,6 +1664,7 @@ void complete_receive_operation_internal (struct fid_ep *ep, }; const uint64_t immediate_byte_count = immediate_info.byte_count; const uint64_t immediate_qw_count = immediate_info.qw_count; + const uint64_t immediate_fragment = ((immediate_byte_count + immediate_qw_count + 63) >> 6); const uint64_t immediate_block_count = immediate_info.block_count; const uint64_t immediate_total = immediate_byte_count + immediate_qw_count * sizeof(uint64_t) + @@ -1724,7 +1723,7 @@ void complete_receive_operation_internal (struct fid_ep *ep, } if (immediate_block_count) { - const union cacheline * const immediate_block = p->rendezvous.contiguous.immediate_block; + const union cacheline * const immediate_block = &p->rendezvous.contiguous.cache_line_1 + immediate_fragment; union cacheline * rbuf_block = (union cacheline *)rbuf; for (i=0; irendezvous.contiguous.immediate_block; + const union cacheline * const immediate_block = &p->rendezvous.contiguous.cache_line_1 + immediate_fragment; uint8_t *rbuf_start = (uint8_t *)recv_buf; rbuf_start += xfer_len - (immediate_end_block_count << 6); if (!is_hmem) { @@ -1816,8 +1815,8 @@ void complete_receive_operation_internal (struct fid_ep *ep, if (lock_required) { fprintf(stderr, "%s:%s():%d\n", __FILE__, __func__, __LINE__); abort(); } fi_opx_context_slist_insert_tail(context, rx->cq_pending_ptr); - /* Post a E_TRUNC to our local RX error queue because a client called receive - with too small a buffer. Tell them about it via the error cq */ + /* Post a E_TRUNC to our local RX error queue because a client called receive + with too small a buffer. Tell them about it via the error cq */ struct fi_opx_context_ext * ext = NULL; if (is_context_ext) { @@ -1897,27 +1896,27 @@ ssize_t fi_opx_shm_dynamic_tx_connect(const unsigned is_intranode, const unsigned rx_id, const uint8_t hfi1_unit) { - assert(hfi1_unit < FI_OPX_MAX_HFIS); - if (!is_intranode) { return FI_SUCCESS; } + assert(hfi1_unit < FI_OPX_MAX_HFIS); + assert(rx_id < OPX_SHM_MAX_CONN_NUM); + +#ifdef OPX_DAOS uint32_t segment_index; if (!opx_ep->daos_info.hfi_rank_enabled) { assert(rx_id < 256); segment_index = OPX_SHM_SEGMENT_INDEX(hfi1_unit, rx_id); } else { - segment_index = rx_id; + segment_index = rx_id & OPX_SHM_MAX_CONN_MASK; } +#else + uint32_t segment_index = rx_id & OPX_SHM_MAX_CONN_MASK; +#endif - if (OFI_UNLIKELY(segment_index >= OPX_SHM_MAX_CONN_NUM)) { - FI_LOG(opx_ep->tx->shm.prov, FI_LOG_WARN, FI_LOG_FABRIC, - "Unable to connect shm object hfi_unit=%hhu, rx_id=%u, segment_index=%u (too large)\n", - hfi1_unit, rx_id, segment_index); - return -FI_E2BIG; - } else if (OFI_LIKELY(opx_ep->tx->shm.fifo_segment[segment_index] != NULL)) { + if (OFI_LIKELY(opx_ep->tx->shm.fifo_segment[segment_index] != NULL)) { /* Connection already established */ return FI_SUCCESS; } @@ -1926,9 +1925,11 @@ ssize_t fi_opx_shm_dynamic_tx_connect(const unsigned is_intranode, char buffer[OPX_JOB_KEY_STR_SIZE + 32]; int inst = 0; +#ifdef OPX_DAOS if (opx_ep->daos_info.hfi_rank_enabled) { inst = opx_ep->daos_info.rank_inst; } +#endif snprintf(buffer, sizeof(buffer), OPX_SHM_FILE_NAME_PREFIX_FORMAT, opx_ep->domain->unique_job_key_str, hfi1_unit, inst); @@ -2137,6 +2138,7 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep * opx_ep, break; case FI_OPX_HFI_DPUT_OPCODE_RZV_TID: { + FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.expected_receive.tid_rcv_pkts); struct fi_opx_rzv_completion * rzv_comp = (struct fi_opx_rzv_completion *)(hdr->dput.target.rzv.completion_vaddr); union fi_opx_context *target_context = rzv_comp->context; assert(target_context); @@ -2183,7 +2185,7 @@ void fi_opx_ep_rx_process_header_rzv_data(struct fi_opx_ep * opx_ep, } else { memcpy(rbuf_qws, sbuf_qws, bytes); } - FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.expected_receive.tid_replays); + FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.expected_receive.tid_rcv_pkts_replays); } #ifndef NDEBUG else { /* Debug, tracking where the TID wrote even though we don't memcpy here */ @@ -2990,7 +2992,7 @@ void fi_opx_ep_do_pending_work(struct fi_opx_ep *opx_ep) } } -static inline __attribute__((always_inline)) +static inline void fi_opx_ep_rx_poll (struct fid_ep *ep, const uint64_t caps, const enum ofi_reliability_kind reliability, @@ -3576,7 +3578,7 @@ ssize_t fi_opx_ep_rx_recvmsg_internal (struct fi_opx_ep *opx_ep, FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA,"===================================== POST RECVMSG RETURN FI_ENOMEM\n"); return -FI_ENOMEM; } - + ext->opx_context.flags = flags | FI_OPX_CQ_CONTEXT_EXT; ext->opx_context.byte_counter = (uint64_t)-1; ext->opx_context.src_addr = fi_opx_ep_get_src_addr(opx_ep, av_type, msg->addr); @@ -4003,7 +4005,7 @@ ssize_t fi_opx_ep_tx_send_internal (struct fid_ep *ep, FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== SEND -- Eager send failed, trying next method\n"); } - + #ifndef FI_OPX_MP_EGR_DISABLE if (is_contiguous && total_len <= FI_OPX_MP_EGR_MAX_PAYLOAD_BYTES && diff --git a/prov/opx/include/rdma/opx/fi_opx_eq.h b/prov/opx/include/rdma/opx/fi_opx_eq.h index 1c3380a2109..d77289f1de6 100644 --- a/prov/opx/include/rdma/opx/fi_opx_eq.h +++ b/prov/opx/include/rdma/opx/fi_opx_eq.h @@ -389,9 +389,8 @@ static ssize_t fi_opx_cq_poll_noinline (struct fi_opx_cq *opx_cq, return num_entries; } -static inline void __attribute__((always_inline)) fi_opx_ep_rx_poll (struct fid_ep *ep, const uint64_t caps, const enum ofi_reliability_kind reliability, const uint64_t hdrq_mask); - __OPX_FORCE_INLINE__ +__attribute__ ((flatten)) ssize_t fi_opx_cq_poll_inline(struct fid_cq *cq, void *buf, size_t count, fi_addr_t *src_addr, const enum fi_cq_format format, const int lock_required, diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1.h b/prov/opx/include/rdma/opx/fi_opx_hfi1.h index 7e2562cea1c..721d714f19c 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1.h @@ -1,6 +1,6 @@ /* * Copyright (C) 2016 by Argonne National Laboratory. - * Copyright (C) 2021-2023 Cornelis Networks. + * Copyright (C) 2021-2024 Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -203,6 +203,21 @@ static_assert(FI_OPX_HFI1_SDMA_MAX_WE >= FI_OPX_HFI1_SDMA_MAX_COMP_INDEX, "FI_OP //Version 1, SDMA replays - EAGER opcode (1)(byte 0), 2 iovectors (byte 1) #define FI_OPX_HFI1_SDMA_REQ_HEADER_REPLAY_EAGER_FIXEDBITS (0x0211) +#ifndef OPX_RTS_TID_SETUP_MAX_TRIES +#define OPX_RTS_TID_SETUP_MAX_TRIES (1) +#endif + +/* + * Minimum page sizes to use for different memory types. + * The array is indexed by the values defined in + * enum fi_hmem_iface. Some values are not supported. + */ +static const uint64_t OPX_TID_PAGE_SIZE[4] = { + PAGE_SIZE, /* FI_HMEM_SYSTEM */ + 64 * 1024, /* FI_HMEM_CUDA */ + PAGE_SIZE, /* FI_HMEM_ROCR */ + PAGE_SIZE /* FI_HMEM_ZE */ +}; static inline uint32_t fi_opx_addr_calculate_base_rx (const uint32_t process_id, const uint32_t processes_per_node) { diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_packet.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_packet.h index 4951172f799..40915d91416 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_packet.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_packet.h @@ -1,6 +1,6 @@ /* * Copyright (C) 2016 by Argonne National Laboratory. - * Copyright (C) 2021-2023 Cornelis Networks. + * Copyright (C) 2021-2024 Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -43,7 +43,12 @@ #include "rdma/fabric.h" /* only for 'fi_addr_t' ... which is a typedef to uint64_t */ #include "rdma/opx/fi_opx_addr.h" - +#if defined(__riscv) && defined(__riscv_xlen) && (__riscv_xlen == 64) +#ifndef PAGE_SIZE +/* 4K pages default */ +#define PAGE_SIZE 4096 +#endif +#endif #define FI_OPX_ADDR_SEP_RX_MAX (4) #define FI_OPX_HFI1_PACKET_MTU (8192) @@ -816,9 +821,14 @@ union fi_opx_hfi1_packet_payload { uint64_t unused[2]; /* ==== CACHE LINE 1 ==== */ - - uint8_t immediate_byte[8]; - uint64_t immediate_qw[7]; + union { + struct { + uint8_t immediate_byte[8]; + uint64_t immediate_qw[7]; + }; + + union cacheline cache_line_1; + }; /* ==== CACHE LINE 2-127 ==== */ @@ -828,11 +838,11 @@ union fi_opx_hfi1_packet_payload { struct { /* ==== CACHE LINE 0 ==== */ - uintptr_t origin_byte_counter_vaddr; - struct fi_opx_hmem_iov iov[2]; + uintptr_t origin_byte_counter_vaddr; + struct fi_opx_hmem_iov iov[2]; /* ==== CACHE LINE 1-127 (for 8k mtu) ==== */ - struct fi_opx_hmem_iov iov_ext[FI_OPX_MAX_HMEM_IOV - 2]; + struct fi_opx_hmem_iov iov_ext[FI_OPX_MAX_HMEM_IOV - 2]; size_t unused; } noncontiguous; diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_sdma.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_sdma.h index 12d194b8c24..a3f4ea0d70b 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_sdma.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_sdma.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2022-2023 by Cornelis Networks. + * Copyright (C) 2022-2024 by Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -59,15 +59,6 @@ static const uint16_t OPX_SDMA_REQ_SET_MEMINFO[2] = {0, static const size_t OPX_SDMA_REQ_HDR_SIZE[2] = {FI_OPX_HFI1_SDMA_HDR_SIZE, FI_OPX_HFI1_SDMA_HDR_SIZE + OPX_SDMA_MEMINFO_SIZE}; -static const unsigned OPX_SDMA_OFI_TO_KERN_MEM_TYPE[4] = { - #ifdef OPX_HMEM - HFI1_MEMINFO_TYPE_SYSTEM, - HFI1_MEMINFO_TYPE_NVIDIA, - 2, /* HFI1_MEMINFO_TYPE_AMD */ - 1 /* HFI1_MEMINFO_TYPE_DMABUF */ - #endif - }; - struct fi_opx_hfi1_sdma_header_vec { union { struct { @@ -212,14 +203,14 @@ void fi_opx_hfi1_dput_sdma_init(struct fi_opx_ep *opx_ep, return; } - params->delivery_completion = (length >= opx_ep->tx->dcomp_threshold) || - (is_hmem) || - (params->opcode == FI_OPX_HFI_DPUT_OPCODE_GET) || - (params->opcode == FI_OPX_HFI_DPUT_OPCODE_PUT) || - (params->opcode == FI_OPX_HFI_DPUT_OPCODE_ATOMIC_FETCH) || - (params->opcode == FI_OPX_HFI_DPUT_OPCODE_ATOMIC_COMPARE_FETCH); + params->sdma_no_bounce_buf = (length >= opx_ep->tx->sdma_bounce_buf_threshold) || + (is_hmem) || + (params->opcode == FI_OPX_HFI_DPUT_OPCODE_GET) || + (params->opcode == FI_OPX_HFI_DPUT_OPCODE_PUT) || + (params->opcode == FI_OPX_HFI_DPUT_OPCODE_ATOMIC_FETCH) || + (params->opcode == FI_OPX_HFI_DPUT_OPCODE_ATOMIC_COMPARE_FETCH); - if (!params->delivery_completion) { + if (!params->sdma_no_bounce_buf) { assert(params->origin_byte_counter); } params->user_cc = params->cc; @@ -515,7 +506,7 @@ void fi_opx_hfi1_sdma_set_meminfo(struct sdma_req_info *req_info, // setting meminfo, and it will be the fist one: // index 0 (the first payload IOV, or iov[1]). const unsigned meminfo_idx = 0; - const unsigned type = OPX_SDMA_OFI_TO_KERN_MEM_TYPE[iface]; + const unsigned type = OPX_HMEM_KERN_MEM_TYPE[iface]; struct sdma_req_meminfo *meminfo = (struct sdma_req_meminfo *) (req_info + 1); meminfo->types = 0; HFI1_MEMINFO_TYPE_ENTRY_SET(meminfo->types, meminfo_idx, type); @@ -759,10 +750,15 @@ void opx_hfi1_sdma_do_sdma_tid(struct fi_opx_ep *opx_ep, unsigned int tidiovec_idx = 2; /* tid info iovec*/ uint32_t *tidpairs = NULL; - // TODO: GPU support for TID - assert(we->hmem.iface == FI_HMEM_SYSTEM); - const uint64_t set_meminfo = 0; + const uint64_t set_meminfo = + #ifdef OPX_HMEM + (we->hmem.iface > FI_HMEM_SYSTEM) ? 1 : 0; + #else + 0; + #endif + struct sdma_req_info *req_info = OPX_SDMA_REQ_INFO_PTR(&we->header_vec, set_meminfo); + fi_opx_hfi1_sdma_set_meminfo(req_info, set_meminfo, we->hmem.iface, we->hmem.device); /* Since we already verified that enough PSNs were available for the send we're about to do, we shouldn't need to check the @@ -775,7 +771,7 @@ void opx_hfi1_sdma_do_sdma_tid(struct fi_opx_ep *opx_ep, we->num_iovs = 3; /* request and data and tids*/ /* no padding for tid, should have been aligned.*/ assert(we->total_payload == ((we->total_payload) & -4)); - ; + we->iovecs[1].iov_len = (we->total_payload + 3) & -4; we->iovecs[1].iov_base = we->packets[0].replay->iov[0].iov_base; @@ -800,8 +796,8 @@ void opx_hfi1_sdma_do_sdma_tid(struct fi_opx_ep *opx_ep, we->packets[i].replay->scb.hdr.qw[2] |= (uint64_t)htonl((uint32_t)psn); we->packets[i].replay->sdma_we_use_count = we->bounce_buf.use_count; we->packets[i].replay->sdma_we = replay_back_ptr; - we->packets[i].replay->hmem_iface = FI_HMEM_SYSTEM; - we->packets[i].replay->hmem_device = 0; + we->packets[i].replay->hmem_iface = we->hmem.iface; + we->packets[i].replay->hmem_device = we->hmem.device; fi_opx_reliability_client_replay_register_with_update( &opx_ep->reliability->state, we->dlid, we->rs, we->rx, we->psn_ptr, we->packets[i].replay, cc, @@ -816,7 +812,8 @@ void opx_hfi1_sdma_do_sdma_tid(struct fi_opx_ep *opx_ep, we->iovecs[tidiovec_idx].iov_len = tid_iov->iov_len - (tid_idx * sizeof(uint32_t)); we->iovecs[tidiovec_idx].iov_base = &tidpairs[tid_idx]; req_info->ctrl = FI_OPX_HFI1_SDMA_REQ_HEADER_EXPECTED_FIXEDBITS | - (((uint16_t)we->num_iovs) << HFI1_SDMA_REQ_IOVCNT_SHIFT); + (((uint16_t)we->num_iovs) << HFI1_SDMA_REQ_IOVCNT_SHIFT) | + OPX_SDMA_REQ_SET_MEMINFO[set_meminfo]; uint32_t tidpair = tidpairs[tid_idx]; uint32_t kdeth = (FI_OPX_HFI1_KDETH_TIDCTRL & FI_OPX_EXP_TID_GET((tidpair), CTRL)) @@ -856,8 +853,7 @@ void opx_hfi1_sdma_do_sdma_tid(struct fi_opx_ep *opx_ep, *fill_index = ((*fill_index) + 1) % (opx_ep->hfi->info.sdma.queue_size); --opx_ep->hfi->info.sdma.available_counter; - FI_OPX_DEBUG_COUNTERS_INC( - opx_ep->debug_counters.sdma.writev_calls[we->num_packets]); + FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.sdma.writev_calls[we->num_packets]); ssize_t rc = writev(opx_ep->hfi->fd, we->iovecs, we->num_iovs); FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "===================================== SDMA_WE -- called writev rc=%ld Params were: fd=%d iovecs=%p num_iovs=%d \n", diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h index f0bcca537a3..1d320ced2d9 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_transport.h @@ -1,6 +1,6 @@ /* * Copyright (C) 2016 by Argonne National Laboratory. - * Copyright (C) 2021-2023 Cornelis Networks. + * Copyright (C) 2021-2024 Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -397,7 +397,7 @@ struct fi_opx_hfi1_dput_params { uint16_t sdma_reqs_used; bool is_intranode; - bool delivery_completion; + bool sdma_no_bounce_buf; bool use_expected_opcode; uint8_t u8_rx; uint8_t dt; @@ -429,9 +429,13 @@ struct fi_opx_hfi1_rx_rzv_rts_params { uintptr_t origin_byte_counter_vaddr; struct fi_opx_rzv_completion* rzv_comp; uintptr_t dst_vaddr; /* bumped past immediate data */ - uint64_t immediate_data; - uint64_t immediate_end_block_count; + uint64_t tid_pending_vaddr; + uint64_t tid_pending_tid_vaddr; + int64_t tid_pending_length; + int64_t tid_pending_tid_length; + int64_t tid_pending_alignment_adjustment; + uint32_t tid_setup_retries; uint32_t ntidpairs; uint32_t tid_offset; uint32_t u32_extended_rx; @@ -442,13 +446,13 @@ struct fi_opx_hfi1_rx_rzv_rts_params { uint16_t origin_rx; uint8_t opcode; - uint8_t fallback_opcode; uint8_t u8_rx; uint8_t target_hfi_unit; + /* Either FI_OPX_MAX_DPUT_IOV iov's or 1 iov and FI_OPX_MAX_DPUT_TIDPAIRS tidpairs */ union { - union fi_opx_hfi1_dput_iov src_iov[FI_OPX_MAX_DPUT_IOV]; + union fi_opx_hfi1_dput_iov dput_iov[FI_OPX_MAX_DPUT_IOV]; struct { union fi_opx_hfi1_dput_iov reserved;/* skip 1 iov */ uint32_t tidpairs[FI_OPX_MAX_DPUT_TIDPAIRS]; diff --git a/prov/opx/include/rdma/opx/fi_opx_hmem.h b/prov/opx/include/rdma/opx/fi_opx_hmem.h index 43da5dcc6ab..bf4b7d89566 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hmem.h +++ b/prov/opx/include/rdma/opx/fi_opx_hmem.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2023 by Cornelis Networks. + * Copyright (C) 2023-2024 by Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -33,6 +33,7 @@ #define _FI_PROV_OPX_HMEM_H_ #include +#include #include "rdma/opx/fi_opx_compiler.h" #include "rdma/opx/fi_opx_rma_ops.h" #include "ofi_hmem.h" @@ -46,23 +47,6 @@ struct fi_opx_hmem_info { OPX_COMPILE_TIME_ASSERT((sizeof(struct fi_opx_hmem_info) & 0x7) == 0, "sizeof(fi_opx_hmem_info) should be a multiple of 8"); -__OPX_FORCE_INLINE__ -unsigned fi_opx_hmem_is_managed(const void *ptr, const enum fi_hmem_iface iface) -{ -#if defined(OPX_HMEM) && HAVE_CUDA - if (iface == FI_HMEM_CUDA) { - unsigned is_hmem_managed; - CUresult __attribute__((unused)) cu_result = - ofi_cuPointerGetAttribute(&is_hmem_managed, - CU_POINTER_ATTRIBUTE_IS_MANAGED, - (CUdeviceptr)ptr); - assert(cu_result == CUDA_SUCCESS); - return is_hmem_managed; - } -#endif - return 0; -} - __OPX_FORCE_INLINE__ enum fi_hmem_iface fi_opx_hmem_get_iface(const void *ptr, const struct fi_opx_mr *desc, @@ -83,16 +67,44 @@ enum fi_hmem_iface fi_opx_hmem_get_iface(const void *ptr, return desc->attr.iface; } - enum fi_hmem_iface iface = ofi_get_hmem_iface(ptr, device, NULL); - if (iface == FI_HMEM_CUDA && fi_opx_hmem_is_managed(ptr, iface)) { - *device = 0ul; - return FI_HMEM_SYSTEM; - } - return iface; -#else + #if HAVE_CUDA + unsigned mem_type; + unsigned is_managed; + unsigned device_ordinal; + + /* Each pointer in 'data' needs to have the same array index + as the corresponding attribute in 'cuda_attributes' */ + void *data[] = {&mem_type, &is_managed, &device_ordinal}; + + enum CUpointer_attribute_enum cuda_attributes[] = { + CU_POINTER_ATTRIBUTE_MEMORY_TYPE, + CU_POINTER_ATTRIBUTE_IS_MANAGED, + CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL + }; + + CUresult cuda_rc = cuPointerGetAttributes(ARRAY_SIZE(cuda_attributes), + cuda_attributes, data, + (CUdeviceptr) ptr); + + if (cuda_rc == CUDA_SUCCESS) { + + if (mem_type == CU_MEMORYTYPE_DEVICE && !is_managed) { + *device = device_ordinal; + return FI_HMEM_CUDA; + } + } else if (cuda_rc != CUDA_ERROR_INVALID_CONTEXT) { + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "Bad return code %hu from cuPointerGetAttributes()", + cuda_rc); + } + #else + enum fi_hmem_iface iface = ofi_get_hmem_iface(ptr, device, NULL); + return iface; + #endif +#endif + *device = 0ul; return FI_HMEM_SYSTEM; -#endif } __OPX_FORCE_INLINE__ @@ -116,6 +128,15 @@ unsigned fi_opx_hmem_iov_init(const void *buf, #endif } +static const unsigned OPX_HMEM_KERN_MEM_TYPE[4] = { + #ifdef OPX_HMEM + HFI1_MEMINFO_TYPE_SYSTEM, + HFI1_MEMINFO_TYPE_NVIDIA, + 2, /* HFI1_MEMINFO_TYPE_AMD */ + 1 /* HFI1_MEMINFO_TYPE_DMABUF */ + #endif +}; + #ifdef OPX_HMEM #define OPX_HMEM_COPY_FROM(dst, src, len, src_iface, src_device) \ do { \ diff --git a/prov/opx/include/rdma/opx/fi_opx_timer.h b/prov/opx/include/rdma/opx/fi_opx_timer.h index a1c9876c1b8..6738c9d2d10 100644 --- a/prov/opx/include/rdma/opx/fi_opx_timer.h +++ b/prov/opx/include/rdma/opx/fi_opx_timer.h @@ -62,6 +62,14 @@ static inline uint64_t fi_opx_timer_get_cycles() cycles = ((uint64_t)a) | (((uint64_t)d) << 32); return cycles; } +#elif defined(__riscv) && defined(__riscv_xlen) && (__riscv_xlen == 64) +__attribute__((always_inline)) +static inline uint64_t fi_opx_timer_get_cycles() +{ + uint64_t dst = 0; + asm volatile ("rdcycle %0" : "=r" (dst) ); + return dst; +} #else #error "Cycle timer not defined for this platform" #endif diff --git a/prov/opx/src/fi_opx_ep.c b/prov/opx/src/fi_opx_ep.c index 7ded6c5ec5b..543812bd94a 100644 --- a/prov/opx/src/fi_opx_ep.c +++ b/prov/opx/src/fi_opx_ep.c @@ -1,6 +1,6 @@ /* * Copyright (C) 2016 by Argonne National Laboratory. - * Copyright (C) 2021-2023 Cornelis Networks. + * Copyright (C) 2021-2024 Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -819,25 +819,28 @@ static int fi_opx_ep_tx_init (struct fi_opx_ep *opx_ep, OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, "Set pio_flow_eager_tx_bytes to %d \n", opx_ep->tx->pio_flow_eager_tx_bytes); - /* Set delivery completion max threshold. Any messages larger than this value in bytes will not be copied to + /* Set SDMA bounce buffer threshold. Any messages larger than this value in bytes will not be copied to * replay bounce buffers. Instead, hold the sender's large message buffer until we get all ACKs back from the Rx * side of the message. Since no copy of the message is made, it will need to be used to handle NAKs. */ - int l_dcomp_threshold; - ssize_t rc = fi_param_get_int(fi_opx_global.prov, "delivery_completion_threshold", &l_dcomp_threshold); + int l_sdma_bounce_buf_threshold; + ssize_t rc = fi_param_get_int(fi_opx_global.prov, "sdma_bounce_buf_threshold", &l_sdma_bounce_buf_threshold); if (rc != FI_SUCCESS) { - opx_ep->tx->dcomp_threshold = OPX_DEFAULT_DCOMP_THRESHOLD; - OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, "FI_OPX_DELIVERY_COMPLETION_THRESHOLD not set. Using default setting of %d\n", - opx_ep->tx->dcomp_threshold); - } else if (l_dcomp_threshold < OPX_MIN_DCOMP_THRESHOLD || l_dcomp_threshold > (OPX_MAX_DCOMP_THRESHOLD)) { - opx_ep->tx->dcomp_threshold = OPX_DEFAULT_DCOMP_THRESHOLD; + rc = fi_param_get_int(fi_opx_global.prov, "delivery_completion_threshold", &l_sdma_bounce_buf_threshold); + } + if (rc != FI_SUCCESS) { + opx_ep->tx->sdma_bounce_buf_threshold = OPX_SDMA_BOUNCE_BUF_THRESHOLD; + OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, "FI_OPX_SDMA_BOUNCE_BUF_THRESHOLD not set. Using default setting of %d\n", + opx_ep->tx->sdma_bounce_buf_threshold); + } else if (l_sdma_bounce_buf_threshold < OPX_SDMA_BOUNCE_BUF_MIN || l_sdma_bounce_buf_threshold > (OPX_SDMA_BOUNCE_BUF_MAX)) { + opx_ep->tx->sdma_bounce_buf_threshold = OPX_SDMA_BOUNCE_BUF_THRESHOLD; FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, - "Error: FI_OPX_DELIVERY_COMPLETION_THRESHOLD was set but is outside of MIN/MAX thresholds. Using default setting of %d\n", - opx_ep->tx->dcomp_threshold); + "Error: FI_OPX_SDMA_BOUNCE_BUF_THRESHOLD was set but is outside of MIN/MAX thresholds. Using default setting of %d\n", + opx_ep->tx->sdma_bounce_buf_threshold); } else { - opx_ep->tx->dcomp_threshold = l_dcomp_threshold; - OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, "FI_OPX_DELIVERY_COMPLETION_THRESHOLD was specified. Set to %d\n", - opx_ep->tx->dcomp_threshold); + opx_ep->tx->sdma_bounce_buf_threshold = l_sdma_bounce_buf_threshold; + OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, "FI_OPX_SDMA_BOUNCE_BUF_THRESHOLD was specified. Set to %d\n", + opx_ep->tx->sdma_bounce_buf_threshold); } OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, "Multi-packet eager max message length is %d, chunk-size is %d.\n", diff --git a/prov/opx/src/fi_opx_hfi1.c b/prov/opx/src/fi_opx_hfi1.c index 6bba39beb53..a24b45851cc 100644 --- a/prov/opx/src/fi_opx_hfi1.c +++ b/prov/opx/src/fi_opx_hfi1.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2021-2023 by Cornelis Networks. + * Copyright (C) 2021-2024 by Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -34,6 +34,9 @@ #include #include #include +#include +#include +#include #include "rdma/fabric.h" // only for 'fi_addr_t' ... which is a typedef to uint64_t #include "rdma/opx/fi_opx_hfi1.h" @@ -156,7 +159,7 @@ static enum opx_hfi1_type opx_hfi1_check_hwversion (const uint32_t hw_version) { // Used by fi_opx_hfi1_context_open as a convenience. static int opx_open_hfi_and_context(struct _hfi_ctrl **ctrl, struct fi_opx_hfi1_context_internal *internal, - uuid_t unique_job_key, + uuid_t unique_job_key, int hfi_unit_number) { int fd; @@ -187,6 +190,7 @@ static int opx_open_hfi_and_context(struct _hfi_ctrl **ctrl, hfi_unit_number); fd = -1; } + assert((*ctrl)->__hfi_pg_sz == OPX_HFI1_TID_PAGESIZE); } return fd; } @@ -275,7 +279,7 @@ void fi_opx_init_hfi_lookup() int rc __attribute__ ((unused)); rc = posix_memalign((void **)&hfi_lookup, 32, sizeof(*hfi_lookup)); assert(rc==0); - + if (!hfi_lookup) { FI_WARN(&fi_opx_provider, FI_LOG_EP_DATA, "Unable to allocate HFI lookup entry.\n"); @@ -315,13 +319,25 @@ struct fi_opx_hfi1_context *fi_opx_hfi1_context_open(struct fid_ep *ep, uuid_t u const int hfi_count = opx_hfi_get_num_units(); int hfi_candidates[FI_OPX_MAX_HFIS]; int hfi_distances[FI_OPX_MAX_HFIS]; + int hfi_freectxs[FI_OPX_MAX_HFIS]; int hfi_candidates_count = 0; int hfi_candidate_index = -1; struct _hfi_ctrl *ctrl = NULL; bool use_default_logic = true; + int dirfd = -1; + + memset(hfi_candidates, 0, sizeof(*hfi_candidates) * FI_OPX_MAX_HFIS); + memset(hfi_distances, 0, sizeof(*hfi_distances) * FI_OPX_MAX_HFIS); + memset(hfi_freectxs, 0, sizeof(*hfi_freectxs) * FI_OPX_MAX_HFIS); struct fi_opx_hfi1_context_internal *internal = calloc(1, sizeof(struct fi_opx_hfi1_context_internal)); + if (!internal) + { + FI_WARN(&fi_opx_provider, FI_LOG_FABRIC, + "Error: Memory allocation failure for fi_opx_hfi_context_internal.\n"); + return NULL; + } struct fi_opx_hfi1_context *context = &internal->context; @@ -494,7 +510,7 @@ struct fi_opx_hfi1_context *fi_opx_hfi1_context_open(struct fid_ep *ep, uuid_t u if (hfi_context_rank != -1) { hfi_context_rank_inst = fi_opx_get_daos_hfi_rank_inst(hfi_unit_number, hfi_context_rank); - + FI_WARN(&fi_opx_provider, FI_LOG_FABRIC, "Application-specified HFI selection set to %d rank %d.%d. Skipping HFI selection algorithm\n", hfi_unit_number, hfi_context_rank, hfi_context_rank_inst); @@ -538,34 +554,59 @@ struct fi_opx_hfi1_context *fi_opx_hfi1_context_open(struct fid_ep *ep, uuid_t u } } else { + + // Lock on the opx class directory path so that HFI selection based on distance and + // number of free credits available is atomic. This is to avoid the situation where several + // processes go to read the number of free contexts available in each HFI at the same time + // and choose the same HFi with the smallest load as well as closest to the corresponding process. + // If the processes of selection and then context openning is atomic here, this situation is avoided + // and hfi selection should be evenly balanced. + if ((dirfd = open(OPX_CLASS_DIR_PATH, O_RDONLY)) == -1) { + FI_WARN(&fi_opx_provider, FI_LOG_FABRIC, + "Failed to open %s: %s for flock use.\n", OPX_CLASS_DIR_PATH, strerror(errno)); + free(internal); + return NULL; + } + + if (flock(dirfd, LOCK_EX) == -1) { + FI_WARN(&fi_opx_provider, FI_LOG_FABRIC, + "Flock exclusive lock failure: %s\n", strerror(errno)); + close(dirfd); + free(internal); + return NULL; + } + // The system has multiple HFIs. Sort them by distance from - // this process. - int hfi_n, hfi_d; + // this process. HFIs with same distance are sorted by number of + // free contexts available. + int hfi_n, hfi_d, hfi_f; for (int i = 0; i < hfi_count; i++) { if (opx_hfi_get_unit_active(i) > 0) { hfi_n = opx_hfi_sysfs_unit_read_node_s64(i); hfi_d = numa_distance(hfi_n, numa_node_id); + hfi_f = opx_hfi_get_num_free_contexts(i); FI_INFO(&fi_opx_provider, FI_LOG_FABRIC, - "HFI unit %d in numa node %d has a distance of %d from this pid.\n", - i, hfi_n, hfi_d); + "HFI unit %d in numa node %d has a distance of %d from this pid with" + " %d free contexts available.\n", i, hfi_n, hfi_d, hfi_f); hfi_candidates[hfi_candidates_count] = i; hfi_distances[hfi_candidates_count] = hfi_d; + hfi_freectxs[hfi_candidates_count] = hfi_f; int j = hfi_candidates_count; - // Bubble the new HFI up till the list is sorted. - // Yes, this is lame but the practical matter is that - // there will never be so many HFIs on a single system - // that a real insertion sort is justified. Also, doing it - // this way results in a deterministic result - HFIs will - // be implicitly sorted by their unit number as well as - // by distance ensuring that all processes in a NUMA node - // will see the HFIs in the same order. - while (j > 0 && hfi_distances[j - 1] > hfi_distances[j]) { + // Bubble the new HFI up till the list is sorted by distance + // and then by number of free contexts. Yes, this is lame but + // the practical matter is that there will never be so many HFIs + // on a single system that a real insertion sort is justified. + while (j > 0 && ((hfi_distances[j - 1] > hfi_distances[j]) || + ( (hfi_distances[j - 1] == hfi_distances[j]) && (hfi_freectxs[j - 1] < hfi_freectxs[j])))){ int t1 = hfi_distances[j - 1]; int t2 = hfi_candidates[j - 1]; + int t3 = hfi_freectxs[j - 1]; hfi_distances[j - 1] = hfi_distances[j]; hfi_candidates[j - 1] = hfi_candidates[j]; + hfi_freectxs[j - 1] = hfi_freectxs[j]; hfi_distances[j] = t1; hfi_candidates[j] = t2; + hfi_freectxs[j] = t3; j--; } hfi_candidates_count++; @@ -573,11 +614,11 @@ struct fi_opx_hfi1_context *fi_opx_hfi1_context_open(struct fid_ep *ep, uuid_t u } } - // At this point we have a list of HFIs, sorted by distance from this - // pid (and by unit # as an implied key). Pick from the closest HFIs - // based on the modulo of the pid. If we fail to open that HFI, try - // another one at the same distance. If that fails, we will try HFIs - // that are further away. + // At this point we have a list of HFIs, sorted by distance from this pid (and by unit # as an implied key). + // HFIs that have the same distance are sorted by number of free contexts available. + // Pick the closest HFI that has the smallest load (largest number of free contexts). + // If we fail to open that HFI, try another one at the same distance but potentially + // under a heavier load. If that fails, we will try HFIs that are further away. int lower = 0; int higher = 0; do { @@ -589,16 +630,13 @@ struct fi_opx_hfi1_context *fi_opx_hfi1_context_open(struct fid_ep *ep, uuid_t u higher++; } - // Use the modulo of the pid to select an HFI. The intent - // is to use HFIs evenly rather than have many pids open - // the 1st HFi then have many select the next HFI, etc... + // Select the hfi that is under the smallest load. All + // hfis from [lower, higher) are sorted by number of free contexts + // available with lower having the most contexts free. int range = higher - lower; - hfi_candidate_index = getpid() % range + lower; + hfi_candidate_index = lower; hfi_unit_number = hfi_candidates[hfi_candidate_index]; - // Try to open the HFI. If we fail, try the other HFIs - // at that distance until we run out of HFIs at that - // distance. fd = opx_open_hfi_and_context(&ctrl, internal, unique_job_key, hfi_unit_number); int t = range; @@ -616,6 +654,20 @@ struct fi_opx_hfi1_context *fi_opx_hfi1_context_open(struct fid_ep *ep, uuid_t u lower = higher; } while (fd < 0 && lower < hfi_candidates_count); + if (dirfd != -1) { + if (flock(dirfd, LOCK_UN) == -1) { + FI_WARN(&fi_opx_provider, FI_LOG_FABRIC, "Flock unlock failure: %s\n", strerror(errno)); + close(dirfd); + + if (fd >=0) { + opx_hfi_context_close(fd); + } + free(internal); + return NULL; + } + close(dirfd); + } + if (fd < 0) { FI_WARN(&fi_opx_provider, FI_LOG_FABRIC, "FATAL: Found %d active HFI device%s, unable to open %s.\n", @@ -670,7 +722,7 @@ struct fi_opx_hfi1_context *fi_opx_hfi1_context_open(struct fid_ep *ep, uuid_t u FI_INFO(&fi_opx_provider, FI_LOG_FABRIC, "Detected user specfied ENV FI_OPX_SL, so set the service level to %d\n", user_sl); } else { - FI_WARN(&fi_opx_provider, FI_LOG_FABRIC, "Error: User specfied an env FI_OPX_SL. Valid data is an positive integer 0 - 31 (Default is 0). User specified %d. Using default value of %d instead\n", + FI_WARN(&fi_opx_provider, FI_LOG_FABRIC, "Error: User specfied an env FI_OPX_SL. Valid data is an positive integer 0 - 31 (Default is 0). User specified %d. Using default value of %d instead\n", user_sl, FI_OPX_HFI1_SL_DEFAULT); context->sl = FI_OPX_HFI1_SL_DEFAULT; } @@ -691,7 +743,7 @@ struct fi_opx_hfi1_context *fi_opx_hfi1_context_open(struct fid_ep *ep, uuid_t u context->vl = rc; if(context->sc == FI_OPX_HFI1_SC_ADMIN || context->vl == FI_OPX_HFI1_VL_ADMIN) { - FI_WARN(&fi_opx_provider, FI_LOG_FABRIC, "Detected user set ENV FI_OPX_SL of %ld, which has translated to admin-level Service class (SC=%ld) and/or admin-level Virtual Lane(VL=%ld), which is invalid for user traffic. Using default values instead\n", + FI_WARN(&fi_opx_provider, FI_LOG_FABRIC, "Detected user set ENV FI_OPX_SL of %ld, which has translated to admin-level Service class (SC=%ld) and/or admin-level Virtual Lane(VL=%ld), which is invalid for user traffic. Using default values instead\n", context->sl, context->sc, context->vl); context->sl = FI_OPX_HFI1_SL_DEFAULT; context->sc = FI_OPX_HFI1_SC_DEFAULT; @@ -893,6 +945,7 @@ ssize_t fi_opx_hfi1_tx_connect (struct fi_opx_ep *opx_ep, fi_addr_t peer) uint32_t segment_index = OPX_SHM_SEGMENT_INDEX(hfi_unit, rx_index); assert(segment_index < OPX_SHM_MAX_CONN_NUM); +#ifdef OPX_DAOS /* HFI Rank Support: Rank and PID included in the SHM file name */ if (opx_ep->daos_info.hfi_rank_enabled) { rx_index = opx_shm_daos_rank_index(opx_ep->daos_info.rank, @@ -900,8 +953,9 @@ ssize_t fi_opx_hfi1_tx_connect (struct fi_opx_ep *opx_ep, fi_addr_t peer) inst = opx_ep->daos_info.rank_inst; segment_index = rx_index; } +#endif - snprintf(buffer,sizeof(buffer), OPX_SHM_FILE_NAME_PREFIX_FORMAT, + snprintf(buffer, sizeof(buffer), OPX_SHM_FILE_NAME_PREFIX_FORMAT, opx_ep->domain->unique_job_key_str, hfi_unit, inst); rc = opx_shm_tx_connect(&opx_ep->tx->shm, (const char * const)buffer, @@ -912,7 +966,7 @@ ssize_t fi_opx_hfi1_tx_connect (struct fi_opx_ep *opx_ep, fi_addr_t peer) return rc; } -int fi_opx_hfi1_do_rx_rzv_rts_intranode (union fi_opx_hfi1_deferred_work *work) +int opx_hfi1_rx_rzv_rts_send_cts_intranode(union fi_opx_hfi1_deferred_work *work) { struct fi_opx_hfi1_rx_rzv_rts_params *params = &work->rx_rzv_rts; @@ -926,7 +980,7 @@ int fi_opx_hfi1_do_rx_rzv_rts_intranode (union fi_opx_hfi1_deferred_work *work) /* Possible SHM connections required for certain applications (i.e., DAOS) * exceeds the max value of the legacy u8_rx field. Use u32_extended field. */ - ssize_t rc = fi_opx_shm_dynamic_tx_connect(1, opx_ep, + ssize_t rc = fi_opx_shm_dynamic_tx_connect(OPX_INTRANODE_TRUE, opx_ep, params->u32_extended_rx, params->target_hfi_unit); if (OFI_UNLIKELY(rc)) { @@ -957,13 +1011,13 @@ int fi_opx_hfi1_do_rx_rzv_rts_intranode (union fi_opx_hfi1_deferred_work *work) uintptr_t vaddr_with_offset = params->dst_vaddr; /* receive buffer virtual address */ for(int i = 0; i < params->niov; i++) { tx_payload->cts.iov[i].rbuf = vaddr_with_offset; - tx_payload->cts.iov[i].sbuf = (uintptr_t)params->src_iov[i].sbuf; - tx_payload->cts.iov[i].bytes = params->src_iov[i].bytes; - tx_payload->cts.iov[i].rbuf_device = params->src_iov[i].rbuf_device; - tx_payload->cts.iov[i].sbuf_device = params->src_iov[i].sbuf_device; - tx_payload->cts.iov[i].rbuf_iface = params->src_iov[i].rbuf_iface; - tx_payload->cts.iov[i].sbuf_iface = params->src_iov[i].sbuf_iface; - vaddr_with_offset += params->src_iov[i].bytes; + tx_payload->cts.iov[i].sbuf = (uintptr_t)params->dput_iov[i].sbuf; + tx_payload->cts.iov[i].bytes = params->dput_iov[i].bytes; + tx_payload->cts.iov[i].rbuf_device = params->dput_iov[i].rbuf_device; + tx_payload->cts.iov[i].sbuf_device = params->dput_iov[i].sbuf_device; + tx_payload->cts.iov[i].rbuf_iface = params->dput_iov[i].rbuf_iface; + tx_payload->cts.iov[i].sbuf_iface = params->dput_iov[i].sbuf_iface; + vaddr_with_offset += params->dput_iov[i].bytes; } opx_shm_tx_advance(&opx_ep->tx->shm, (void*)tx_hdr, pos); @@ -974,8 +1028,7 @@ int fi_opx_hfi1_do_rx_rzv_rts_intranode (union fi_opx_hfi1_deferred_work *work) return FI_SUCCESS; } -/* Rendezvous to eager ring buffers (not directly to user buffers) */ -int fi_opx_hfi1_do_rx_rzv_rts_eager_ring(union fi_opx_hfi1_deferred_work *work) +int opx_hfi1_rx_rzv_rts_send_cts(union fi_opx_hfi1_deferred_work *work) { struct fi_opx_hfi1_rx_rzv_rts_params *params = &work->rx_rzv_rts; struct fi_opx_ep *opx_ep = params->opx_ep; @@ -983,8 +1036,10 @@ int fi_opx_hfi1_do_rx_rzv_rts_eager_ring(union fi_opx_hfi1_deferred_work *work) const uint64_t bth_rx = ((uint64_t)params->u8_rx) << 56; FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== RECV, HFI -- RENDEZVOUS EAGER RTS (begin)\n"); - const uint64_t payload_bytes = params->niov * sizeof(union fi_opx_hfi1_dput_iov); + "===================================== RECV, HFI -- RENDEZVOUS %s RTS (begin)\n", + params->ntidpairs ? "EXPECTED TID" : "EAGER"); + const uint64_t tid_payload = params->ntidpairs ? ((params->ntidpairs + 2) * sizeof(uint32_t)) : 0; + const uint64_t payload_bytes = (params->niov * sizeof(union fi_opx_hfi1_dput_iov)) + tid_payload; const uint64_t pbc_dws = 2 + /* pbc */ 2 + /* lrh */ @@ -995,14 +1050,14 @@ int fi_opx_hfi1_do_rx_rzv_rts_eager_ring(union fi_opx_hfi1_deferred_work *work) union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; const uint16_t total_credits_needed = 1 + /* packet header */ ((payload_bytes + 63) >> 6); /* payload blocks needed */ - uint64_t total_credits_available = FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, + uint64_t total_credits_available = FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, &opx_ep->tx->force_credit_return, total_credits_needed); if (OFI_UNLIKELY(total_credits_available < total_credits_needed)) { fi_opx_compiler_msync_writes(); FI_OPX_HFI1_UPDATE_CREDITS(pio_state, opx_ep->tx->pio_credits_addr); - total_credits_available = FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, + total_credits_available = FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, &opx_ep->tx->force_credit_return, total_credits_needed); opx_ep->tx->pio_state->qw0 = pio_state.qw0; @@ -1015,302 +1070,242 @@ int fi_opx_hfi1_do_rx_rzv_rts_eager_ring(union fi_opx_hfi1_deferred_work *work) union fi_opx_reliability_tx_psn *psn_ptr; int64_t psn; - psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, &opx_ep->reliability->state, params->slid, - params->u8_rx, params->origin_rs, &psn_ptr, &replay, params->reliability); + psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, + &opx_ep->reliability->state, + params->slid, + params->u8_rx, + params->origin_rs, + &psn_ptr, + &replay, + params->reliability); if(OFI_UNLIKELY(psn == -1)) { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "FI_EAGAIN\n"); return -FI_EAGAIN; } assert(payload_bytes <= FI_OPX_HFI1_PACKET_MTU); + // The "memcopy first" code is here as an alternative to the more complicated // direct write to pio followed by memory copy of the reliability buffer - replay->scb.qw0 = opx_ep->rx->tx.cts.qw0 | pbc_dws | - ((opx_ep->tx->force_credit_return & FI_OPX_HFI1_PBC_CR_MASK) << FI_OPX_HFI1_PBC_CR_SHIFT); + replay->scb.qw0 = opx_ep->rx->tx.cts.qw0 | pbc_dws; replay->scb.hdr.qw[0] = opx_ep->rx->tx.cts.hdr.qw[0] | lrh_dlid | - ((uint64_t)lrh_dws << 32); + ((uint64_t) lrh_dws << 32); replay->scb.hdr.qw[1] = opx_ep->rx->tx.cts.hdr.qw[1] | bth_rx; replay->scb.hdr.qw[2] = opx_ep->rx->tx.cts.hdr.qw[2] | psn; replay->scb.hdr.qw[3] = opx_ep->rx->tx.cts.hdr.qw[3]; replay->scb.hdr.qw[4] = opx_ep->rx->tx.cts.hdr.qw[4] | + ((uint64_t) params->ntidpairs << 32) | (params->niov << 48) | params->opcode; replay->scb.hdr.qw[5] = params->origin_byte_counter_vaddr; - replay->scb.hdr.qw[6] = (uint64_t)params->rzv_comp; + replay->scb.hdr.qw[6] = (uint64_t) params->rzv_comp; union fi_opx_hfi1_packet_payload *const tx_payload = - (union fi_opx_hfi1_packet_payload *)replay->payload; + (union fi_opx_hfi1_packet_payload *) replay->payload; assert(((uint8_t *)tx_payload) == ((uint8_t *)&replay->data)); - uintptr_t vaddr_with_offset = params->dst_vaddr; /* receive buffer virtual address */ + uintptr_t vaddr_with_offset = params->ntidpairs ? + ((uint64_t)params->dst_vaddr & -64) : + params->dst_vaddr; /* receive buffer virtual address */ + for (int i = 0; i < params->niov; i++) { tx_payload->cts.iov[i].rbuf = vaddr_with_offset; - tx_payload->cts.iov[i].sbuf = (uintptr_t)params->src_iov[i].sbuf; - tx_payload->cts.iov[i].bytes = params->src_iov[i].bytes; - tx_payload->cts.iov[i].sbuf_device = params->src_iov[i].sbuf_device; - tx_payload->cts.iov[i].rbuf_device = params->src_iov[i].rbuf_device; - tx_payload->cts.iov[i].sbuf_iface = params->src_iov[i].sbuf_iface; - tx_payload->cts.iov[i].rbuf_iface = params->src_iov[i].rbuf_iface; - vaddr_with_offset += params->src_iov[i].bytes; + tx_payload->cts.iov[i].sbuf = (uintptr_t)params->dput_iov[i].sbuf; + tx_payload->cts.iov[i].bytes = params->dput_iov[i].bytes; + tx_payload->cts.iov[i].sbuf_device = params->dput_iov[i].sbuf_device; + tx_payload->cts.iov[i].rbuf_device = params->dput_iov[i].rbuf_device; + tx_payload->cts.iov[i].sbuf_iface = params->dput_iov[i].sbuf_iface; + tx_payload->cts.iov[i].rbuf_iface = params->dput_iov[i].rbuf_iface; + vaddr_with_offset += params->dput_iov[i].bytes; } - FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); + /* copy tidpairs to packet */ + if (params->ntidpairs) { + assert(params->niov == 1); + + /* coverity[missing_lock] */ + tx_payload->tid_cts.tid_offset = params->tid_offset; + tx_payload->tid_cts.ntidpairs = params->ntidpairs; + assert(params->tidpairs[0] != 0); + memcpy(&tx_payload->tid_cts.tidpairs, params->tidpairs, + params->ntidpairs * sizeof(uint32_t)); + } fi_opx_reliability_service_do_replay(&opx_ep->reliability->service,replay); - fi_opx_reliability_client_replay_register_no_update(&opx_ep->reliability->state, - params->slid, + fi_opx_reliability_client_replay_register_no_update(&opx_ep->reliability->state, + params->slid, params->origin_rs, - params->origin_rx, - psn_ptr, - replay, + params->origin_rx, + psn_ptr, + replay, params->reliability); - FI_DBG( - fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== RECV, HFI -- RENDEZVOUS EAGER RTS (end)\n"); - + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== RECV, HFI -- RENDEZVOUS %s RTS (end)\n", + params->ntidpairs ? "EXPECTED TID" : "EAGER"); return FI_SUCCESS; } -/* RTS TID falling back to RTS eager ring */ __OPX_FORCE_INLINE__ -int opx_fallback_eager_ring(union fi_opx_hfi1_deferred_work *work, int line) -{ - struct fi_opx_hfi1_rx_rzv_rts_params *params = &work->rx_rzv_rts; - - FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, - "RENDEZVOUS EXPECTED TID CTS fallback to EAGER CTS (%u)\n", - line); -#ifdef OPX_TID_FALLBACK_DEBUG - fprintf(stderr, - "## OPX_TID_FALLBACK_DEBUG: RENDEZVOUS EXPECTED TID CTS fallback to EAGER CTS (%u)\n", - line); -#endif - params->ntidpairs = 0; - params->opcode = params->fallback_opcode; /* fallback */ - params->work_elem.work_fn = fi_opx_hfi1_do_rx_rzv_rts_eager_ring; - FI_OPX_DEBUG_COUNTERS_INC(params->opx_ep->debug_counters - .expected_receive.rts_fallback_eager); - return params->work_elem.work_fn(work); -} - -/* Rendezvous directly to user buffers (using TID) (not to eager buffers) */ -int fi_opx_hfi1_do_rx_rzv_rts_tid(union fi_opx_hfi1_deferred_work *work) +int opx_hfi1_rx_rzv_rts_tid_eligible(struct fi_opx_ep *opx_ep, + struct fi_opx_hfi1_rx_rzv_rts_params *params, + const uint64_t niov, + const uint64_t immediate_data, + const uint64_t immediate_end_block_count, + const uint64_t is_hmem, + const uint64_t is_intranode, + const enum fi_hmem_iface iface, + uint8_t opcode) { - struct fi_opx_hfi1_rx_rzv_rts_params *params = &work->rx_rzv_rts; - struct fi_opx_ep *opx_ep = params->opx_ep; - const uint64_t lrh_dlid = params->lrh_dlid; - const uint64_t bth_rx = ((uint64_t)params->u8_rx) << 56; - - FI_DBG( - fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== RECV, HFI -- RENDEZVOUS EXPECTED TID RTS (begin)\n"); - - /* If tidpairs is set, this is FI_EAGAIN so skip TID processing as we're committed to TID (not eager) */ - if (!params->ntidpairs) { - /*******************************************************************************/ - /* If there's no immediate data, the peer must have - * dynamically disabled expected receive tid so fallback. - */ - const uint64_t immediate_data = params->immediate_data; - const uint64_t immediate_end_block_count = params->immediate_end_block_count; - if ((immediate_data == 0) || (immediate_end_block_count == 0)) { - return opx_fallback_eager_ring(work, __LINE__); - } - - /* Caller adjusted pointers and lengths past the immediate data. - * Now align the destination buffer to be page aligned for expected TID writes - * This should point/overlap into the immediate data area. - * Then realign source buffer and lengths appropriately. - */ - const uint64_t page_alignment_mask = -(int64_t)OPX_HFI1_TID_PAGESIZE; - /* TID writes must start on 64 byte boundaries */ - const uint64_t vaddr = ((uint64_t)params->dst_vaddr) & -64; - /* TID updates require page alignment*/ - const uint64_t tid_vaddr = (uint64_t)vaddr & (uint64_t)page_alignment_mask; - - /* If adjusted pointer doesn't fall into the immediate data region, can't - * continue with TID. Fallback to eager. - */ - if (!((vaddr >= ((uint64_t)params->dst_vaddr -params->immediate_data)) && - (vaddr <= ((uint64_t)params->dst_vaddr)))) { - return opx_fallback_eager_ring(work, __LINE__); - } + if (is_intranode + || !opx_ep->use_expected_tid_rzv + || (niov != 1) + || (opcode != FI_OPX_HFI_DPUT_OPCODE_RZV && + opcode != FI_OPX_HFI_DPUT_OPCODE_RZV_NONCONTIG) + || !fi_opx_hfi1_sdma_use_sdma(opx_ep, params->dput_iov[0].bytes, + opcode, is_hmem, OPX_INTRANODE_FALSE) + || (immediate_data == 0) + || (immediate_end_block_count == 0)) { + + FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.expected_receive.rts_tid_ineligible); + return 0; + } - /* First adjust for the start page alignment, using immediate data that was sent.*/ - const int64_t alignment_adjustment = (uint64_t)params->dst_vaddr - vaddr; + FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.expected_receive.rts_tid_eligible); - /* Adjust length for aligning the buffer and adjust again for total length, - aligning to SDMA header auto-generation payload requirements. */ - const int64_t length = (params->src_iov[0].bytes + alignment_adjustment) & -64; + /* Caller adjusted pointers and lengths past the immediate data. + * Now align the destination buffer to be page aligned for expected TID writes + * This should point/overlap into the immediate data area. + * Then realign source buffer and lengths appropriately. + */ + const uint64_t page_alignment_mask = -(int64_t)OPX_TID_PAGE_SIZE[iface]; + /* TID writes must start on 64 byte boundaries */ + const uint64_t vaddr = ((uint64_t)params->dst_vaddr) & -64; + /* TID updates require page alignment*/ + const uint64_t tid_vaddr = (uint64_t)vaddr & (uint64_t)page_alignment_mask; + + /* If adjusted pointer doesn't fall into the immediate data region, can't + * continue with TID. Fallback to eager. + */ + if (!((vaddr >= ((uint64_t)params->dst_vaddr - immediate_data)) && + (vaddr <= ((uint64_t)params->dst_vaddr)))) { + FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.expected_receive.rts_fallback_eager_immediate); + return 0; + } - /* Tune for unaligned buffers. Buffers misaligned more than the threshold on - * message sizes under the MSG threshold will fallback to eager. - */ - if ((length < FI_OPX_TID_MSG_MISALIGNED_THRESHOLD) && - ((vaddr - tid_vaddr) > FI_OPX_TID_MISALIGNED_THRESHOLD)) { - return opx_fallback_eager_ring(work, __LINE__); - } + /* First adjust for the start page alignment, using immediate data that was sent.*/ + const int64_t alignment_adjustment = (uint64_t)params->dst_vaddr - vaddr; - /* The tid length much account for starting at a page boundary and will be page aligned */ - const int64_t tid_length = (uint64_t)(((vaddr + length) - tid_vaddr) + - (OPX_HFI1_TID_PAGESIZE - 1)) & (uint64_t)page_alignment_mask; - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, - "iov_len %#lX, length %#lX, tid_length %#lX, " - "params->dst_vaddr %p, iov_base %p, vaddr [%p - %p], tid_vaddr [%p - %p]\n", - params->src_iov[0].bytes, length, tid_length, - (void *)params->dst_vaddr, (void *) params->src_iov[0].sbuf, - (void *)vaddr, (void *)(vaddr + length), - (void *)tid_vaddr, (void *)(tid_vaddr + tid_length)); - - /* New params were checked above but - * DO NOT CHANGE params->xxx or opx_ep->xxx until we know we will NOT fallback to eager rts */ - if (opx_register_for_rzv(params, tid_vaddr, tid_length)) - return opx_fallback_eager_ring(work, __LINE__); - - /* Register was done based on tid_vaddr and the offset should be set to the page - * offset into the TID now. - * This was done under the mm_lock, but that lock is not required. - * Stop the MISSING_LOCK false positives. */ - /* coverity[missing_lock] */ - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, - "vaddr %p, tid_vaddr %p, diff %#X, registered tid_offset %u/%#X, buffer tid_offset %u/%#X, tid_length %lu/%#lX \n", - (void *)vaddr, (void *)tid_vaddr, - (uint32_t)(vaddr - tid_vaddr), params->tid_offset, - params->tid_offset, - params->tid_offset + (uint32_t)(vaddr - tid_vaddr), - params->tid_offset + (uint32_t)(vaddr - tid_vaddr), - tid_length, tid_length); + /* Adjust length for aligning the buffer and adjust again for total length, + aligning to SDMA header auto-generation payload requirements. */ + const int64_t length = (params->dput_iov[0].bytes + alignment_adjustment) & -64; + /* Tune for unaligned buffers. Buffers misaligned more than the threshold on + * message sizes under the MSG threshold will fallback to eager. + */ + if ((length < FI_OPX_TID_MSG_MISALIGNED_THRESHOLD) && + ((vaddr - tid_vaddr) > FI_OPX_TID_MISALIGNED_THRESHOLD)) { + FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.expected_receive.rts_fallback_eager_misaligned_thrsh); + return 0; + } - /* Adjust the offset for vaddr byte offset into the tid. */ - /* coverity[missing_lock] */ - params->tid_offset += (uint32_t)(vaddr - tid_vaddr); - - /* Now there is no fallback to eager so we can change params in case of FI_EAGAIN */ - const uint64_t iov_adj = ((uint64_t)params->dst_vaddr - vaddr); - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, - " ==== iov[%u].base %p len %zu/%#lX iov_adj %lu/%#lX alignment_adjustment %lu/%#lX\n", - 0, (void *) params->src_iov[0].sbuf, - params->src_iov[0].bytes, params->src_iov[0].bytes, - iov_adj, iov_adj, alignment_adjustment, alignment_adjustment); + /* The tid length must account for starting at a page boundary and will be page aligned */ + const int64_t tid_length = (uint64_t)(((vaddr + length) - tid_vaddr) + + (OPX_TID_PAGE_SIZE[iface] - 1)) & (uint64_t)page_alignment_mask; + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "iov_len %#lX, length %#lX, tid_length %#lX, " + "params->dst_vaddr %p, iov_base %p, vaddr [%p - %p], tid_vaddr [%p - %p]\n", + params->dput_iov[0].bytes, length, tid_length, + (void *)params->dst_vaddr, (void *) params->dput_iov[0].sbuf, + (void *)vaddr, (void *)(vaddr + length), + (void *)tid_vaddr, (void *)(tid_vaddr + tid_length)); - params->src_iov[0].sbuf -= iov_adj; - params->src_iov[0].bytes = (params->src_iov[0].bytes + iov_adj) & -64; - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, - " ==== iov[%u].base %p len %zu/%#lX alignment_adjustment %lu/%#lX\n", - 0, (void *) params->src_iov[0].sbuf, - params->src_iov[0].bytes, params->src_iov[0].bytes, - alignment_adjustment, alignment_adjustment); - /* Adjust the (context) counter with the new length ... */ - params->rzv_comp->context->byte_counter = length; - params->rzv_comp->tid_length = tid_length; - params->rzv_comp->tid_vaddr = tid_vaddr; - } else { - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "RETRY FI_EAGAIN\n"); - OPX_DEBUG_TIDS("RTS retry tidpairs", params->ntidpairs, params->tidpairs); - } + params->tid_pending_vaddr = vaddr; + params->tid_pending_length = length; + params->tid_pending_tid_vaddr = tid_vaddr; + params->tid_pending_tid_length = tid_length; + params->tid_pending_alignment_adjustment = alignment_adjustment; - /*******************************************************************************************************************/ - /* Committed to expected receive (TID) but might FI_EAGAIN out and retry */ - /*******************************************************************************************************************/ - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "ntidpairs %u\n", - params->ntidpairs); - const uint64_t payload_bytes = - params->niov * sizeof(union fi_opx_hfi1_dput_iov) + - sizeof(uint32_t) /* tid_offset */ + - sizeof(uint32_t) /* ntidpairs */ + - params->ntidpairs * sizeof(uint32_t) /* tidpairs[]*/; + return 1; +} - const uint64_t pbc_dws = - 2 + /* pbc */ - 2 + /* lrh */ - 3 + /* bth */ - 9 + /* kdeth; from "RcvHdrSize[i].HdrSize" CSR */ - ((payload_bytes + 3) >> 2); - const uint16_t lrh_dws = htons(pbc_dws - 1); - union fi_opx_hfi1_pio_state pio_state = *opx_ep->tx->pio_state; - const uint16_t total_credits_needed = 1 + /* packet header */ - ((payload_bytes + 63) >> 6); /* payload blocks needed */ - uint64_t total_credits_available = FI_OPX_HFI1_AVAILABLE_CREDITS( - pio_state, &opx_ep->tx->force_credit_return, - total_credits_needed); +int opx_hfi1_rx_rzv_rts_tid_setup(union fi_opx_hfi1_deferred_work *work) +{ + struct fi_opx_hfi1_rx_rzv_rts_params *params = &work->rx_rzv_rts; - if (OFI_UNLIKELY(total_credits_available < total_credits_needed)) { - fi_opx_compiler_msync_writes(); - FI_OPX_HFI1_UPDATE_CREDITS(pio_state, - opx_ep->tx->pio_credits_addr); - total_credits_available = FI_OPX_HFI1_AVAILABLE_CREDITS(pio_state, - &opx_ep->tx->force_credit_return, - total_credits_needed); - opx_ep->tx->pio_state->qw0 = pio_state.qw0; - if (total_credits_available < total_credits_needed) { - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "FI_EAGAIN\n"); + if (opx_register_for_rzv(params, params->tid_pending_tid_vaddr, + params->tid_pending_tid_length, + params->dput_iov[0].rbuf_iface, + params->dput_iov[0].rbuf_device)) { + /* Retry TID setup */ + if (++params->tid_setup_retries < OPX_RTS_TID_SETUP_MAX_TRIES) { + FI_OPX_DEBUG_COUNTERS_INC(params->opx_ep->debug_counters + .expected_receive.rts_tid_setup_retries); return -FI_EAGAIN; } - } - struct fi_opx_reliability_tx_replay *replay; - union fi_opx_reliability_tx_psn *psn_ptr; - int64_t psn; - - psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, &opx_ep->reliability->state, params->slid, - params->u8_rx, params->origin_rs, &psn_ptr, &replay, params->reliability); - if(OFI_UNLIKELY(psn == -1)) { - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "FI_EAGAIN\n"); - return -FI_EAGAIN; + // Give up and fall back to non-TID + FI_OPX_DEBUG_COUNTERS_INC(params->opx_ep->debug_counters + .expected_receive.rts_fallback_eager_reg_rzv); + params->work_elem.work_fn = opx_hfi1_rx_rzv_rts_send_cts; + return opx_hfi1_rx_rzv_rts_send_cts(work); } - assert(payload_bytes <= FI_OPX_HFI1_PACKET_MTU); - const uint64_t force_credit_return = (opx_ep->tx->force_credit_return & FI_OPX_HFI1_PBC_CR_MASK) - << FI_OPX_HFI1_PBC_CR_SHIFT; - FI_OPX_HFI1_CLEAR_CREDIT_RETURN(opx_ep); - - // The "memcopy first" code is here as an alternative to the more complicated - // direct write to pio followed by memory copy of the reliability buffer - replay->scb.qw0 = opx_ep->rx->tx.cts.qw0 | pbc_dws | force_credit_return; + assert(params->ntidpairs); - replay->scb.hdr.qw[0] = opx_ep->rx->tx.cts.hdr.qw[0] | lrh_dlid | - ((uint64_t)lrh_dws << 32); - replay->scb.hdr.qw[1] = opx_ep->rx->tx.cts.hdr.qw[1] | bth_rx; - replay->scb.hdr.qw[2] = opx_ep->rx->tx.cts.hdr.qw[2] | psn; - replay->scb.hdr.qw[3] = opx_ep->rx->tx.cts.hdr.qw[3]; - replay->scb.hdr.qw[4] = opx_ep->rx->tx.cts.hdr.qw[4] | - (uint64_t)params->ntidpairs << 32 | - (params->niov << 48) | params->opcode; - replay->scb.hdr.qw[5] = params->origin_byte_counter_vaddr; - replay->scb.hdr.qw[6] = (uint64_t)params->rzv_comp; - - union fi_opx_hfi1_packet_payload *const tx_payload = - (union fi_opx_hfi1_packet_payload *)replay->payload; - assert(((uint8_t *)tx_payload) == ((uint8_t *)&replay->data)); - - uintptr_t vaddr_with_offset = ((uint64_t)params->dst_vaddr & -64); - - assert(params->niov == 1); - - tx_payload->tid_cts.iov[0].rbuf = vaddr_with_offset; /* receive buffer virtual address */ - tx_payload->tid_cts.iov[0].sbuf = (uintptr_t)params->src_iov[0].sbuf; /* send buffer virtual address */ - tx_payload->tid_cts.iov[0].bytes = params->src_iov[0].bytes; /* number of bytes to transfer */ - tx_payload->tid_cts.iov[0].rbuf_device = params->src_iov[0].rbuf_device; - tx_payload->tid_cts.iov[0].sbuf_device = params->src_iov[0].sbuf_device; - tx_payload->tid_cts.iov[0].rbuf_iface = params->src_iov[0].rbuf_iface; - tx_payload->tid_cts.iov[0].sbuf_iface = params->src_iov[0].sbuf_iface; - - /* copy tidpairs to packet */ + const uint64_t vaddr = params->tid_pending_vaddr; + const uint64_t tid_vaddr = params->tid_pending_tid_vaddr; + const int64_t tid_length = params->tid_pending_tid_length; + const int64_t length = params->tid_pending_length; + /* Register was done based on tid_vaddr and the offset should be set to the page + * offset into the TID now. + * This was done under the mm_lock, but that lock is not required. + * Stop the MISSING_LOCK false positives. */ /* coverity[missing_lock] */ - tx_payload->tid_cts.tid_offset = params->tid_offset; - tx_payload->tid_cts.ntidpairs = params->ntidpairs; - assert(params->tidpairs[0] != 0); - memcpy(&tx_payload->tid_cts.tidpairs, params->tidpairs, - params->ntidpairs * sizeof(uint32_t)); - - fi_opx_reliability_service_do_replay(&opx_ep->reliability->service, replay); - fi_opx_reliability_client_replay_register_no_update( - &opx_ep->reliability->state, params->slid, params->origin_rs, - params->origin_rx, psn_ptr, replay, params->reliability); FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== RECV, HFI -- RENDEZVOUS EXPECTED TID RTS (end)\n"); - return FI_SUCCESS; + "vaddr %p, tid_vaddr %p, diff %#X, registered tid_offset %u/%#X, buffer tid_offset %u/%#X, tid_length %lu/%#lX \n", + (void *)vaddr, (void *)tid_vaddr, + (uint32_t)(vaddr - tid_vaddr), params->tid_offset, + params->tid_offset, + params->tid_offset + (uint32_t)(vaddr - tid_vaddr), + params->tid_offset + (uint32_t)(vaddr - tid_vaddr), + tid_length, tid_length); + + /* Adjust the offset for vaddr byte offset into the tid. */ + /* coverity[missing_lock] */ + params->tid_offset += (uint32_t)(vaddr - tid_vaddr); + + const uint64_t iov_adj = ((uint64_t)params->dst_vaddr - vaddr); + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + " ==== iov[%u].base %p len %zu/%#lX iov_adj %lu/%#lX alignment_adjustment %lu/%#lX\n", + 0, (void *) params->dput_iov[0].sbuf, + params->dput_iov[0].bytes, params->dput_iov[0].bytes, + iov_adj, iov_adj, + params->tid_pending_alignment_adjustment, + params->tid_pending_alignment_adjustment); + + params->dput_iov[0].sbuf -= iov_adj; + params->dput_iov[0].bytes = (params->dput_iov[0].bytes + iov_adj) & -64; + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + " ==== iov[%u].base %p len %zu/%#lX alignment_adjustment %lu/%#lX\n", + 0, (void *) params->dput_iov[0].sbuf, + params->dput_iov[0].bytes, params->dput_iov[0].bytes, + params->tid_pending_alignment_adjustment, + params->tid_pending_alignment_adjustment); + /* Adjust the (context) counter with the new length ... */ + params->rzv_comp->context->byte_counter = length; + params->rzv_comp->tid_length = tid_length; + params->rzv_comp->tid_vaddr = tid_vaddr; + params->opcode = FI_OPX_HFI_DPUT_OPCODE_RZV_TID; + + FI_OPX_DEBUG_COUNTERS_INC(params->opx_ep->debug_counters + .expected_receive.rts_tid_setup_success); + FI_OPX_DEBUG_COUNTERS_INC_COND(params->dput_iov[0].rbuf_iface, + params->opx_ep->debug_counters.hmem.tid_recv); + FI_OPX_DEBUG_COUNTERS_INC_COND(params->tid_setup_retries > 0, + params->opx_ep->debug_counters.expected_receive.rts_tid_setup_retry_success); + + params->work_elem.work_fn = opx_hfi1_rx_rzv_rts_send_cts; + return opx_hfi1_rx_rzv_rts_send_cts(work); } void fi_opx_hfi1_rx_rzv_rts (struct fi_opx_ep *opx_ep, @@ -1338,9 +1333,6 @@ void fi_opx_hfi1_rx_rzv_rts (struct fi_opx_ep *opx_ep, params->opx_ep = opx_ep; params->work_elem.slist_entry.next = NULL; - params->opcode = opcode; - params->fallback_opcode = opcode; - assert(niov <= MIN(FI_OPX_MAX_HMEM_IOV, FI_OPX_MAX_DPUT_IOV)); const struct fi_opx_hmem_iov *src_iov = src_iovs; @@ -1350,20 +1342,20 @@ void fi_opx_hfi1_rx_rzv_rts (struct fi_opx_ep *opx_ep, #ifdef OPX_HMEM is_hmem |= src_iov->iface; #endif - params->src_iov[i].sbuf = src_iov->buf; - params->src_iov[i].sbuf_iface = src_iov->iface; - params->src_iov[i].sbuf_device = src_iov->device; - params->src_iov[i].rbuf = dst_vaddr + rbuf_offset; - params->src_iov[i].rbuf_iface = dst_iface; - params->src_iov[i].rbuf_device = dst_device; - params->src_iov[i].bytes = src_iov->len; + params->dput_iov[i].sbuf = src_iov->buf; + params->dput_iov[i].sbuf_iface = src_iov->iface; + params->dput_iov[i].sbuf_device = src_iov->device; + params->dput_iov[i].rbuf = dst_vaddr + rbuf_offset; + params->dput_iov[i].rbuf_iface = dst_iface; + params->dput_iov[i].rbuf_device = dst_device; + params->dput_iov[i].bytes = src_iov->len; rbuf_offset += src_iov->len; ++src_iov; } if (is_intranode) { FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "is_intranode %u\n",is_intranode ); - params->work_elem.work_fn = fi_opx_hfi1_do_rx_rzv_rts_intranode; + params->work_elem.work_fn = opx_hfi1_rx_rzv_rts_send_cts_intranode; if (hfi1_hdr->stl.lrh.slid == opx_ep->rx->self.uid.lid) { params->target_hfi_unit = opx_ep->rx->self.hfi1_unit; } else { @@ -1371,35 +1363,12 @@ void fi_opx_hfi1_rx_rzv_rts (struct fi_opx_ep *opx_ep, assert(hfi_lookup); params->target_hfi_unit = hfi_lookup->hfi_unit; } - } else if (is_hmem) { - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "is_hmem %lu\n",is_hmem); - params->work_elem.work_fn = fi_opx_hfi1_do_rx_rzv_rts_eager_ring; - } else if (opx_ep->use_expected_tid_rzv) { - /* further checks on whether TID rts is supported */ - if(niov != 1) { - /* TID rts only supports 1 iov, use eager rts */ - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "niov %lu\n", niov); - params->work_elem.work_fn = fi_opx_hfi1_do_rx_rzv_rts_eager_ring; - } else if (!fi_opx_hfi1_sdma_use_sdma(opx_ep, params->src_iov[0].bytes, opcode, is_hmem, is_intranode)) { - /* TID rts requires SDMA, use eager rts */ - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, - "src_iov[0].bytes %zu, opcode %u, is_hmem %lu is_intranode %u\n", - params->src_iov[0].bytes, opcode, is_hmem, is_intranode); - params->work_elem.work_fn = fi_opx_hfi1_do_rx_rzv_rts_eager_ring; - } else { - params->opcode = FI_OPX_HFI_DPUT_OPCODE_RZV_TID; - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, - "opx_ep->use_expected_tid_rzv %u, opcode %u, fallback opcode %u\n", - opx_ep->use_expected_tid_rzv, params->opcode, params->fallback_opcode); - FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.expected_receive.total_requests); - params->work_elem.work_fn = fi_opx_hfi1_do_rx_rzv_rts_tid; - } - params->target_hfi_unit = 0xFF; } else { FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, - "opx_ep->use_expected_tid_rzv %u, opcode %u\n", - opx_ep->use_expected_tid_rzv, params->opcode); - params->work_elem.work_fn = fi_opx_hfi1_do_rx_rzv_rts_eager_ring; + "opx_ep->use_expected_tid_rzv=%u niov=%lu opcode=%u\n", + opx_ep->use_expected_tid_rzv, niov, params->opcode); + + params->work_elem.work_fn = opx_hfi1_rx_rzv_rts_send_cts; params->target_hfi_unit = 0xFF; } params->work_elem.completion_action = NULL; @@ -1421,11 +1390,23 @@ void fi_opx_hfi1_rx_rzv_rts (struct fi_opx_ep *opx_ep, params->rzv_comp->context = target_context; params->rzv_comp->invalidate_needed = false; params->dst_vaddr = dst_vaddr; - params->immediate_data = immediate_data; - params->immediate_end_block_count = immediate_end_block_count, params->is_intranode = is_intranode; params->reliability = reliability; + params->tid_pending_vaddr = 0UL; + params->tid_pending_tid_vaddr = 0UL; + params->tid_pending_length = 0L; + params->tid_pending_tid_length = 0L; + params->tid_setup_retries = 0; params->ntidpairs = 0; + params->opcode = opcode; + + if (opx_hfi1_rx_rzv_rts_tid_eligible(opx_ep, params, niov, + immediate_data, + immediate_end_block_count, + is_hmem, is_intranode, + dst_iface, opcode)) { + params->work_elem.work_fn = opx_hfi1_rx_rzv_rts_tid_setup; + } int rc = params->work_elem.work_fn(work); if(rc == FI_SUCCESS) { @@ -1817,7 +1798,7 @@ int fi_opx_hfi1_do_dput_sdma (union fi_opx_hfi1_deferred_work * work) assert ((opx_ep->tx->pio_max_eager_tx_bytes & 0x3fu) == 0); unsigned i; const void* sbuf_start = (opx_mr == NULL) ? 0 : opx_mr->iov.iov_base; - const bool delivery_completion = params->delivery_completion; + const bool sdma_no_bounce_buf = params->sdma_no_bounce_buf; /* Note that lrh_dlid is just the version of params->slid shifted so that it can be OR'd into the correct position in the packet header */ @@ -1835,8 +1816,8 @@ int fi_opx_hfi1_do_dput_sdma (union fi_opx_hfi1_deferred_work * work) opcode != FI_OPX_HFI_DPUT_OPCODE_ATOMIC_COMPARE_FETCH && params->payload_bytes_for_iovec == 0)); - assert((opcode == FI_OPX_HFI_DPUT_OPCODE_PUT && params->delivery_completion) || - (opcode == FI_OPX_HFI_DPUT_OPCODE_GET && params->delivery_completion) || + assert((opcode == FI_OPX_HFI_DPUT_OPCODE_PUT && params->sdma_no_bounce_buf) || + (opcode == FI_OPX_HFI_DPUT_OPCODE_GET && params->sdma_no_bounce_buf) || (opcode != FI_OPX_HFI_DPUT_OPCODE_PUT && opcode != FI_OPX_HFI_DPUT_OPCODE_GET)); uint64_t max_eager_bytes = opx_ep->tx->pio_max_eager_tx_bytes; @@ -1926,7 +1907,7 @@ int fi_opx_hfi1_do_dput_sdma (union fi_opx_hfi1_deferred_work * work) * which will still be set correctly. */ bool need_padding = (packet_count == 1 && (sdma_we_bytes & 0x3ul)); - params->sdma_we->use_bounce_buf = (!delivery_completion || + params->sdma_we->use_bounce_buf = (!sdma_no_bounce_buf || opcode == FI_OPX_HFI_DPUT_OPCODE_ATOMIC_FETCH || opcode == FI_OPX_HFI_DPUT_OPCODE_ATOMIC_COMPARE_FETCH || need_padding); @@ -2045,7 +2026,7 @@ int fi_opx_hfi1_do_dput_sdma (union fi_opx_hfi1_deferred_work * work) // been copied to bounce buffer(s), so at this point, it should be safe // for the user to alter the send buffer even though the send may still // be in progress. - if (!params->delivery_completion) { + if (!params->sdma_no_bounce_buf) { assert(params->origin_byte_counter); *params->origin_byte_counter = 0; params->origin_byte_counter = NULL; @@ -2076,8 +2057,9 @@ int fi_opx_hfi1_do_dput_sdma_tid (union fi_opx_hfi1_deferred_work * work) const uint64_t bth_rx = ((uint64_t)u8_rx) << 56; unsigned i; const void* sbuf_start = (opx_mr == NULL) ? 0 : opx_mr->iov.iov_base; - const bool delivery_completion = params->delivery_completion; + const bool sdma_no_bounce_buf = params->sdma_no_bounce_buf; assert(params->ntidpairs != 0); + assert(niov == 1); /* Note that lrh_dlid is just the version of params->slid shifted so that it can be OR'd into the correct position in the packet header */ @@ -2097,7 +2079,8 @@ int fi_opx_hfi1_do_dput_sdma_tid (union fi_opx_hfi1_deferred_work * work) const uint64_t max_dput_bytes = max_eager_bytes; FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, - "%p:===================================== SEND DPUT SDMA TID, opcode %X -- (begin)\n", params, opcode); + "%p:===================================== SEND DPUT SDMA TID, opcode %X -- (begin)\n", + params, opcode); for (i=params->cur_iov; idebug_counters.expected_receive.first_tidpair_minoffset == 0), params->tidoffset, opx_ep->debug_counters.expected_receive.first_tidpair_minoffset); - FI_OPX_DEBUG_COUNTERS_MIN_OF(opx_ep->debug_counters.expected_receive.first_tidpair_minoffset, params->tidoffset); - FI_OPX_DEBUG_COUNTERS_MAX_OF(opx_ep->debug_counters.expected_receive.first_tidpair_maxoffset, params->tidoffset); + FI_OPX_DEBUG_COUNTERS_INC_COND_N((opx_ep->debug_counters.expected_receive.first_tidpair_minoffset == 0), + params->tidoffset, + opx_ep->debug_counters.expected_receive.first_tidpair_minoffset); + FI_OPX_DEBUG_COUNTERS_MIN_OF(opx_ep->debug_counters.expected_receive.first_tidpair_minoffset, + params->tidoffset); + FI_OPX_DEBUG_COUNTERS_MAX_OF(opx_ep->debug_counters.expected_receive.first_tidpair_maxoffset, + params->tidoffset); tididx = 0; tidpairs = (uint32_t *)params->tid_iov.iov_base; @@ -2127,7 +2114,11 @@ int fi_opx_hfi1_do_dput_sdma_tid (union fi_opx_hfi1_deferred_work * work) tidlen_consumed = params->tidoffset / OPX_HFI1_TID_PAGESIZE ; tidlen_remaining -= tidlen_consumed; if (tidlen_consumed) { - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "params->tidoffset %u, tidlen_consumed %u, tidlen_remaining %u, length %llu\n", params->tidoffset, tidlen_consumed, tidlen_remaining, FI_OPX_EXP_TID_GET(tidpairs[0],LEN)); + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "params->tidoffset %u, tidlen_consumed %u, tidlen_remaining %u, length %llu\n", + params->tidoffset, tidlen_consumed, + tidlen_remaining, + FI_OPX_EXP_TID_GET(tidpairs[0],LEN)); } } else { /* eagain retry, restore previous TID state */ tidpairs = (uint32_t *)params->tid_iov.iov_base; @@ -2139,15 +2130,25 @@ int fi_opx_hfi1_do_dput_sdma_tid (union fi_opx_hfi1_deferred_work * work) uint32_t starting_tid_idx = tididx; - assert(i == 0); uint8_t * sbuf = (uint8_t*)((uintptr_t)sbuf_start + (uintptr_t)dput_iov[i].sbuf + params->bytes_sent); uintptr_t rbuf = dput_iov[i].rbuf + params->bytes_sent; uint64_t bytes_to_send = dput_iov[i].bytes - params->bytes_sent; - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, " sbuf %p, sbuf_start %p, dput_iov[%u].sbuf %p, dput_iov[i].bytes %lu/%#lX, bytes sent %lu/%#lX, bytes_to_send %lu/%#lX, origin_byte_counter %ld\n", - sbuf, sbuf_start, i, (void*)dput_iov[i].sbuf, dput_iov[i].bytes, dput_iov[i].bytes, params->bytes_sent, params->bytes_sent, bytes_to_send, bytes_to_send, params->origin_byte_counter? *(params->origin_byte_counter):-1UL); - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, " rbuf %p, dput_iov[%u].rbuf %p, dput_iov[i].bytes %lu/%#lX, bytes sent %lu/%#lX, bytes_to_send %lu/%#lX, first_tidoffset %u/%#X first_tidoffset_page_adj %u/%#X \n", - (void*)rbuf, i, (void *)dput_iov[i].rbuf, dput_iov[i].bytes, dput_iov[i].bytes, params->bytes_sent, params->bytes_sent, bytes_to_send, bytes_to_send, first_tidoffset, first_tidoffset, first_tidoffset_page_adj, first_tidoffset_page_adj); + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + " sbuf %p, sbuf_start %p, dput_iov[%u].sbuf %p, dput_iov[i].bytes %lu/%#lX, bytes sent %lu/%#lX, bytes_to_send %lu/%#lX, origin_byte_counter %ld\n", + sbuf, sbuf_start, i, (void*)dput_iov[i].sbuf, + dput_iov[i].bytes, dput_iov[i].bytes, + params->bytes_sent, params->bytes_sent, + bytes_to_send, bytes_to_send, + params->origin_byte_counter ? *(params->origin_byte_counter) : -1UL); + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + " rbuf %p, dput_iov[%u].rbuf %p, dput_iov[i].bytes %lu/%#lX, bytes sent %lu/%#lX, bytes_to_send %lu/%#lX, first_tidoffset %u/%#X first_tidoffset_page_adj %u/%#X \n", + (void*)rbuf, i, (void *)dput_iov[i].rbuf, + dput_iov[i].bytes, dput_iov[i].bytes, + params->bytes_sent, params->bytes_sent, + bytes_to_send, bytes_to_send, + first_tidoffset, first_tidoffset, + first_tidoffset_page_adj, first_tidoffset_page_adj); while (bytes_to_send > 0) { fi_opx_hfi1_sdma_poll_completion(opx_ep); if (!params->sdma_we) { @@ -2171,7 +2172,8 @@ int fi_opx_hfi1_do_dput_sdma_tid (union fi_opx_hfi1_deferred_work * work) params->slid, params->origin_rs, params->u8_rx, - FI_HMEM_SYSTEM, 0); + dput_iov[i].sbuf_iface, + (int) dput_iov[i].sbuf_device); } assert(!fi_opx_hfi1_sdma_has_unsent_packets(params->sdma_we)); @@ -2205,7 +2207,8 @@ int fi_opx_hfi1_do_dput_sdma_tid (union fi_opx_hfi1_deferred_work * work) if (psns_avail < (int64_t) packet_count) { FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.sdma.eagain_psn); FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, - "%p:===================================== SEND DPUT SDMA TID, !PSN FI_EAGAIN\n",params); + "%p:===================================== SEND DPUT SDMA TID, !PSN FI_EAGAIN\n", + params); return -FI_EAGAIN; } #ifndef OPX_RELIABILITY_TEST /* defining this will force reliability replay of some packets */ @@ -2234,14 +2237,15 @@ int fi_opx_hfi1_do_dput_sdma_tid (union fi_opx_hfi1_deferred_work * work) * are used when not DC or fetch, not for "padding". */ assert(!(packet_count == 1 && (bytes_to_send & 0x3ul))); - params->sdma_we->use_bounce_buf = !delivery_completion; + params->sdma_we->use_bounce_buf = !sdma_no_bounce_buf; uint8_t *sbuf_tmp; if (params->sdma_we->use_bounce_buf) { OPX_HMEM_COPY_FROM(params->sdma_we->bounce_buf.buf, sbuf, MIN((packet_count * max_dput_bytes), bytes_to_send), - FI_HMEM_SYSTEM, 0ul); + dput_iov[i].sbuf_iface, + dput_iov[i].sbuf_device); sbuf_tmp = params->sdma_we->bounce_buf.buf; } else { sbuf_tmp = sbuf; @@ -2259,18 +2263,26 @@ int fi_opx_hfi1_do_dput_sdma_tid (union fi_opx_hfi1_deferred_work * work) assert(packet_bytes <= FI_OPX_HFI1_PACKET_MTU); if (p == 0) { /* First packet header is user's responsibility even with SDMA header auto-generation*/ /* set fields for first header */ + unsigned offset_shift; starting_tid_idx = tididx; /* first tid this write() */ - if ((FI_OPX_EXP_TID_GET(tidpairs[tididx],LEN)) >= KDETH_OM_MAX_SIZE/OPX_HFI1_TID_PAGESIZE) { + if ((FI_OPX_EXP_TID_GET(tidpairs[tididx],LEN)) >= + (KDETH_OM_MAX_SIZE / OPX_HFI1_TID_PAGESIZE)) { tidOMshift = (1 << HFI_KHDR_OM_SHIFT); - tidoffset = ((tidlen_consumed * OPX_HFI1_TID_PAGESIZE) + first_tidoffset_page_adj) >> KDETH_OM_LARGE_SHIFT; - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "%p:tidoffset %#X/%#X, first_tid_offset %#X, first_tidoffset_page_adj %#X\n",params,tidoffset, tidoffset << KDETH_OM_LARGE_SHIFT, first_tidoffset, first_tidoffset_page_adj); + offset_shift = KDETH_OM_LARGE_SHIFT; } else { tidOMshift = 0; - tidoffset = ((tidlen_consumed * OPX_HFI1_TID_PAGESIZE) + first_tidoffset_page_adj) >> KDETH_OM_SMALL_SHIFT; - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "%p:tidoffset %#X/%#X, first_tid_offset %#X, first_tidoffset_page_adj %#X\n",params,tidoffset, tidoffset << KDETH_OM_SMALL_SHIFT, first_tidoffset, first_tidoffset_page_adj); + offset_shift = KDETH_OM_SMALL_SHIFT; } + tidoffset = ((tidlen_consumed * OPX_HFI1_TID_PAGESIZE) + + first_tidoffset_page_adj) + >> offset_shift; + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "%p:tidoffset %#X/%#X, first_tid_offset %#X, first_tidoffset_page_adj %#X\n", + params, tidoffset, + tidoffset << offset_shift, + first_tidoffset, + first_tidoffset_page_adj); } - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "%p:tid[%u], tidlen_remaining %u, packet_bytes %#lX, first_tid_offset %#X, first_tidoffset_page_adj %#X, packet_count %lu\n",params,tididx,tidlen_remaining, packet_bytes, first_tidoffset, first_tidoffset_page_adj, packet_count); /* Save current values in case we can't process this packet (!REPLAY) and need to restore state */ @@ -2288,11 +2300,17 @@ int fi_opx_hfi1_do_dput_sdma_tid (union fi_opx_hfi1_deferred_work * work) } else { packet_bytes = MIN(packet_bytes, FI_OPX_HFI1_PACKET_MTU-first_tidoffset_page_adj); } - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "%p:tid[%u], tidlen_remaining %u, packet_bytes %#lX, first_tid_offset %#X, first_tidoffset_page_adj %#X, packet_count %lu\n",params,tididx,tidlen_remaining, packet_bytes, first_tidoffset, first_tidoffset_page_adj, packet_count); assert(tididx == 0); first_tidoffset = 0; /* offset ONLY for first tid from cts*/ first_tidoffset_page_adj = 0; } + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "%p:tid[%u], tidlen_remaining %u, packet_bytes %#lX, first_tid_offset %#X, first_tidoffset_page_adj %#X, packet_count %lu\n", + params, tididx, tidlen_remaining, + packet_bytes, first_tidoffset, + first_tidoffset_page_adj, + packet_count); + /* Check tid for each packet and determine if SDMA header auto-generation will use 4k or 8k packet */ /* Assume any CTRL 3 tidpair optimizations were already done, or are not wanted, @@ -2313,12 +2331,16 @@ int fi_opx_hfi1_do_dput_sdma_tid (union fi_opx_hfi1_deferred_work * work) if(tididx == 0) first_tid_last_packet = true;/* First tid even though tididx ++*/ #endif tididx++; - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "%p:tid[%u/%u], tidlen_remaining %u, packet_bytes %#lX, first_tid_offset %#X, first_tidoffset_page_adj %#X, packet_count %lu\n",params,tididx,params->ntidpairs, tidlen_remaining, packet_bytes, first_tidoffset, first_tidoffset_page_adj, packet_count); tidlen_remaining = FI_OPX_EXP_TID_GET(tidpairs[tididx],LEN); tidlen_consumed = 0; - } else { - FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "%p:tid[%u], tidlen_remaining %u, packet_bytes %#lX, first_tid_offset %#X, first_tidoffset_page_adj %#X, packet_count %lu\n",params,tididx,tidlen_remaining, packet_bytes, first_tidoffset, first_tidoffset_page_adj, packet_count); } + FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, + "%p:tid[%u/%u], tidlen_remaining %u, packet_bytes %#lX, first_tid_offset %#X, first_tidoffset_page_adj %#X, packet_count %lu\n", + params, tididx, params->ntidpairs, + tidlen_remaining, packet_bytes, + first_tidoffset, + first_tidoffset_page_adj, + packet_count); struct fi_opx_reliability_tx_replay *replay; replay = fi_opx_reliability_client_replay_allocate( @@ -2330,8 +2352,9 @@ int fi_opx_hfi1_do_dput_sdma_tid (union fi_opx_hfi1_deferred_work * work) tidlen_consumed = prev_tidlen_consumed; tidlen_remaining = prev_tidlen_remaining; FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, - "%p:!REPLAY on packet %u out of %lu, params->sdma_we->num_packets %u\n", - params, p, packet_count, params->sdma_we->num_packets); + "%p:!REPLAY on packet %u out of %lu, params->sdma_we->num_packets %u\n", + params, p, packet_count, + params->sdma_we->num_packets); break; } replay->use_sdma = true; /* Always replay TID packets with SDMA */ @@ -2379,7 +2402,8 @@ int fi_opx_hfi1_do_dput_sdma_tid (union fi_opx_hfi1_deferred_work * work) if (OFI_UNLIKELY(params->sdma_we->num_packets == 0)) { FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.sdma.eagain_replay); FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, - "%p:===================================== SEND DPUT SDMA TID, !REPLAY FI_EAGAIN\n",params); + "%p:===================================== SEND DPUT SDMA TID, !REPLAY FI_EAGAIN\n", + params); return -FI_EAGAIN; } @@ -2405,7 +2429,7 @@ int fi_opx_hfi1_do_dput_sdma_tid (union fi_opx_hfi1_deferred_work * work) FI_DBG(fi_opx_global.prov, FI_LOG_EP_DATA, "%p:===================================== SEND DPUT SDMA TID, finished IOV=%d(%d) bytes_sent=%ld\n", - params,params->cur_iov, niov, params->bytes_sent); + params,params->cur_iov, niov, params->bytes_sent); params->bytes_sent = 0; params->cur_iov++; @@ -2421,7 +2445,7 @@ int fi_opx_hfi1_do_dput_sdma_tid (union fi_opx_hfi1_deferred_work * work) // been copied to bounce buffer(s), so at this point, it should be safe // for the user to alter the send buffer even though the send may still // be in progress. - if (!params->delivery_completion) { + if (!params->sdma_no_bounce_buf) { assert(params->origin_byte_counter); *params->origin_byte_counter = 0; params->origin_byte_counter = NULL; @@ -2477,7 +2501,7 @@ union fi_opx_hfi1_deferred_work* fi_opx_hfi1_rx_rzv_cts (struct fi_opx_ep * opx_ params->cc = NULL; params->user_cc = NULL; params->payload_bytes_for_iovec = 0; - params->delivery_completion = false; + params->sdma_no_bounce_buf = false; params->target_byte_counter_vaddr = target_byte_counter_vaddr; params->rma_request_vaddr = rma_request_vaddr; @@ -2515,7 +2539,6 @@ union fi_opx_hfi1_deferred_work* fi_opx_hfi1_rx_rzv_cts (struct fi_opx_ep * opx_ uint32_t *tidpairs = NULL; if (hfi1_hdr->cts.target.vaddr.opcode == FI_OPX_HFI_DPUT_OPCODE_RZV_TID) { - assert(!is_hmem); ntidpairs = hfi1_hdr->cts.target.vaddr.ntidpairs; if (ntidpairs) { tidpairs = ((union fi_opx_hfi1_packet_payload *)payload)->tid_cts.tidpairs; @@ -2854,15 +2877,15 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, #endif /* Expected tid needs to send a leading data block and a trailing * data block for alignment. Limit this to SDMA (8K+) for now */ - const bool use_immediate_blocks = len > FI_OPX_SDMA_MIN_LENGTH ? (opx_ep->use_expected_tid_rzv ? 1 : 0) : 0; + + const uint64_t immediate_block_count = (len > FI_OPX_SDMA_MIN_LENGTH && opx_ep->use_expected_tid_rzv) ? 1 : 0; FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "use_immediate_blocks %u *origin_byte_counter_value %#lX, origin_byte_counter_vaddr %p, " + "immediate_block_count %#lX *origin_byte_counter_value %#lX, origin_byte_counter_vaddr %p, " "*origin_byte_counter_vaddr %lu/%#lX, len %lu/%#lX\n", - use_immediate_blocks, *origin_byte_counter_value, (uint64_t*)origin_byte_counter_vaddr, + immediate_block_count, *origin_byte_counter_value, (uint64_t*)origin_byte_counter_vaddr, origin_byte_counter_vaddr ? *(uint64_t*)origin_byte_counter_vaddr : -1UL, origin_byte_counter_vaddr ? *(uint64_t*)origin_byte_counter_vaddr : -1UL, len, len ); - const uint64_t immediate_block_count = use_immediate_blocks ? 1 : 0; const uint64_t immediate_end_block_count = immediate_block_count; assert((immediate_block_count + immediate_end_block_count) <= max_immediate_block_count); @@ -2872,6 +2895,7 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, const uint64_t immediate_byte_count = len & 0x0007ul; const uint64_t immediate_qw_count = (len >> 3) & 0x0007ul; + const uint64_t immediate_fragment = (((len & 0x003Ful) + 63) >> 6); /* Immediate total does not include trailing block */ const uint64_t immediate_total = immediate_byte_count + immediate_qw_count * sizeof(uint64_t) + @@ -2904,7 +2928,7 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, const uint64_t payload_blocks_total = 1 + /* rzv metadata */ - 1 + /* immediate data tail */ + immediate_fragment + immediate_block_count + immediate_end_block_count; @@ -2995,11 +3019,10 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, for (i=0; irendezvous.contiguous.immediate_qw[i] = sbuf_qw[i]; } - sbuf_qw += immediate_qw_count; - memcpy((void*)payload->rendezvous.contiguous.immediate_block, - (const void *)sbuf_qw, immediate_block_count * 64); /* immediate_end_block_count */ + memcpy((void*)(&payload->rendezvous.contiguous.cache_line_1 + immediate_fragment), + (const void *)sbuf_qw, immediate_block_count << 6); /* immediate_end_block_count */ } opx_shm_tx_advance(&opx_ep->tx->shm, (void*)hdr, pos); @@ -3051,7 +3074,7 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, union fi_opx_reliability_tx_psn *psn_ptr; int64_t psn; - psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, &opx_ep->reliability->state, addr.uid.lid, + psn = fi_opx_reliability_get_replay(&opx_ep->ep_fid, &opx_ep->reliability->state, addr.uid.lid, dest_rx, addr.reliability_rx, &psn_ptr, &replay, reliability); if(OFI_UNLIKELY(psn == -1)) { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "FI_EAGAIN\n"); @@ -3152,34 +3175,34 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, /* This would lead to more efficient packing on both sides at the expense of */ /* wasting space of a common 0 byte immediate */ /* tmp_payload_t represents the second cache line of the rts packet */ - /* fi_opx_hfi1_packet_payload -> rendezvous -> contiguous */ + /* fi_opx_hfi1_packet_payload -> rendezvous -> contiguous */ struct tmp_payload_t { uint8_t immediate_byte[8]; uint64_t immediate_qw[7]; } __attribute__((packed)); - struct tmp_payload_t *tmp_payload = (void*)tmp; - if (immediate_byte_count > 0) { - memcpy((void*)tmp_payload->immediate_byte, (const void*)sbuf, immediate_byte_count); - sbuf += immediate_byte_count; - } + uint64_t * sbuf_qw = (uint64_t *)(sbuf + immediate_byte_count); + if (immediate_fragment) { + struct tmp_payload_t *tmp_payload = (void*)tmp; + if (immediate_byte_count > 0) { + memcpy((void*)tmp_payload->immediate_byte, (const void*)sbuf, immediate_byte_count); + } - uint64_t * sbuf_qw = (uint64_t *)sbuf; - int i=0; - for (i=0; iimmediate_qw[i] = sbuf_qw[i]; - } - fi_opx_copy_scb(scb_payload, tmp); - sbuf_qw += immediate_qw_count; + for (int i=0; iimmediate_qw[i] = sbuf_qw[i]; + } + fi_opx_copy_scb(scb_payload, tmp); + sbuf_qw += immediate_qw_count; - fi_opx_copy_scb(replay_payload, tmp); - replay_payload += 8; + fi_opx_copy_scb(replay_payload, tmp); + replay_payload += 8; - /* consume one credit for the rendezvous payload immediate data */ - FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); + /* consume one credit for the rendezvous payload immediate data */ + FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); #ifndef NDEBUG - ++credits_consumed; + ++credits_consumed; #endif + } if(immediate_block_count) { #ifndef NDEBUG @@ -3206,6 +3229,7 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, #endif } + if(immediate_end_block_count) { char* sbuf_end = (char *)buf + len - (immediate_end_block_count << 6); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA,"IMMEDIATE SEND RZV buf %p, buf end %p, sbuf immediate end block %p\n",(char *)buf, (char *)buf+len, sbuf_end); diff --git a/prov/opx/src/fi_opx_hfi1_sdma.c b/prov/opx/src/fi_opx_hfi1_sdma.c index 0b21c9efeb6..e5318a9481b 100644 --- a/prov/opx/src/fi_opx_hfi1_sdma.c +++ b/prov/opx/src/fi_opx_hfi1_sdma.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2022-2023 by Cornelis Networks. + * Copyright (C) 2022-2024 by Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -100,14 +100,14 @@ int fi_opx_hfi1_dput_sdma_pending_completion(union fi_opx_hfi1_deferred_work *wo slist_remove_head(¶ms->sdma_reqs); we->next = NULL; - fi_opx_hfi1_sdma_return_we(params->opx_ep, we); + fi_opx_hfi1_sdma_return_we(opx_ep, we); we = (struct fi_opx_hfi1_sdma_work_entry *) params->sdma_reqs.head; } assert(slist_empty(¶ms->sdma_reqs)); if (!params->work_elem.complete) { - assert(params->delivery_completion); + assert(params->sdma_no_bounce_buf); FI_OPX_DEBUG_COUNTERS_INC(work->dput.opx_ep->debug_counters.sdma.eagain_pending_dc); return -FI_EAGAIN; } @@ -115,7 +115,7 @@ int fi_opx_hfi1_dput_sdma_pending_completion(union fi_opx_hfi1_deferred_work *wo if (params->origin_byte_counter) { // If we're not doing delivery_competion, then origin_byte_counter // should have already been zero'd and NULL'd at the end of do_dput_sdma(...) - assert(params->delivery_completion); + assert(params->sdma_no_bounce_buf); *params->origin_byte_counter = 0; params->origin_byte_counter = NULL; } diff --git a/prov/opx/src/fi_opx_init.c b/prov/opx/src/fi_opx_init.c index 4f15dec2cdc..1d1b91f7b42 100644 --- a/prov/opx/src/fi_opx_init.c +++ b/prov/opx/src/fi_opx_init.c @@ -1,6 +1,6 @@ /* * Copyright (C) 2016 by Argonne National Laboratory. - * Copyright (C) 2021-2023 Cornelis Networks. + * Copyright (C) 2021-2024 Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -645,6 +645,8 @@ static void do_static_assert_tests() "sizeof(fi_opx_context_ext) should be a multiple of 32") ; OPX_COMPILE_TIME_ASSERT((sizeof(struct fi_opx_hmem_info) >> 3) == OPX_HMEM_SIZE_QWS, "sizeof(fi_opx_hmem_info) >> 3 != OPX_HMEM_SIZE_QWS") ; + OPX_COMPILE_TIME_ASSERT(OPX_HFI1_TID_PAGESIZE == 4096, + "OPX_HFI1_TID_PAGESIZE must be 4K!"); } #pragma GCC diagnostic pop @@ -678,7 +680,8 @@ OPX_INI fi_param_define(&fi_opx_provider, "reliability_service_pre_ack_rate", FI_PARAM_INT, "The number of packets to receive from a particular sender before preemptively acknowledging them without waiting for a ping. Valid values are powers of 2 in the range of 0-32,768, where 0 indicates no preemptive acking. Defaults to 64."); fi_param_define(&fi_opx_provider, "selinux", FI_PARAM_BOOL, "Set to true if you're running a security-enhanced Linux. This enables updating the Jkey used based on system settings. Defaults to \"No\""); fi_param_define(&fi_opx_provider, "hfi_select", FI_PARAM_STRING, "Overrides the normal algorithm used to choose which HFI a process will use. See the documentation for more information."); - fi_param_define(&fi_opx_provider, "delivery_completion_threshold", FI_PARAM_INT, "The minimum message length in bytes to force delivery completion. Value must be between %d and %d. Defaults to %d.", OPX_MIN_DCOMP_THRESHOLD, OPX_MAX_DCOMP_THRESHOLD, OPX_DEFAULT_DCOMP_THRESHOLD); + fi_param_define(&fi_opx_provider, "delivery_completion_threshold", FI_PARAM_INT, "Will be deprecated. Please use FI_OPX_SDMA_BOUNCE_BUF_THRESHOLD"); + fi_param_define(&fi_opx_provider, "sdma_bounce_buf_threshold", FI_PARAM_INT, "The maximum message length in bytes that will be copied to the SDMA bounce buffer. For messages larger than this threshold, the send will not be completed until receiver has ACKed. Value must be between %d and %d. Defaults to %d.", OPX_SDMA_BOUNCE_BUF_MIN, OPX_SDMA_BOUNCE_BUF_MAX, OPX_SDMA_BOUNCE_BUF_THRESHOLD); fi_param_define(&fi_opx_provider, "sdma_disable", FI_PARAM_INT, "Disables SDMA offload hardware. Default is 0"); fi_param_define(&fi_opx_provider, "expected_receive_enable", FI_PARAM_BOOL, "Enables expected receive rendezvous using Token ID (TID). Defaults to \"No\". This feature is not currently supported."); fi_param_define(&fi_opx_provider, "prog_affinity", FI_PARAM_STRING, diff --git a/prov/opx/src/fi_opx_mr.c b/prov/opx/src/fi_opx_mr.c index 6a25f32b724..4a5c0ba009e 100644 --- a/prov/opx/src/fi_opx_mr.c +++ b/prov/opx/src/fi_opx_mr.c @@ -180,20 +180,14 @@ static inline int fi_opx_mr_reg_internal(struct fid *fid, hmem_iface = fi_opx_hmem_get_iface(iov->iov_base, NULL, &hmem_device); } - if (hmem_iface == FI_HMEM_CUDA) { - if (fi_opx_hmem_is_managed(iov->iov_base, FI_HMEM_CUDA)) { - opx_mr->attr.iface = FI_HMEM_SYSTEM; - opx_mr->attr.device.reserved = 0ul; - } else { - opx_mr->attr.iface = FI_HMEM_CUDA; + opx_mr->attr.iface = (enum fi_hmem_iface) hmem_iface; + switch (hmem_iface) { + case FI_HMEM_CUDA: + case FI_HMEM_ZE: opx_mr->attr.device.cuda = (int) hmem_device; - } - } else if (hmem_iface == FI_HMEM_ZE) { - opx_mr->attr.iface = FI_HMEM_ZE; - opx_mr->attr.device.ze = (int) hmem_device; - } else { - opx_mr->attr.iface = (enum fi_hmem_iface) hmem_iface; - opx_mr->attr.device.reserved = hmem_device; + break; + default: + opx_mr->attr.device.reserved = hmem_device; } #else opx_mr->attr.iface = FI_HMEM_SYSTEM; diff --git a/prov/opx/src/fi_opx_reliability.c b/prov/opx/src/fi_opx_reliability.c index ccaeecaa9ba..862e97a6b14 100644 --- a/prov/opx/src/fi_opx_reliability.c +++ b/prov/opx/src/fi_opx_reliability.c @@ -1,6 +1,6 @@ /* * Copyright (C) 2016 by Argonne National Laboratory. - * Copyright (C) 2021-2023 Cornelis Networks. + * Copyright (C) 2021-2024 Cornelis Networks. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -450,7 +450,7 @@ ssize_t fi_opx_hfi1_tx_reliability_inject_ud_opcode (struct fid_ep *ep, OPX_HFI1_BAR_STORE(&scb[5], 0UL); OPX_HFI1_BAR_STORE(&scb[6], 0UL); OPX_HFI1_BAR_STORE(&scb[7], key); - + /* consume one credit for the packet header */ FI_OPX_HFI1_CONSUME_SINGLE_CREDIT(pio_state); @@ -519,7 +519,7 @@ ssize_t fi_opx_hfi1_tx_reliability_inject_ud_init(struct fid_ep *ep, #endif return -FI_EAGAIN; } - + #ifdef OPX_RELIABILITY_DEBUG if (opcode == FI_OPX_HFI_UD_OPCODE_RELIABILITY_INIT) { fprintf(stderr, "(tx) flow__ %016lx 0x%lx inj init\n", key, reliability_rx); @@ -552,7 +552,7 @@ ssize_t fi_opx_hfi1_tx_reliability_inject_ud_resynch(struct fid_ep *ep, #endif return -FI_EAGAIN; } - + #ifdef OPX_RELIABILITY_DEBUG if (opcode == FI_OPX_HFI_UD_OPCODE_RELIABILITY_RESYNCH) { fprintf(stderr, "(tx) Client flow__ %016lx 0x%lx inj resynch\n", key, reliability_rx); @@ -675,7 +675,7 @@ void fi_opx_hfi1_rx_reliability_send_pre_acks(struct fid_ep *ep, const uint64_t const uint64_t slid = hdr->stl.lrh.slid; const union fi_opx_reliability_service_flow_key key = { - .slid = slid, + .slid = slid, .tx = FI_OPX_HFI1_PACKET_ORIGIN_TX(hdr), .dlid = dlid, .rx = reliability_rx }; @@ -1506,7 +1506,7 @@ ssize_t fi_opx_reliability_service_do_replay (struct fi_opx_reliability_service OPX_HFI1_BAR_STORE(&scb[5], replay->scb.hdr.qw[4]); OPX_HFI1_BAR_STORE(&scb[6], replay->scb.hdr.qw[5]); OPX_HFI1_BAR_STORE(&scb[7], replay->scb.hdr.qw[6]); - + FI_OPX_HFI1_CHECK_CREDITS_FOR_ERROR((service->tx.hfi1.pio_credits_addr)); @@ -1566,14 +1566,14 @@ ssize_t fi_opx_reliability_service_do_replay (struct fi_opx_reliability_service uint16_t i; for (i=0; ipsn_stop)?MAX_PSN:psn_stop) - psn_start + 1; // Send one ping to cover the entire replay range. @@ -1965,7 +1965,7 @@ void fi_opx_reliability_service_process_pending (struct fi_opx_reliability_servi } #if 0 -/* +/* * Prototype code for off-loading the reliability service. */ static inline @@ -2287,14 +2287,14 @@ uint8_t fi_opx_reliability_service_init (struct fi_opx_reliability_service * ser if (env) { unsigned long period = strtoul(env, NULL, 10); OPX_LOG_REL(FI_LOG_DEBUG, FI_LOG_EP_DATA,"FI_OPX_RELIABILITY_SERVICE_BACKOFF_PERIOD = '%s' (%lu)\n", env, period); - + service->is_backoff_enabled = 1; service->backoff_period=(uint64_t)period; } /* * How often to preemptively acknowledge packets. - * The default is 64, which indicates send a + * The default is 64, which indicates send a * preemptive, non-solicited ACK after 64 packets received. * Must be a power of 2, so that we can create an AND mask * that will quickly tell us whether or not to ack @@ -2482,7 +2482,7 @@ uint8_t fi_opx_reliability_service_init (struct fi_opx_reliability_service * ser for (i = 0; i < cpu_mask_chunk_num; i++) { for (j = 0; j < cpu_mask_chunk_bits_size; j++) { - cpu_mask[i] |= CPU_ISSET(i * cpu_mask_chunk_bits_size + j, &cpu_set) << j; + cpu_mask[i] |= ((uint64_t) CPU_ISSET(i * cpu_mask_chunk_bits_size + j, &cpu_set)) << j; } } @@ -2623,13 +2623,13 @@ void fi_opx_reliability_client_init (struct fi_opx_reliability_client_state * st * if a receiver is dropping packets, we should throttle the sender * by returning an EAGAIN until the # of outstanding packets falls. */ - (void)ofi_bufpool_create(&(state->replay_pool), + (void)ofi_bufpool_create(&(state->replay_pool), OPX_RELIABILITY_TX_REPLAY_SIZE, // element size sizeof(void *), // byte alignment FI_OPX_RELIABILITY_TX_REPLAY_BLOCKS, // max # of elements FI_OPX_RELIABILITY_TX_REPLAY_BLOCKS, // # of elements to allocate at once OFI_BUFPOOL_NO_TRACK); // flags - (void)ofi_bufpool_create(&(state->replay_iov_pool), + (void)ofi_bufpool_create(&(state->replay_iov_pool), OPX_RELIABILITY_TX_REPLAY_IOV_SIZE, // element size sizeof(void *), // byte alignment FI_OPX_RELIABILITY_TX_REPLAY_IOV_BLOCKS, // max # of elements @@ -2690,19 +2690,20 @@ void fi_opx_reliability_client_fini (struct fi_opx_reliability_client_state * st } - /* TODO - delete rbtree and flows, but first have to notify - * reliability service of the tear-down */ - /*if (state->flow_rbtree_resynch) { + if (state->flow_rbtree_resynch) { rbtDelete(state->flow_rbtree_resynch); + state->flow_rbtree_resynch = NULL; } if (state->rx_flow_rbtree) { rbtDelete(state->rx_flow_rbtree); + state->rx_flow_rbtree = NULL; } if(state->tx_flow_rbtree) { rbtDelete(state->tx_flow_rbtree); - }*/ + state->tx_flow_rbtree = NULL; + } } __OPX_FORCE_INLINE__ @@ -3053,13 +3054,13 @@ void fi_opx_reliability_rx_exception (struct fi_opx_reliability_client_state * s * coalesced PING requests. An ACK/NAK will be sent as a response to * the requests processed. We might not make it thru the entire hashmap, * so don't deallocate any requests that cannot be sent. - * + * * This function is capable to handle an incomplete run thru the loop - * + * * This function is optimized to only do pings, but it can easily be modfied * to handle all reliablity events. If you see lots of duplicate ACK/NAK, * then adding those ops would be a good idea. - */ + */ // TODO: Should add some feedback from the amount of PIO send credits available // Each op processed takes one credit to send @@ -3083,20 +3084,24 @@ void fi_opx_hfi_rx_reliablity_process_requests(struct fid_ep *ep, int max_to_sen // Detect if we Coalesced any packets since responding to the first ping, then respond to them here if (cur_op->psn_count < cur_op->psn_count_coalesce) { - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "Processing Rx Ping, psn=%lu count=%lu key=%lu\n", - cur_op->key.psn_start, cur_op->psn_count, cur_op->key.key); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "Processing Rx Ping, psn=%lu count=%lu key=%lu\n", + cur_op->key.psn_start, cur_op->psn_count, cur_op->key.key); + + fi_opx_hfi1_rx_reliability_ping(ep, service, + cur_op->key.key, cur_op->psn_count_coalesce, + cur_op->key.psn_start, + cur_op->slid, cur_op->rx); + } - fi_opx_hfi1_rx_reliability_ping(ep, service, - cur_op->key.key, cur_op->psn_count_coalesce, cur_op->key.psn_start, - cur_op->slid, cur_op->rx); - } - HASH_DEL(service->pending_rx_reliability_ops_hashmap, cur_op); OPX_BUF_FREE(cur_op); pending_op_count++; if (OFI_UNLIKELY(pending_op_count >= max_to_send)) { - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "WARNING: Should not break here pending_op_count=%i\n", pending_op_count); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "WARNING: Should not break here pending_op_count=%i\n", + pending_op_count); break; } } @@ -3129,7 +3134,7 @@ ssize_t fi_opx_hfi1_tx_reliability_inject_shm (struct fid_ep *ep, #endif /* Make sure the connection to remote EP exists. */ - rc = fi_opx_shm_dynamic_tx_connect(1, opx_ep, (unsigned)u32_reliability_rx, hfi1_unit); + rc = fi_opx_shm_dynamic_tx_connect(OPX_INTRANODE_TRUE, opx_ep, (unsigned)u32_reliability_rx, hfi1_unit); if (OFI_UNLIKELY(rc)) { return -FI_EAGAIN; } @@ -3160,7 +3165,7 @@ ssize_t fi_opx_hfi1_tx_reliability_inject_shm (struct fid_ep *ep, hdr->qw[2] = model.hdr.qw[2]; - hdr->qw[3] = model.hdr.qw[3]; + hdr->qw[3] = model.hdr.qw[3]; hdr->qw[4] = model.hdr.qw[4]; @@ -3229,7 +3234,7 @@ struct fi_opx_reliability_resynch_flow * fi_opx_reliability_resynch_flow_init ( __OPX_FORCE_INLINE__ void fi_opx_reliability_resynch_tx_flow_reset (struct fi_opx_ep *opx_ep, - struct fi_opx_reliability_service * service, + struct fi_opx_reliability_service * service, struct fi_opx_reliability_client_state * state, union fi_opx_reliability_service_flow_key tx_key) { @@ -3242,7 +3247,7 @@ void fi_opx_reliability_resynch_tx_flow_reset (struct fi_opx_ep *opx_ep, itr = fi_opx_rbt_find(state->tx_flow_rbtree, (void*)tx_key.value); if (itr) { /* When the Server does its first transmit, this will cause the Server to */ - /* initiate a handshake with the Client. */ + /* initiate a handshake with the Client. */ rbtErase(state->tx_flow_rbtree, itr); FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, @@ -3313,7 +3318,7 @@ void fi_opx_hfi1_rx_reliability_resynch (struct fid_ep *ep, .rx = origin_reliability_rx }; - /* + /* * INTRA-NODE: * Reset all SHM related reliability protocol data retained by this * Server EP about the remote Client EP. @@ -3343,11 +3348,11 @@ void fi_opx_hfi1_rx_reliability_resynch (struct fid_ep *ep, rx_key.value, tx_key.dlid, hdr->service.origin_reliability_rx, (uint8_t)hfi_unit, origin_reliability_rx, FI_OPX_HFI_UD_OPCODE_RELIABILITY_RESYNCH_ACK); - + return; } - /* + /* * INTER-NODE: * Reset all rx/tx related reliability protocol data retained by this * Server EP about the remote Client EP. @@ -3409,7 +3414,7 @@ void fi_opx_hfi1_rx_reliability_resynch (struct fid_ep *ep, /* Reset all (tx) related reliability protocol data */ fi_opx_reliability_resynch_tx_flow_reset(opx_ep, service, state, tx_key); - /* + /* * When a DAOS Server is configured for multiple Engine instances, each Engine * instance can act as both a server and client EP. If this is so, then set * the resynch_client_ep flag to indicate that a reset of all (tx) related @@ -3432,7 +3437,7 @@ void fi_opx_hfi1_rx_reliability_resynch (struct fid_ep *ep, tx_local_client_key.value); } - /* + /* * Create record of the RESYNCH operation being completed for all (rx) & (tx) * related reliability protocol data. */ @@ -3551,7 +3556,7 @@ ssize_t fi_opx_reliability_do_remote_ep_resynch(struct fid_ep *ep, opx_shm_daos_rank_index(opx_ep->daos_info.rank, opx_ep->daos_info.rank_inst) : dest_addr.hfi1_rx; - /* + /* * Check whether RESYNCH request has been received from the remote EP. * If so, then this is a Server EP amd there is nothing to be done. */ @@ -3565,7 +3570,7 @@ ssize_t fi_opx_reliability_do_remote_ep_resynch(struct fid_ep *ep, return FI_SUCCESS; } - /* + /* * Check whether a RESYNCH request was already received from the remote EP. * If so, then this is a Server EP; otherwise this is a Client EP. */ @@ -3578,7 +3583,7 @@ ssize_t fi_opx_reliability_do_remote_ep_resynch(struct fid_ep *ep, (struct fi_opx_reliability_resynch_flow **) fi_opx_rbt_value_ptr( opx_ep->reliability->state.flow_rbtree_resynch, rx_itr ); - + struct fi_opx_reliability_resynch_flow * resynch_flow = *value_ptr; if (resynch_flow->client_ep && resynch_flow->remote_ep_resynch_completed) { FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, @@ -3592,7 +3597,7 @@ ssize_t fi_opx_reliability_do_remote_ep_resynch(struct fid_ep *ep, } } - /* + /* * Check whether packets have already been sent to the dest EP. If not, * then send RESYNCH request to the dest Server EP. This causes the dest * Server EP to resynch all SHM related data that it maintains associated @@ -3601,7 +3606,7 @@ ssize_t fi_opx_reliability_do_remote_ep_resynch(struct fid_ep *ep, if (!resynch_rcvd || !opx_ep->tx->shm.fifo_segment[rx_index] || !opx_ep->tx->shm.connection[rx_index].inuse) { - rc = fi_opx_shm_dynamic_tx_connect(1, opx_ep, rx_index, dest_addr.hfi1_unit); + rc = fi_opx_shm_dynamic_tx_connect(OPX_INTRANODE_TRUE, opx_ep, rx_index, dest_addr.hfi1_unit); if (OFI_UNLIKELY(rc)) { return -FI_EAGAIN; } @@ -3622,7 +3627,7 @@ ssize_t fi_opx_reliability_do_remote_ep_resynch(struct fid_ep *ep, } else { /* INTER-NODE */ - /* + /* * Check whether a RESYNCH request was already received from the remote EP. * If so, then this is a Server EP; otherwise this is a Client EP. */ @@ -3642,7 +3647,7 @@ ssize_t fi_opx_reliability_do_remote_ep_resynch(struct fid_ep *ep, tx_key.value, resynch_flow->resynch_counter); return FI_SUCCESS; } else { - /* + /* * When a DAOS Server is configured for multiple Engine instances, each * Engine instance can act as both a server and client EP. If this is so, * then the resynch_client_ep flag will indicate whether a reset of all diff --git a/prov/opx/src/fi_opx_service.c b/prov/opx/src/fi_opx_service.c index 0fce4dd4f9d..e6ccfc2f4bf 100644 --- a/prov/opx/src/fi_opx_service.c +++ b/prov/opx/src/fi_opx_service.c @@ -6,7 +6,7 @@ GPL LICENSE SUMMARY Copyright(c) 2015 Intel Corporation. - Copyright(c) 2021-2022 Cornelis Networks. + Copyright(c) 2021-2024 Cornelis Networks. This program is free software; you can redistribute it and/or modify it under the terms of version 2 of the GNU General Public License as @@ -23,7 +23,7 @@ BSD LICENSE Copyright(c) 2015 Intel Corporation. - Copyright(c) 2021-2022 Cornelis Networks. + Copyright(c) 2021-2024 Cornelis Networks. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -345,8 +345,8 @@ int _hfi_cmd_ioctl(int fd, struct hfi1_cmd *cmd, size_t count) [OPX_HFI_CMD_CTXT_RESET] = {HFI1_IOCTL_CTXT_RESET , 1}, [OPX_HFI_CMD_TID_INVAL_READ] = {HFI1_IOCTL_TID_INVAL_READ, 0}, [OPX_HFI_CMD_GET_VERS] = {HFI1_IOCTL_GET_VERS , 1}, -#ifdef PSM_CUDA - [OPX_HFI_CMD_TID_UPDATE_V2] = {HFI1_IOCTL_TID_UPDATE_V2 , 0}, +#ifdef OPX_HMEM + [OPX_HFI_CMD_TID_UPDATE_V3] = {HFI1_IOCTL_TID_UPDATE_V3 , 0}, #endif }; _HFI_INFO("command OPX_HFI_CMD %#X, HFI1_IOCTL %#X\n",cmd->type, cmdTypeToIoctlNum[cmd->type].ioctlCmd); @@ -425,43 +425,23 @@ int opx_hfi_get_unit_active(int unit) return (rv>0); } -/* get the number of contexts from the unit id. */ +/* Get the number of free contexts from the unit id. */ /* Returns 0 if no unit or no match. */ -int opx_hfi_get_num_contexts(int unit_id) +int opx_hfi_get_num_free_contexts(int unit_id) { - int n = 0; - int units; int64_t val; uint32_t p = OPX_MIN_PORT; - units = opx_hfi_get_num_units(); - - if_pf(units <= 0) - return 0; - - if (unit_id == OPX_UNIT_ID_ANY) { - uint32_t u; - - for (u = 0; u < units; u++) { - for (p = OPX_MIN_PORT; p <= OPX_MAX_PORT; p++) - if (opx_hfi_get_port_lid(u, p) > 0) - break; - - if (p <= OPX_MAX_PORT && - !opx_sysfs_unit_read_s64(u, "nctxts", &val, 0)) - n += (uint32_t) val; - } - } else { - for (; p <= OPX_MAX_PORT; p++) - if (opx_hfi_get_port_lid(unit_id, p) > 0) - break; + for (; p <= OPX_MAX_PORT; p++) + if (opx_hfi_get_port_lid(unit_id, p) > 0) + break; - if (p <= OPX_MAX_PORT && - !opx_sysfs_unit_read_s64(unit_id, "nctxts", &val, 0)) - n += (uint32_t) val; + if (p <= OPX_MAX_PORT && + !opx_sysfs_unit_read_s64(unit_id, "nfreectxts", &val, 0)) { + return (uint32_t) val; } - return n; + return 0; } /* Given a unit number and port number, returns 1 if the unit and port are active. diff --git a/prov/opx/src/fi_opx_tid_cache.c b/prov/opx/src/fi_opx_tid_cache.c index 8854b86d9d2..667c77a9bd3 100644 --- a/prov/opx/src/fi_opx_tid_cache.c +++ b/prov/opx/src/fi_opx_tid_cache.c @@ -1,6 +1,6 @@ /* * Copyright (c) 2017-2020 Amazon.com, Inc. or its affiliates. All rights reserved. - * Copyright (C) 2022-2023 Cornelis Networks. + * Copyright (C) 2022-2024 Cornelis Networks. * * Copyright (c) 2016-2017 Cray Inc. All rights reserved. * Copyright (c) 2017-2019 Intel Corporation, Inc. All rights reserved. @@ -83,7 +83,7 @@ * struct opx_tid_mr *opx_mr = (struct opx_tid_mr *)entry->data; * * The TID memory region (mr) has TID info for that mr that is - * registered/ioctl(update) and deregisered/ioctl(free) + * registered/ioctl(update) and deregistered/ioctl(free) * * struct opx_mr_tid_info * tid_info = &opx_mr->tid_info; * @@ -243,15 +243,15 @@ static int opx_util_mr_find_within(struct ofi_rbmap *map, void *key, void *data) void opx_regen_tidpairs(struct fi_opx_ep *opx_ep, struct opx_mr_tid_info *const tid_reuse_cache) { - uint32_t *tidinfo = (uint32_t *)&OPX_TID_INFO(tid_reuse_cache, 0); - uint32_t ntidinfo = OPX_TID_NINFO(tid_reuse_cache); - uint32_t *tidpairs = &OPX_TID_PAIR(tid_reuse_cache, 0); - OPX_TID_NPAIRS(tid_reuse_cache) = 0; + uint32_t *tidinfo = (uint32_t *)&tid_reuse_cache->info[0]; + uint32_t ntidinfo = tid_reuse_cache->ninfo; + uint32_t *tidpairs = &tid_reuse_cache->pairs[0]; + tid_reuse_cache->npairs = 0; size_t accumulated_len = 0; int32_t tid_idx = 0, pair_idx = -1; unsigned int npages = 0; OPX_DEBUG_TIDS("Input tidinfo", ntidinfo, tidinfo); - uint32_t tid_length = OPX_TID_LENGTH(tid_reuse_cache); + uint32_t tid_length = tid_reuse_cache->tid_length; FI_DBG(fi_opx_global.prov, FI_LOG_MR, "OPX_DEBUG_ENTRY tid_idx %u, ntidinfo %u, accumulated_len %zu, length_pages %u\n", tid_idx, ntidinfo, accumulated_len, tid_length); @@ -272,73 +272,47 @@ void opx_regen_tidpairs(struct fi_opx_ep *opx_ep, (len >= 128), opx_ep->debug_counters.expected_receive.tid_buckets[3]); #endif - if (FI_OPX_EXP_TID_GET(tidinfo[tid_idx], CTRL) == 1) { - npages += - (int)FI_OPX_EXP_TID_GET(tidinfo[tid_idx], LEN); - accumulated_len += - FI_OPX_EXP_TID_GET(tidinfo[tid_idx], LEN) * - OPX_HFI1_TID_PAGESIZE; + size_t tid_pages = FI_OPX_EXP_TID_GET(tidinfo[tid_idx], LEN); + size_t tid_pages_len = tid_pages * OPX_HFI1_TID_PAGESIZE; + uint64_t tid_ctrl = FI_OPX_EXP_TID_GET(tidinfo[tid_idx], CTRL); + /* Starts with CTRL 1 *or* it's the first entry (tid_idx == 0) + and starts with ONLY CTRL 2, just accumulate it, no previous + CTRL 1 to pair */ + if (tid_idx == 0 || tid_ctrl == 1) { + npages += (int) tid_pages; + accumulated_len += tid_pages_len; pair_idx++; tidpairs[pair_idx] = tidinfo[tid_idx]; - } else { - if (tid_idx == 0) { - /* Starts with ONLY CTRL 2, just accumulate it - - no previous CTRL 1 to pair */ - npages += (int)FI_OPX_EXP_TID_GET( - tidinfo[tid_idx], LEN); - accumulated_len += - FI_OPX_EXP_TID_GET(tidinfo[tid_idx], - LEN) * - OPX_HFI1_TID_PAGESIZE; + } else { /* possible CTRL 1/2 tid pair */ + assert(tid_ctrl == 2); + npages += tid_pages; + accumulated_len += tid_pages_len; + if ((FI_OPX_EXP_TID_GET(tidinfo[tid_idx - 1], IDX) != + FI_OPX_EXP_TID_GET(tidinfo[tid_idx], IDX)) + || (FI_OPX_EXP_TID_GET(tidinfo[tid_idx - 1], CTRL) != 1) + || ((FI_OPX_EXP_TID_GET(tidinfo[tid_idx - 1], LEN) + + tid_pages) > 512)) { + /* Can't combine into CTRL 3 if : + - not the same IDX or + - previous was not CTRL 1 or + - combined LEN > 512 + + Offset field (OFFSET): For expected receive packets this offset is added to the address field + associated with the specified TID to determine a physical address. This physical address is then + used to DMA the data portion of the received packet to system memory. If OM is 0 the 15-bit + OFFSET can address a 128KB mapping in DW multiples. If OM is 1 the 15-bit OFFSET can address a + 2MB mapping in 64B multiples. + + 512 pages is 2MB. So even if a "tid pair" *seems* to be available, it won't work over 512 pages + so keep ctrl 1 tid and ctrl 2 tid separate, do not optimize into ctrl 3 tidpair + */ pair_idx++; tidpairs[pair_idx] = tidinfo[tid_idx]; - } else { /* possible CTRL 1/2 tid pair */ - assert(FI_OPX_EXP_TID_GET(tidinfo[tid_idx], - CTRL) == 2); - npages += (int)FI_OPX_EXP_TID_GET( - tidinfo[tid_idx], LEN); - accumulated_len += - FI_OPX_EXP_TID_GET(tidinfo[tid_idx], - LEN) * - OPX_HFI1_TID_PAGESIZE; - if ((FI_OPX_EXP_TID_GET(tidinfo[tid_idx - 1], - IDX) != - FI_OPX_EXP_TID_GET(tidinfo[tid_idx], - IDX)) || - (FI_OPX_EXP_TID_GET(tidinfo[tid_idx - 1], - CTRL) != 1) || - ((FI_OPX_EXP_TID_GET(tidinfo[tid_idx - 1], - LEN) + - FI_OPX_EXP_TID_GET(tidinfo[tid_idx], - LEN)) > 512)) { - /* Can't combine into CTRL 3 if : - - not the same IDX or - - previous was not CTRL 1 or - - combined LEN > 512 - - Offset field (OFFSET): For expected receive packets this offset is added to the address field - associated with the specified TID to determine a physical address. This physical address is then - used to DMA the data portion of the received packet to system memory. If OM is 0 the 15-bit - OFFSET can address a 128KB mapping in DW multiples. If OM is 1 the 15-bit OFFSET can address a - 2MB mapping in 64B multiples. - - 512 pages is 2MB. So even if a "tid pair" *seems* to be available, it won't work over 512 pages - so keep ctrl 1 tid and ctrl 2 tid separate, do not optimize into ctrl 3 tidpair - */ - pair_idx++; - tidpairs[pair_idx] = tidinfo[tid_idx]; - } else { - FI_OPX_EXP_TID_RESET(tidpairs[pair_idx], - CTRL, 0x3); - int32_t len = - FI_OPX_EXP_TID_GET( - tidinfo[tid_idx - 1], - LEN) + - FI_OPX_EXP_TID_GET( - tidinfo[tid_idx], LEN); - FI_OPX_EXP_TID_RESET(tidpairs[pair_idx], - LEN, len); - } + } else { + FI_OPX_EXP_TID_RESET(tidpairs[pair_idx], CTRL, 0x3); + int32_t len = tid_pages + + FI_OPX_EXP_TID_GET(tidinfo[tid_idx - 1], LEN); + FI_OPX_EXP_TID_RESET(tidpairs[pair_idx], LEN, len); } } tid_idx++; @@ -360,9 +334,9 @@ void opx_regen_tidpairs(struct fi_opx_ep *opx_ep, opx_ep->debug_counters.expected_receive.first_tidpair_maxlen, first_pair_len); #endif - OPX_TID_NPAIRS(tid_reuse_cache) = pair_idx + 1; - OPX_DEBUG_TIDS("Regen tidpairs", OPX_TID_NPAIRS(tid_reuse_cache), - &OPX_TID_PAIR(tid_reuse_cache, 0)); + tid_reuse_cache->npairs = pair_idx + 1; + OPX_DEBUG_TIDS("Regen tidpairs", tid_reuse_cache->npairs, + &tid_reuse_cache->pairs[0]); (void) npages; } @@ -371,82 +345,75 @@ void opx_regen_tidpairs(struct fi_opx_ep *opx_ep, * * Hold the cache->lock across registering the TIDs */ int opx_register_tid_region(uint64_t tid_vaddr, uint64_t tid_length, + enum fi_hmem_iface tid_iface, + uint64_t tid_device, struct fi_opx_ep *opx_ep, struct opx_mr_tid_info *const tid_reuse_cache) { + uint64_t flags = (uint64_t) OPX_HMEM_KERN_MEM_TYPE[tid_iface]; + /* Parameters must be aligned for expected receive */ - assert(tid_length == (tid_length & -64)); - assert(tid_vaddr == (tid_vaddr & -(int64_t)OPX_HFI1_TID_PAGESIZE)); - assert(tid_length == (tid_length & -(int64_t)OPX_HFI1_TID_PAGESIZE)); + assert(tid_vaddr == (tid_vaddr & -(int64_t)OPX_TID_PAGE_SIZE[tid_iface])); + assert(tid_length == (tid_length & -(int64_t)OPX_TID_PAGE_SIZE[tid_iface])); /* Assert precondition that the lock is held with a trylock assert */ assert(pthread_mutex_trylock(&opx_ep->tid_domain->tid_cache->lock) == EBUSY); - FI_DBG(fi_opx_global.prov, FI_LOG_MR, "vaddr %p, length %lu/%lu\n", - (void *)tid_vaddr, tid_length, - (tid_length + (OPX_HFI1_TID_PAGESIZE - 1)) & - -OPX_HFI1_TID_PAGESIZE); - tid_length = (tid_length + (OPX_HFI1_TID_PAGESIZE - 1)) & - -OPX_HFI1_TID_PAGESIZE; + FI_DBG(fi_opx_global.prov, FI_LOG_MR, "vaddr %p, length %lu\n", + (void *)tid_vaddr, tid_length); /* TODO: Remove opx_ep - we aren't registering for an ep, it's domain-wide */ struct _hfi_ctrl *ctx = opx_ep->hfi->ctrl; -#ifndef NDEBUG - /* switching to use the #define more consistently, but assert it's correct - with respect to hfi configuration */ - const uint32_t pg_sz = ctx->__hfi_pg_sz; - assert(pg_sz == OPX_HFI1_TID_PAGESIZE); - assert(sysconf(_SC_PAGESIZE) == OPX_HFI1_TID_PAGESIZE); - /* Unfortunately, for now, we assume 2 TID pages per 8K packet */ - assert(OPX_HFI1_TID_PAGESIZE == 4096); -#endif const uint32_t max_tidcnt = ctx->__hfi_tidexpcnt; - assert(ctx->__hfi_tidexpcnt <= OPX_MAX_TID_COUNT); - if (OFI_UNLIKELY(tid_length > - (max_tidcnt * OPX_HFI1_TID_PAGESIZE))) { + assert(max_tidcnt <= OPX_MAX_TID_COUNT); + + const uint64_t max_tidlen = max_tidcnt * OPX_TID_PAGE_SIZE[tid_iface]; + if (OFI_UNLIKELY(tid_length > max_tidlen)) { /* This is somewhat arbitrary - if we "chunk" the TID updates we might be able * to do larger buffers using multiple update calls. */ FI_WARN(fi_opx_global.prov, FI_LOG_MR, - "OPX_DEBUG_EXIT Max length exceeded, %lu\n", - tid_length); - OPX_TID_CACHE_RZV_RTS(tid_reuse_cache, - "UPDATE LENGTH EXCEEDED"); + "OPX_DEBUG_EXIT Max TID length exceeded, %lu > %lu\n", + tid_length, max_tidlen); + OPX_TID_CACHE_RZV_RTS(tid_reuse_cache, "UPDATE LENGTH EXCEEDED"); return -1; } - uint32_t tidcnt = - (uint32_t)((tid_length + (OPX_HFI1_TID_PAGESIZE - 1)) >> 12); - /* Eventually we might need to "chunk" updates, thus the naming here */ - uint32_t tidcnt_chunk = tidcnt; - uint32_t length_chunk = OPX_HFI1_TID_PAGESIZE * tidcnt; /* tid update takes uint32_t, not uint64_t length */ + + uint32_t tidcnt = (uint32_t) (tid_length / OPX_TID_PAGE_SIZE[tid_iface]); if (OFI_UNLIKELY(tidcnt > max_tidcnt)) { FI_WARN(fi_opx_global.prov, FI_LOG_MR, - "OPX_DEBUG_EXIT Max TIDs exceeded, %u > %u\n", tidcnt, - max_tidcnt); + "OPX_DEBUG_EXIT Max TIDs exceeded, %u > %u\n", + tidcnt, max_tidcnt); OPX_TID_CACHE_RZV_RTS(tid_reuse_cache, "UPDATE NTIDS EXCEEDED"); OPX_TID_CACHE_DEBUG_FPRINTF("## %s:%u OPX_TID_CACHE_DEBUG Update number of TIDs (%u) exceeded\n", __func__, __LINE__, tidcnt); return -1; } + + /* Eventually we might need to "chunk" updates, thus the naming here */ + uint32_t length_chunk = (uint32_t) tid_length; /* new (cumulative) vaddr/length of this operation*/ uint64_t new_vaddr = tid_vaddr; - uint64_t new_length = length_chunk; /* page aligned length */ - assert((OPX_TID_LENGTH(tid_reuse_cache) == 0) && - (OPX_TID_VADDR(tid_reuse_cache) == 0)); + assert((tid_reuse_cache->tid_length == 0) && + (tid_reuse_cache->tid_vaddr == 0)); - uint64_t *tidlist = (uint64_t *)&OPX_TID_INFO(tid_reuse_cache, 0); + uint64_t *tidlist = (uint64_t *)&tid_reuse_cache->info[0]; FI_DBG(fi_opx_global.prov, FI_LOG_MR, - "OPX_DEBUG_ENTRY buffer range [%#lx - %#lx] length %lu %u, new range [%#lx - %#lx] length %lu %u, tidcnt %u, tidlist %p\n", + "OPX_DEBUG_ENTRY buffer range [%#lx - %#lx] length %lu %u, new range [%#lx - %#lx] length %u, tidcnt %u, tidlist %p iface %u flags %#lx\n", tid_vaddr, tid_vaddr + tid_length, tid_length, length_chunk, - new_vaddr, new_vaddr + new_length, new_length, length_chunk, - tidcnt, tidlist); - FI_DBG(fi_opx_global.prov, FI_LOG_MR, - "update tid length %#X, pages (tidcnt) %u\n", length_chunk, - tidcnt); - assert(tid_vaddr + tid_length <= - tid_vaddr + (tidcnt * OPX_HFI1_TID_PAGESIZE)); - FI_DBG(fi_opx_global.prov, FI_LOG_MR, - "opx_hfi_update_tid vaddr [%#lx - %#lx], length %u\n", tid_vaddr, - tid_vaddr + length_chunk, length_chunk); + new_vaddr, new_vaddr + length_chunk, length_chunk, + tidcnt, tidlist, tid_iface, flags); + + if (tid_iface == FI_HMEM_CUDA) { + int err = cuda_set_sync_memops((void *) tid_vaddr); + if (OFI_UNLIKELY(err != 0)) { + FI_WARN(fi_opx_global.prov, FI_LOG_MR, + "cuda_set_sync_memops(%p) FAILED (returned %d)\n", + (void *) tid_vaddr, err); + return -1; + } + } + + uint32_t tidcnt_chunk; /* return code is ignored in favor of length/tidcnt checks * because the ioctl can "succeed" (return code 0) within * resource limitations and the updated length/tidcnt will @@ -457,9 +424,9 @@ int opx_register_tid_region(uint64_t tid_vaddr, uint64_t tid_length, &length_chunk, /* input/output*/ (uint64_t)tidlist, /* input/output ptr cast as uint64_t */ &tidcnt_chunk, /* output */ - 0); - FI_OPX_DEBUG_COUNTERS_INC( - opx_ep->debug_counters.expected_receive.tid_updates); + flags); + FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.expected_receive.tid_updates); + FI_OPX_DEBUG_COUNTERS_INC_COND(tid_iface > FI_HMEM_SYSTEM, opx_ep->debug_counters.hmem.tid_update); FI_DBG(fi_opx_global.prov, FI_LOG_MR, "opx_hfi_update_tid return length %u, tidcnt %u\n", length_chunk, tidcnt_chunk); @@ -469,72 +436,59 @@ int opx_register_tid_region(uint64_t tid_vaddr, uint64_t tid_length, if (OFI_UNLIKELY(((uint64_t)length_chunk < tid_length) || (tidcnt_chunk == 0))) { /* errors generally mean we hit the TID resource limit */ FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.expected_receive.tid_resource_limit); + FI_OPX_DEBUG_COUNTERS_INC_COND(length_chunk < tid_length, + opx_ep->debug_counters.expected_receive.tid_resource_limit_length_chunk_short); + FI_OPX_DEBUG_COUNTERS_INC_COND(tidcnt_chunk == 0, + opx_ep->debug_counters.expected_receive.tid_resource_limit_tidcnt_chunk_zero); FI_WARN(fi_opx_global.prov, FI_LOG_MR, "OPX_DEBUG_EXIT opx_hfi_update_tid failed on vaddr %#lX, length %lu/%u, tidcnt %u\n", tid_vaddr, tid_length, length_chunk, tidcnt_chunk); - if(tidcnt_chunk == 0) { + if (tidcnt_chunk == 0) { /* The return length is untouched, so update it for the recovery calculations below */ length_chunk = 0; } - /* flush the cache to recover resources until - * we've flushed tids or no more to flush. - This assumes worst case 1 page tids. */ - int npages = (tid_length - (uint64_t)length_chunk) / - OPX_HFI1_TID_PAGESIZE; - uint32_t flush_counter = opx_ep->mcache_flush_counter = 0; - uint32_t ncounter = 0; - do { - flush_counter = opx_ep->mcache_flush_counter; - pthread_mutex_unlock(&opx_ep->tid_domain->tid_cache->lock); - opx_tid_cache_flush(opx_ep->tid_domain->tid_cache, true); - pthread_mutex_lock(&opx_ep->tid_domain->tid_cache->lock); - FI_DBG(fi_opx_global.prov, FI_LOG_MR,"npages %d, flush_counter %u/%u\n", - npages, flush_counter, opx_ep->mcache_flush_counter); - ncounter++; - } while (((npages - opx_ep->mcache_flush_counter) > 0) && (flush_counter != opx_ep->mcache_flush_counter)); -#ifdef OPX_IOCTL_DEBUG - if ((npages - (int)opx_ep->mcache_flush_counter) > 0) { - fprintf(stderr, - "## FAILED RECOVERY FLUSHES, npages %d, npages left %d, nflushes(%u) %u/%u\n",npages,(npages - opx_ep->mcache_flush_counter), ncounter, flush_counter, opx_ep->mcache_flush_counter); - } -#else - (void) ncounter; -#endif - FI_DBG(fi_opx_global.prov, FI_LOG_MR, "npages %d, npages left %d, nflushes(%u) %u/%u\n",npages,(npages - opx_ep->mcache_flush_counter), ncounter, flush_counter, opx_ep->mcache_flush_counter); + /* flush the cache to recover resources */ + pthread_mutex_unlock(&opx_ep->tid_domain->tid_cache->lock); + opx_tid_cache_flush_all(opx_ep->tid_domain->tid_cache, true, true); + pthread_mutex_lock(&opx_ep->tid_domain->tid_cache->lock); /* Attempt one recovery ioctl()*/ - uint32_t new_length_chunk = (OPX_HFI1_TID_PAGESIZE * tidcnt) - length_chunk; + uint32_t new_length_chunk = tid_length - length_chunk; uint32_t new_tidcnt_chunk = 0; /* Frustrating mix of uint32_t/uint64_t*/ - uint32_t *new_tidinfo = &OPX_TID_INFO(tid_reuse_cache, tidcnt_chunk); + uint32_t *new_tidinfo = &tid_reuse_cache->info[tidcnt_chunk]; opx_hfi_update_tid( ctx, (tid_vaddr + length_chunk), /* input */ &new_length_chunk, /* input/output*/ (uint64_t)new_tidinfo, /* input/output ptr cast as uint64_t */ &new_tidcnt_chunk, /* output */ - 0); + flags); FI_OPX_DEBUG_COUNTERS_INC( opx_ep->debug_counters.expected_receive.tid_updates); + FI_OPX_DEBUG_COUNTERS_INC_COND(tid_iface > FI_HMEM_SYSTEM, + opx_ep->debug_counters.hmem.tid_update); FI_DBG(fi_opx_global.prov, FI_LOG_MR, - "opx_hfi_update_tid return length %u, tidcnt %u\n", new_length_chunk, - new_tidcnt_chunk); - if (OFI_UNLIKELY(((uint64_t)length_chunk + (uint64_t)new_length_chunk) < tid_length) || - (new_tidcnt_chunk == 0)) { + "opx_hfi_update_tid return length %u, tidcnt %u\n", + new_length_chunk, new_tidcnt_chunk); + if (OFI_UNLIKELY((length_chunk + new_length_chunk) < tid_length) || + (new_tidcnt_chunk == 0)) { #ifdef OPX_IOCTL_DEBUG fprintf(stderr, "## FAILED RECOVERY opx_hfi_update_tid failed on vaddr %#lX, length %lu/%u, tidcnt %u\n", tid_vaddr, tid_length, length_chunk, tidcnt_chunk); fprintf(stderr, "## FAILED RECOVERY opx_hfi_update_tid failed on vaddr %#lX, length %lu/%u, tidcnt %u\n", - (tid_vaddr + length_chunk),(OPX_HFI1_TID_PAGESIZE * tidcnt) - length_chunk, new_length_chunk, new_tidcnt_chunk); + (tid_vaddr + length_chunk), + (OPX_TID_PAGE_SIZE[tid_iface] * tidcnt) - length_chunk, + new_length_chunk, new_tidcnt_chunk); #endif OPX_TID_CACHE_RZV_RTS(tid_reuse_cache, "UPDATE/NEW FAILED"); /* free first partial update, it's not useful */ if (length_chunk) { OPX_FPRINTF_TIDS("Partially updated tidinfo", (tidcnt_chunk + new_tidcnt_chunk), - &OPX_TID_INFO(tid_reuse_cache, 0)); + &tid_reuse_cache->info[0]); opx_hfi_free_tid(ctx, (uint64_t)tidlist, tidcnt_chunk); } OPX_TID_CACHE_DEBUG_FPRINTF( @@ -557,23 +511,31 @@ int opx_register_tid_region(uint64_t tid_vaddr, uint64_t tid_length, tid_vaddr, tid_length, length_chunk, tidcnt_chunk); fprintf(stderr, "## SUCCESS RECOVERY opx_hfi_update_tid on vaddr %#lX, length %lu/%u, tidcnt %u\n", - (tid_vaddr + length_chunk),(OPX_HFI1_TID_PAGESIZE * tidcnt) - length_chunk, new_length_chunk, new_tidcnt_chunk); + (tid_vaddr + length_chunk), + (OPX_TID_PAGE_SIZE[tid_iface] * tidcnt) - length_chunk, + new_length_chunk, new_tidcnt_chunk); } #endif /* Successfully recovered */ tidcnt_chunk += new_tidcnt_chunk; + length_chunk += new_length_chunk; OPX_FPRINTF_TIDS("Recovered partially updated tidinfo", tidcnt_chunk, - &OPX_TID_INFO(tid_reuse_cache, 0)); + &tid_reuse_cache->info[0]); + } else if (length_chunk > tid_length) { + FI_DBG(fi_opx_global.prov, FI_LOG_MR, + "opx_hfi_update_tid gave larger than requested range! requested length %lu, return length %u, tidcnt %u\n", + tid_length, length_chunk, tidcnt_chunk); + FI_OPX_DEBUG_COUNTERS_INC(opx_ep->debug_counters.expected_receive + .tid_resource_limit_length_chunk_long); } assert(tidcnt_chunk <= FI_OPX_MAX_DPUT_TIDPAIRS); OPX_DEBUG_TIDS("Updated tidinfo", tidcnt_chunk, - (&(OPX_TID_INFO(tid_reuse_cache, 0)))); - OPX_TID_VADDR(tid_reuse_cache) = new_vaddr; - OPX_TID_LENGTH(tid_reuse_cache) = new_length; - OPX_TID_NINFO(tid_reuse_cache) += - tidcnt_chunk; /* appended or replaced */ - OPX_TID_VALID(tid_reuse_cache); + (&(tid_reuse_cache->info[0]))); + tid_reuse_cache->tid_vaddr = new_vaddr; + tid_reuse_cache->tid_length = length_chunk; + tid_reuse_cache->ninfo += tidcnt_chunk; /* appended or replaced */ + tid_reuse_cache->invalid = 0; OPX_TID_CACHE_RZV_RTS(tid_reuse_cache, "UPDATE/NEW"); @@ -587,9 +549,9 @@ int opx_register_tid_region(uint64_t tid_vaddr, uint64_t tid_length, "OPX_DEBUG_EXIT UPDATED TIDs vaddr [%#lx - %#lx] length %lu, tid vaddr [%#lx - %#lx] , tid length %lu, number of TIDs %u\n", tid_vaddr, tid_vaddr + tid_length, tid_length, - OPX_TID_VADDR(tid_reuse_cache), - OPX_TID_VADDR(tid_reuse_cache) + OPX_TID_LENGTH(tid_reuse_cache), - OPX_TID_LENGTH(tid_reuse_cache), OPX_TID_NINFO(tid_reuse_cache)); + tid_reuse_cache->tid_vaddr, + tid_reuse_cache->tid_vaddr + tid_reuse_cache->tid_length, + tid_reuse_cache->tid_length, tid_reuse_cache->ninfo); opx_regen_tidpairs(opx_ep, tid_reuse_cache); return 0; @@ -603,8 +565,8 @@ void opx_deregister_tid_region(struct fi_opx_ep *opx_ep, struct opx_mr_tid_info *const tid_reuse_cache) { struct _hfi_ctrl *ctx = opx_ep->hfi->ctrl; - uint32_t old_ntidinfo = OPX_TID_NINFO(tid_reuse_cache); - uint64_t *old_tidlist = (uint64_t *)&OPX_TID_INFO(tid_reuse_cache, 0); + uint32_t old_ntidinfo = tid_reuse_cache->ninfo; + uint64_t *old_tidlist = (uint64_t *)&tid_reuse_cache->info[0]; FI_DBG(fi_opx_global.prov, FI_LOG_MR, "OPX_DEBUG_ENTRY vaddr %p, length %lu, opx_hfi_free_tid %u tidpairs\n", (void *)tid_reuse_cache->tid_vaddr, tid_reuse_cache->tid_length, @@ -679,20 +641,17 @@ void opx_tid_cache_delete_region(struct ofi_mr_cache *cache, const size_t iov_len = entry->info.iov.iov_len; assert(entry->use_cnt == 0); /* Is this region current? deregister it */ - if (!OPX_TID_IS_INVALID(tid_reuse_cache) && - (OPX_TID_LENGTH(tid_reuse_cache) == iov_len) && - (OPX_TID_VADDR(tid_reuse_cache) == (uint64_t)iov_base)) { + if (!tid_reuse_cache->invalid && + (tid_reuse_cache->tid_length == iov_len) && + (tid_reuse_cache->tid_vaddr == (uint64_t)iov_base)) { FI_DBG(cache->domain->prov, FI_LOG_MR, "ENTRY cache %p, entry %p, data %p, iov_base %p, iov_len %zu\n", cache, entry, opx_mr, iov_base, iov_len); - - /* count the tid's flushed */ - opx_ep->mcache_flush_counter += OPX_TID_NPAIRS(tid_reuse_cache); opx_deregister_tid_region(opx_ep, tid_reuse_cache); } else { FI_DBG(cache->domain->prov, FI_LOG_MR, "ENTRY OPX_TID_IS_INVALID==%u cache %p, entry %p, data %p, iov_base %p, iov_len %zu\n", - OPX_TID_IS_INVALID(tid_reuse_cache), cache, entry, opx_mr, iov_base, iov_len); + tid_reuse_cache->invalid, cache, entry, opx_mr, iov_base, iov_len); } memset(opx_mr, 0x00, sizeof(*opx_mr)); @@ -737,7 +696,7 @@ int opx_tid_dec_use_cnt(struct ofi_mr_entry *entry) } -/* Copied from util_mr_cache_full */ +/* Copied from ofi_mr_cache_full */ __OPX_FORCE_INLINE__ bool opx_tid_cache_full(struct ofi_mr_cache *cache) { @@ -822,7 +781,7 @@ int opx_tid_cache_init(struct util_domain *domain, __OPX_FORCE_INLINE__ struct ofi_mr_entry *opx_mr_rbt_find(struct ofi_rbmap *tree, - const struct ofi_mr_info *key) + const struct ofi_mr_info *key) { struct ofi_rbnode *node; @@ -951,7 +910,6 @@ int opx_tid_cache_crte(struct ofi_mr_cache *cache, /* drop the mm lock across alloc/register */ pthread_mutex_unlock(&mm_lock); *entry = opx_mr_entry_alloc(cache); - assert((*entry)->use_cnt == 0); if (!*entry) { FI_DBG(cache->domain->prov, FI_LOG_MR, "OPX_DEBUG_ENTRY FI_NOMEM [%p - %p] (len: %zu/%#lX) \n", @@ -962,6 +920,7 @@ int opx_tid_cache_crte(struct ofi_mr_cache *cache, pthread_mutex_lock(&mm_lock); return -FI_ENOMEM; } + assert((*entry)->use_cnt == 0); (*entry)->node = NULL; (*entry)->info = *info; @@ -988,15 +947,15 @@ int opx_tid_cache_crte(struct ofi_mr_cache *cache, struct opx_mr_tid_info *tid_info = &opx_mr->tid_info; - OPX_TID_NINFO(tid_info) = 0; - OPX_TID_NPAIRS(tid_info) = 0; - OPX_TID_VADDR(tid_info) = 0UL; - OPX_TID_LENGTH(tid_info) = 0UL; + tid_info->ninfo = 0; + tid_info->npairs = 0; + tid_info->tid_vaddr = 0UL; + tid_info->tid_length = 0UL; #ifndef NDEBUG for (int i = 0; i < FI_OPX_MAX_DPUT_TIDPAIRS; ++i) { - OPX_TID_INFO(tid_info, i) = -1U; - OPX_TID_PAIR(tid_info, i) = -1U; + tid_info->info[i] = -1U; + tid_info->pairs[i] = -1U; } #endif @@ -1006,11 +965,12 @@ int opx_tid_cache_crte(struct ofi_mr_cache *cache, /* Hold the cache->lock across registering the TIDs */ pthread_mutex_lock(&cache->lock); if (opx_register_tid_region((uint64_t)info->iov.iov_base, - (uint64_t)info->iov.iov_len, opx_ep, - tid_info)) { + (uint64_t)info->iov.iov_len, + info->iface, info->device, + opx_ep, tid_info)) { FI_DBG(fi_opx_global.prov, FI_LOG_MR, "opx_register_tid_region failed\n"); - /* Failed, OPX_TID_NINFO(tid_info) will be zero */ + /* Failed, tid_info->ninfo will be zero */ FI_DBG(fi_opx_global.prov, FI_LOG_MR, "FREE node %p\n", (*entry)->node); pthread_mutex_unlock(&cache->lock); @@ -1026,15 +986,12 @@ int opx_tid_cache_crte(struct ofi_mr_cache *cache, "NEW vaddr [%#lx - %#lx] length %lu, tid vaddr [%#lx - %#lx] , tid length %lu\n", (uint64_t)info->iov.iov_base, (uint64_t)info->iov.iov_base + (uint64_t)info->iov.iov_len, - (uint64_t)info->iov.iov_len, OPX_TID_VADDR(tid_info), - OPX_TID_VADDR(tid_info) + OPX_TID_LENGTH(tid_info), - OPX_TID_LENGTH(tid_info)); + (uint64_t)info->iov.iov_len, tid_info->tid_vaddr, + tid_info->tid_vaddr + tid_info->tid_length, + tid_info->tid_length); if (opx_tid_cache_full(cache)) { - pthread_mutex_unlock(&mm_lock); - opx_tid_cache_flush(cache, 1); - /* re-acquire mm_lock */ - pthread_mutex_lock(&mm_lock); + opx_tid_cache_flush_all(cache, true, true); FI_DBG(fi_opx_global.prov, FI_LOG_MR, "CACHE FULL flushed\n"); } if (opx_tid_cache_full(cache)) { @@ -1076,7 +1033,7 @@ int opx_tid_cache_crte(struct ofi_mr_cache *cache, info->iov.iov_len, info->iov.iov_len); assert((*entry)->use_cnt == 1); opx_tid_dec_use_cnt(*entry);/* traceable */ - OPX_TID_NINFO(tid_info) = 0; /* error == no tid pairs */ + tid_info->ninfo = 0; /* error == no tid pairs */ OPX_DEBUG_EXIT((*entry), 2); return 0; //TODO - handle case for free } @@ -1104,7 +1061,7 @@ int opx_tid_cache_find(struct fi_opx_ep *opx_ep, goto in_use; } ret = OPX_ENTRY_FOUND; - } else if (!ofi_iov_within(&info->iov, &(*entry)->info.iov)) { + } else { if (opx_mr->opx_ep != opx_ep) { FI_DBG(fi_opx_global.prov, FI_LOG_MR,"OPX_ENTRY_IN_USE %p/%p\n",opx_mr? opx_mr->opx_ep:NULL, opx_ep); goto in_use; @@ -1250,7 +1207,7 @@ int opx_return_offset_for_new_cache_entry( __OPX_FORCE_INLINE__ int opx_tid_cache_close_region(struct ofi_mr_cache *tid_cache, struct ofi_mr_entry *entry, - bool force) + bool invalidate) { /* TODO ... fix? */ OPX_DEBUG_ENTRY2(entry, OPX_ENTRY_FOUND); @@ -1274,7 +1231,7 @@ int opx_tid_cache_close_region(struct ofi_mr_cache *tid_cache, struct opx_tid_mr *opx_mr = (struct opx_tid_mr *)entry->data; struct opx_mr_tid_info *const tid_info = &opx_mr->tid_info; - if(force) { + if(invalidate) { /* Invalidate and deregister it. * Any ongoing RDMA will fail. * Any new RDMA will not use it and will fallback. @@ -1282,7 +1239,7 @@ int opx_tid_cache_close_region(struct ofi_mr_cache *tid_cache, struct fi_opx_ep *const opx_ep = opx_mr->opx_ep; FI_DBG(tid_cache->domain->prov, FI_LOG_MR, "OPX_TID_IS_INVALID %u->1, (%p/%p) insert lru [%p - %p] (len: %zu,%#lX) use_cnt %x\n", - OPX_TID_IS_INVALID(tid_info), + tid_info->invalid, entry, entry->data, entry->info.iov.iov_base, (char*)entry->info.iov.iov_base + entry->info.iov.iov_len, @@ -1293,13 +1250,10 @@ int opx_tid_cache_close_region(struct ofi_mr_cache *tid_cache, /* drop mm_lock */ pthread_mutex_unlock(&mm_lock); - /* count the tid's flushed */ - opx_ep->mcache_flush_counter += OPX_TID_NPAIRS(tid_info); - /* Hold the cache->lock across de-registering the TIDs */ pthread_mutex_lock(&tid_cache->lock); opx_deregister_tid_region(opx_ep, tid_info); - OPX_TID_INVALID(tid_info); /* prevent double deregister later */ + tid_info->invalid = 1; /* prevent double deregister later */ pthread_mutex_unlock(&tid_cache->lock); /* re-acquire mm_lock */ @@ -1309,8 +1263,8 @@ int opx_tid_cache_close_region(struct ofi_mr_cache *tid_cache, if (use_cnt == 0) { OPX_DEBUG_UCNT(entry); - FI_DBG(tid_cache->domain->prov, FI_LOG_MR, "force %u, invalid %u, node %p, (%p/%p) insert lru [%p - %p] (len: %zu,%#lX) use_cnt %x\n", - force, OPX_TID_IS_INVALID(tid_info), entry->node, + FI_DBG(tid_cache->domain->prov, FI_LOG_MR, "invalidate %u, invalid %u, node %p, (%p/%p) insert lru [%p - %p] (len: %zu,%#lX) use_cnt %x\n", + invalidate, tid_info->invalid, entry->node, entry, entry->data, entry->info.iov.iov_base, (char*)entry->info.iov.iov_base + entry->info.iov.iov_len, @@ -1326,7 +1280,7 @@ int opx_tid_cache_close_region(struct ofi_mr_cache *tid_cache, pthread_mutex_lock(&mm_lock); return 0; } - if(OPX_TID_IS_INVALID(tid_info)) { /* it's dead, not just "least recently used */ + if(tid_info->invalid) { /* it's dead, not just "least recently used */ FI_DBG(tid_cache->domain->prov, FI_LOG_MR, "DEAD entry %p\n",entry); opx_mr_uncache_entry_storage(tid_cache, entry); dlist_insert_tail(&entry->list_entry, &tid_cache->dead_region_list); @@ -1508,9 +1462,9 @@ int opx_tid_cache_setup(struct ofi_mr_cache **cache, return 0; } -/* De-register (lazy, unless force is true) a memory region on TID rendezvous completion */ +/* De-register (lazy, unless invalidate is true) a memory region on TID rendezvous completion */ void opx_deregister_for_rzv(struct fi_opx_ep *opx_ep, const uint64_t tid_vaddr, - const int64_t tid_length, bool force) + const int64_t tid_length, bool invalidate) { struct opx_tid_domain *tid_domain = opx_ep->domain->tid_domain; struct ofi_mr_cache *tid_cache = tid_domain->tid_cache; @@ -1588,7 +1542,7 @@ void opx_deregister_for_rzv(struct fi_opx_ep *opx_ep, const uint64_t tid_vaddr, (uint64_t)info.iov.iov_base)); ncache_entries++; /* Force the invalidation and put it on the dead list */ - opx_tid_cache_close_region(tid_cache, entry, force); + opx_tid_cache_close_region(tid_cache, entry, invalidate); /* increment past found region for next find */ remaining_length -= adj; info.iov.iov_base = (char *)info.iov.iov_base + adj; @@ -1598,14 +1552,12 @@ void opx_deregister_for_rzv(struct fi_opx_ep *opx_ep, const uint64_t tid_vaddr, (char *)(info.iov.iov_base) + remaining_length, remaining_length, remaining_length); } + /* Flush the dead list, don't flush the lru list (false) */ + opx_tid_cache_flush(tid_cache, false); FI_DBG(fi_opx_global.prov, FI_LOG_MR, "OPX_DEBUG_EXIT %u entries closed\n", ncache_entries); pthread_mutex_unlock(&mm_lock); - if (force) { - /* Flush the dead list, don't flush the lru list (false) */ - opx_tid_cache_flush(tid_cache, false); - } } /* opx_process_entry() @@ -1706,7 +1658,7 @@ int opx_process_entry(struct fi_opx_ep *opx_ep, int find, found_tid_entry->tid_length) - *vaddr)); - */ + */ assert(inout_info->iov.iov_base == (void *)(input_tid_info->tid_vaddr)); assert(inout_info->iov.iov_base == (void *)*vaddr); const uint64_t adj = *length; @@ -1849,7 +1801,9 @@ int opx_process_entry(struct fi_opx_ep *opx_ep, int find, } int opx_register_for_rzv(struct fi_opx_hfi1_rx_rzv_rts_params *params, - const uint64_t tid_vaddr, const uint64_t tid_length) + const uint64_t tid_vaddr, const uint64_t tid_length, + const enum fi_hmem_iface tid_iface, + const uint64_t tid_device) { struct fi_opx_ep *opx_ep = params->opx_ep; struct opx_tid_domain *tid_domain = opx_ep->domain->tid_domain; @@ -1858,13 +1812,15 @@ int opx_register_for_rzv(struct fi_opx_hfi1_rx_rzv_rts_params *params, struct ofi_mr_info find_info = {0}; int first_tid_index = -1, last_tid_index = -1, page_offset_in_tid = -1; - assert(tid_vaddr == (tid_vaddr & -(int64_t)OPX_HFI1_TID_PAGESIZE)); - assert(tid_length == (tid_length & -(int64_t)OPX_HFI1_TID_PAGESIZE)); + assert(tid_vaddr == (tid_vaddr & -(int64_t)OPX_TID_PAGE_SIZE[tid_iface])); + assert(tid_length == (tid_length & -(int64_t)OPX_TID_PAGE_SIZE[tid_iface])); pthread_mutex_lock(&mm_lock); find_info.iov.iov_base = (void *)tid_vaddr; find_info.iov.iov_len = tid_length; + find_info.iface = tid_iface; + find_info.device = tid_device; FI_DBG(fi_opx_global.prov, FI_LOG_MR, "OPX_DEBUG_ENTRY tid vaddr [%#lx - %#lx] , tid length %lu/%#lX\n", @@ -1897,6 +1853,8 @@ int opx_register_for_rzv(struct fi_opx_hfi1_rx_rzv_rts_params *params, * - multple entries */ if (find == OPX_ENTRY_NOT_FOUND) { + /* Flush the dead list, don't flush the lru list (false) */ + opx_tid_cache_flush(tid_cache, false); /* No entry found, create it. */ FI_DBG(fi_opx_global.prov, FI_LOG_MR, "OPX_ENTRY_NOT_FOUND\n"); opx_tid_cache_crte(tid_cache, &find_info, &entry, opx_ep); @@ -1904,7 +1862,7 @@ int opx_register_for_rzv(struct fi_opx_hfi1_rx_rzv_rts_params *params, &((struct opx_tid_mr *)entry->data)->tid_info; /* opx_register_tid_region was done in add region, check result */ - if (OPX_TID_NINFO(cached_tid_entry) == 0) { /* failed */ + if (cached_tid_entry->ninfo == 0) { /* failed */ OPX_TID_CACHE_DEBUG_FPRINTF("## %s:%u return -FI_EFAULT\n", __func__, __LINE__); /*crte returns an entry even if tid update failed */ @@ -1922,11 +1880,11 @@ int opx_register_for_rzv(struct fi_opx_hfi1_rx_rzv_rts_params *params, } /* Copy the tid info to params list for further modifications */ - params->ntidpairs = OPX_TID_NPAIRS(cached_tid_entry); + params->ntidpairs = cached_tid_entry->npairs; assert(params->ntidpairs != 0); - memcpy(params->tidpairs, &OPX_TID_PAIR(cached_tid_entry, 0), - (OPX_TID_NPAIRS(cached_tid_entry) * - sizeof(OPX_TID_PAIR(cached_tid_entry, 0)))); + memcpy(params->tidpairs, &cached_tid_entry->pairs[0], + (cached_tid_entry->npairs * + sizeof(cached_tid_entry->pairs[0]))); params->tid_offset = 0; FI_DBG(fi_opx_global.prov, FI_LOG_MR, "tid_offset %u/%#X\n", params->tid_offset, params->tid_offset); @@ -1937,7 +1895,7 @@ int opx_register_for_rzv(struct fi_opx_hfi1_rx_rzv_rts_params *params, &((struct opx_tid_mr *)entry->data)->tid_info; assert(cached_tid_entry->tid_length != 0); - if (OPX_TID_IS_INVALID(cached_tid_entry)) { + if (cached_tid_entry->invalid) { /* TID was invalidated while still in use and not deleted, can't user or re-register it until it's dead. */ /* Unlock for failed return */ @@ -1948,10 +1906,10 @@ int opx_register_for_rzv(struct fi_opx_hfi1_rx_rzv_rts_params *params, "found [%#lx - %#lx] length %lu/%#lX\n", tid_vaddr, tid_vaddr + tid_length, tid_length, tid_length, - OPX_TID_VADDR(cached_tid_entry), - OPX_TID_VADDR(cached_tid_entry) + OPX_TID_LENGTH(cached_tid_entry), - OPX_TID_LENGTH(cached_tid_entry), - OPX_TID_LENGTH(cached_tid_entry)); + cached_tid_entry->tid_vaddr, + cached_tid_entry->tid_vaddr + cached_tid_entry->tid_length, + cached_tid_entry->tid_length, + cached_tid_entry->tid_length); return -FI_EINVAL; } /* Entry was found. Our search is completely contained in this region */ @@ -1959,22 +1917,21 @@ int opx_register_for_rzv(struct fi_opx_hfi1_rx_rzv_rts_params *params, opx_tid_inc_use_cnt(entry); OPX_DEBUG_TIDS("REUSE FULL LIST", - OPX_TID_NPAIRS(cached_tid_entry), - &OPX_TID_PAIR(cached_tid_entry, 0)); + cached_tid_entry->npairs, + &cached_tid_entry->pairs[0]); opx_return_offset_for_new_cache_entry( (uint64_t)find_info.iov.iov_base, (uint64_t)find_info.iov.iov_len, cached_tid_entry, &first_tid_index, &page_offset_in_tid, &last_tid_index); OPX_DEBUG_TIDS("REUSE SUBSET LIST", (last_tid_index - first_tid_index + 1), - &OPX_TID_PAIR(cached_tid_entry, - first_tid_index)); + &cached_tid_entry->pairs[first_tid_index]); /* Copy the tid info to params list for further modifications */ params->ntidpairs = last_tid_index - first_tid_index + 1; assert(params->ntidpairs != 0); memcpy(params->tidpairs, - &OPX_TID_PAIR(cached_tid_entry, first_tid_index), + &cached_tid_entry->pairs[first_tid_index], params->ntidpairs * sizeof(params->tidpairs[0])); params->tid_offset = page_offset_in_tid * OPX_HFI1_TID_PAGESIZE; @@ -1990,7 +1947,7 @@ int opx_register_for_rzv(struct fi_opx_hfi1_rx_rzv_rts_params *params, &((struct opx_tid_mr *)entry->data)->tid_info; assert(overlap_tid_entry->tid_length != 0); - if (OPX_TID_IS_INVALID(overlap_tid_entry)) { + if (overlap_tid_entry->invalid) { /* TID was invalidated while still in use and not deleted, can't user or re-register it until it's dead. */ /* Unlock for failed return */ @@ -2001,10 +1958,10 @@ int opx_register_for_rzv(struct fi_opx_hfi1_rx_rzv_rts_params *params, "found [%#lx - %#lx] length %lu/%#lX\n", tid_vaddr, tid_vaddr + tid_length, tid_length, tid_length, - OPX_TID_VADDR(overlap_tid_entry), - OPX_TID_VADDR(overlap_tid_entry) + OPX_TID_LENGTH(overlap_tid_entry), - OPX_TID_LENGTH(overlap_tid_entry), - OPX_TID_LENGTH(overlap_tid_entry)); + overlap_tid_entry->tid_vaddr, + overlap_tid_entry->tid_vaddr + overlap_tid_entry->tid_length, + overlap_tid_entry->tid_length, + overlap_tid_entry->tid_length); return -FI_EINVAL; } /* Partial/overlapping memory region found */ @@ -2014,16 +1971,17 @@ int opx_register_for_rzv(struct fi_opx_hfi1_rx_rzv_rts_params *params, "overlap vaddr [%#lx - %#lx] , tid length %lu/%#lX\n", tid_vaddr, tid_vaddr + tid_length, tid_length, tid_length, - OPX_TID_VADDR(overlap_tid_entry), - OPX_TID_VADDR(overlap_tid_entry) + OPX_TID_LENGTH(overlap_tid_entry), - OPX_TID_LENGTH(overlap_tid_entry), - OPX_TID_LENGTH(overlap_tid_entry)); + overlap_tid_entry->tid_vaddr, + overlap_tid_entry->tid_vaddr + overlap_tid_entry->tid_length, + overlap_tid_entry->tid_length, + overlap_tid_entry->tid_length); uint64_t remaining_vaddr = tid_vaddr; int64_t remaining_length = tid_length; uint32_t ntidpairs = 0; /* This loop handles the more complicated combinations of holes and overlap */ + bool once = true; while (remaining_length) { /* process previos find results */ find = opx_process_entry(opx_ep, find, @@ -2055,6 +2013,11 @@ int opx_register_for_rzv(struct fi_opx_hfi1_rx_rzv_rts_params *params, return -FI_EPERM; } if (find == OPX_ENTRY_NOT_FOUND) { + /* Flush the dead list, don't flush the lru list (false) */ + if(once) { + once = false; + opx_tid_cache_flush(tid_cache, false); + } FI_DBG(fi_opx_global.prov, FI_LOG_MR, "NEXT OPX_ENTRY_NOT_FOUND TIDs " "remaining vaddr [%#lx - %#lx] length %lu/%#lX, " @@ -2137,7 +2100,7 @@ int opx_register_for_rzv(struct fi_opx_hfi1_rx_rzv_rts_params *params, &((struct opx_tid_mr *) create_entry->data) ->tid_info; - if (OPX_TID_NINFO(create_tid_entry) == + if (create_tid_entry->ninfo == 0) { /* failed */ OPX_TID_CACHE_DEBUG_FPRINTF("## %s:%u return -FI_EFAULT\n", __func__, __LINE__); @@ -2189,9 +2152,9 @@ int opx_register_for_rzv(struct fi_opx_hfi1_rx_rzv_rts_params *params, /* Copy the tid info to params list for further modifications */ const uint32_t created_ntidpairs = - (int)OPX_TID_NPAIRS(create_tid_entry); + (int)create_tid_entry->npairs; const uint32_t *created_tidpairs = - &OPX_TID_PAIR(create_tid_entry, 0); + &create_tid_entry->pairs[0]; OPX_DEBUG_TIDS("Created tidpairs", created_ntidpairs, created_tidpairs); @@ -2210,7 +2173,7 @@ int opx_register_for_rzv(struct fi_opx_hfi1_rx_rzv_rts_params *params, &((struct opx_tid_mr *) entry->data) ->tid_info; - if (OPX_TID_IS_INVALID(found_tid_entry)) { + if (found_tid_entry->invalid) { /* TID was invalidated while still in use and not deleted, can't user or re-register it until it's dead. */ /* Unlock for failed return */ @@ -2220,10 +2183,10 @@ int opx_register_for_rzv(struct fi_opx_hfi1_rx_rzv_rts_params *params, "vaddr [%#lx - %#lx] length %lu/%#lX " "found [%#lx - %#lx] length %lu/%#lX\n", tid_vaddr, tid_vaddr + tid_length, tid_length, tid_length, - OPX_TID_VADDR(found_tid_entry), - OPX_TID_VADDR(found_tid_entry) + OPX_TID_LENGTH(found_tid_entry), - OPX_TID_LENGTH(found_tid_entry), - OPX_TID_LENGTH(found_tid_entry)); + found_tid_entry->tid_vaddr, + found_tid_entry->tid_vaddr + found_tid_entry->tid_length, + found_tid_entry->tid_length, + found_tid_entry->tid_length); return -FI_EINVAL; } opx_tid_inc_use_cnt(entry); @@ -2233,19 +2196,18 @@ int opx_register_for_rzv(struct fi_opx_hfi1_rx_rzv_rts_params *params, "found vaddr [%#lx - %#lx] %lu/%#lX\n", remaining_vaddr, remaining_vaddr + remaining_length, remaining_length, remaining_length, - OPX_TID_VADDR(found_tid_entry), - OPX_TID_VADDR(found_tid_entry) + - OPX_TID_LENGTH(found_tid_entry), - OPX_TID_LENGTH(found_tid_entry), - OPX_TID_LENGTH(found_tid_entry)); + found_tid_entry->tid_vaddr, + found_tid_entry->tid_vaddr + + found_tid_entry->tid_length, + found_tid_entry->tid_length, + found_tid_entry->tid_length); first_tid_index = 0; last_tid_index = - (int)OPX_TID_NPAIRS(found_tid_entry); + (int)found_tid_entry->npairs; page_offset_in_tid = 0; OPX_DEBUG_TIDS("OVERLAP REUSE FULL LIST", - OPX_TID_NPAIRS(found_tid_entry), - &OPX_TID_PAIR(found_tid_entry, - 0)); + found_tid_entry->npairs, + &found_tid_entry->pairs[0]); if ((found_tid_entry->tid_vaddr < remaining_vaddr) || (remaining_length < @@ -2262,12 +2224,10 @@ int opx_register_for_rzv(struct fi_opx_hfi1_rx_rzv_rts_params *params, OPX_DEBUG_TIDS( "OVERLAP REUSE SUBSET LIST", (last_tid_index - first_tid_index + 1), - &OPX_TID_PAIR(found_tid_entry, - first_tid_index)); + &found_tid_entry->pairs[first_tid_index]); const uint32_t found_ntidpairs = last_tid_index - first_tid_index + 1; - const uint32_t *found_tidpairs = &OPX_TID_PAIR( - found_tid_entry, first_tid_index); + const uint32_t *found_tidpairs = &found_tid_entry->pairs[first_tid_index]; assert(found_ntidpairs && found_tidpairs && params && params->tidpairs); @@ -2361,14 +2321,13 @@ int opx_register_for_rzv(struct fi_opx_hfi1_rx_rzv_rts_params *params, return 0; } -bool opx_tid_cache_flush(struct ofi_mr_cache *cache, bool flush_lru) +void opx_tid_cache_flush_all(struct ofi_mr_cache *cache,const bool flush_lru,const bool flush_all) { struct dlist_entry free_list; struct ofi_mr_entry *entry; - bool entries_freed; - FI_DBG(cache->domain->prov, FI_LOG_MR, "OPX_DEBUG_ENTRY (%u)\n", - flush_lru); + FI_DBG(cache->domain->prov, FI_LOG_MR, "OPX_DEBUG_ENTRY (%u/%u)\n", + flush_lru, flush_all); dlist_init(&free_list); @@ -2410,12 +2369,14 @@ bool opx_tid_cache_flush(struct ofi_mr_cache *cache, bool flush_lru) } } #endif + /* Always free the dead list */ dlist_splice_tail(&free_list, &cache->dead_region_list); /* lru is a list of regions that are still active, optionally * free one, or more if the cache is full. */ - while (flush_lru && !dlist_empty(&cache->lru_list)) { + bool flush_once = flush_lru; + while ((flush_all || flush_once) && !dlist_empty(&cache->lru_list)) { dlist_pop_front(&cache->lru_list, struct ofi_mr_entry, entry, list_entry); FI_DBG(cache->domain->prov, FI_LOG_MR, @@ -2431,13 +2392,11 @@ bool opx_tid_cache_flush(struct ofi_mr_cache *cache, bool flush_lru) opx_mr_uncache_entry_storage(cache, entry); dlist_insert_tail(&entry->list_entry, &free_list); - flush_lru = opx_tid_cache_full(cache); + flush_once = opx_tid_cache_full(cache); } pthread_mutex_unlock(&mm_lock); - entries_freed = !dlist_empty(&free_list); - /* Free dead and selected lru entries */ while (!dlist_empty(&free_list)) { dlist_pop_front(&free_list, struct ofi_mr_entry, entry, @@ -2453,7 +2412,7 @@ bool opx_tid_cache_flush(struct ofi_mr_cache *cache, bool flush_lru) opx_cache_free_entry(cache, entry); } - return entries_freed; + return ; } /* Copied from opx_tid_cache_flush @@ -2609,8 +2568,7 @@ void opx_tid_cache_cleanup(struct ofi_mr_cache *cache) cache->notify_cnt); /* Try the nice flush */ - while (opx_tid_cache_flush(cache, true)) - ; + opx_tid_cache_flush_all(cache, true, true); /* Try forcing it (fini abnormal exit) for all eps (NULL) */ opx_tid_cache_purge_ep(cache, NULL); diff --git a/prov/psm2/build-psm2.sh b/prov/psm2/build-psm2.sh index 6883959b3f6..a3333c484b6 100755 --- a/prov/psm2/build-psm2.sh +++ b/prov/psm2/build-psm2.sh @@ -10,7 +10,7 @@ # # Please run the script from the top level directory of the source repo. # -# The "verbs" providers are disabled to reduce the +# The "psm", "usnic", and "verbs" providers are disabled to reduce the # building time. They can be enabled as needed. # # Please check that the following variables are either set to appropriate @@ -56,6 +56,8 @@ eval ../configure \ $cflags $ldflags $options \ --prefix=${PREFIX:-$HOME/install/ofi} \ --enable-psm2=${PSM2_HOME:-yes} \ + --disable-psm \ + --disable-usnic \ --disable-verbs && \ make && make install diff --git a/prov/psm2/include/fi_ext_psm2.h b/prov/psm2/include/fi_ext_psm2.h index 3a48d83e17f..804fbf37ecc 100644 --- a/prov/psm2/include/fi_ext_psm2.h +++ b/prov/psm2/include/fi_ext_psm2.h @@ -38,7 +38,7 @@ extern "C" { #endif /* Provider specific name for fi_set_val() / fi_get_val() */ -#define FI_PSM2_DISCONNECT (1U | FI_PROV_SPECIFIC) +#define FI_PSM2_DISCONNECT (1U | (1UL << 31)) #ifdef __cplusplus } diff --git a/prov/psm3/Makefile.am b/prov/psm3/Makefile.am index a6d3fbc68ed..cec9bddede3 100644 --- a/prov/psm3/Makefile.am +++ b/prov/psm3/Makefile.am @@ -30,9 +30,9 @@ ACLOCAL_AMFLAGS = -I config AM_CFLAGS = -Wall if HAVE_LD_VERSION_SCRIPT - libpsm3_fi_version_script = -Wl,--version-script=$(builddir)/libpsm3-fi.map + libpsm3_fi_version_script = -Wl,--version-script=$(builddir)/libpsm3-fi.map else !HAVE_LD_VERSION_SCRIPT - libpsm3_fi_version_script = + libpsm3_fi_version_script = endif !HAVE_LD_VERSION_SCRIPT # rdmaincludedir = $(includedir)/rdma @@ -51,6 +51,8 @@ common_srcs = \ shared/hmem_neuron.c \ shared/hmem_synapseai.c \ shared/hmem_ipc_cache.c \ + shared/xpmem.c \ + shared/xpmem_cache.c \ shared/common.c \ shared/enosys.c \ shared/rbtree.c \ @@ -78,13 +80,22 @@ common_srcs = \ util/src/util_ns.c \ util/src/util_pep.c \ util/src/util_poll.c \ + util/src/util_profile.c \ + util/src/util_srx.c \ util/src/util_wait.c \ util/src/rxm_av.c \ util/src/cuda_mem_monitor.c \ util/src/cuda_ipc_monitor.c \ util/src/rocr_mem_monitor.c \ util/src/rocr_ipc_monitor.c \ - util/src/ze_mem_monitor.c + util/src/ze_mem_monitor.c \ + util/src/xpmem_monitor.c \ + shared/fabric.c \ + shared/fi_tostr.c \ + shared/perf.c \ + shared/log.c \ + shared/var.c \ + shared/abi_1_0.c if MACOS common_srcs += shared/osx/osd.c @@ -103,9 +114,7 @@ if LINUX common_srcs += shared/unix/osd.c common_srcs += shared/linux/osd.c if HAVE_LINUX_PERF_RDPMC -if !HAVE_PSM3_SRC -common_srcs += shared/linux/rdpmc.c #seems to be a copy of psm3/psm_perf.c -endif +common_srcs += shared/linux/rdpmc.c endif common_srcs += inc/linux/rdpmc.h common_srcs += inc/linux/osd.h @@ -120,6 +129,8 @@ bin_SCRIPTS = nodist_src_libpsm3_fi_la_SOURCES = src_libpsm3_fi_la_SOURCES = \ inc/ofi_hmem.h \ + inc/ofi_cma.h \ + inc/ofi_xpmem.h \ inc/ofi.h \ inc/ofi_abi.h \ inc/ofi_atom.h \ @@ -137,7 +148,7 @@ src_libpsm3_fi_la_SOURCES = \ inc/ofi_proto.h \ inc/ofi_recvwin.h \ inc/ofi_rbuf.h \ - inc/ofi_shm.h \ + inc/ofi_shm_p2p.h \ inc/ofi_signal.h \ inc/ofi_epoll.h \ inc/ofi_tree.h \ @@ -148,10 +159,12 @@ src_libpsm3_fi_la_SOURCES = \ inc/ofi_net.h \ inc/ofi_perf.h \ inc/ofi_coll.h \ + inc/ofi_mb.h \ inc/fasthash.h \ inc/rbtree.h \ inc/uthash.h \ inc/ofi_prov.h \ + inc/ofi_profile.h \ inc/rdma/providers/fi_log.h \ inc/rdma/providers/fi_prov.h \ inc/rdma/providers/fi_peer.h \ @@ -167,6 +180,7 @@ src_libpsm3_fi_la_SOURCES = \ inc/rdma/fi_errno.h \ inc/rdma/fi_tagged.h \ inc/rdma/fi_trigger.h \ + inc/rdma/fi_profile.h \ src/psmx3.h \ src/psmx3_am.c \ src/psmx3_atomic.c \ @@ -216,7 +230,7 @@ src_libpsm3_fi_la_LDFLAGS += -lpsm2 endif !HAVE_PSM3_SRC if !EMBEDDED -src_libpsm3_fi_la_LDFLAGS += -version-info 22:0:21 +src_libpsm3_fi_la_LDFLAGS += -version-info 24:0:23 endif prov_install_man_pages = man/man7/fi_psm3.7 @@ -249,8 +263,8 @@ src/psm3_src_chksum.h: Makefile $(chksum_srcs) nroff: @for file in $(prov_install_man_pages); do \ - source=`echo $$file | sed -e 's@/man[0-9]@@'`; \ - perl $(top_srcdir)/config/md2nroff.pl --source=$(top_srcdir)/$$source.md; \ + source=`echo $$file | sed -e 's@/man[0-9]@@'`; \ + perl $(top_srcdir)/config/md2nroff.pl --source=$(top_srcdir)/$$source.md; \ done dist-hook: libpsm3-fi.spec diff --git a/prov/psm3/Makefile.include b/prov/psm3/Makefile.include index 4716706b0e0..47424fc2caf 100644 --- a/prov/psm3/Makefile.include +++ b/prov/psm3/Makefile.include @@ -220,6 +220,8 @@ prov_psm3_psm3_libpsm3i_la_SOURCES = \ prov/psm3/psm3/psm_mq_recv.c \ prov/psm3/psm3/psm_mq_utils.c \ prov/psm3/psm3/psm_netutils.h \ + prov/psm3/psm3/psm_nic_select.c \ + prov/psm3/psm3/psm_nic_select.h \ prov/psm3/psm3/psm_oneapi_ze.c \ prov/psm3/psm3/psm_perf.c \ prov/psm3/psm3/psm_perf.h \ diff --git a/prov/psm3/VERSION b/prov/psm3/VERSION index ef63cfba3ce..144229f3d51 100644 --- a/prov/psm3/VERSION +++ b/prov/psm3/VERSION @@ -1 +1 @@ -3_5_1_1 +3_6_0_1 diff --git a/prov/psm3/configure.ac b/prov/psm3/configure.ac index 1dfc2dfc012..a985fc05b85 100644 --- a/prov/psm3/configure.ac +++ b/prov/psm3/configure.ac @@ -58,7 +58,7 @@ AC_DEFINE_UNQUOTED([BUILD_ID],["$with_build_id"], # Override autoconf default CFLAG settings (e.g. "-g -O2") while still # allowing the user to explicitly set CFLAGS="" -: ${CFLAGS="-fvisibility=hidden ${base_c_warn_flags}"} +: ${CFLAGS="${base_c_warn_flags}"} # AM_PROG_AS would set CFLAGS="-g -O2" by default if not set already so it # should not be called earlier @@ -242,6 +242,35 @@ AS_IF([test x"$enable_debug" != x"no"], AC_DEFINE_UNQUOTED([ENABLE_DEBUG],[$dbg], [defined to 1 if libfabric was configured with --enable-debug, 0 otherwise]) +AC_ARG_ENABLE([profile], + [AS_HELP_STRING([--enable-profile], + [Enable profiling @<:@default=no@:>@])], + [], + [enable_profile=no]) + +AS_IF([test x"$enable_profile" != x"no"], + [AC_DEFINE([HAVE_FABRIC_PROFILE], [1], + [defined to 1 if libfabric was configured with --enable-profile, 0 otherwise]) +]) + +AC_DEFUN([FI_ARG_ENABLE_SANITIZER],[ + AC_ARG_ENABLE([$1], + [AS_HELP_STRING([--enable-$1], + [Enable $3Sanitizer @<:@default=no@:>@]) + ], + [], + [enable_$1=no]) + AS_IF([test x"$enable_$1" != x"no"], + [CFLAGS="-fsanitize=$2 $CFLAGS"]) +]) + +m4_map([FI_ARG_ENABLE_SANITIZER],[ + [asan, address, Address], + [lsan, leak, Leak], + [tsan, thread, Thread], + [ubsan, undefined, UndefinedBehavior] +]) + dnl Checks for header files. AC_HEADER_STDC @@ -463,7 +492,9 @@ AC_LINK_IFELSE([AC_LANG_PROGRAM([[__asm__(".symver main_, main@ABIVER_1.0");]], ]) dnl AS_IF icc_symver_hack -AC_DEFINE_UNQUOTED([HAVE_SYMVER_SUPPORT], [$ac_asm_symver_support], +dnl Disable in psm3 to include all symbols without symver +dnl AC_DEFINE_UNQUOTED([HAVE_SYMVER_SUPPORT], [$ac_asm_symver_support], +AC_DEFINE_UNQUOTED([HAVE_SYMVER_SUPPORT], [0], [Define to 1 if compiler/linker support symbol versioning.]) AC_MSG_CHECKING(for __alias__ attribute support) @@ -478,8 +509,9 @@ AC_LINK_IFELSE([AC_LANG_PROGRAM([[ AC_MSG_RESULT(no) ac_prog_cc_alias_symbols=0 ]) - -AC_DEFINE_UNQUOTED([HAVE_ALIAS_ATTRIBUTE], [$ac_prog_cc_alias_symbols], +dnl Disable in psm3 to include all symbols without symver +dnl AC_DEFINE_UNQUOTED([HAVE_ALIAS_ATTRIBUTE], [$ac_prog_cc_alias_symbols], +AC_DEFINE_UNQUOTED([HAVE_ALIAS_ATTRIBUTE], [0], [Define to 1 if the linker supports alias attribute.]) AC_CHECK_FUNCS([getifaddrs]) @@ -772,6 +804,37 @@ AS_IF([test "x$enable_psm3_umr_cache" != "xno"], ]) ]) +dnl ------------- hwloc +AC_ARG_ENABLE([psm3-hwloc], + [AS_HELP_STRING([--enable-psm3-hwloc], + [Enable PSM3 use of hwloc for NIC affinity selections @<:@default=check@:>@])], + [], [enable_psm3_hwloc=check]) +psm3_hwloc_happy=0 +AS_IF([test "x$enable_psm3_hwloc" != "xno"], + [ + FI_CHECK_PACKAGE([psm3_hwloc], + [hwloc.h], + [hwloc], + [hwloc_topology_init], + [], + [$psm3_PREFIX], + [$psm3_LIBDIR], + [psm3_hwloc_found=1], + [psm3_hwloc_found=0]) + AS_IF([test $psm3_hwloc_found -ne 1 && test "x$enable_psm3_hwloc" == "xyes"], + [ + psm3_happy=0 + AC_MSG_ERROR([hwloc Support requested but hwloc headers and/or library not found.]) + ]) + AS_IF([test "$psm3_hwloc_found" -eq 1], + [ + psm3_hwloc_happy=1 + CPPFLAGS="$CPPFLAGS $psm3_hwloc_CPPFLAGS -DPSM_USE_HWLOC" + LDFLAGS="$LDFLAGS $psm3_hwloc_LDFLAGS" + LIBS="$LIBS $psm3_hwloc_LIBS" + ]) + ]) + dnl ------------- Driver Modules psm3_rv_happy=0 AC_ARG_WITH([psm3-rv], @@ -852,6 +915,9 @@ AC_DEFINE_UNQUOTED([PSM3_MARCH], ["$PSM3_MARCH"], [PSM3 built with instruction s AS_IF([test ! -z "$PSM_CPPFLAGS"], [CPPFLAGS="$CPPFLAGS $PSM_CPPFLAGS"], []) AS_IF([test ! -z "$PSM_CFLAGS"], [CFLAGS="$CFLAGS $PSM_CFLAGS"], []) +dnl Workaround for including fabric.c +AC_DEFINE([HOOK_NOOP_INIT], NULL, [Ignore HOOK_NOOP_INIT]) +AC_DEFINE([COLL_INIT], NULL, [Ignore COLL_INIT]) dnl Defines not used in PSM3 provider AC_DEFINE([HAVE_DMABUF_PEER_MEM], 0, [Ignore HAVE_DMABUF_PEER_MEM]) AC_DEFINE([HAVE_GDRCOPY], 0, [Ignore HAVE_GDRCOPY]) @@ -862,10 +928,16 @@ AC_DEFINE([HAVE_NEURON], 0, [Ignore HAVE_NEURON]) AC_DEFINE([HAVE_ROCR], 0, [Ignore HAVE_ROCR]) AC_DEFINE([HAVE_SYNAPSEAI], 0, [Ignore HAVE_SYNAPSEAI]) AC_DEFINE([HAVE_UFFD_MONITOR], 0, [Ignore HAVE_UFFD_MONITOR]) +AC_DEFINE([HAVE_XPMEM], 0, [Ignore HAVE_XPMEM]) + dnl Provider-specific checks dnl FI_PROVIDER_INIT +AC_DEFINE([HAVE_BGQ], 0, [Ignore HAVE_BGQ]) +AC_DEFINE([HAVE_BGQ_DL], 0, [Ignore HAVE_BGQ_DL]) AC_DEFINE([HAVE_EFA], 0, [Ignore HAVE_EFA]) AC_DEFINE([HAVE_EFA_DL], 0, [Ignore HAVE_EFA_DL]) +AC_DEFINE([HAVE_GNI], 0, [Ignore HAVE_GNI]) +AC_DEFINE([HAVE_GNI_DL], 0, [Ignore HAVE_GNI_DL]) AC_DEFINE([HAVE_MRAIL], 0, [Ignore HAVE_MRAIL]) AC_DEFINE([HAVE_MRAIL_DL], 0, [Ignore HAVE_MRAIL_DL]) AC_DEFINE([HAVE_NET], 0, [Ignore HAVE_NET]) @@ -878,6 +950,8 @@ AC_DEFINE([HAVE_PSM2_DL], 0, [Ignore HAVE_PSM2_DL]) dnl FI_PROVIDER_SETUP([psm3]) AC_DEFINE([HAVE_OPX], 0, [Ignore HAVE_OPX]) AC_DEFINE([HAVE_OPX_DL], 0, [Ignore HAVE_OPX_DL]) +AC_DEFINE([HAVE_RSTREAM], 0, [Ignore HAVE_RSTREAM]) +AC_DEFINE([HAVE_RSTREAM_DL], 0, [Ignore HAVE_RSTREAM_DL]) AC_DEFINE([HAVE_RXD], 0, [Ignore HAVE_RXD]) AC_DEFINE([HAVE_RXD_DL], 0, [Ignore HAVE_RXD_DL]) AC_DEFINE([HAVE_RXM], 0, [Ignore HAVE_RXM]) @@ -896,6 +970,8 @@ AC_DEFINE([HAVE_UCX], 0, [Ignore HAVE_UCX]) AC_DEFINE([HAVE_UCX_DL], 0, [Ignore HAVE_UCX_DL]) AC_DEFINE([HAVE_UDP], 0, [Ignore HAVE_UDP]) AC_DEFINE([HAVE_UDP_DL], 0, [Ignore HAVE_UDP_DL]) +AC_DEFINE([HAVE_USNIC], 0, [Ignore HAVE_USNIC]) +AC_DEFINE([HAVE_USNIC_DL], 0, [Ignore HAVE_USNIC_DL]) AC_DEFINE([HAVE_VERBS], 0, [Ignore HAVE_VERBS]) AC_DEFINE([HAVE_VERBS_DL], 0, [Ignore HAVE_VERBS_DL]) dnl FI_PROVIDER_FINI @@ -978,6 +1054,9 @@ fi if test $psm3_dsa_happy -eq 1; then afeatures="$afeatures, Intel DSA" fi +if test $psm3_hwloc_happy -eq 1; then + afeatures="$afeatures, hwloc" +fi if test "x$enable_psm3_udp" = "xyes"; then afeatures="$afeatures, UDP" fi diff --git a/prov/psm3/configure.m4 b/prov/psm3/configure.m4 index 6ae917558e8..25aea136db6 100644 --- a/prov/psm3/configure.m4 +++ b/prov/psm3/configure.m4 @@ -371,6 +371,28 @@ AC_DEFUN([FI_PSM3_CONFIGURE],[ ]) ]) + AS_IF([test "x$enable_psm3_hwloc" != "xno"], + [ + FI_CHECK_PACKAGE([psm3_hwloc], + [hwloc.h], + [hwloc], + [hwloc_topology_init], + [], + [$psm3_PREFIX], + [$psm3_LIBDIR], + [psm3_hwloc_found=1], + [psm3_hwloc_found=0]) + AS_IF([test $psm3_hwloc_found -ne 1 && test "x$enable_psm3_hwloc" == "xyes"], + [ + psm3_happy=0 + AC_MSG_ERROR([hwloc Support requested but hwloc headers and/or library not found.]) + ]) + AS_IF([test "$psm3_hwloc_found" -eq 1], + [ + psm3_CPPFLAGS="$psm3_CPPFLAGS -DPSM_USE_HWLOC" + ]) + ]) + AS_IF([test $psm3_happy -eq 1], [ AC_CONFIG_FILES([prov/psm3/psm3/psm2_hal_inlines_i.h \ prov/psm3/psm3/psm2_hal_inlines_d.h \ @@ -381,9 +403,9 @@ AC_DEFUN([FI_PSM3_CONFIGURE],[ AS_IF([test $psm3_happy -eq 1], [$1], [$2]) psm3_ARCH_CFLAGS="$PSM3_ARCH_CFLAGS" - psm3_CPPFLAGS="$psm3_CPPFLAGS $psm3_rt_CPPFLAGS $psm3_dl_CPPFLAGS $psm3_numa_CPPFLAGS $psm3_ibv_CPPFLAGS $psm3_uuid_CPPFLAGS" - psm3_LDFLAGS="$psm3_LDFLAGS $psm3_rt_LDFLAGS $psm3_dl_LDFLAGS $psm3_numa_LDFLAGS $psm3_ibv_LDFLAGS $psm3_uuid_LDFLAGS" - psm3_LIBS="$psm3_LIBS $psm3_rt_LIBS $psm3_dl_LIBS $psm3_numa_LIBS $psm3_ibv_LIBS $psm3_uuid_LIBS" + psm3_CPPFLAGS="$psm3_CPPFLAGS $psm3_rt_CPPFLAGS $psm3_dl_CPPFLAGS $psm3_numa_CPPFLAGS $psm3_ibv_CPPFLAGS $psm3_uuid_CPPFLAGS $psm3_hwloc_CPPFLAGS" + psm3_LDFLAGS="$psm3_LDFLAGS $psm3_rt_LDFLAGS $psm3_dl_LDFLAGS $psm3_numa_LDFLAGS $psm3_ibv_LDFLAGS $psm3_uuid_LDFLAGS $psm3_hwloc_LDFLAGS" + psm3_LIBS="$psm3_LIBS $psm3_rt_LIBS $psm3_dl_LIBS $psm3_numa_LIBS $psm3_ibv_LIBS $psm3_uuid_LIBS $psm3_hwloc_LIBS" AC_SUBST(psm3_CFLAGS) AC_SUBST(psm3_ARCH_CFLAGS) AC_SUBST(psm3_CPPFLAGS) @@ -448,4 +470,9 @@ AC_ARG_ENABLE([psm3-umr-cache], [Enable support for Userspace Memory Region (UMR) Caching @<:@default=check@:>@])], [], [enable_psm3_umr_cache=check]) +AC_ARG_ENABLE([psm3-hwloc], + [AS_HELP_STRING([--enable-psm3-hwloc], + [Enable PSM3 use of hwloc for NIC affinity selections @<:@default=check@:>@])], + [], + [enable_psm3_hwloc=check]) dnl vim: set ts=4 sw=4 tw=0 et : diff --git a/prov/psm3/debian/changelog b/prov/psm3/debian/changelog index 7eaab218a3a..0b1b356686f 100644 --- a/prov/psm3/debian/changelog +++ b/prov/psm3/debian/changelog @@ -1,4 +1,4 @@ -libpsm3-fi (11.5.1.1-1) unstable; urgency=medium +libpsm3-fi (11.6.0.0-231) unstable; urgency=medium * Initial release diff --git a/prov/psm3/debian/control b/prov/psm3/debian/control index 40dd0224032..43e38c07d02 100644 --- a/prov/psm3/debian/control +++ b/prov/psm3/debian/control @@ -2,7 +2,7 @@ Source: libpsm3-fi Section: libs Priority: optional Maintainer: https://www.intel.com/content/www/us/en/support.html -Build-Depends: debhelper (>= 12~), uuid-dev, libnuma-dev, libibverbs-dev, librdmacm-dev +Build-Depends: debhelper (>= 12~), uuid-dev, libnuma-dev, libibverbs-dev, librdmacm-dev, libhwloc-dev Standards-Version: 4.5.1 Rules-Requires-Root: no diff --git a/prov/psm3/libpsm3-fi.spec.in b/prov/psm3/libpsm3-fi.spec.in index a5cbce1be15..b24d4c13a63 100644 --- a/prov/psm3/libpsm3-fi.spec.in +++ b/prov/psm3/libpsm3-fi.spec.in @@ -1,6 +1,8 @@ %{!?configopts: %global configopts LDFLAGS=-Wl,--build-id} %{!?provider: %define provider psm3} %{!?provider_formal: %define provider_formal PSM3} +# Disable setting SOURCE_DATE_EPOCH from changelog +%define source_date_epoch_from_changelog 0 Name: lib%{provider}-fi Version: @VERSION@ @@ -18,6 +20,7 @@ Provides: lib${provider}-fi1 = %{version}-%{release} BuildRequires: libuuid-devel BuildRequires: rdma-core-devel +BuildRequires: hwloc-devel %if 0%{?suse_version} >= 1 BuildRequires: glibc-devel BuildRequires: libnuma-devel diff --git a/prov/psm3/psm3/Makefile.include b/prov/psm3/psm3/Makefile.include index 3cd1eff52ff..cc52b8f1868 100644 --- a/prov/psm3/psm3/Makefile.include +++ b/prov/psm3/psm3/Makefile.include @@ -185,6 +185,8 @@ psm3_libpsm3i_la_SOURCES = \ psm3/psm_mq_recv.c \ psm3/psm_mq_utils.c \ psm3/psm_netutils.h \ + psm3/psm_nic_select.c \ + psm3/psm_nic_select.h \ psm3/psm_oneapi_ze.c \ psm3/psm_perf.c \ psm3/psm_perf.h \ @@ -196,13 +198,13 @@ psm3_libpsm3i_la_SOURCES = \ psm3/psm_sysbuf.h \ psm3/psm_timer.c \ psm3/psm_timer.h \ + psm3/psm_uffd.c \ + psm3/psm_uffd.h \ psm3/psm_user.h \ psm3/psm_utils.c \ psm3/psm_utils.h \ psm3/psm_verbs_mr.c \ psm3/psm_verbs_mr.h \ - psm3/psm_verbs_umrc.c \ - psm3/psm_verbs_umrc.h \ psm3/psmi_wrappers.c \ psm3/psmi_wrappers.h \ psm3/psm2.h \ diff --git a/prov/psm3/psm3/hal_sockets/sockets_ep.c b/prov/psm3/psm3/hal_sockets/sockets_ep.c index 8e095b71315..27b98631508 100755 --- a/prov/psm3/psm3/hal_sockets/sockets_ep.c +++ b/prov/psm3/psm3/hal_sockets/sockets_ep.c @@ -159,11 +159,16 @@ psm3_ep_open_udp_internal(psm2_ep_t ep, int unit, int port, } if (!is_aux) { - psm3_getenv("PSM3_UDP_GSO", - "Enable UDP GSO Segmentation Offload (0 disables GSO)", - PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, - (union psmi_envvar_val)1, &env_gso); - ep->sockets_ep.udp_gso = env_gso.e_int; + psm3_getenv_range("PSM3_UDP_GSO", + "Enable UDP GSO Segmentation Offload", + "(0 disables GSO, 1 sets max chunk to 65536, >1 specifies max chunk)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)UINT16_MAX, + (union psmi_envvar_val)0, (union psmi_envvar_val)UINT16_MAX, + NULL, NULL, &env_gso); + ep->sockets_ep.udp_gso = env_gso.e_uint; + if (ep->sockets_ep.udp_gso == 1) + ep->sockets_ep.udp_gso = UINT16_MAX; if (ep->sockets_ep.udp_gso) { int gso; socklen_t optlen = sizeof(gso); @@ -553,6 +558,57 @@ psm2_error_t psm3_tune_tcp_socket(const char *sck_name, psm2_ep_t ep, int fd) return PSM2_INTERNAL_ERR; } +/* parse TCP port range for PSM3_TCP_PORT_RANGE + * format is low:high + * low must be <= high and each must be < UINT16_MAX. + * Either field can be omitted in which case default (input tvals) is used + * for given field. + * 0 - successfully parsed, tvals updated + * -1 - str empty, tvals unchanged + * -2 - syntax error, tvals may have been changed + */ +static int parse_tcp_port_range(const char *str, + size_t errstr_size, char errstr[], + int tvals[2]) +{ + psmi_assert(tvals); + int ret = psm3_parse_str_tuples(str, 2, tvals); + if (ret < 0) + return ret; + if (tvals[0] > UINT16_MAX || tvals[1] > UINT16_MAX) { + if (errstr_size) + snprintf(errstr, errstr_size, " Max allowed is %u", UINT16_MAX); + return -2; + } + if (tvals[0] < 0 || tvals[1] < 0) { + if (errstr_size) + snprintf(errstr, errstr_size, " Negative values not allowed"); + return -2; + } + if ((tvals[0] == TCP_PORT_AUTODETECT && tvals[1] != TCP_PORT_AUTODETECT) + || (tvals[0] != TCP_PORT_AUTODETECT && tvals[1] == TCP_PORT_AUTODETECT)) { + if (errstr_size) + snprintf(errstr, errstr_size, " low of %d only allowed with high of %d", TCP_PORT_AUTODETECT, TCP_PORT_AUTODETECT); + return -2; + } + if (tvals[0] > tvals[1]) { + if (errstr_size) + snprintf(errstr, errstr_size, " low (%d) > high (%d)", tvals[0], tvals[1]); + return -2; + } + return 0; +} + +static int parse_check_tcp_port_range(int type, + const union psmi_envvar_val val, void *ptr, + size_t errstr_size, char errstr[]) +{ + // parser will set tvals to result, use a copy to protect input of defaults + int tvals[2] = { ((int*)ptr)[0], ((int*)ptr)[1] }; + psmi_assert(type == PSMI_ENVVAR_TYPE_STR_TUPLES); + return parse_tcp_port_range(val.e_str, errstr_size, errstr, tvals); +} + static __inline__ psm2_error_t listen_to_port(psm2_ep_t ep, int sockfd, psm3_sockaddr_in_t *addr, @@ -567,12 +623,16 @@ psm2_error_t listen_to_port(psm2_ep_t ep, int sockfd, char range_def[32]; snprintf(range_def, sizeof(range_def), "%d:%d", tvals[0], tvals[1]); - if (!psm3_getenv("PSM3_TCP_PORT_RANGE", - "Set the TCP listener port range . The listener will bind to a random port in the range. '0:0'=let OS pick.", + (void)psm3_getenv_range("PSM3_TCP_PORT_RANGE", + "Set the TCP listener port range .", + "The listener will bind to a random port in the range. '0:0'=let OS pick.", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR_TUPLES, - (union psmi_envvar_val) range_def, &env_val)) { - /* not using default values */ - (void)psm3_parse_str_tuples(env_val.e_str, 2, tvals); + (union psmi_envvar_val) range_def, + (union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL, + parse_check_tcp_port_range, tvals, &env_val); + if (parse_tcp_port_range(env_val.e_str, 0, NULL, tvals) < 0) { + // already checked, shouldn't get parse errors nor empty strings + psmi_assert(0); } _HFI_DBG("PSM3_TCP_PORT_RANGE = %d:%d\n", tvals[0], tvals[1]); @@ -583,17 +643,14 @@ psm2_error_t listen_to_port(psm2_ep_t ep, int sockfd, start = 0; end = 0; _HFI_DBG("Binding to OS provided port\n"); - } else if (tvals[0] > 0 && tvals[0] <= tvals[1] && tvals[1] <= UINT16_MAX) { + } else { + psmi_assert(tvals[0] > 0); // start with a random port, find the first available one. port = psm3_rand((long int) getpid()); port = port % (tvals[1] + 1 - tvals[0]) + tvals[0]; start = (uint16_t)tvals[0]; end = (uint16_t)tvals[1]; _HFI_DBG("Binding to port in range [%" PRIu16 ":%" PRIu16 "], starting from %ld\n", start, end, port); - } else { - // high < low or only set one - _HFI_ERROR("Invalid TCP port range [%d:%d]\n", tvals[0], tvals[1]); - return PSM2_INTERNAL_ERR; } psm3_getenv("PSM3_TCP_BACKLOG", @@ -637,6 +694,46 @@ psm2_error_t listen_to_port(psm2_ep_t ep, int sockfd, return PSM2_INTERNAL_ERR; } +/* parse TCP skip poll counts for PSM3_TCP_SKIPPOLL_COUNT + * format is inactive_polls:active_polls + * inactive_polls must be >= active_polls + * Either field can be omitted in which case default (input tvals) is used + * for given field. + * 0 - successfully parsed, tvals updated + * -1 - str empty, tvals unchanged + * -2 - syntax error, tvals may have been changed + */ +static int parse_tcp_skippoll_count(const char *str, + size_t errstr_size, char errstr[], + int tvals[2]) +{ + psmi_assert(tvals); + int ret = psm3_parse_str_tuples(str, 2, tvals); + if (ret < 0) + return ret; + if (tvals[0] < 0 || tvals[1] < 0) { + if (errstr_size) + snprintf(errstr, errstr_size, " Negative values not allowed"); + return -2; + } + if (tvals[0] < tvals[1]) { + if (errstr_size) + snprintf(errstr, errstr_size, " inactive_polls (%d) must be >= active_polls (%d)", tvals[0], tvals[1]); + return -2; + } + return 0; +} + +static int parse_check_tcp_skippoll_count(int type, + const union psmi_envvar_val val, void *ptr, + size_t errstr_size, char errstr[]) +{ + // parser will set tvals to result, use a copy to protect input of defaults + int tvals[2] = { ((int*)ptr)[0], ((int*)ptr)[1] }; + psmi_assert(type == PSMI_ENVVAR_TYPE_STR_TUPLES); + return parse_tcp_skippoll_count(val.e_str, errstr_size, errstr, tvals); +} + psm2_error_t psm3_ep_open_tcp_internal(psm2_ep_t ep, int unit, int port, psm2_uuid_t const job_key) @@ -772,21 +869,16 @@ psm3_ep_open_tcp_internal(psm2_ep_t ep, int unit, int port, char buf[32]; snprintf(buf, sizeof(buf), "%d:%d", TCP_INACT_SKIP_POLLS, TCP_ACT_SKIP_POLLS); int tvals[2] = {TCP_INACT_SKIP_POLLS, TCP_ACT_SKIP_POLLS}; - if (!psm3_getenv("PSM3_TCP_SKIPPOLL_COUNT", - "Polls to skip under inactive and active connections " + (void)psm3_getenv_range("PSM3_TCP_SKIPPOLL_COUNT", + "Polls to skip under inactive and active connections ", "where inactive_polls >= active_polls.", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR_TUPLES, - (union psmi_envvar_val) buf, &env_val)) { - (void)psm3_parse_str_tuples(env_val.e_str, 2, tvals); - if (tvals[0] < 0) { - tvals[0] = TCP_INACT_SKIP_POLLS; - } - if (tvals[1] < 0) { - tvals[1] = TCP_ACT_SKIP_POLLS; - } - if (tvals[1] > tvals[0]) { - tvals[1] = tvals[0]; - } + (union psmi_envvar_val) buf, + (union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL, + parse_check_tcp_skippoll_count, tvals, &env_val); + if (parse_tcp_skippoll_count(env_val.e_str, 0, NULL, tvals) < 0) { + // already checked, shouldn't get parse errors nor empty strings + psmi_assert(0); } ep->sockets_ep.inactive_skip_polls = tvals[0]; ep->sockets_ep.active_skip_polls_offset = tvals[0] - tvals[1]; @@ -1084,10 +1176,11 @@ psm3_sockets_ips_proto_init(struct ips_proto *proto, uint32_t cksum_sz) if (ep->sockets_ep.udp_gso) { // set upper bounds for GSO segmentation - // OS limitation of 64K (UINT16_MAX) + // OS limitation of 64K (UINT16_MAX) and UDP_MAX_SEGMENTS (64) ep->chunk_max_segs = min(UINT16_MAX / (ep->mtu + sizeof(struct ips_message_header)), UDP_MAX_SEGMENTS); - ep->chunk_max_size = ep->mq->hfi_base_window_rv; - // for acks to pipeline well need to limit max_nsegs to + ep->chunk_max_size = ep->sockets_ep.udp_gso; + + // for acks to pipeline we'll need to limit max_nsegs to // < flow_credits/2 and max_size to < flow_credit_bytes/2 // (ideally 1/4, but that makes GSO too small and is worse) ep->chunk_max_segs = min(ep->chunk_max_segs, proto->flow_credits/2); diff --git a/prov/psm3/psm3/hal_sockets/sockets_ep.h b/prov/psm3/psm3/hal_sockets/sockets_ep.h index 5bfc3ffdb82..51fcd06f792 100644 --- a/prov/psm3/psm3/hal_sockets/sockets_ep.h +++ b/prov/psm3/psm3/hal_sockets/sockets_ep.h @@ -185,7 +185,7 @@ struct psm3_sockets_ep { int active_skip_polls_offset; // tailored for internal use. it's inactive_skip_polls - active_skip_polls struct msghdr snd_msg; // struct used for sendmsg /* fields specific to UDP */ - int udp_gso; // is GSO enabled for UDP + unsigned udp_gso; // is GSO enabled for UDP, max chunk_size uint8_t *sbuf_udp_gso; // buffer to compose UDP GSO packet sequence int udp_gso_zerocopy; // is UDP GSO Zero copy option enabled int udp_gro; // will be used later diff --git a/prov/psm3/psm3/hal_sockets/sockets_hal.c b/prov/psm3/psm3/hal_sockets/sockets_hal.c index 0c8087450b3..8d4527bdd64 100644 --- a/prov/psm3/psm3/hal_sockets/sockets_hal.c +++ b/prov/psm3/psm3/hal_sockets/sockets_hal.c @@ -175,15 +175,15 @@ static void psm3_hfp_sockets_mq_init_defaults(struct psm2_mq *mq) * corresponding PSM3_* env variables. * Otherwise these defaults are used. */ - mq->hfi_thresh_rv = 64000; - mq->hfi_base_window_rv = 131072; + mq->hfi_thresh_rv = PSM_MQ_NIC_RNDV_THRESH; + mq->ips_cpu_window_rv_str = PSM_CPU_NIC_RNDV_WINDOW_STR; // Even without RDMA do we want to disable rendezvous? // even without RDMA, the receiver controlled pacing helps scalability mq->hfi_thresh_rv = (~(uint32_t)0); // disable rendezvous mq->hfi_thresh_tiny = PSM_MQ_NIC_MAX_TINY; #if defined(PSM_CUDA) || defined(PSM_ONEAPI) if (PSMI_IS_GPU_ENABLED) - mq->hfi_base_window_rv = 2097152; + mq->ips_gpu_window_rv_str = PSM_GPU_NIC_RNDV_WINDOW_STR; #endif // we parse inet and rv_gpu_cache_size here so we can cache it // once per EP open, even if multi-rail or multi-QP diff --git a/prov/psm3/psm3/hal_verbs/verbs_ep.c b/prov/psm3/psm3/hal_verbs/verbs_ep.c index 979787b7af6..10a4e845e4b 100644 --- a/prov/psm3/psm3/hal_verbs/verbs_ep.c +++ b/prov/psm3/psm3/hal_verbs/verbs_ep.c @@ -113,7 +113,7 @@ psm3_ep_open_verbs(psm2_ep_t ep, int unit, int port, int addr_index, psm2_uuid_t // make sure all fields are empty. memset(&ep->verbs_ep,0,sizeof(ep->verbs_ep)); - ep->verbs_ep.qkey = *(uint32_t*)job_key; // use 1st 32 bits of job_key + ep->verbs_ep.qkey = (*(uint32_t*)job_key) & 0x7FFFFFFF; // use 1st 31 bits of job_key (MSB is reserved) if (_HFI_PRDBG_ON) { char uuid_str[64]; @@ -180,12 +180,48 @@ psm3_ep_open_verbs(psm2_ep_t ep, int unit, int port, int addr_index, psm2_uuid_t ep->dev_name, strerror(errno)); goto fail; } - // this gets done by psm3_verbs_poll_type - //if (ibv_req_notify_cq(ep->verbs_ep.recv_cq, 0)) { - // _HFI_ERROR("Can't request RQ events from %s: %s\n", - // ep->dev_name, strerror(errno)); - // goto fail; - //} + +#ifdef USE_RC + if (IPS_PROTOEXP_FLAG_USER_RC_QP(ep->rdmamode)) { + // SRQ improves scalability + struct ibv_device_attr dev_attr; + union psmi_envvar_val envvar_val; + + // get RDMA capabilities of device + if (ibv_query_device(ep->verbs_ep.context, &dev_attr)) { + _HFI_ERROR("Unable to query device %s: %s\n", ep->dev_name, + strerror(errno)); + goto fail; + } + _HFI_DBG("max_srq=%d\n", dev_attr.max_srq); + if (dev_attr.max_srq) { + psm3_getenv("PSM3_USE_SRQ", + "If device supports SRQ, use it [1=yes, 0=no) [1]", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)1, &envvar_val); + if (envvar_val.e_uint) { + struct ibv_srq_init_attr attr = { 0 }; + attr.srq_context = ep; // our own pointer + attr.attr.max_wr = ep->verbs_ep.hfi_num_recv_wqes; + attr.attr.max_sge = 1; + + ep->verbs_ep.srq = ibv_create_srq(ep->verbs_ep.pd, &attr); + if (ep->verbs_ep.srq == NULL) { + _HFI_ERROR( "Unable to create SRQ on %s: %s\n", + ep->dev_name, strerror(errno)); + if (errno == ENOMEM) { + _HFI_ERROR( "Requested SRQ size might be too big. Try reducing TX depth and/or inline size.\n"); + _HFI_ERROR( "Requested RX depth was %u .\n", + ep->verbs_ep.hfi_num_recv_wqes); + } + goto fail; + } + _HFI_DBG("created SRQ\n"); + ep->addl_nic_info = " SRQ"; + } + } + } +#endif /* USE_RC */ // TBD - should we pick an EQ number // we use ep as the cq_context (would be in callbacks if any) @@ -194,13 +230,20 @@ psm3_ep_open_verbs(psm2_ep_t ep, int unit, int port, int addr_index, psm2_uuid_t // so CQ only needs a little headroom to be safe (1000) // HFI_TF_NFLOWS (32) limits receiver side concurrent tidflows (aka inbound // RDMA w/immed). - // For USER RC Eager we can have num_recv_wqes/FRACTION per QP - // in which case theoretical need could be huge. We add 4000 as a + // For USER RC Eager without SRQ we can have num_recv_wqes/FRACTION per + // QP in which case theoretical need could be huge. We add 4000 as a // swag to cover most cases and user can always tune higher as needed + // For USER RC Eager with SRQ worse case is num_recv_wqes so we + // add that to allow up to num_recv_wqes on UD QP and SRQ each and keep + // the HFI_TF_NFLOWS+1000 as headroom. if (! ep->verbs_ep.hfi_num_recv_cqes) { ep->verbs_ep.hfi_num_recv_cqes = ep->verbs_ep.hfi_num_recv_wqes+HFI_TF_NFLOWS+1000; - if ((ep->rdmamode&IPS_PROTOEXP_FLAG_RDMA_MASK) == IPS_PROTOEXP_FLAG_RDMA_USER_RC) - ep->verbs_ep.hfi_num_recv_cqes += 4000; + if ((ep->rdmamode&IPS_PROTOEXP_FLAG_RDMA_MASK) == IPS_PROTOEXP_FLAG_RDMA_USER_RC) { + if (ep->verbs_ep.srq) + ep->verbs_ep.hfi_num_recv_cqes += ep->verbs_ep.hfi_num_recv_wqes; + else + ep->verbs_ep.hfi_num_recv_cqes += 4000; + } } ep->verbs_ep.recv_cq = ibv_create_cq(ep->verbs_ep.context, ep->verbs_ep.hfi_num_recv_cqes, @@ -211,12 +254,16 @@ psm3_ep_open_verbs(psm2_ep_t ep, int unit, int port, int addr_index, psm2_uuid_t strerror(errno)); goto fail; } + // this gets done by psm3_verbs_poll_type + //if (ibv_req_notify_cq(ep->verbs_ep.recv_cq, 0)) { + // _HFI_ERROR("Can't request RQ events from %s: %s\n", + // ep->dev_name, strerror(errno)); + // goto fail; + //} ep->verbs_ep.qp = ud_qp_create(ep); - if (! ep->verbs_ep.qp) { - _HFI_ERROR( "Unable to create UD QP on %s\n", ep->dev_name); + if (! ep->verbs_ep.qp) goto fail; - } psmi_assert_always (ep->verbs_ep.context); @@ -306,7 +353,8 @@ psm3_verbs_parse_params(psm2_ep_t ep) psm3_getenv("PSM3_NUM_RECV_CQES", "Number of recv CQEs to allocate\n" "(0 will calculate as PSM3_NUM_RECV_WQES+1032 for PSM3_RDMA=0-2\n" - "and 4000 more than that for PSM3_RDMA=3]) [0]", + "for PSM3_RDMA=3 with SRQ, allow an additional PSM3_NUM_RECV_WQES\n" + "for PSM3_RDMA=3 without SRQ, allow an additional 4000) [0]", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)0, &envvar_val); @@ -343,11 +391,12 @@ psm3_verbs_parse_params(psm2_ep_t ep) * otherwise ignored */ // RV defaults are sufficient for default PSM parameters - // but if user adjusts ep->hfi_num_send_rdma or mq->hfi_base_window_rv + // but if user adjusts ep->hfi_num_send_rdma or mq->ips_cpu_window_rv // they also need to increase the cache size. psm3_verbs_alloc_mr_cache // will verify cache size is sufficient. // min size is (HFI_TF_NFLOWS + ep->hfi_num_send_rdma) * - // chunk size (mq->hfi_base_window_rv after psm3_mq_initialize_params) + // chunk size (psm3_mq_max_window_rv(mq, 0) after + // psm3_mq_initialize_params) // for OPA native, actual window_rv may be smaller, but for UD it // is not reduced psm3_getenv("PSM3_RV_MR_CACHE_SIZE", @@ -358,12 +407,14 @@ psm3_verbs_parse_params(psm2_ep_t ep) (union psmi_envvar_val)0, &envvar_val); ep->rv_mr_cache_size = envvar_val.e_uint; // TBD - we could check cache_size >= minimum based on: - // (HFI_TF_NFLOWS + ep->hfi_num_send_rdma) * mq->hfi_base_window_rv + // (HFI_TF_NFLOWS + ep->hfi_num_send_rdma) + // * psm3_mq_max_window_rv(mq, 0) // and automatically increase with warning if not? #if defined(PSM_CUDA) || defined(PSM_ONEAPI) ep->rv_gpu_cache_size = psmi_parse_gpudirect_rv_gpu_cache_size(0); // TBD - we could check gpu_cache_size >= minimum based on: - // (HFI_TF_NFLOWS + ep->hfi_num_send_rdma) * mq->hfi_base_window_rv + // (HFI_TF_NFLOWS + ep->hfi_num_send_rdma) + // * psm3_mq_max_window_rv(mq, 1) // and automatically increase with warning if not? #endif @@ -464,7 +515,7 @@ psm3_verbs_ips_proto_init(struct ips_proto *proto, uint32_t cksum_sz) ep->verbs_ep.send_reap_thresh = min(ep->verbs_ep.hfi_send_reap_thresh, ep->verbs_ep.send_pool.send_total/2); _HFI_PRDBG("reaping when %u posted.\n", ep->verbs_ep.send_reap_thresh); - if (PSM2_OK != psm_verbs_alloc_recv_pool(ep, ep->verbs_ep.qp, &ep->verbs_ep.recv_pool, + if (PSM2_OK != psm_verbs_alloc_recv_pool(ep, 0, ep->verbs_ep.qp, &ep->verbs_ep.recv_pool, min(ep->verbs_ep.hfi_num_recv_wqes, ep->verbs_ep.qp_cap.max_recv_wr), // want to end up with multiple of cache line (64) // ep->mtu+MAX_PSM_HEADERS will be power of 2 verbs MTU @@ -474,6 +525,25 @@ psm3_verbs_ips_proto_init(struct ips_proto *proto, uint32_t cksum_sz) _HFI_ERROR( "Unable to allocate UD recv buffer pool\n"); goto fail; } +#ifdef USE_RC + if (ep->verbs_ep.srq) { + if (PSM2_OK != psm_verbs_alloc_recv_pool(ep, 1, ep->verbs_ep.srq, &ep->verbs_ep.srq_recv_pool, + ep->verbs_ep.hfi_num_recv_wqes, + (proto->ep->rdmamode == IPS_PROTOEXP_FLAG_RDMA_USER)? 0 + // want to end up with multiple of cache line (64) + // ep->mtu+MAX_PSM_HEADERS will be power of 2 verbs MTU + // be conservative (+BUFFER_HEADROOM) + : (ep->mtu + MAX_PSM_HEADER + BUFFER_HEADROOM) + )) { + _HFI_ERROR( "Unable to allocate SRQ recv buffer pool\n"); + goto fail; + } + if (PSM2_OK != psm3_ep_verbs_prepost_recv(&ep->verbs_ep.srq_recv_pool)) { + _HFI_ERROR( "Unable to prepost recv buffers on SRQ for %s port %u\n", ep->dev_name, ep->portnum); + goto fail; + } + } +#endif /* USE_RC */ // no send segmentation, max_segs will constrain ep->chunk_max_segs = 1; @@ -515,6 +585,9 @@ psm3_verbs_ips_proto_init(struct ips_proto *proto, uint32_t cksum_sz) return PSM2_OK; fail: +#ifdef USE_RC + psm_verbs_free_recv_pool(&ep->verbs_ep.srq_recv_pool); +#endif psm_verbs_free_send_pool(&ep->verbs_ep.send_pool); psm_verbs_free_recv_pool(&ep->verbs_ep.recv_pool); return PSM2_INTERNAL_ERR; @@ -756,6 +829,13 @@ void psm3_ep_free_verbs(psm2_ep_t ep) psm3_rv_close(ep->rv); ep->rv = NULL; } +#endif +#ifdef USE_RC + if (ep->verbs_ep.srq) { + ibv_destroy_srq(ep->verbs_ep.srq); + ep->verbs_ep.srq = NULL; + } + psm_verbs_free_recv_pool(&ep->verbs_ep.srq_recv_pool); #endif if (ep->verbs_ep.pd) { ibv_dealloc_pd(ep->verbs_ep.pd); @@ -796,6 +876,16 @@ psm2_error_t psm_verbs_alloc_send_pool(psm2_ep_t ep, struct ibv_pd *pd, _HFI_ERROR( "can't alloc send buffers"); goto fail; } +#if defined(PSM_CUDA) && !defined(PSM3_NO_CUDA_REGISTER) + // By registering memory with Cuda, we make + // cuMemcpy run faster for copies from + // GPU to the send buffer. + if (PSMI_IS_GPU_ENABLED && check_have_cuda_ctxt()) + PSMI_CUDA_CALL(cuMemHostRegister, + pool->send_buffers, + pool->send_total*pool->send_buffer_size, + CU_MEMHOSTALLOC_PORTABLE); +#endif #if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT) // By registering memory with Level Zero, we make // zeCommandListAppendMemoryCopy run faster for copies from @@ -860,13 +950,22 @@ extern psm2_error_t psm_verbs_init_send_allocator( // which are tracked in other structures but still part of the ep's memory stats // For RC QPs receiving only RDMA Write with immediate, no buffer space is // needed. Caller will specify recv_buffer_size==0 with a recv_total. -psm2_error_t psm_verbs_alloc_recv_pool(psm2_ep_t ep, struct ibv_qp *qp, - psm3_verbs_recv_pool_t pool, +psm2_error_t psm_verbs_alloc_recv_pool(psm2_ep_t ep, uint32_t for_srq, + void *qp_srq, psm3_verbs_recv_pool_t pool, uint32_t recv_total, uint32_t recv_buffer_size) { memset(pool,0,sizeof(*pool)); - pool->qp = qp; // save a reference +#ifdef USE_RC + pool->for_srq = for_srq; + if (for_srq) + pool->srq = (struct ibv_srq *)qp_srq; // save a reference + else +#endif + pool->qp = (struct ibv_qp *)qp_srq; // save a reference +#ifndef USE_RC + psmi_assert(! for_srq); +#endif pool->ep = ep; pool->recv_total = recv_total; @@ -878,7 +977,11 @@ psm2_error_t psm_verbs_alloc_recv_pool(psm2_ep_t ep, struct ibv_qp *qp, // allocate recv buffers pool->recv_buffer_size = recv_buffer_size; // beginning of UD QP Recv Buf always consumed with space for IB GRH - if (qp->qp_type == IBV_QPT_UD) { + if ( +#ifdef USE_RC + ! pool->for_srq && +#endif + pool->qp->qp_type == IBV_QPT_UD) { // round up UD_ADDITION (40) to multiple of 64 for better // cache alignment of buffers pool->recv_buffer_size += ROUNDUP(UD_ADDITION, 64); @@ -892,6 +995,16 @@ psm2_error_t psm_verbs_alloc_recv_pool(psm2_ep_t ep, struct ibv_qp *qp, _HFI_ERROR( "can't alloc recv buffers"); goto fail; } +#if defined(PSM_CUDA) && !defined(PSM3_NO_CUDA_REGISTER) + // By registering memory with Cuda, we make + // cuMemcpy run faster for copies from + // recv buffer to GPU + if (PSMI_IS_GPU_ENABLED && check_have_cuda_ctxt()) + PSMI_CUDA_CALL(cuMemHostRegister, + pool->recv_buffers, + pool->recv_total*pool->recv_buffer_size, + CU_MEMHOSTALLOC_PORTABLE); +#endif #if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT) // By registering memory with Level Zero, we make // zeCommandListAppendMemoryCopy run faster for copies from @@ -921,7 +1034,11 @@ psm2_error_t psm_verbs_alloc_recv_pool(psm2_ep_t ep, struct ibv_qp *qp, // UD doesn't support RDMA, so we just need local NIC to be able to // access our buffers with kernel bypass (IBV_ACCESS_LOCAL_WRITE) pool->recv_buffer_mr = ibv_reg_mr( - qp->pd, pool->recv_buffers, +#ifdef USE_RC + for_srq?pool->srq->pd: +#endif + pool->qp->pd, + pool->recv_buffers, pool->recv_total*pool->recv_buffer_size, IBV_ACCESS_LOCAL_WRITE); if (! pool->recv_buffer_mr) { @@ -932,7 +1049,7 @@ psm2_error_t psm_verbs_alloc_recv_pool(psm2_ep_t ep, struct ibv_qp *qp, } else { #ifdef USE_RC // we want a pool for RDMA Write w/immediate recv. No buffers - psmi_assert(qp->qp_type != IBV_QPT_UD); + psmi_assert(for_srq || pool->qp->qp_type != IBV_QPT_UD); // we use exactly 1 rbuf so wr_id can lead us to pool and qp pool->recv_bufs = (struct verbs_rbuf *)psmi_calloc(ep, NETWORK_BUFFERS, sizeof(struct verbs_rbuf), 1); @@ -989,10 +1106,37 @@ void psm_verbs_free_send_pool(psm3_verbs_send_pool_t pool) pool->send_bufs = NULL; } if (pool->send_buffers) { +#if defined(PSM_CUDA) && !defined(PSM3_NO_CUDA_REGISTER) + if (PSMI_IS_GPU_ENABLED && cu_ctxt) { + /* ignore NOT_REGISTERED in case cuda initialized late */ + /* ignore other errors as context could be destroyed before this */ + CUresult cudaerr; + //PSMI_CUDA_CALL_EXCEPT(CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, + // cuMemHostUnregister, pool->send_buffers); + psmi_count_cuMemHostUnregister++; + cudaerr = psmi_cuMemHostUnregister(pool->send_buffers); + if (cudaerr) { + const char *pStr = NULL; + psmi_count_cuGetErrorString++; + psmi_cuGetErrorString(cudaerr, &pStr); + _HFI_DBG("CUDA failure: cuMemHostUnregister returned %d: %s\n", + cudaerr, pStr?pStr:"Unknown"); + } + + } +#endif #if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT) - if (PSMI_IS_GPU_ENABLED) - PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, - ze_driver, pool->send_buffers); + if (PSMI_IS_GPU_ENABLED) { + ze_result_t result; + //PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, + // ze_driver, pool->send_buffers); + psmi_count_zexDriverReleaseImportedPointer++; + result = psmi_zexDriverReleaseImportedPointer(ze_driver, + pool->send_buffers); + if (result != ZE_RESULT_SUCCESS) { + _HFI_DBG("OneAPI Level Zero failure: zexDriverReleaseImportedPointer returned %d: %s\n", result, psmi_oneapi_ze_result_to_string(result)); + } + } #endif psmi_free(pool->send_buffers); pool->send_buffers = NULL; @@ -1014,10 +1158,36 @@ void psm_verbs_free_recv_pool(psm3_verbs_recv_pool_t pool) } #endif if (pool->recv_buffers) { +#if defined(PSM_CUDA) && !defined(PSM3_NO_CUDA_REGISTER) + if (PSMI_IS_GPU_ENABLED && cu_ctxt) { + /* ignore NOT_REGISTERED in case cuda initialized late */ + /* ignore other errors as context could be destroyed before this */ + CUresult cudaerr; + //PSMI_CUDA_CALL_EXCEPT(CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, + // cuMemHostUnregister, pool->recv_buffers); + psmi_count_cuMemHostUnregister++; + cudaerr = psmi_cuMemHostUnregister(pool->recv_buffers); + if (cudaerr) { + const char *pStr = NULL; + psmi_count_cuGetErrorString++; + psmi_cuGetErrorString(cudaerr, &pStr); + _HFI_DBG("CUDA failure: cuMemHostUnregister returned %d: %s\n", + cudaerr, pStr?pStr:"Unknown"); + } + } +#endif #if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT) - if (PSMI_IS_GPU_ENABLED) - PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, - ze_driver, pool->recv_buffers); + if (PSMI_IS_GPU_ENABLED) { + ze_result_t result; + //PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, + // ze_driver, pool->recv_buffers); + psmi_count_zexDriverReleaseImportedPointer++; + result = psmi_zexDriverReleaseImportedPointer(ze_driver, + pool->recv_buffers); + if (result != ZE_RESULT_SUCCESS) { + _HFI_DBG("OneAPI Level Zero failure: zexDriverReleaseImportedPointer returned %d: %s\n", result, psmi_oneapi_ze_result_to_string(result)); + } + } #endif psmi_free(pool->recv_buffers); pool->recv_buffers = NULL; @@ -1181,27 +1351,44 @@ psm2_error_t psm3_ep_verbs_post_recv( PSM3_FAULTINJ_STATIC_DECL(fi_rq_lkey, "rq_lkey", "post UD " #ifdef USE_RC - "or RC " + "or RC or SRQ " #endif "RQ WQE with bad lkey", 0, IPS_FAULTINJ_RQ_LKEY); - if_pf(PSM3_FAULTINJ_IS_FAULT(fi_rq_lkey, pool->ep, " QP %u", pool->qp->qp_num)) + // SRQ has no number but need consistency in fmt and number of args + if_pf(PSM3_FAULTINJ_IS_FAULT(fi_rq_lkey, pool->ep, +#ifdef USE_RC + "%s %u", pool->for_srq?"SRQ":"QP", pool->for_srq?0:pool->qp->qp_num)) +#else + " QP %u", pool->qp->qp_num)) +#endif wr->sg_list->lkey = 55; } else wr->sg_list->lkey = pool->recv_buffer_mr->lkey; #endif // PSM_FI if_pf (++pool->next_recv_wqe >= VERBS_RECV_QP_COALLESCE) { // we have a batch ready to post - if_pf (ibv_post_recv(pool->qp, pool->recv_wr_list, &bad_wr)) { - _HFI_ERROR("failed to post RQ on %s port %u: %s", pool->ep->dev_name, pool->ep->portnum, strerror(errno)); - return PSM2_INTERNAL_ERR; +#ifdef USE_RC + if (pool->for_srq) { + if_pf (ibv_post_srq_recv(pool->srq, pool->recv_wr_list, &bad_wr)) { + _HFI_ERROR("failed to post SRQ on %s port %u: %s", pool->ep->dev_name, pool->ep->portnum, strerror(errno)); + return PSM2_INTERNAL_ERR; + } + //_HFI_VDBG("posted SRQ, including buffer %u\n", index); + } else +#endif + { + if_pf (ibv_post_recv(pool->qp, pool->recv_wr_list, &bad_wr)) { + _HFI_ERROR("failed to post RQ on %s port %u: %s", pool->ep->dev_name, pool->ep->portnum, strerror(errno)); + return PSM2_INTERNAL_ERR; + } + //_HFI_VDBG("posted RQ, including buffer %u\n", index); } - //_HFI_VDBG("posted RQ, including buffer %u\n", index); pool->next_recv_wqe = 0; } else { //_HFI_VDBG("preped RQE, buffer %u\n", index); } -#else +#else /* VERBS_RECV_QP_COALLESCE > 1 */ list.addr = (uintptr_t)rbuf_to_buffer(buf); list.length = pool->recv_buffer_size; list.lkey = pool->recv_buffer_mr->lkey; @@ -1210,11 +1397,17 @@ psm2_error_t psm3_ep_verbs_post_recv( PSM3_FAULTINJ_STATIC_DECL(fi_rq_lkey, "rq_lkey", "post UD " #ifdef USE_RC - "or RC " + "or RC or SRQ" #endif "RQ WQE with bad lkey", 0, IPS_FAULTINJ_RQ_LKEY); - if_pf(PSM3_FAULTINJ_IS_FAULT(fi_rq_lkey, pool->ep, " QP %u", pool->qp->qp_num)) + // SRQ has no number but need consistency in fmt and number of args + if_pf(PSM3_FAULTINJ_IS_FAULT(fi_rq_lkey, pool->ep, +#ifdef USE_RC + "%s %u", pool->for_srq?"SRQ":"QP", pool->for_srq?0:pool->qp->qp_num)) +#else + " QP %u", pool->qp->qp_num)) +#endif list.lkey = 55; } #endif // PSM_FI @@ -1223,12 +1416,23 @@ psm2_error_t psm3_ep_verbs_post_recv( wr.sg_list = &list; wr.num_sge = 1; // size of sg_list - if_pf (ibv_post_recv(pool->qp, &wr, &bad_wr)) { - _HFI_ERROR("failed to post RQ on %s port %u: %s", pool->ep->dev_name, pool->ep->portnum, strerror(errno)); - return PSM2_INTERNAL_ERR; - } - //_HFI_VDBG("posted RQ, buffer %u\n", index); +#ifdef USE_RC + if (pool->for_srq) { + if_pf (ibv_post_srq_recv(pool->srq, &wr, &bad_wr)) { + _HFI_ERROR("failed to post SRQ on %s port %u: %s", pool->ep->dev_name, pool->ep->portnum, strerror(errno)); + return PSM2_INTERNAL_ERR; + } + //_HFI_VDBG("posted SRQ, buffer %u\n", index); + } else #endif + { + if_pf (ibv_post_recv(pool->qp, &wr, &bad_wr)) { + _HFI_ERROR("failed to post RQ on %s port %u: %s", pool->ep->dev_name, pool->ep->portnum, strerror(errno)); + return PSM2_INTERNAL_ERR; + } + //_HFI_VDBG("posted RQ, buffer %u\n", index); + } +#endif /* VERBS_RECV_QP_COALLESCE > 1 */ #ifdef USE_RC } else { #if VERBS_RECV_QP_COALLESCE > 1 @@ -1238,27 +1442,43 @@ psm2_error_t psm3_ep_verbs_post_recv( wr->wr_id = (uintptr_t)buf; // we'll get this back in completion if_pf (++pool->next_recv_wqe >= VERBS_RECV_QP_COALLESCE) { // we have a batch ready to post - if_pf (ibv_post_recv(pool->qp, pool->recv_wr_list, &bad_wr)) { - _HFI_ERROR("failed to post RQ on %s on port %u: %s", pool->ep->dev_name, pool->ep->portnum, strerror(errno)); - return PSM2_INTERNAL_ERR; + if (pool->for_srq) { + if_pf (ibv_post_srq_recv(pool->srq, pool->recv_wr_list, &bad_wr)) { + _HFI_ERROR("failed to post SRQ on %s port %u: %s", pool->ep->dev_name, pool->ep->portnum, strerror(errno)); + return PSM2_INTERNAL_ERR; + } + //_HFI_VDBG("posted SRQ\n"); + } else { + if_pf (ibv_post_recv(pool->qp, pool->recv_wr_list, &bad_wr)) { + _HFI_ERROR("failed to post RQ on %s on port %u: %s", pool->ep->dev_name, pool->ep->portnum, strerror(errno)); + return PSM2_INTERNAL_ERR; + } + //_HFI_VDBG("posted RQ\n"); } - //_HFI_VDBG("posted RQ\n"); pool->next_recv_wqe = 0; } else { //_HFI_VDBG("preped RQE\n"); } -#else +#else /* VERBS_RECV_QP_COALLESCE > 1 */ wr.next = NULL; // just post 1 wr.wr_id = (uintptr_t)buf; // we'll get this back in completion wr.sg_list = NULL; wr.num_sge = 0; // size of sg_list - if_pf (ibv_post_recv(pool->qp, &wr, &bad_wr)) { - _HFI_ERROR("failed to post RQ on %s on port %u: %s", pool->ep->dev_name, pool->ep->portnum, strerror(errno)); - return PSM2_INTERNAL_ERR; + if (pool->for_srq) { + if_pf (ibv_post_srq_recv(pool->srq, &wr, &bad_wr)) { + _HFI_ERROR("failed to post SRQ on %s port %u: %s", pool->ep->dev_name, pool->ep->portnum, strerror(errno)); + return PSM2_INTERNAL_ERR; + } + //_HFI_VDBG("posted SRQ\n"); + } else { + if_pf (ibv_post_recv(pool->qp, &wr, &bad_wr)) { + _HFI_ERROR("failed to post RQ on %s on port %u: %s", pool->ep->dev_name, pool->ep->portnum, strerror(errno)); + return PSM2_INTERNAL_ERR; + } + //_HFI_VDBG("posted RQ\n"); } - //_HFI_VDBG("posted RQ\n"); -#endif +#endif /* VERBS_RECV_QP_COALLESCE > 1 */ #endif // USE_RC } return PSM2_OK; @@ -2333,12 +2553,15 @@ static struct ibv_qp* ud_qp_create(psm2_ep_t ep) attr.qp_type = IBV_QPT_UD; qp = ibv_create_qp(ep->verbs_ep.pd, &attr); - if (qp == NULL && errno == ENOMEM) { + if (qp == NULL) { _HFI_ERROR( "Unable to create UD QP on %s: %s\n", ep->dev_name, strerror(errno)); - _HFI_ERROR( "Requested QP size might be too big. Try reducing TX depth and/or inline size.\n"); - _HFI_ERROR( "Requested TX depth was %u and RX depth was %u .\n", + if (errno == ENOMEM) { + _HFI_ERROR( "Requested QP size might be too big. Try reducing TX depth and/or inline size.\n"); + _HFI_ERROR( "Requested TX depth was %u and RX depth was %u .\n", ep->verbs_ep.hfi_num_send_wqes+1, ep->verbs_ep.hfi_num_recv_wqes); + } + return NULL; } // attr reports what we got, double check and react in case @@ -2437,7 +2660,7 @@ struct ibv_qp* rc_qp_create(psm2_ep_t ep, void *context, struct ibv_qp_cap *cap) attr.qp_context = context; attr.send_cq = ep->verbs_ep.send_cq; attr.recv_cq = ep->verbs_ep.recv_cq; - attr.srq = NULL; + attr.srq = ep->verbs_ep.srq; // one extra WQE to be safe in case verbs needs a spare WQE if ((ep->rdmamode&IPS_PROTOEXP_FLAG_RDMA_MASK) == IPS_PROTOEXP_FLAG_RDMA_USER_RC) { // need to be prepared in case all sends posted to same RC QP, so @@ -2445,10 +2668,9 @@ struct ibv_qp* rc_qp_create(psm2_ep_t ep, void *context, struct ibv_qp_cap *cap) attr.cap.max_send_wr = ep->verbs_ep.hfi_num_send_wqes+ep->hfi_num_send_rdma+1; attr.cap.max_send_sge = 2; // inline data helps latency and message rate for small sends - // Later we may explore use of - // send SGEs pointing to application buffers, somewhat like WFR send DMA attr.cap.max_inline_data = ep->hfi_imm_size; - attr.cap.max_recv_wr = ep->verbs_ep.hfi_num_recv_wqes/VERBS_RECV_QP_FRACTION;// TBD + attr.cap.max_recv_wr = ep->verbs_ep.srq?0 + :(ep->verbs_ep.hfi_num_recv_wqes/VERBS_RECV_QP_FRACTION);// TBD attr.cap.max_recv_sge = 1; } else { // only RDMA Write w/immediate @@ -2456,7 +2678,7 @@ struct ibv_qp* rc_qp_create(psm2_ep_t ep, void *context, struct ibv_qp_cap *cap) attr.cap.max_send_sge = 1; attr.cap.max_inline_data = 0; // incoming Write w/immediate consumes a RQ WQE but no buffer needed - attr.cap.max_recv_wr = HFI_TF_NFLOWS+1; + attr.cap.max_recv_wr = ep->verbs_ep.srq?0:(HFI_TF_NFLOWS+1); attr.cap.max_recv_sge = 0; } @@ -2467,9 +2689,16 @@ struct ibv_qp* rc_qp_create(psm2_ep_t ep, void *context, struct ibv_qp_cap *cap) _HFI_ERROR( "Unable to create RC QP on %s: %s\n", ep->dev_name, strerror(errno)); _HFI_ERROR( "Requested QP size might be too big. Try reducing TX depth and/or inline size.\n"); - _HFI_ERROR( "Requested TX depth was %u and RX depth was %u .\n", - ep->verbs_ep.hfi_num_send_wqes+1, - ep->verbs_ep.hfi_num_recv_wqes/VERBS_RECV_QP_FRACTION); + if ((ep->rdmamode&IPS_PROTOEXP_FLAG_RDMA_MASK) == IPS_PROTOEXP_FLAG_RDMA_USER_RC) { + _HFI_ERROR( "Requested TX depth was %u and RX depth was %u .\n", + ep->verbs_ep.hfi_num_send_wqes+ep->hfi_num_send_rdma+1, + ep->verbs_ep.srq?0 + :(ep->verbs_ep.hfi_num_recv_wqes/VERBS_RECV_QP_FRACTION)); + } else { + _HFI_ERROR( "Requested TX depth was %u and RX depth was %u .\n", + ep->hfi_num_send_rdma+1, + ep->verbs_ep.srq?0:(HFI_TF_NFLOWS+1)); + } return NULL; } @@ -2492,7 +2721,8 @@ struct ibv_qp* rc_qp_create(psm2_ep_t ep, void *context, struct ibv_qp_cap *cap) _HFI_PRDBG( "Limited to %d SQ SGEs\n", attr.cap.max_send_sge); } - if (ep->verbs_ep.hfi_num_recv_wqes/VERBS_RECV_QP_FRACTION > attr.cap.max_recv_wr) { + if (! ep->verbs_ep.srq + && ep->verbs_ep.hfi_num_recv_wqes/VERBS_RECV_QP_FRACTION > attr.cap.max_recv_wr) { _HFI_PRDBG( "Limited to %d RQ WQEs, requested %u\n", attr.cap.max_recv_wr, ep->verbs_ep.hfi_num_recv_wqes/VERBS_RECV_QP_FRACTION); } else { @@ -2514,7 +2744,8 @@ struct ibv_qp* rc_qp_create(psm2_ep_t ep, void *context, struct ibv_qp_cap *cap) _HFI_PRDBG( "Limited to %d SQ SGEs\n", attr.cap.max_send_sge); } - if (HFI_TF_NFLOWS+1 > attr.cap.max_recv_wr) { + if (! ep->verbs_ep.srq + && HFI_TF_NFLOWS+1 > attr.cap.max_recv_wr) { _HFI_PRDBG( "Limited to %d RQ WQEs, requested %u\n", attr.cap.max_recv_wr, HFI_TF_NFLOWS+1); } else { @@ -2848,7 +3079,7 @@ psm3_dump_verbs_qp(struct ibv_qp *qp) printf("QP %p (%u), type %u state %u PkeyIndx %u Port %u draining %u\n", qp, qp->qp_num, qp->qp_type, attr.qp_state, attr.pkey_index, attr.port_num, attr.sq_draining); - printf(" send: wr %u sge %u inline %u recv: wr %u sqe %u\n", + printf(" send: wr %u sge %u inline %u recv: wr %u sge %u\n", attr.cap.max_send_wr, attr.cap.max_send_sge, attr.cap.max_inline_data, attr.cap.max_recv_wr, attr.cap.max_recv_sge); printf(" context %p send_cq %p recv_cq %p srq %p sg_sig_all %u\n", @@ -2906,6 +3137,7 @@ static enum psm3_ibv_rate verbs_get_rate(uint8_t width, uint8_t speed) case 16: return PSM3_IBV_RATE_14_GBPS; case 32: return PSM3_IBV_RATE_25_GBPS; case 64: return PSM3_IBV_RATE_50_GBPS; + case 128: return PSM3_IBV_RATE_100_GBPS; default: _HFI_ERROR( "unknown link speed 0x%x\n", speed); return PSM3_IBV_RATE_100_GBPS; @@ -2919,6 +3151,7 @@ static enum psm3_ibv_rate verbs_get_rate(uint8_t width, uint8_t speed) case 16: return PSM3_IBV_RATE_56_GBPS; case 32: return PSM3_IBV_RATE_100_GBPS; case 64: return PSM3_IBV_RATE_200_GBPS; + case 128: return PSM3_IBV_RATE_400_GBPS; default: _HFI_ERROR( "unknown link speed 0x%x\n", speed); return PSM3_IBV_RATE_100_GBPS; @@ -2932,6 +3165,7 @@ static enum psm3_ibv_rate verbs_get_rate(uint8_t width, uint8_t speed) case 16: return PSM3_IBV_RATE_112_GBPS; case 32: return PSM3_IBV_RATE_200_GBPS; case 64: return PSM3_IBV_RATE_400_GBPS; + case 128: return PSM3_IBV_RATE_800_GBPS; default: _HFI_ERROR( "unknown link speed 0x%x\n", speed); return PSM3_IBV_RATE_100_GBPS; @@ -2945,6 +3179,7 @@ static enum psm3_ibv_rate verbs_get_rate(uint8_t width, uint8_t speed) case 16: return PSM3_IBV_RATE_168_GBPS; case 32: return PSM3_IBV_RATE_300_GBPS; case 64: return PSM3_IBV_RATE_600_GBPS; + case 128: return PSM3_IBV_RATE_1200_GBPS; default: _HFI_ERROR( "unknown link speed 0x%x\n", speed); return PSM3_IBV_RATE_100_GBPS; @@ -2958,6 +3193,7 @@ static enum psm3_ibv_rate verbs_get_rate(uint8_t width, uint8_t speed) case 16: return PSM3_IBV_RATE_28_GBPS; case 32: return PSM3_IBV_RATE_50_GBPS; case 64: return PSM3_IBV_RATE_100_GBPS; + case 128: return PSM3_IBV_RATE_200_GBPS; default: _HFI_ERROR( "unknown link speed 0x%x\n", speed); return PSM3_IBV_RATE_100_GBPS; diff --git a/prov/psm3/psm3/hal_verbs/verbs_ep.h b/prov/psm3/psm3/hal_verbs/verbs_ep.h index 8874831f3b5..c1da6b73e53 100644 --- a/prov/psm3/psm3/hal_verbs/verbs_ep.h +++ b/prov/psm3/psm3/hal_verbs/verbs_ep.h @@ -161,12 +161,14 @@ struct verbs_rbuf { typedef struct verbs_rbuf *rbuf_t; #define rbuf_to_buffer(buf) ((buf)->buffer) #define rbuf_addition(buf) ((buf)->pool->addition) -#define rbuf_qp(ep, buf) ((buf)->pool->qp) +#define rbuf_qp_context(ep, buf) ((buf)->pool->for_srq?NULL:(buf)->pool->qp->qp_context) +#define rbuf_qp_type_str(ep, buf) ((buf)->pool->for_srq?"SRQ":qp_type_str((buf)->pool->qp)) #else typedef uint8_t *rbuf_t; #define rbuf_to_buffer(buf) (buf) #define rbuf_addition(buf) (UD_ADDITION) -#define rbuf_qp(ep, buf) ((ep)->verbs_ep.recv_pool.qp) +#define rbuf_qp_context(ep, buf) ((ep)->verbs_ep.recv_pool.qp->qp_context) +#define rbuf_qp_type_str(ep, buf) (qp_type_str((ep)->verbs_ep.recv_pool.qp)) #endif static inline const char*qp_type_str(struct ibv_qp *qp) { @@ -255,7 +257,12 @@ typedef struct psm3_verbs_send_allocator *psm3_verbs_send_allocator_t; // but sizes may differ // when USE_RC, we need a separate recv pool per QP so we can prepost bufs. struct psm3_verbs_recv_pool { - struct ibv_qp *qp; // secondary reference to QP these buffers are for + union { // secondary reference to QP or SRQ these buffers are for + struct ibv_qp *qp; // when ! for_srq +#ifdef USE_RC + struct ibv_srq *srq; // when for_srq +#endif + }; psm2_ep_t ep; // our preregistered recv buffers uint32_t recv_buffer_size; @@ -264,6 +271,7 @@ struct psm3_verbs_recv_pool { struct ibv_mr *recv_buffer_mr; #ifdef USE_RC uint32_t addition; // UD_ADDITION for UD QP, 0 for RC QP + uint32_t for_srq; // if this for an SRQ or QP? #endif #if VERBS_RECV_QP_COALLESCE > 1 // list of ready to post WQEs and SGEs @@ -296,6 +304,9 @@ struct psm3_verbs_ep { struct ibv_cq *recv_cq; struct ibv_qp *qp; struct ibv_qp_cap qp_cap; // capabilities of QP we got +#ifdef USE_RC + struct ibv_srq *srq; +#endif uint32_t qkey; //uint8_t link_layer; // IBV_LINK_LAYER_ETHERNET or other uint8_t active_rate; @@ -309,6 +320,9 @@ struct psm3_verbs_ep { int recv_wc_count; // number left in recv_wc_list int recv_wc_next; // next index #else +#ifdef USE_RC + struct psm3_verbs_recv_pool srq_recv_pool; +#endif // if asked to revisit a packet we save it here rbuf_t revisit_buf; uint32_t revisit_payload_size; @@ -385,8 +399,8 @@ extern psm2_error_t psm_verbs_alloc_send_pool(psm2_ep_t ep, struct ibv_pd *pd, extern psm2_error_t psm_verbs_init_send_allocator( psm3_verbs_send_allocator_t allocator, psm3_verbs_send_pool_t pool); -extern psm2_error_t psm_verbs_alloc_recv_pool(psm2_ep_t ep, struct ibv_qp *qp, - psm3_verbs_recv_pool_t pool, +extern psm2_error_t psm_verbs_alloc_recv_pool(psm2_ep_t ep, uint32_t for_srq, + void *qp_srq, psm3_verbs_recv_pool_t pool, uint32_t recv_total, uint32_t recv_buffer_size); extern void psm_verbs_free_send_pool(psm3_verbs_send_pool_t pool); extern void psm_verbs_free_recv_pool(psm3_verbs_recv_pool_t pool); diff --git a/prov/psm3/psm3/hal_verbs/verbs_hal.c b/prov/psm3/psm3/hal_verbs/verbs_hal.c index 4f6bfb742ef..9575b316ff2 100644 --- a/prov/psm3/psm3/hal_verbs/verbs_hal.c +++ b/prov/psm3/psm3/hal_verbs/verbs_hal.c @@ -166,21 +166,17 @@ static void psm3_hfp_verbs_mq_init_defaults(struct psm2_mq *mq) * Otherwise these defaults are used. */ unsigned rdmamode = psm3_verbs_parse_rdmamode(1); - mq->hfi_thresh_rv = 64000; - mq->hfi_base_window_rv = 131072; + mq->hfi_thresh_rv = PSM_MQ_NIC_RNDV_THRESH; + mq->ips_cpu_window_rv_str = PSM_CPU_NIC_RNDV_WINDOW_STR; if (! (rdmamode & IPS_PROTOEXP_FLAG_ENABLED)) { // TBD - when RDMA is disabled do we want to disable rendezvous? // even without RDMA, the receiver controlled pacing helps scalability mq->hfi_thresh_rv = (~(uint32_t)0); // disable rendezvous } mq->hfi_thresh_tiny = PSM_MQ_NIC_MAX_TINY; -#ifdef PSM_CUDA - if (PSMI_IS_GPU_ENABLED) - mq->hfi_base_window_rv = 2097152; -#endif -#ifdef PSM_ONEAPI +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) if (PSMI_IS_GPU_ENABLED) - mq->hfi_base_window_rv = 512*1024; + mq->ips_gpu_window_rv_str = PSM_GPU_NIC_RNDV_WINDOW_STR; #endif // we parse mr_cache_mode and rv_gpu_cache_size here so we can cache it // once per EP open, even if multi-rail or multi-QP diff --git a/prov/psm3/psm3/hal_verbs/verbs_hal_inline_i.h b/prov/psm3/psm3/hal_verbs/verbs_hal_inline_i.h index 4f2df710571..2ba92503e9f 100644 --- a/prov/psm3/psm3/hal_verbs/verbs_hal_inline_i.h +++ b/prov/psm3/psm3/hal_verbs/verbs_hal_inline_i.h @@ -287,29 +287,33 @@ static PSMI_HAL_INLINE psm2_error_t psm3_hfp_verbs_ips_ipsaddr_set_req_params( //ipsaddr->verbs.rc_qp = NULL; } else { // we got a REQ or a REP, we can move to RTR - // if we are only doing RDMA, we don't need any buffers, but we need a - // pool object for RQ coallesce, so we create a pool with 0 size buffers - if (PSM2_OK != psm_verbs_alloc_recv_pool(proto->ep, ipsaddr->verbs.rc_qp, &ipsaddr->verbs.recv_pool, - min(proto->ep->verbs_ep.hfi_num_recv_wqes/VERBS_RECV_QP_FRACTION, ipsaddr->verbs.rc_qp_max_recv_wr), - (proto->ep->rdmamode == IPS_PROTOEXP_FLAG_RDMA_USER)? 0 - // want to end up with multiple of cache line (64) - // pr_mtu is negotiated max PSM payload, not including hdrs - // pr_mtu+MAX_PSM_HEADERS will be power of 2 verbs MTU - // be conservative (+BUFFER_HEADROOM) - : ipsaddr->pathgrp->pg_path[0][IPS_PATH_LOW_PRIORITY]->pr_mtu - + MAX_PSM_HEADER + BUFFER_HEADROOM - )) { - _HFI_ERROR("failed to alloc RC recv buffers\n"); - return PSM2_INTERNAL_ERR; + if (! proto->ep->verbs_ep.srq) { + // if we are only doing RDMA, we don't need any buffers, but we need a + // pool object for RQ coallesce, so we create a pool with 0 size buffers + if (PSM2_OK != psm_verbs_alloc_recv_pool(proto->ep, 0, ipsaddr->verbs.rc_qp, &ipsaddr->verbs.recv_pool, + min(proto->ep->verbs_ep.hfi_num_recv_wqes/VERBS_RECV_QP_FRACTION, ipsaddr->verbs.rc_qp_max_recv_wr), + (proto->ep->rdmamode == IPS_PROTOEXP_FLAG_RDMA_USER)? 0 + // want to end up with multiple of cache line (64) + // pr_mtu is negotiated max PSM payload, not including hdrs + // pr_mtu+MAX_PSM_HEADERS will be power of 2 verbs MTU + // be conservative (+BUFFER_HEADROOM) + : ipsaddr->pathgrp->pg_path[0][IPS_PATH_LOW_PRIORITY]->pr_mtu + + MAX_PSM_HEADER + BUFFER_HEADROOM + )) { + _HFI_ERROR("failed to alloc RC recv buffers\n"); + return PSM2_INTERNAL_ERR; + } } if (modify_rc_qp_to_init(proto->ep, ipsaddr->verbs.rc_qp)) { _HFI_ERROR("qp_to_init failed\n"); return PSM2_INTERNAL_ERR; } - if (PSM2_OK != psm3_ep_verbs_prepost_recv(&ipsaddr->verbs.recv_pool)) { - _HFI_ERROR("prepost failed\n"); - return PSM2_INTERNAL_ERR; + if (! proto->ep->verbs_ep.srq) { + if (PSM2_OK != psm3_ep_verbs_prepost_recv(&ipsaddr->verbs.recv_pool)) { + _HFI_ERROR("prepost failed\n"); + return PSM2_INTERNAL_ERR; + } } // RC QP MTU will be set to min of req->verbs.qp_attr and pr_mtu // TBD - we already factored in req vs pr to update pr no need diff --git a/prov/psm3/psm3/hal_verbs/verbs_recvhdrq.c b/prov/psm3/psm3/hal_verbs/verbs_recvhdrq.c index eebcac2e5da..f38aa505fc8 100644 --- a/prov/psm3/psm3/hal_verbs/verbs_recvhdrq.c +++ b/prov/psm3/psm3/hal_verbs/verbs_recvhdrq.c @@ -278,7 +278,7 @@ psm2_error_t psm3_verbs_recvhdrq_progress(struct ips_recvhdrq *recvq) // wc.byte_len is len of inbound rdma write not including immed // wc.qp_num - local QP ips_protoexp_handle_immed_data(rcv_ev.proto, - (uint64_t)(rbuf_qp(ep, buf)->qp_context), + (uint64_t)(rbuf_qp_context(ep, buf)), RDMA_IMMED_USER_RC, WC(imm_data), WC(byte_len)); goto repost; break; @@ -310,7 +310,7 @@ psm2_error_t psm3_verbs_recvhdrq_progress(struct ips_recvhdrq *recvq) } rcv_ev.p_hdr = (struct ips_message_header *)(rbuf_to_buffer(buf)+rbuf_addition(buf)); rcv_ev.payload = (rbuf_to_buffer(buf) + rbuf_addition(buf) + sizeof(struct ips_message_header)); - _HFI_VDBG("%s receive - opcode %x\n", qp_type_str(rbuf_qp(ep, buf)), + _HFI_VDBG("%s receive - opcode %x\n", rbuf_qp_type_str(ep, buf), _get_proto_hfi_opcode(rcv_ev.p_hdr)); PSM2_LOG_PKT_STRM(PSM2_LOG_RX,rcv_ev.p_hdr,"PKT_STRM:"); diff --git a/prov/psm3/psm3/include/utils_debug.h b/prov/psm3/psm3/include/utils_debug.h index 499f1a41699..b7b6655f2e6 100644 --- a/prov/psm3/psm3/include/utils_debug.h +++ b/prov/psm3/psm3/include/utils_debug.h @@ -202,6 +202,14 @@ extern void psm3_dump_gpu_buf(uint8_t *buf, uint32_t len); } \ } while (0) +#define _HFI_ENV_ERROR(fmt, ...) \ + do { \ + _Pragma_unlikely \ + if (unlikely(psm3_dbgmask&__HFI_INFO)) { \ + printf("%s: env " fmt, psm3_mylabel, ##__VA_ARGS__); \ + } \ + } while (0) + #define __HFI_PKTDBG_ON unlikely(psm3_dbgmask & __HFI_PKTDBG) #define __HFI_DBG_WHICH(which, fmt, ...) \ @@ -218,8 +226,7 @@ extern void psm3_dump_gpu_buf(uint8_t *buf, uint32_t len); do { \ _Pragma_unlikely \ if (unlikely(psm3_dbgmask&(which))) { \ - PSM3_GETTIME \ - fprintf(psm3_dbgout, PSM3_TIME_FMT "%s: " fmt, PSM3_TIME_ARG, psm3_mylabel, \ + fprintf(psm3_dbgout, "%s: " fmt, psm3_mylabel, \ ##__VA_ARGS__); \ } \ } while (0) @@ -291,6 +298,8 @@ extern void psm3_dump_gpu_buf(uint8_t *buf, uint32_t len); #define _HFI_INFO(fmt, ...) +#define _HFI_ENV_ERROR(fmt, ...) + #define __HFI_PKTDBG_ON 0 #define _HFI_DBG(fmt, ...) diff --git a/prov/psm3/psm3/include/utils_env.h b/prov/psm3/psm3/include/utils_env.h index 5e18975a36b..d95660f6e01 100644 --- a/prov/psm3/psm3/include/utils_env.h +++ b/prov/psm3/psm3/include/utils_env.h @@ -55,6 +55,7 @@ #define UTILS_ENV_H #include "psm2_mock_testing.h" +#include "fnmatch.h" /* we can only include low level headers here because this is * #included by utils_sysfs.c. Can't pull in HAL headers or heap debug macros @@ -81,21 +82,37 @@ union psmi_envvar_val { unsigned long long e_ulonglong; }; -#define PSMI_ENVVAR_LEVEL_USER 1 -#define PSMI_ENVVAR_LEVEL_HIDDEN 2 -#define PSMI_ENVVAR_LEVEL_NEVER_PRINT 4 - -#define PSMI_ENVVAR_TYPE_YESNO 0 -#define PSMI_ENVVAR_TYPE_STR 1 -#define PSMI_ENVVAR_TYPE_INT 2 -#define PSMI_ENVVAR_TYPE_UINT 3 -#define PSMI_ENVVAR_TYPE_UINT_FLAGS 4 -#define PSMI_ENVVAR_TYPE_LONG 5 -#define PSMI_ENVVAR_TYPE_ULONG 6 -#define PSMI_ENVVAR_TYPE_ULONG_FLAGS 7 -#define PSMI_ENVVAR_TYPE_ULONG_ULONG 8 -#define PSMI_ENVVAR_TYPE_STR_VAL_PAT 9 -#define PSMI_ENVVAR_TYPE_STR_TUPLES 10 +// psm3_getenv only expects LEVEL +// psm3_getenv_range accepts LEVEL and FLAGs +// MIN/MAX N/A to TYPEs: YESNO, STR, STR_VAL_PAT_*, STR_TUPLES +// 'min' and 'max' only allowed as input when corresponding +// range check enabled +// FLAG_FATAL will cause a fatal error on invalid input +// (syntax, range or check function detected). When FLAG_FATAL is not +// set an invalid input will fallback to the default with a message. +#define PSMI_ENVVAR_LEVEL_USER 1 // show in user help +#define PSMI_ENVVAR_LEVEL_HIDDEN 2 // hidden from user help +#define PSMI_ENVVAR_LEVEL_NEVER_PRINT 4 // a bit flag, never show in help +#define PSMI_ENVVAR_LEVEL_MASK 0x07 // mask for getting level +#define PSMI_ENVVAR_FLAG_NOMIN 0x10 // no min check +#define PSMI_ENVVAR_FLAG_NOMAX 0x20 // no max check +#define PSMI_ENVVAR_FLAG_NOABBREV 0x40 // no 'min' or 'max' as input +#define PSMI_ENVVAR_FLAG_NOMIN_NOMAX 0x70 // no min, no max, no abbrev +#define PSMI_ENVVAR_FLAG_FATAL 0x80 // invalid input is fatal + +#define PSMI_ENVVAR_TYPE_YESNO 0 +#define PSMI_ENVVAR_TYPE_STR 1 +#define PSMI_ENVVAR_TYPE_INT 2 +#define PSMI_ENVVAR_TYPE_UINT 3 +#define PSMI_ENVVAR_TYPE_UINT_FLAGS 4 +#define PSMI_ENVVAR_TYPE_LONG 5 +#define PSMI_ENVVAR_TYPE_ULONG 6 +#define PSMI_ENVVAR_TYPE_ULONG_FLAGS 7 +#define PSMI_ENVVAR_TYPE_ULONG_ULONG 8 +#define PSMI_ENVVAR_TYPE_STR_VAL_PAT_INT 9 +#define PSMI_ENVVAR_TYPE_STR_VAL_PAT_UINT 10 +#define PSMI_ENVVAR_TYPE_STR_VAL_PAT_UINT_FLAGS 11 +#define PSMI_ENVVAR_TYPE_STR_TUPLES 12 #define PSMI_ENVVAR_VAL_YES ((union psmi_envvar_val) 1) #define PSMI_ENVVAR_VAL_NO ((union psmi_envvar_val) 0) @@ -105,43 +122,82 @@ void psm3_env_print_val(FILE *f, const char *name, int type, int psm3_env_snprint_val(char *buf, size_t size, const char *name, int type, union psmi_envvar_val val); +// psm3_getenv_check_t is optional in psm3_getenv_range +// to confirm the resulting value is valid (return of 0). +// On error (return != 0), errstr[errstr_size] is filled in with +// '\0' terminated string with more information about the error. +// +// This may be used for any envvar type to do further checks of the value +// such as integers which may need to be power of 2 or parse checking +// of strings. +// For strings parsed value(s) is not returned, so caller will need to parse +// again, but this allows better error reporting during env variable get. +// +// ptr is caller specific and can pass additional input information which may +// assist in verification of values. ptr should be used as input only +// because the check function is only called by psm3_getenv_range when +// otherwise valid input is supplied. +typedef int (*psm3_getenv_check_t)(int type, const union psmi_envvar_val val, + void *ptr, size_t errstr_size, char errstr[]); + int MOCKABLE(psm3_getenv)(const char *name, const char *descr, int level, int type, union psmi_envvar_val defval, union psmi_envvar_val *newval); MOCK_DCL_EPILOGUE(psm3_getenv); -/* - * Parsing int and unsigned int parameters - * 0 -> ok, *val updated - * -1 -> empty string - * -2 -> parse error - */ -int psm3_parse_str_int(const char *string, int *val); -int psm3_parse_str_uint(const char *string, unsigned int *val); +int MOCKABLE(psm3_getenv_range)(const char *name, const char *descr, + const char *help, unsigned level_flags, + int type, union psmi_envvar_val defval, union psmi_envvar_val min, + union psmi_envvar_val max, psm3_getenv_check_t check, void *ptr, + union psmi_envvar_val *newval); +MOCK_DCL_EPILOGUE(psm3_getenv_range); /* - * Parse long parameters - * -1 -> empty string - * -2 -> parse error + * Parsing int, unsigned int and long parameters + * 0 -> ok, *val updated + * -1 -> empty string, *val not updated + * -2 -> parse error, *val not updated */ -long psm3_parse_str_long(const char *str); +int psm3_parse_str_int(const char *string, int *val, int min, int max); +int psm3_parse_str_uint(const char *string, unsigned int *val, + unsigned int min, unsigned int max); +int psm3_parse_str_long(const char *str, long *val, long min, long max); /* * Parsing yesno parameters * allows: yes/no, true/false, on/off, 1/0 - * -1 -> empty string - * -2 -> parse error + * 0 -> ok, *val updated + * -1 -> empty string, *val not updated + * -2 -> parse error, *val not updated */ -int psm3_parse_str_yesno(const char *str); +int psm3_parse_str_yesno(const char *str, int *val); /* * Parsing int parameters set in string tuples. + * Returns: + * 0 - parsed with no errors, vals[] updated + * -1 - empty or NULL string, vals[] unchanged + * -2 - syntax error in one of more of the parameters + * parameters with syntax errors are unchanged, others without + * syntax errors are updated in vals[] */ int psm3_parse_str_tuples(const char *str, int ntup, int *vals); -/* parse env of the form 'val' or 'val:' or 'val:pattern' */ -int psm3_parse_val_pattern(const char *env, unsigned def, unsigned *val); +/* parse env of the form 'val' or 'val:' or 'val:pattern' + * Returns: + * 0 - parsed and matches current process, *val set to parsed val + * 0 - parsed and doesn't match current process, *val set to def + * -1 - nothing provided, *val set to def + * -2 - syntax error, *val set to def + * flags PSMI_ENVVAR_FLAG_NOMIN, PSMI_ENVVAR_FLAG_NOMAX and + * PSMI_ENVVAR_FLAG_NOABBREV control if 'min', 'minimum', 'max' or 'maximum' + * allowed as input and indicate if min and/or max supplied. + */ +int psm3_parse_val_pattern_int(const char *env, int def, int *val, + unsigned flags, int min, int max); +int psm3_parse_val_pattern_uint(const char *env, unsigned def, unsigned *val, + unsigned flags, unsigned min, unsigned max); #if defined(PSM_VERBS) || defined(PSM_SOCKETS) // return forced speed in mbps or 0 if not forced diff --git a/prov/psm3/psm3/psm.c b/prov/psm3/psm3/psm.c index 40826d38c1c..df138dd8a2f 100644 --- a/prov/psm3/psm3/psm.c +++ b/prov/psm3/psm3/psm.c @@ -97,6 +97,7 @@ sem_t *psm3_sem_affinity_shm_rw = NULL; int psm3_affinity_shared_file_opened = 0; char *psm3_affinity_shm_name; uint64_t *psm3_shared_affinity_ptr; +uint64_t *psm3_shared_affinity_nic_refcount_ptr; uint32_t psm3_cpu_model; @@ -164,6 +165,8 @@ CUresult (*psmi_cuEventRecord)(CUevent hEvent, CUstream hStream); CUresult (*psmi_cuEventSynchronize)(CUevent hEvent); CUresult (*psmi_cuMemHostAlloc)(void** pp, size_t bytesize, unsigned int Flags); CUresult (*psmi_cuMemFreeHost)(void* p); +CUresult (*psmi_cuMemHostRegister)(void* p, size_t bytesize, unsigned int Flags); +CUresult (*psmi_cuMemHostUnregister)(void* p); CUresult (*psmi_cuMemcpy)(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount); CUresult (*psmi_cuMemcpyDtoD)(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount); CUresult (*psmi_cuMemcpyDtoH)(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount); @@ -202,6 +205,8 @@ uint64_t psmi_count_cuEventRecord; uint64_t psmi_count_cuEventSynchronize; uint64_t psmi_count_cuMemHostAlloc; uint64_t psmi_count_cuMemFreeHost; +uint64_t psmi_count_cuMemHostRegister; +uint64_t psmi_count_cuMemHostUnregister; uint64_t psmi_count_cuMemcpy; uint64_t psmi_count_cuMemcpyDtoD; uint64_t psmi_count_cuMemcpyDtoH; @@ -225,7 +230,7 @@ int psmi_cuda_lib_load() char *dlerr; PSM2_LOG_MSG("entering"); - _HFI_VDBG("Loading CUDA library.\n"); + _HFI_DBG("Loading CUDA library.\n"); psmi_cuda_lib = dlopen("libcuda.so.1", RTLD_LAZY); if (!psmi_cuda_lib) { @@ -270,6 +275,8 @@ int psmi_cuda_lib_load() PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventSynchronize); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemHostAlloc); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemFreeHost); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemHostRegister); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemHostUnregister); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpy); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpyDtoD); PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpyDtoH); @@ -333,6 +340,8 @@ static void psmi_cuda_stats_register() PSMI_CUDA_COUNT_DECLU64(cuEventSynchronize), PSMI_CUDA_COUNT_DECLU64(cuMemHostAlloc), PSMI_CUDA_COUNT_DECLU64(cuMemFreeHost), + PSMI_CUDA_COUNT_DECLU64(cuMemHostRegister), + PSMI_CUDA_COUNT_DECLU64(cuMemHostUnregister), PSMI_CUDA_COUNT_DECLU64(cuMemcpy), PSMI_CUDA_COUNT_DECLU64(cuMemcpyDtoD), PSMI_CUDA_COUNT_DECLU64(cuMemcpyDtoH), @@ -366,6 +375,7 @@ static void psmi_cuda_stats_register() ze_result_t (*psmi_zeInit)(ze_init_flags_t flags); ze_result_t (*psmi_zeDriverGet)(uint32_t *pCount, ze_driver_handle_t *phDrivers); ze_result_t (*psmi_zeDeviceGet)(ze_driver_handle_t hDriver, uint32_t *pCount, ze_device_handle_t *phDevices); +ze_result_t (*psmi_zeDevicePciGetPropertiesExt)(ze_device_handle_t hDevice, ze_pci_ext_properties_t *pPciProperties); #ifndef PSM3_NO_ONEAPI_IMPORT ze_result_t (*psmi_zeDriverGetExtensionFunctionAddress)(ze_driver_handle_t hDriver, const char *name, void **ppFunctionAddress); ze_result_t (*psmi_zexDriverImportExternalPointer)(ze_driver_handle_t hDriver, void *ptr, size_t size); @@ -411,6 +421,7 @@ ze_result_t (*psmi_zelLoaderGetVersions)(size_t *num_elems, zel_component_versio uint64_t psmi_count_zeInit; uint64_t psmi_count_zeDriverGet; uint64_t psmi_count_zeDeviceGet; +uint64_t psmi_count_zeDevicePciGetPropertiesExt; #ifndef PSM3_NO_ONEAPI_IMPORT uint64_t psmi_count_zeDriverGetExtensionFunctionAddress; uint64_t psmi_count_zexDriverImportExternalPointer; @@ -473,6 +484,7 @@ int psmi_oneapi_ze_load() PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeInit); PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeDriverGet); PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeDeviceGet); + PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeDevicePciGetPropertiesExt); #ifndef PSM3_NO_ONEAPI_IMPORT PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeDriverGetExtensionFunctionAddress); #endif @@ -535,6 +547,7 @@ static void psmi_oneapi_ze_stats_register() PSMI_ONEAPI_ZE_COUNT_DECLU64(zeInit), PSMI_ONEAPI_ZE_COUNT_DECLU64(zeDriverGet), PSMI_ONEAPI_ZE_COUNT_DECLU64(zeDeviceGet), + PSMI_ONEAPI_ZE_COUNT_DECLU64(zeDevicePciGetPropertiesExt), #ifndef PSM3_NO_ONEAPI_IMPORT PSMI_ONEAPI_ZE_COUNT_DECLU64(zeDriverGetExtensionFunctionAddress), PSMI_ONEAPI_ZE_COUNT_DECLU64(zexDriverImportExternalPointer), @@ -637,11 +650,13 @@ static void psmi_gpu_init(void) is_gdr_copy_enabled = env_enable_gdr_copy.e_int; union psmi_envvar_val env_gpu_thresh_rndv; - ret = psm3_getenv("PSM3_GPU_THRESH_RNDV", + ret = psm3_getenv_range("PSM3_GPU_THRESH_RNDV", "RNDV protocol is used for GPU send message sizes greater than the threshold", - PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, - (union psmi_envvar_val)gpu_thresh_rndv, &env_gpu_thresh_rndv); - if (ret) + NULL, PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)gpu_thresh_rndv, + (union psmi_envvar_val)0, (union psmi_envvar_val)UINT32_MAX, + NULL, NULL, &env_gpu_thresh_rndv); + if (ret > 0) /* * For backward compatibility, check if the old variable name is set. * Priority order: New name > old name > default value. @@ -693,7 +708,7 @@ int psmi_cuda_initialize() psm2_error_t err = PSM2_OK; PSM2_LOG_MSG("entering"); - _HFI_VDBG("Enabling CUDA support.\n"); + _HFI_DBG("Enabling CUDA support.\n"); psmi_cuda_stats_register(); @@ -727,6 +742,7 @@ static void psmi_oneapi_find_copy_only_engine(ze_device_handle_t dev, uint32_t count = 0; ze_command_queue_group_properties_t *props = NULL; int i; + int done = 0; /* Set the default */ ctxt->ordinal = 0; @@ -742,15 +758,27 @@ static void psmi_oneapi_find_copy_only_engine(ze_device_handle_t dev, PSMI_ONEAPI_ZE_CALL(zeDeviceGetCommandQueueGroupProperties, dev, &count, props); - /* Select the first copy-only engine group if possible */ + // pick the last command queue group which supports copy but not compute. + // For PVC this will be the xeLink copy engine which will also + // have numQueues >1 (TBD - perhaps only select if it has numQueues>1). + // This ordinal is then supplied to create Command Queues and Command Lists. for (i = count - 1; i >= 0; i--) { - if ((props[i].flags & + _HFI_DBG("GPU Queue Group %d: copy=%d Compute=%d num_queues=%d\n", i, + (props[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY) != 0, + (props[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) != 0, + (int)props[i].numQueues); + if (! done && (props[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY) && !(props[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE)) { ctxt->ordinal = i; ctxt->num_queues = props[i].numQueues; - break; + done = 1; + if (_HFI_DBG_ON) { + _HFI_DBG_ALWAYS("Selected GPU copy engine %d\n", i); + } else { + break; + } } } psmi_free(props); @@ -789,6 +817,35 @@ static void psmi_oneapi_cmd_create(ze_device_handle_t dev, struct ze_dev_ctxt *c dev, &ze_cl_desc, &ctxt->cl); } ctxt->dev = dev; + + if (psm3_oneapi_parallel_dtod_copy_thresh < UINT_MAX) { + // create resources for dual copy mechanism + ze_event_pool_desc_t pool_desc = { + .stype = ZE_STRUCTURE_TYPE_EVENT_POOL_DESC, + .flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE, + .count = 2 + }; + ze_event_desc_t event_desc = { + .stype = ZE_STRUCTURE_TYPE_EVENT_DESC, + .signal = ZE_EVENT_SCOPE_FLAG_HOST, + .wait = ZE_EVENT_SCOPE_FLAG_HOST, + }; + PSMI_ONEAPI_ZE_CALL(zeEventPoolCreate, + ze_context, &pool_desc, 0, NULL, &ctxt->event_pool); + + event_desc.index = 0; + PSMI_ONEAPI_ZE_CALL(zeEventCreate, ctxt->event_pool, &event_desc, + &ctxt->copy_status0); + + event_desc.index = 1; + PSMI_ONEAPI_ZE_CALL(zeEventCreate, ctxt->event_pool, &event_desc, + &ctxt->copy_status1); + + psmi_oneapi_async_cmd_create(ctxt, &ctxt->async_cq0, + &ctxt->async_cl0); + psmi_oneapi_async_cmd_create(ctxt, &ctxt->async_cq1, + &ctxt->async_cl1); + } } void psmi_oneapi_cmd_create_all(void) @@ -804,8 +861,11 @@ void psmi_oneapi_cmd_create_all(void) for (i = 0; i < num_ze_devices; i++) { ctxt = &ze_devices[i]; - if (!ctxt->cl) + if (!ctxt->cl) { psmi_oneapi_cmd_create(ctxt->dev, ctxt); + _HFI_DBG("Initialized cmd queues for ze_device[%d] %p\n", + i, ctxt->dev); + } } if (num_ze_devices > 0) cur_ze_dev = &ze_devices[0]; @@ -819,6 +879,34 @@ void psmi_oneapi_cmd_destroy_all(void) for (i = 0; i < num_ze_devices; i++) { ctxt = &ze_devices[i]; + if (ctxt->async_cl1 != NULL) { + PSMI_ONEAPI_ZE_CALL(zeCommandListDestroy, ctxt->async_cl1); + ctxt->async_cl1 = NULL; + } + if (ctxt->async_cq1 != NULL) { + PSMI_ONEAPI_ZE_CALL(zeCommandQueueDestroy, ctxt->async_cq1); + ctxt->async_cq1 = NULL; + } + if (ctxt->async_cl0 != NULL) { + PSMI_ONEAPI_ZE_CALL(zeCommandListDestroy, ctxt->async_cl0); + ctxt->async_cl0 = NULL; + } + if (ctxt->async_cq0 != NULL) { + PSMI_ONEAPI_ZE_CALL(zeCommandQueueDestroy, ctxt->async_cq0); + ctxt->async_cq0 = NULL; + } + if (ctxt->copy_status1 != NULL) { + PSMI_ONEAPI_ZE_CALL(zeEventDestroy, ctxt->copy_status1); + ctxt->copy_status1 = NULL; + } + if (ctxt->copy_status0 != NULL) { + PSMI_ONEAPI_ZE_CALL(zeEventDestroy, ctxt->copy_status0); + ctxt->copy_status0 = NULL; + } + if (ctxt->event_pool != NULL) { + PSMI_ONEAPI_ZE_CALL(zeEventPoolDestroy, ctxt->event_pool); + ctxt->event_pool = NULL; + } if (ctxt->cl) { PSMI_ONEAPI_ZE_CALL(zeCommandListDestroy, ctxt->cl); ctxt->cl = NULL; @@ -849,7 +937,7 @@ int psmi_oneapi_ze_initialize() union psmi_envvar_val env; PSM2_LOG_MSG("entering"); - _HFI_VDBG("Init Level Zero library.\n"); + _HFI_DBG("Init Level Zero library.\n"); psmi_oneapi_ze_stats_register(); err = psmi_oneapi_ze_load(); @@ -868,6 +956,13 @@ int psmi_oneapi_ze_initialize() (union psmi_envvar_val)1, &env); psm3_oneapi_immed_async_copy = env.e_int; + psm3_getenv("PSM3_ONEAPI_PARALLEL_DTOD_COPY_THRESH", + "Use parallel CommandLists for GPU to GPU copy larger than threshold", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)(256*1024-1), &env); + // no benefit below 128K-1, plus the copy is spilt at a 64K boundary + psm3_oneapi_parallel_dtod_copy_thresh = max(128*1024-1, env.e_uint); + PSMI_ONEAPI_ZE_CALL(zeInit, ZE_INIT_FLAG_GPU_ONLY); @@ -911,11 +1006,15 @@ int psmi_oneapi_ze_initialize() ze_context_desc_t ctxtDesc = { ZE_STRUCTURE_TYPE_CONTEXT_DESC, NULL, 0 }; PSMI_ONEAPI_ZE_CALL(zeContextCreate, ze_driver, &ctxtDesc, &ze_context); - _HFI_VDBG("ze_driver %p first device %p ze_context %p\n", - ze_driver, &devices[0], ze_context); + _HFI_DBG("ze_driver %p %u devices first device %p ze_context %p\n", + ze_driver, ze_device_count, devices[0], ze_context); - for (i = 0; i < ze_device_count; i++) + for (i = 0; i < ze_device_count; i++) { + ze_devices[i].dev_index = i; psmi_oneapi_cmd_create(devices[i], &ze_devices[i]); + _HFI_DBG("Initialized cmd queues for ze_device[%d] %p\n", + i, ze_devices[i].dev); + } num_ze_devices = ze_device_count; if (num_ze_devices > 0) @@ -1014,7 +1113,11 @@ void psmi_parse_nic_var() { union psmi_envvar_val env_nic; psm3_getenv("PSM3_NIC", - "Device Unit number or name or wildcard (-1 or 'any' autodetects)", + "Device(s) to consider for use. By name (" +#ifdef FNM_EXTMATCH + "extended " +#endif + "glob pattern), unit number or 'any'", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, (union psmi_envvar_val)"any", &env_nic); //autodetect @@ -1064,6 +1167,11 @@ static int psm3_parse_no_warn(void) } #endif +int init_cache_on = 1; +void psm3_turn_off_init_cache() { + init_cache_on = 0; +} + psm2_error_t psm3_init(int *major, int *minor) { psm2_error_t err = PSM2_OK; @@ -1177,10 +1285,10 @@ psm2_error_t psm3_init(int *major, int *minor) psm3_getenv("PSM3_TRACEMASK", "Mask flags for tracing", PSMI_ENVVAR_LEVEL_USER, - PSMI_ENVVAR_TYPE_STR_VAL_PAT, + PSMI_ENVVAR_TYPE_STR_VAL_PAT_UINT_FLAGS, (union psmi_envvar_val)__HFI_DEBUG_DEFAULT_STR, &env_tmask); - (void)psm3_parse_val_pattern(env_tmask.e_str, __HFI_DEBUG_DEFAULT, - &psm3_dbgmask); + (void)psm3_parse_val_pattern_uint(env_tmask.e_str, __HFI_DEBUG_DEFAULT, + &psm3_dbgmask, PSMI_ENVVAR_FLAG_NOMIN_NOMAX, 0, UINT_MAX); /* The "real thing" is done in utils_mallopt.c as constructor function, but * we getenv it here to report what we're doing with the setting */ @@ -1319,6 +1427,10 @@ psm2_error_t psm3_init(int *major, int *minor) goto fail_epid; } + if (psm3_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) { + psm3_hwloc_topology_init(); + } + #ifdef PSM_DSA if (psm3_device_is_enabled(devid_enabled, PTL_DEVID_AMSH)) { if (psm3_dsa_init()) { @@ -1352,7 +1464,8 @@ psm2_error_t psm3_init(int *major, int *minor) * want it to appear in PSM3_VERBOSE_ENV help text */ int enable_cuda = 0; - if (psm3_parse_str_int(psm3_env_get("PSM3_CUDA"), &enable_cuda) == -2 + if (psm3_parse_str_int(psm3_env_get("PSM3_CUDA"), &enable_cuda, + INT_MIN, INT_MAX) == -2 || enable_cuda) { _HFI_INFO("WARNING: PSM built without CUDA enabled, PSM3_CUDA unavailable\n"); } @@ -1382,7 +1495,8 @@ psm2_error_t psm3_init(int *major, int *minor) * want it to appear in PSM3_VERBOSE_ENV help text */ int enable_oneapi = 0; - if (psm3_parse_str_int(psm3_env_get("PSM3_ONEAPI_ZE"), &enable_oneapi) == -2 + if (psm3_parse_str_int(psm3_env_get("PSM3_ONEAPI_ZE"), &enable_oneapi, + INT_MIN, INT_MAX) == -2 || enable_oneapi) { _HFI_INFO("WARNING: PSM built without ONEAPI_ZE enabled, PSM3_ONEAPI_ZE unavailable\n"); } @@ -1399,7 +1513,8 @@ psm2_error_t psm3_init(int *major, int *minor) * get the behavior they expected */ unsigned int gpudirect = 0; - if (psm3_parse_str_uint(psm3_env_get("PSM3_GPUDIRECT"), &gpudirect) == -2 + if (psm3_parse_str_uint(psm3_env_get("PSM3_GPUDIRECT"), &gpudirect, + 0, UINT_MAX) == -2 || gpudirect) { _HFI_INFO("WARNING: PSM built with neither ONEAPI_ZE nor CUDA enabled, PSM3_GPUDIRECT unavailable\n"); } @@ -1420,6 +1535,7 @@ psm2_error_t psm3_init(int *major, int *minor) #endif #if defined(PSM_DSA) || defined(PSM_CUDA) || defined(PSM_ONEAPI) fail_hal: + psm3_hwloc_topology_destroy(); // always safe to call psm3_hal_finalize(); #endif fail_epid: @@ -1450,6 +1566,7 @@ static inline psm2_error_t unit_query_ret_to_err(int ret) } } +static uint64_t nics_max_speed; psm2_error_t psm3_info_query(psm2_info_query_t q, void *out, size_t nargs, psm2_info_query_arg_t args[]) { @@ -1606,6 +1723,11 @@ psm2_error_t psm3_info_query(psm2_info_query_t q, void *out, if (port == 0) port = 1; /* VERBS_PORT */ if (unit == -1) { + if (init_cache_on && nics_max_speed) { + *speed = nics_max_speed; + rv = PSM2_OK; + break; + } // query for unit -1 returns max speed of all candidate NICs *speed = 0; for (unit = 0; unit < psmi_hal_get_num_units_(); unit++) { @@ -1615,7 +1737,12 @@ psm2_error_t psm3_info_query(psm2_info_query_t q, void *out, if (0 <= psmi_hal_get_port_speed(unit, port, &unit_speed)) *speed = max(*speed, unit_speed); } - rv = (*speed) ? PSM2_OK : PSM2_EP_NO_DEVICE; + if (*speed) { + nics_max_speed = *speed; + rv = PSM2_OK; + } else { + rv = PSM2_EP_NO_DEVICE; + } } else { if (psmi_hal_get_port_active(unit, port) <= 0) break; @@ -1749,7 +1876,9 @@ psm2_error_t psm3_finalize(void) * Start critical section to decrement ref count and unlink * affinity shm file. */ - psmi_sem_timedwait(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name); + if (psmi_sem_timedwait(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name)) { + _HFI_ERROR("unable to get NIC affinity semaphone, proceeding anyway\n"); + } psm3_shared_affinity_ptr[AFFINITY_SHM_REF_COUNT_LOCATION] -= 1; if (psm3_shared_affinity_ptr[AFFINITY_SHM_REF_COUNT_LOCATION] <= 0) { @@ -1767,6 +1896,7 @@ psm2_error_t psm3_finalize(void) munmap(psm3_shared_affinity_ptr, PSMI_PAGESIZE); psm3_shared_affinity_ptr = NULL; + psm3_shared_affinity_nic_refcount_ptr = NULL; psmi_free(psm3_affinity_shm_name); psm3_affinity_shm_name = NULL; psm3_affinity_shared_file_opened = 0; @@ -1782,6 +1912,7 @@ psm2_error_t psm3_finalize(void) psm3_affinity_semaphore_open = 0; } + psm3_hwloc_topology_destroy(); // always safe to call psm3_hal_finalize(); #ifdef PSM_CUDA if (PSMI_IS_GPU_ENABLED) diff --git a/prov/psm3/psm3/psm2.h b/prov/psm3/psm3/psm2.h index fe76b5fe4b8..b9ff1c598d1 100644 --- a/prov/psm3/psm3/psm2.h +++ b/prov/psm3/psm3/psm2.h @@ -1376,6 +1376,16 @@ void *psm3_epaddr_getctxt(psm2_epaddr_t epaddr); * option value: Deprecated; this option has no effect. */ +#define PSM2_MQ_OPT_GPU_RNDV_SHM_SZ 0x304 +#define PSM2_MQ_GPU_RNDV_SHM_SZ PSM2_MQ_OPT_GPU_RNDV_SHM_SZ + /**< [@b uint32_t ] Size at which to start enabling + * rendezvous messaging for shared memory (intra-node) GPU messages (If + * unset, defaults to 127 bytes for Intel GPU, 127 for NVIDIA GPU). + * + * component object: PSM2 Matched Queue (@ref psm2_mq_t). + * option value: Size at which to switch to rendezvous protocol for GPU send. + */ + /* PSM2_COMPONENT_AM options */ #define PSM2_AM_OPT_FRAG_SZ 0x401 #define PSM2_AM_MAX_FRAG_SZ PSM2_AM_OPT_FRAG_SZ @@ -1802,10 +1812,10 @@ char* psm3_env_get(const char *name); * * @param[in] const char *str parameter value * @retval 0 The string was valid, *val has value - * -1 The string was empty or NULL - * -2 The string had invalid syntax + * -1 The string was empty or NULL, *val not updated + * -2 The string had invalid syntax, *val not updated */ -int psm3_parse_str_int(const char *string, int *val); +int psm3_parse_str_int(const char *string, int *val, int min, int max); /** @brief PSM2 unsigned int parameter parsing * @@ -1813,22 +1823,56 @@ int psm3_parse_str_int(const char *string, int *val); * * @param[in] const char *str parameter value * @retval 0 The string was valid, *val has value - * -1 The string was empty or NULL - * -2 The string had invalid syntax + * -1 The string was empty or NULL, *val not updated + * -2 The string had invalid syntax, *val not updated */ -int psm3_parse_str_uint(const char *string, unsigned int *val); +int psm3_parse_str_uint(const char *string, unsigned int *val, + unsigned int min, unsigned int max); /** @brief PSM2 yesno parameter parsing * * Function that parses a string yesno parameter * * @param[in] const char *str parameter value - * @retval -1 The string was empty or NULL - * -2 The string had invalid syntax + * @retval 0 The string was valid, *val has value + * -1 The string was empty or NULL, *val not updated + * -2 The string had invalid syntax, *val not updated + * @param[out] int *val * 0 The string was No, False, Off or 0 * 1 The string was Yes, True, On or 1 */ -int psm3_parse_str_yesno(const char *str); +int psm3_parse_str_yesno(const char *str, int *val); + +// for the purposes of psmx3 accessing PSM3_DEVICES config, these +// interfaces are defined here. Not for general consumption +/* We currently have 3 PTLs, 0 is reserved. */ +#define PTL_DEVID_IPS 1 // ips aka nic, network inter-node +#define PTL_DEVID_AMSH 2 // shm, intra-node, scale-up +#define PTL_DEVID_SELF 3 // self + +/* We can currently initialize up to 3 PTLs */ +#define PTL_MAX_INIT 3 + +/** @brief PSM2 devices parameter parsing + * + * Function that gets and parses the PSM3_DEVICES string parameter + * + * @param[out] array of devices + * @retval PSM2_OK - devices successfully returned + * other (PSM2_PARAM_ERR) - error parsing devices + */ +psm2_error_t psm3_parse_devices(int devices[PTL_MAX_INIT]); + +/** @brief PSM2 devices list search + * + * Function that searches devid_enabled for a specific device + * + * @param[in] array of devices from psm3_parse_devices + * @param[in] devid: PTL_DEVID_IPS, PTL_DEVID_AMSH, or PTL_DEVID_SELF + * @retval 1 - given devid is enabled in devices[] + * 0 Given devid is disabled in devices[] + */ +int psm3_device_is_enabled(const int devices[PTL_MAX_INIT], int devid); /** @brief PSM2 env finalize * @@ -1872,6 +1916,8 @@ void psm3_memcpy(void *dest, const void *src, uint32_t len); /*! @} */ +void psm3_turn_off_init_cache(); + #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/prov/psm3/psm3/psm2_hal.c b/prov/psm3/psm3/psm2_hal.c index 058a6c26034..0c347ce2160 100644 --- a/prov/psm3/psm3/psm2_hal.c +++ b/prov/psm3/psm3/psm2_hal.c @@ -230,6 +230,69 @@ int psm3_hal_pre_init_cache_func(enum psmi_hal_pre_init_cache_func_krnls k, ...) rv = -1; } break; + case psmi_hal_pre_init_cache_func_get_port_speed: + { + int unit = va_arg(ap,int); + + if ((unit >= 0) && (unit < p->params.num_units)) + { + int port = va_arg(ap,int); + if ((port >= 1) && (port <= p->params.num_ports)) + { + int i = unit * (p->params.num_ports+1) + port; + // only cache during PSM3 init + if (!init_cache_on || !p->params.port_speed_valid[i]) { + rv = p->hfp_get_port_speed(unit,port,&p->params.port_speed[i]); + p->params.port_speed_valid[i] = rv == 0 ? 1 : -1; + } + rv = (p->params.port_subnet_valid[i] ==1)? 0: -1; + if (rv == 0) { + uint64_t *speed = va_arg(ap, uint64_t*); + if (speed) *speed = p->params.port_speed[i]; + } + } + else + rv = -1; + } + else + rv = -1; + } + break; + case psmi_hal_pre_init_cache_func_get_port_lid: + { + int unit = va_arg(ap,int); + + if ((unit >= 0) && (unit < p->params.num_units)) + { + int port = va_arg(ap,int); + if ((port >= 1) && (port <= p->params.num_ports)) + { + int addr_index = va_arg(ap,int); + if (addr_index >= 0 && addr_index < psm3_addr_per_nic) + { + int i = unit * ((p->params.num_ports+1) * psm3_addr_per_nic) + port * psm3_addr_per_nic + addr_index; + // only cache during PSM3 init + if (!init_cache_on || !p->params.port_lid_valid[i]) { + rv = p->hfp_get_port_lid(unit,port,addr_index); + if (rv > 0) { + p->params.port_lid_valid[i] = 1; + p->params.port_lid[i] = rv; + } else { + p->params.port_lid_valid[i] = -1; + rv = -1; + } + break; + } + rv = p->params.port_lid_valid[i] == -1 ? -1 : p->params.port_lid[i]; + } + } + else + rv = -1; + } + else + rv = -1; + } + break; case psmi_hal_pre_init_cache_func_get_num_contexts: { int unit = va_arg(ap,int); @@ -310,6 +373,51 @@ int psm3_hal_pre_init_cache_func(enum psmi_hal_pre_init_cache_func_krnls k, ...) rv = -1; } break; + case psmi_hal_pre_init_cache_func_get_port_subnet_name: + { + int unit = va_arg(ap,int); + + if ((unit >= 0) && (unit < p->params.num_units)) + { + int port = va_arg(ap,int); + if ((port >= 1) && (port <= p->params.num_ports)) + { + int addr_index = va_arg(ap,int); + if (addr_index >= 0 && addr_index < psm3_addr_per_nic) + { + int i = unit * ((p->params.num_ports+1) * psm3_addr_per_nic) + port * psm3_addr_per_nic + addr_index; + // only cache during PSM3 init + if (!init_cache_on || !p->params.port_subnet_name[i]) { + char buffer[PATH_MAX] = {}; + rv = p->hfp_get_port_subnet_name(unit, port, addr_index, buffer, sizeof(buffer)); + if (p->params.port_subnet_name[i]) { + psmi_free(p->params.port_subnet_name[i]); + } + if (rv == 0) { + p->params.port_subnet_name[i] = psmi_strdup(PSMI_EP_NONE, buffer); + } else { + p->params.port_subnet_name[i] = NULL; + rv = -1; + break; + } + } + char *buf = va_arg(ap, char*); + size_t bufsize = va_arg(ap, size_t); + rv = p->params.port_subnet_name[i] ? 0 : -1; + if (rv == 0 && buf) { + (void)snprintf(buf, bufsize, "%s", p->params.port_subnet_name[i]); + } + } + else + rv = -1; + } + else + rv = -1; + } + else + rv = -1; + } + break; case psmi_hal_pre_init_cache_func_get_unit_pci_bus: { int unit = va_arg(ap,int); @@ -469,6 +577,10 @@ static void psm3_hal_free_cache(struct _psmi_hal_instance *p) FREE_HAL_CACHE(unit_active_valid); FREE_HAL_CACHE(port_active); FREE_HAL_CACHE(port_active_valid); + FREE_HAL_CACHE(port_speed); + FREE_HAL_CACHE(port_speed_valid); + FREE_HAL_CACHE(port_lid); + FREE_HAL_CACHE(port_lid_valid); FREE_HAL_CACHE(num_contexts); FREE_HAL_CACHE(num_contexts_valid); FREE_HAL_CACHE(num_free_contexts); @@ -478,6 +590,7 @@ static void psm3_hal_free_cache(struct _psmi_hal_instance *p) FREE_HAL_CACHE(port_subnet_addr); FREE_HAL_CACHE(port_subnet_idx); FREE_HAL_CACHE(port_subnet_gid); + FREE_HAL_CACHE_ARRAY(port_subnet_name, p->params.num_units * p->params.num_ports * psm3_addr_per_nic); FREE_HAL_CACHE(unit_pci_bus_valid); FREE_HAL_CACHE(unit_pci_bus_domain); @@ -521,6 +634,10 @@ static psmi_hal_instance_t *psm3_hal_select_hal(psmi_hal_instance_t *p, ALLOC_HAL_CACHE(unit_active_valid, int8_t, nunits); ALLOC_HAL_CACHE(port_active, int8_t, nunits*(nports+1)); ALLOC_HAL_CACHE(port_active_valid, int8_t, nunits*(nports+1)); + ALLOC_HAL_CACHE(port_speed, uint64_t, nunits*(nports+1)); + ALLOC_HAL_CACHE(port_speed_valid, int8_t, nunits*(nports+1)); + ALLOC_HAL_CACHE(port_lid, int, nunits*(nports+1)*psm3_addr_per_nic); + ALLOC_HAL_CACHE(port_lid_valid, int8_t, nunits*(nports+1)*psm3_addr_per_nic); ALLOC_HAL_CACHE(num_contexts, uint16_t, nunits); ALLOC_HAL_CACHE(num_contexts_valid, uint16_t, nunits); ALLOC_HAL_CACHE(num_free_contexts, uint16_t, nunits); @@ -530,6 +647,7 @@ static psmi_hal_instance_t *psm3_hal_select_hal(psmi_hal_instance_t *p, ALLOC_HAL_CACHE(port_subnet_addr, psmi_naddr128_t, nunits*(nports+1)*psm3_addr_per_nic); ALLOC_HAL_CACHE(port_subnet_idx, int, nunits*(nports+1)*psm3_addr_per_nic); ALLOC_HAL_CACHE(port_subnet_gid, psmi_gid128_t, nunits*(nports+1)*psm3_addr_per_nic); + ALLOC_HAL_CACHE_ARRAY(port_subnet_name, char, nunits*(nports+1)*psm3_addr_per_nic); ALLOC_HAL_CACHE(unit_pci_bus_valid, int8_t, nunits); ALLOC_HAL_CACHE(unit_pci_bus_domain, uint32_t, nunits); @@ -557,6 +675,72 @@ static psmi_hal_instance_t *psm3_hal_select_hal(psmi_hal_instance_t *p, return NULL; } +/* check syntax of pattern. and confirm it matches at least 1 HAL + * returns: + * 0 - valid + * -1 - empty string + * -2 - invalid syntax + */ +static int parse_check_hal(int type, const union psmi_envvar_val val, void *ptr, + size_t errstr_size, char errstr[]) +{ + int i; + int ret; + + psmi_assert(type == PSMI_ENVVAR_TYPE_STR); + if (! val.e_str || ! *val.e_str) + return -1; + // use fnmatch to check syntax of pattern + // reviewing fnmatch source it only returns 0 or FNM_NOMATCH, but be + // safe and match fnmatch documentation that other values indicate error + ret = fnmatch(val.e_str, "dontcare", 0 +#ifdef FNM_EXTMATCH + | FNM_EXTMATCH +#endif + ); + if (ret && ret != FNM_NOMATCH) { + if (errstr_size) + snprintf(errstr, errstr_size, " invalid " +#ifdef FNM_EXTMATCH + "extended " +#endif + "glob pattern"); + return -2; + } + // we check for at least 1 matching HAL, but purposely do + // not check for active NICs within the HAL + // We allow any valid HAL, even if not included in the build + // This avoids surprises if user or middleware uses PSM3_HAL to limit + // PSM3 to a specific HAL, but the PSM3 build found lacks that HAL + ret = -2; // assume no matching HAL found + for (i=0; i <= PSM_HAL_INDEX_MAX; i++) + { + if (i == PSM_HAL_INDEX_LOOPBACK) + continue; + if (0 == strcmp("unknown", psm3_hal_index_to_str(i))) + continue; + + if (0 == strcmp(val.e_str, "any") || + 0 == fnmatch(val.e_str, psm3_hal_index_to_str(i), 0 +#ifdef FNM_EXTMATCH + | FNM_EXTMATCH +#endif + )) + { + ret = 0; + break; + } + } + if (ret == -2) { + if (errstr_size) + snprintf(errstr, errstr_size, " no matching HAL found"); + return -2; + } + return 0; +} + +static char hal_help[512] = ""; + static struct _psmi_hal_instance *psm3_hal_get_pi_inst(void) { int i; @@ -584,11 +768,12 @@ static struct _psmi_hal_instance *psm3_hal_get_pi_inst(void) */ union psmi_envvar_val env_hal; /* HAL instance preference */ - psm3_getenv("PSM3_HAL", - "Hardware Abstraction Layer to use (Default is first HAL" - " to find a valid, unfiltered NIC [any])", + psm3_getenv_range("PSM3_HAL", + "Hardware Abstraction Layer to use", hal_help, PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, - (union psmi_envvar_val)"any", &env_hal); + (union psmi_envvar_val)"any", + (union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL, + parse_check_hal, NULL, &env_hal); for (i=0; i <= PSM_HAL_INDEX_MAX; i++) { @@ -651,6 +836,36 @@ int psm3_hal_initialize(int devid_enabled[PTL_MAX_INIT]) PSMI_HAL_INI(); if (! psm3_hal_current_hal_instance) { + int i; + char valid_hal_list[80]; + int valid_len = 0; + char avail_hal_list[80]; + int avail_len = 0; + + valid_hal_list[0] = '\0'; + avail_hal_list[0] = '\0'; + for (i=0; i <= PSM_HAL_INDEX_MAX; i++) + { + if (i == PSM_HAL_INDEX_LOOPBACK) + continue; + if (0 == strcmp("unknown", psm3_hal_index_to_str(i))) + continue; + + snprintf(&valid_hal_list[valid_len], + sizeof(valid_hal_list)-valid_len, "%s'%s'", + valid_hal_list[0]?", ":"", psm3_hal_index_to_str(i)); + valid_len = strlen(valid_hal_list); + if (psm3_hal_table[i]) { + snprintf(&avail_hal_list[avail_len], + sizeof(avail_hal_list)-avail_len, "%s'%s'", + avail_hal_list[0]?", ":"", psm3_hal_index_to_str(i)); + avail_len = strlen(avail_hal_list); + } + } + snprintf(hal_help, sizeof(hal_help), + " 'any' - use first HAL which finds a valid, unfiltered NIC (default)\n" + " valid HALs: %s\n" + " available HALs: %s", valid_hal_list, avail_hal_list); if (! psm3_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) { // register the loopback HAL and select it. Unlike normal HALs // we don't call psm3_hal_register_instance because it would enforce diff --git a/prov/psm3/psm3/psm2_hal.h b/prov/psm3/psm3/psm2_hal.h index d5658221c0c..055261da6c4 100644 --- a/prov/psm3/psm3/psm2_hal.h +++ b/prov/psm3/psm3/psm2_hal.h @@ -228,6 +228,10 @@ typedef struct _psmi_hal_params uint16_t default_pkey; int8_t *unit_active,*unit_active_valid; int8_t *port_active,*port_active_valid; + uint64_t *port_speed; + int8_t *port_speed_valid; + int *port_lid; + int8_t *port_lid_valid; uint16_t *num_contexts,*num_contexts_valid; uint16_t *num_free_contexts,*num_free_contexts_valid; // information from port_get_subnet @@ -237,6 +241,7 @@ typedef struct _psmi_hal_params psmi_naddr128_t *port_subnet_addr; int *port_subnet_idx; psmi_gid128_t *port_subnet_gid; + char **port_subnet_name; int8_t *unit_pci_bus_valid; uint32_t *unit_pci_bus_domain; @@ -254,6 +259,10 @@ typedef struct _psmi_hal_params #define PSM_HAL_ALG_ACROSS 0 #define PSM_HAL_ALG_WITHIN 1 #define PSM_HAL_ALG_ACROSS_ALL 2 +#define PSM_HAL_ALG_CPU_CENTRIC 3 +#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY +#define PSM_HAL_ALG_GPU_CENTRIC 4 +#endif typedef enum { @@ -499,16 +508,22 @@ int psm3_hal_initialize(int devid_enabled[PTL_MAX_INIT]); int psm3_hal_finalize(void); +// indicate whether we cache data during PSM3 init +extern int init_cache_on; + enum psmi_hal_pre_init_cache_func_krnls { psmi_hal_pre_init_cache_func_get_num_units, psmi_hal_pre_init_cache_func_get_num_ports, psmi_hal_pre_init_cache_func_get_unit_active, psmi_hal_pre_init_cache_func_get_port_active, + psmi_hal_pre_init_cache_func_get_port_speed, + psmi_hal_pre_init_cache_func_get_port_lid, psmi_hal_pre_init_cache_func_get_num_contexts, psmi_hal_pre_init_cache_func_get_num_free_contexts, psmi_hal_pre_init_cache_func_get_default_pkey, psmi_hal_pre_init_cache_func_get_port_subnet, + psmi_hal_pre_init_cache_func_get_port_subnet_name, psmi_hal_pre_init_cache_func_get_unit_pci_bus, psmi_hal_pre_init_cache_func_get_unit_device_id, psmi_hal_pre_init_cache_func_get_unit_device_version, @@ -549,9 +564,6 @@ int psm3_hal_pre_init_cache_func(enum psmi_hal_pre_init_cache_func_krnls k, ...) /* DISPATCH_FUNC */ #define psmi_hal_get_unit_name(...) PSMI_HAL_DISPATCH_FUNC(get_unit_name,__VA_ARGS__) -#define psmi_hal_get_port_subnet_name(...) PSMI_HAL_DISPATCH_FUNC(get_port_subnet_name,__VA_ARGS__) -#define psmi_hal_get_port_speed(...) PSMI_HAL_DISPATCH_FUNC(get_port_speed,__VA_ARGS__) -#define psmi_hal_get_port_lid(...) PSMI_HAL_DISPATCH_FUNC(get_port_lid,__VA_ARGS__) #define psmi_hal_mq_init_defaults(...) PSMI_HAL_DISPATCH_FUNC(mq_init_defaults,__VA_ARGS__) #define psmi_hal_ep_open_opts_get_defaults(...) PSMI_HAL_DISPATCH_FUNC(ep_open_opts_get_defaults,__VA_ARGS__) #define psmi_hal_context_initstats(...) PSMI_HAL_DISPATCH_FUNC(context_initstats,__VA_ARGS__) @@ -566,10 +578,13 @@ int psm3_hal_pre_init_cache_func(enum psmi_hal_pre_init_cache_func_krnls k, ...) #define psmi_hal_get_num_ports_(...) PSMI_HAL_DISPATCH_PI(get_num_ports,##__VA_ARGS__) #define psmi_hal_get_unit_active(...) PSMI_HAL_DISPATCH_PI(get_unit_active,__VA_ARGS__) #define psmi_hal_get_port_active(...) PSMI_HAL_DISPATCH_PI(get_port_active,__VA_ARGS__) +#define psmi_hal_get_port_speed(...) PSMI_HAL_DISPATCH_PI(get_port_speed,__VA_ARGS__) +#define psmi_hal_get_port_lid(...) PSMI_HAL_DISPATCH_PI(get_port_lid,__VA_ARGS__) #define psmi_hal_get_num_contexts(...) PSMI_HAL_DISPATCH_PI(get_num_contexts,__VA_ARGS__) #define psmi_hal_get_num_free_contexts(...) PSMI_HAL_DISPATCH_PI(get_num_free_contexts,__VA_ARGS__) #define psmi_hal_get_default_pkey(...) PSMI_HAL_DISPATCH_PI(get_default_pkey,##__VA_ARGS__) #define psmi_hal_get_port_subnet(...) PSMI_HAL_DISPATCH_PI(get_port_subnet,__VA_ARGS__) +#define psmi_hal_get_port_subnet_name(...) PSMI_HAL_DISPATCH_PI(get_port_subnet_name,__VA_ARGS__) #define psmi_hal_get_unit_pci_bus(...) PSMI_HAL_DISPATCH_PI(get_unit_pci_bus,__VA_ARGS__) #define psmi_hal_get_unit_device_id(...) PSMI_HAL_DISPATCH_PI(get_unit_device_id,__VA_ARGS__) #define psmi_hal_get_unit_device_version(...) PSMI_HAL_DISPATCH_PI(get_unit_device_version,__VA_ARGS__) diff --git a/prov/psm3/psm3/psm2_hal_loopback.c b/prov/psm3/psm3/psm2_hal_loopback.c index cf78a99b2ee..913a45dec78 100644 --- a/prov/psm3/psm3/psm2_hal_loopback.c +++ b/prov/psm3/psm3/psm2_hal_loopback.c @@ -209,8 +209,10 @@ static int psm3_hfp_loopback_get_port_lid(int unit, int port, int addr_index) // also prior to the EP being opened static void psm3_hfp_loopback_mq_init_defaults(struct psm2_mq *mq) { - /* these are only used by ptl_ips */ - mq->hfi_base_window_rv = (~(uint32_t)0); // no rendezvous + mq->ips_cpu_window_rv_str = NULL; // no rendezvous +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) + mq->ips_gpu_window_rv_str = NULL; // no rendezvous +#endif mq->hfi_thresh_rv = (~(uint32_t)0); // disable rendezvous mq->hfi_thresh_tiny = PSM_MQ_NIC_MAX_TINY; // RDMA and MR cache N/A, leave ep->rdmamode, ep->mr_cache_mode and diff --git a/prov/psm3/psm3/psm2_mq.h b/prov/psm3/psm3/psm2_mq.h index b32c5126ba8..517b4802d5b 100644 --- a/prov/psm3/psm3/psm2_mq.h +++ b/prov/psm3/psm3/psm2_mq.h @@ -173,7 +173,8 @@ extern "C" { * @li If and when possible, receive buffers should be posted as early as * possible and ideally before calling into the progress engine. * @li Use of rendezvous messaging that can be controlled with - * @ref PSM2_MQ_RNDV_HFI_SZ and @ref PSM2_MQ_RNDV_SHM_SZ options. These + * @ref PSM2_MQ_RNDV_HFI_SZ, @ref PSM2_MQ_RNDV_SHM_SZ and + * PSM2_MQ_GPU_RNDV_SHM_SZ options. These * options default to values determined to make effective use of * bandwidth and are hence not advisable for all communication message * sizes, but rendezvous messages inherently prevent unexpected @@ -477,6 +478,7 @@ struct psm2_mq_req_user { * @param[in] option Index of option to retrieve. Possible values are: * @li @ref PSM2_MQ_RNDV_HFI_SZ * @li @ref PSM2_MQ_RNDV_SHM_SZ + * @li @ref PSM2_MQ_GPU_RNDV_SHM_SZ * @li @ref PSM2_MQ_MAX_SYSBUF_MBYTES * * @param[in] value Pointer to storage that can be used to store the value of @@ -498,6 +500,7 @@ psm2_error_t psm3_mq_getopt(psm2_mq_t mq, int option, void *value); * @param[in] option Index of option to retrieve. Possible values are: * @li @ref PSM2_MQ_RNDV_HFI_SZ * @li @ref PSM2_MQ_RNDV_SHM_SZ + * @li @ref PSM2_MQ_GPU_RNDV_SHM_SZ * @li @ref PSM2_MQ_MAX_SYSBUF_MBYTES * * @param[in] value Pointer to storage that contains the value to be updated @@ -519,6 +522,9 @@ psm2_error_t psm3_mq_setopt(psm2_mq_t mq, int option, const void *value); #define PSM2_MQ_FLAG_SENDSYNC 0x01 /**< MQ Send Force synchronous send */ +#define PSM2_MQ_FLAG_INJECT 0x02 + /**< MQ Send Force bounce buffer for */ + /* FI_INJECT/fi_inject behavior */ #define PSM2_MQ_REQINVALID ((psm2_mq_req_t)(NULL)) /**< MQ request completion value */ @@ -710,6 +716,9 @@ psm3_mq_imrecv(psm2_mq_t mq, uint32_t flags, void *buf, uint32_t len, * synchronously, meaning that the message will not be sent until * the receiver acknowledges that it has matched the send with a * receive buffer. + * @li PSM2_MQ_FLAG_INJECT tells PSM to consume the send buffer + * immediately to comply with FI_INJECT/fi_inject behavior, + * cannot be used in conjunction with PSM2_MQ_FLAG_SENDSYNC. * @param[in] stag Message Send Tag * @param[in] buf Source buffer pointer * @param[in] len Length of message starting at @c buf. @@ -742,6 +751,9 @@ psm3_mq_send(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, uint64_t stag, * synchronously, meaning that the message will not be sent until * the receiver acknowledges that it has matched the send with a * receive buffer. + * @li PSM2_MQ_FLAG_INJECT tells PSM to consume the send buffer + * immediately to comply with FI_INJECT/fi_inject behavior, + * cannot be used in conjunction with PSM2_MQ_FLAG_SENDSYNC. * @param[in] stag Message Send Tag * @param[in] buf Source buffer pointer * @param[in] len Length of message starting at @c buf. @@ -776,6 +788,9 @@ psm3_mq_send2(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, * synchronously, meaning that the message will not be sent until * the receiver acknowledges that it has matched the send with a * receive buffer. + * @li PSM2_MQ_FLAG_INJECT tells PSM to consume the send buffer + * immediately to comply with FI_INJECT/fi_inject behavior, + * cannot be used in conjunction with PSM2_MQ_FLAG_SENDSYNC. * @param[in] stag Message Send Tag * @param[in] buf Source buffer pointer * @param[in] len Length of message starting at @c buf. @@ -841,6 +856,9 @@ psm3_mq_isend(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, uint64_t stag, * synchronously, meaning that the message will not be sent until * the receiver acknowledges that it has matched the send with a * receive buffer. + * @li PSM2_MQ_FLAG_INJECT tells PSM to consume the send buffer + * immediately to comply with FI_INJECT/fi_inject behavior, + * cannot be used in conjunction with PSM2_MQ_FLAG_SENDSYNC. * @param[in] stag Message Send Tag, array of three 32-bit values. * @param[in] buf Source buffer pointer * @param[in] len Length of message starting at @c buf. diff --git a/prov/psm3/psm3/psm_config.h b/prov/psm3/psm3/psm_config.h index 479b6f9d732..4ce7de78157 100644 --- a/prov/psm3/psm3/psm_config.h +++ b/prov/psm3/psm3/psm_config.h @@ -99,6 +99,11 @@ /* #define PSM_PROFILE */ #endif +// If defined, for FI_INJECT Send DMA will be avoided +#ifndef PSM_INJECT_NOSDMA +/* #define PSM_INJECT_NOSDMA */ +#endif + #define PSMI_MIN_EP_CONNECT_TIMEOUT (2 * SEC_ULL) #define PSMI_MIN_EP_CLOSE_TIMEOUT (1 * SEC_ULL) #define PSMI_MAX_EP_CLOSE_TIMEOUT (2 * SEC_ULL) @@ -174,9 +179,21 @@ #define PSM_MQ_NIC_MAX_TINY 8 /* max TINY payload allowed */ +#define PSM_MQ_NIC_RNDV_THRESH 64000 +#define PSM_CPU_NIC_RNDV_WINDOW_STR "131072" +#ifdef PSM_CUDA +#define PSM_GPU_NIC_RNDV_WINDOW_STR "2097152" +#elif defined(PSM_ONEAPI) +#define PSM_GPU_NIC_RNDV_WINDOW_STR "131072:524287,262144:1048575,524288" +#endif #define PSM_MQ_NIC_MAX_RNDV_WINDOW (4 * 1024 * 1024) /* max rndv window */ #define MQ_SHM_THRESH_RNDV 16000 +#if defined(PSM_CUDA) +#define MQ_SHM_GPU_THRESH_RNDV 127 +#elif defined(PSM_ONEAPI) +#define MQ_SHM_GPU_THRESH_RNDV 127 +#endif // LEARN_HASH_SELECTOR has PSM3 dynamically learn the combinations // of src_addr presence and tagsel used by a given middleware. This diff --git a/prov/psm3/psm3/psm_context.c b/prov/psm3/psm3/psm_context.c index 047cfbc38a3..35477d69f2f 100644 --- a/prov/psm3/psm3/psm_context.c +++ b/prov/psm3/psm3/psm_context.c @@ -58,7 +58,6 @@ #include "psm_user.h" #include "psm2_hal.h" -static int psmi_parse_nic_selection_algorithm(void); static psm2_error_t psm3_ep_verify_pkey(psm2_ep_t ep, uint16_t pkey, uint16_t *opkey, uint16_t* oindex); @@ -92,481 +91,6 @@ int psm3_context_interrupt_isenabled(psm2_ep_t ep) return psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_INTR_ENABLED); } - -/* returns the 8-bit hash value of an uuid. */ -static inline -uint8_t -psm3_get_uuid_hash(psm2_uuid_t const uuid) -{ - int i; - uint8_t hashed_uuid = 0; - - for (i=0; i < sizeof(psm2_uuid_t); ++i) - hashed_uuid ^= *((uint8_t const *)uuid + i); - - return hashed_uuid; -} - -int psm3_get_current_proc_location() -{ - int core_id, node_id; - - core_id = sched_getcpu(); - if (core_id < 0) - return -EINVAL; - - node_id = numa_node_of_cpu(core_id); - if (node_id < 0) - return -EINVAL; - - return node_id; -} - -// print a bitmask in condensed form at _HFI_VBG level -// condensed form consolidates sequential numbers such as: "0-43,88-131" -static void vdbg_print_bitmask(const char* prefix, struct bitmask *bmp) -{ - if (_HFI_VDBG_ON) { - int i, len; - char buf[1024]; - int last=-1; - int first=-1; - int max = numa_num_possible_nodes(); - - snprintf(buf, sizeof(buf), "%s", prefix); - len = strlen(buf); - for (i=0; i 1) { - if (first == last) { - // first in a possible sequence - snprintf(&buf[len], sizeof(buf)-len, ",%d", i); - } else { - // complete prior sequence, first in a new sequence - snprintf(&buf[len], sizeof(buf)-len, "-%d,%d", last, i); - } - first = i; - last = first; - } else { - last = i; - } - len = strlen(buf); - } - // complete prior sequence as needed - if (first>=0 && first != last) - snprintf(&buf[len], sizeof(buf)-len, "-%d", last); - _HFI_VDBG("%s\n", buf); - } -} - -// return the largest possible numa ID of a CPU in this system -int psm3_get_max_cpu_numa() -{ - static int max_cpu_numa = -1; - struct bitmask *cpumask, *empty_cpumask; - int i; - - if (max_cpu_numa >= 0) - return max_cpu_numa; - - // we don't depend on numa_num_configured_nodes since in theory there - // could be non-CPU memory NUMA nodes. We only need to know the - // largest possible value for a CPU numa node ID - - // numa_max_node - largest NUMA node which is not disabled - // numa_node_to_cpus - given a NUMA node, create list of CPUs - // numa_node_of_cpu - cpu ID to NUMA (or error if invalid CPU) - // numa_node_to_cpus - cpumask of CPUs on given NUMA node - - max_cpu_numa = -1; - empty_cpumask = numa_allocate_cpumask(); - numa_bitmask_clearall(empty_cpumask); - //vdbg_print_bitmask("empty_cpumask: ", empty_cpumask); - - cpumask = numa_allocate_cpumask(); - _HFI_VDBG("numa_max_node=%d\n", numa_max_node()); - for (i=numa_max_node(); i >= 0; i--) { - numa_bitmask_clearall(cpumask); - int ret = numa_node_to_cpus(i, cpumask); - _HFI_VDBG("i=%d node_to_cpus ret=%d\n", i, ret); - vdbg_print_bitmask("cpumask: ", cpumask); - if (ret >= 0 && ! numa_bitmask_equal(cpumask, empty_cpumask)) { - max_cpu_numa = i; - break; - } - } - numa_free_cpumask(cpumask); - numa_free_cpumask(empty_cpumask); - psmi_assert_always(max_cpu_numa >= 0); - return max_cpu_numa; -} - -/* search the list of all units for those which are active - * and optionally match the given NUMA node_id (when node_id >= 0) - * returns the number of active units found. - * Note get_unit_active tests for active ports, valid addresses and - * performs filtering as done in get_port_subnets - */ -static int -hfi_find_active_hfis(int nunits, int node_id, int *saved_hfis) -{ - int found = 0, unit_id; - - for (unit_id = 0; unit_id < nunits; unit_id++) { - int node_id_i; - - if (psmi_hal_get_unit_active(unit_id) <= 0) - continue; - - if (node_id < 0) { - saved_hfis[found++] = unit_id; - _HFI_DBG("RoundRobinAll Found NIC unit= %d, local rank=%d.\n", - unit_id, psm3_get_mylocalrank()); - } else if (!psmi_hal_get_node_id(unit_id, &node_id_i) - && node_id_i == node_id) { - saved_hfis[found++] = unit_id; - _HFI_DBG("RoundRobin Found NIC unit= %d, node = %d, local rank=%d.\n", - unit_id, node_id, psm3_get_mylocalrank()); - } - } - return found; -} - -static void -psmi_spread_nic_selection(psm2_uuid_t const job_key, long *unit_start, - long *unit_end, int nunits) -{ - { - int found, saved_hfis[nunits]; - - /* else, we are going to look at: - (a hash of the job key plus the local rank id) mod nunits. */ - found = hfi_find_active_hfis(nunits, -1, saved_hfis); - if (found) - *unit_start = saved_hfis[((psm3_get_mylocalrank()+1) + - psm3_get_uuid_hash(job_key)) % found]; - else - *unit_start = 0; // caller will fail - /* just in case, caller will check all other units, with wrap */ - if (*unit_start > 0) - *unit_end = *unit_start - 1; - else - *unit_end = nunits-1; - } - _HFI_DBG("RoundRobinAll Will select 1st viable NIC unit= %ld to %ld.\n", - *unit_start, *unit_end); -} - -static int -psm3_create_and_open_affinity_shm(psm2_uuid_t const job_key) -{ - int shm_fd, ret; - int first_to_create = 0; - size_t shm_name_len = 256; - - psmi_assert_always(psm3_affinity_semaphore_open); - if (psm3_affinity_shared_file_opened) { - /* opened and have our reference counted in shm */ - psmi_assert_always(psm3_affinity_shm_name != NULL); - psmi_assert_always(psm3_shared_affinity_ptr != NULL); - return 0; - } - - psm3_shared_affinity_ptr = NULL; - psm3_affinity_shm_name = (char *) psmi_malloc(PSMI_EP_NONE, UNDEFINED, shm_name_len); - - psmi_assert_always(psm3_affinity_shm_name != NULL); - snprintf(psm3_affinity_shm_name, shm_name_len, - AFFINITY_SHM_BASENAME".%d", - psm3_get_uuid_hash(job_key)); - shm_fd = shm_open(psm3_affinity_shm_name, O_RDWR | O_CREAT | O_EXCL, - S_IRUSR | S_IWUSR); - if ((shm_fd < 0) && (errno == EEXIST)) { - shm_fd = shm_open(psm3_affinity_shm_name, O_RDWR, S_IRUSR | S_IWUSR); - if (shm_fd < 0) { - _HFI_VDBG("Cannot open affinity shared mem fd:%s, errno=%d\n", - psm3_affinity_shm_name, errno); - goto free_name; - } - } else if (shm_fd >= 0) { - first_to_create = 1; - } else { - _HFI_VDBG("Cannot create affinity shared mem fd:%s, errno=%d\n", - psm3_affinity_shm_name, errno); - goto free_name; - } - - ret = ftruncate(shm_fd, PSMI_PAGESIZE); - if ( ret < 0 ) { - _HFI_VDBG("Cannot truncate affinity shared mem fd:%s, errno=%d\n", - psm3_affinity_shm_name, errno); - goto close_shm; - } - - psm3_shared_affinity_ptr = (uint64_t *) mmap(NULL, PSMI_PAGESIZE, PROT_READ | PROT_WRITE, - MAP_SHARED, shm_fd, 0); - if (psm3_shared_affinity_ptr == MAP_FAILED) { - _HFI_VDBG("Cannot mmap affinity shared memory: %s, errno=%d\n", - psm3_affinity_shm_name, errno); - goto close_shm; - } - close(shm_fd); - shm_fd = -1; - - if (first_to_create) { - _HFI_VDBG("Initializing shm to store NIC affinity per socket: %s\n", psm3_affinity_shm_name); - - memset(psm3_shared_affinity_ptr, 0, PSMI_PAGESIZE); - - /* - * Once shm object is initialized, unlock others to be able to - * use it. - */ - psmi_sem_post(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name); - } else { - _HFI_VDBG("Opened shm object to read/write NIC affinity per socket: %s\n", psm3_affinity_shm_name); - } - - /* - * Start critical section to increment reference count when creating - * or opening shm object. Decrement of ref count will be done before - * closing the shm. - */ - if (psmi_sem_timedwait(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name)) { - _HFI_VDBG("Could not enter critical section to update shm refcount\n"); - goto unmap_shm; - } - - psm3_shared_affinity_ptr[AFFINITY_SHM_REF_COUNT_LOCATION] += 1; - _HFI_VDBG("shm refcount = %"PRId64"\n", psm3_shared_affinity_ptr[AFFINITY_SHM_REF_COUNT_LOCATION]); - - /* End critical section */ - psmi_sem_post(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name); - - psm3_affinity_shared_file_opened = 1; - - return 0; - -unmap_shm: - munmap(psm3_shared_affinity_ptr, PSMI_PAGESIZE); - psm3_shared_affinity_ptr = NULL; -close_shm: - if (shm_fd >= 0) close(shm_fd); -free_name: - psmi_free(psm3_affinity_shm_name); - psm3_affinity_shm_name = NULL; - return -1; -} - -/* - * Spread HFI selection between units if we find more than one within a socket. - */ -static void -psmi_spread_hfi_within_socket(long *unit_start, long *unit_end, int node_id, - int *saved_hfis, int found, psm2_uuid_t const job_key) -{ - int ret, shm_location; - - /* - * Take affinity lock and open shared memory region to be able to - * accurately determine which HFI to pick for this process. If any - * issues, bail by picking first known HFI. - */ - if (!psm3_affinity_semaphore_open) - goto spread_hfi_fallback; - - ret = psm3_create_and_open_affinity_shm(job_key); - if (ret < 0) - goto spread_hfi_fallback; - - shm_location = AFFINITY_SHM_HFI_INDEX_LOCATION + node_id; - if (shm_location > PSMI_PAGESIZE) - goto spread_hfi_fallback; - - /* Start critical section to read/write shm object */ - if (psmi_sem_timedwait(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name)) { - _HFI_VDBG("Could not enter critical section to update NIC index\n"); - goto spread_hfi_fallback; - } - - *unit_start = *unit_end = saved_hfis[psm3_shared_affinity_ptr[shm_location]]; - psm3_shared_affinity_ptr[shm_location] = - (psm3_shared_affinity_ptr[shm_location] + 1) % found; - _HFI_DBG("RoundRobin Selected NIC unit= %ld, Next NIC=%ld, node = %d, local rank=%d, found=%d.\n", - *unit_start, psm3_shared_affinity_ptr[shm_location], node_id, - psm3_get_mylocalrank(), found); - - /* End Critical Section */ - psmi_sem_post(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name); - - return; - -spread_hfi_fallback: - *unit_start = *unit_end = saved_hfis[0]; -} - -static void -psm3_create_affinity_semaphores(psm2_uuid_t const job_key) -{ - int ret; - size_t sem_len = 256; - - /* - * If already opened, no need to do anything else. - * This could be true for Multi-EP cases where a different thread has - * already created the semaphores. We don't need separate locks here as - * we are protected by the overall "psm3_creation_lock" which each - * thread will take in psm3_ep_open() - */ - if (psm3_affinity_semaphore_open) - return; - - psm3_sem_affinity_shm_rw_name = (char *) psmi_malloc(PSMI_EP_NONE, UNDEFINED, sem_len); - psmi_assert_always(psm3_sem_affinity_shm_rw_name != NULL); - snprintf(psm3_sem_affinity_shm_rw_name, sem_len, - SEM_AFFINITY_SHM_RW_BASENAME".%d", - psm3_get_uuid_hash(job_key)); - - ret = psmi_init_semaphore(&psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name, - S_IRUSR | S_IWUSR, 0); - if (ret) { - _HFI_VDBG("Cannot initialize semaphore: %s for read-write access to shm object.\n", - psm3_sem_affinity_shm_rw_name); - if (psm3_sem_affinity_shm_rw) - sem_close(psm3_sem_affinity_shm_rw); - psmi_free(psm3_sem_affinity_shm_rw_name); - psm3_sem_affinity_shm_rw_name = NULL; - return; - } - - _HFI_VDBG("Semaphore: %s created for read-write access to shm object.\n", - psm3_sem_affinity_shm_rw_name); - - psm3_affinity_semaphore_open = 1; - - return; -} - -// return set of units to consider and which to start at. -// caller will use 1st active unit which can be opened. -// caller will wrap around so it's valid for start > end -// Note: When using multiple rails per PSM process, higher level code will -// walk through desired units and unit_param will specify a specific unit -static -psm2_error_t -psmi_compute_start_and_end_unit(long unit_param, long addr_index, - int nunitsactive,int nunits, - psm2_uuid_t const job_key, - long *unit_start,long *unit_end) -{ - unsigned short nic_sel_alg = PSMI_UNIT_SEL_ALG_ACROSS; - int node_id, found = 0; - int saved_hfis[nunits]; - - /* if the user did not set PSM3_NIC then ... */ - if (unit_param == PSM3_NIC_ANY) - { - if (nunitsactive > 1) { - // if NICs are on different planes (non-routed subnets) - // we need to have all ranks default to the same plane - // so force 1st active NIC in that case - int have_subnet = 0, unit_id; - psmi_subnet128_t got_subnet = { }; - for (unit_id = 0; unit_id < nunits; unit_id++) { - psmi_subnet128_t subnet; - if (psmi_hal_get_unit_active(unit_id) <= 0) - continue; - if (0 != psmi_hal_get_port_subnet(unit_id, 1 /* VERBS_PORT*/, - addr_index>0?addr_index:0, - &subnet, NULL, NULL, NULL)) - continue; // can't access NIC - if (! have_subnet) { - have_subnet = 1; - got_subnet = subnet; - } else if (! psm3_subnets_match(got_subnet, - subnet)) { - // active units have different tech - // (IB/OPA vs Eth) or different subnets - // caller will pick 1st active unit - *unit_start = 0; - *unit_end = nunits - 1; - _HFI_DBG("Multi-Plane config: Will select 1st viable NIC unit= %ld to %ld.\n", - *unit_start, *unit_end); - return PSM2_OK; - } - } - } - - /* Get the actual selection algorithm from the environment: */ - nic_sel_alg = psmi_parse_nic_selection_algorithm(); - /* If round-robin is selection algorithm and ... */ - if ((nic_sel_alg == PSMI_UNIT_SEL_ALG_ACROSS) && - /* there are more than 1 active units then ... */ - (nunitsactive > 1)) - { - /* - * Pick first HFI we find on same root complex - * as current task. If none found, fall back to - * RoundRobinAll load-balancing algorithm. - */ - node_id = psm3_get_current_proc_location(); - if (node_id >= 0) { - found = hfi_find_active_hfis(nunits, node_id, - saved_hfis); - if (found > 1) { - psm3_create_affinity_semaphores(job_key); - psmi_spread_hfi_within_socket(unit_start, unit_end, - node_id, saved_hfis, - found, job_key); - } else if (found == 1) { - *unit_start = *unit_end = saved_hfis[0]; - _HFI_DBG("RoundRobin Selected NIC unit= %ld, node = %d, local rank=%d, found=%d.\n", - *unit_start, node_id, - psm3_get_mylocalrank(), found); - } - } - - if (node_id < 0 || !found) { - _HFI_DBG("RoundRobin No local NIC found, using RoundRobinAll, node = %d, local rank=%d, found=%d.\n", - node_id, - psm3_get_mylocalrank(), found); - psmi_spread_nic_selection(job_key, unit_start, - unit_end, nunits); - } - } else if ((nic_sel_alg == PSMI_UNIT_SEL_ALG_ACROSS_ALL) && - (nunitsactive > 1)) { - psmi_spread_nic_selection(job_key, unit_start, - unit_end, nunits); - } - else { // PSMI_UNIT_SEL_ALG_WITHIN or only 1 active unit - // caller will pick 1st active unit - *unit_start = 0; - *unit_end = nunits - 1; - _HFI_DBG("%s: Will select 1st viable NIC unit= %ld to %ld.\n", - (nic_sel_alg == PSMI_UNIT_SEL_ALG_WITHIN) - ?"Packed":"Only 1 viable NIC", - *unit_start, *unit_end); - } - } else if (unit_param >= 0) { - /* the user specified PSM3_NIC, we use it. */ - *unit_start = *unit_end = unit_param; - _HFI_DBG("Caller selected NIC %ld.\n", *unit_start); - } else { - psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, - "PSM3 can't open unit: %ld for reading and writing", - unit_param); - return PSM2_EP_DEVICE_FAILURE; - } - - return PSM2_OK; -} - static int psmi_hash_addr_index(long unit, long port, long addr_index) { /* if the user did not set addr_index, then use a hash */ @@ -578,6 +102,9 @@ static int psmi_hash_addr_index(long unit, long port, long addr_index) return addr_index; } +// Open a single NIC. +// if unit_param is PSM3_NIC_ANY, the chosen PSM3_NIC_SELECTION_ALG will be +// used to pick a single active NIC psm2_error_t psm3_context_open(const psm2_ep_t ep, long unit_param, long port, long addr_index, psm2_uuid_t const job_key, uint16_t network_pkey, @@ -620,15 +147,15 @@ psm3_context_open(const psm2_ep_t ep, long unit_param, long port, long addr_inde unit_start = 0; unit_end = nunits - 1; - err = psmi_compute_start_and_end_unit(unit_param, addr_index, + err = psm3_compute_start_and_end_unit(unit_param, addr_index, nunitsactive, nunits, job_key, &unit_start, &unit_end); if (err != PSM2_OK) goto ret; - /* this is the start of a loop that starts at unit_start and goes to unit_end. - but note that the way the loop computes the loop control variable is by - an expression involving the mod operator. */ + /* Loop from unit_start to unit_end inclusive and pick 1st active found + * As needed wrap, so it's valid for unit_start >= unit_end + */ int success = 0; unit_id_prev = unit_id = unit_start; do @@ -645,6 +172,10 @@ psm3_context_open(const psm2_ep_t ep, long unit_param, long port, long addr_inde psmi_hash_addr_index(unit_id, port, addr_index), open_timeout, ep, job_key, HAL_CONTEXT_OPEN_RETRY_MAX)) { + // in modes where we refcount NIC use, + // psm3_compute_start_and_end_unit will have returned exactly + // 1 NIC and refcount'ed it, so we dec refcount here + psm3_dec_nic_refcount(unit_id); /* go to next unit if failed to open. */ unit_id_prev = unit_id; unit_id = (unit_id + 1) % nunits; @@ -709,6 +240,7 @@ psm3_context_open(const psm2_ep_t ep, long unit_param, long port, long addr_inde close: psmi_hal_close_context(ep); + psm3_dec_nic_refcount(ep->unit_id); bail: _HFI_PRDBG("open failed: unit_id: %ld, err: %d (%s)\n", unit_id, err, strerror(errno)); ret: @@ -720,16 +252,21 @@ psm3_context_open(const psm2_ep_t ep, long unit_param, long port, long addr_inde psm2_error_t psm3_context_close(psm2_ep_t ep) { psmi_hal_close_context(ep); + psm3_dec_nic_refcount(ep->unit_id); return PSM2_OK; } +// up to 4 digits per CPU number, plus a coma or dash +#define MAX_CPU_AFFINITY_STRING (CPU_SETSIZE * 5) + static inline char * _dump_cpu_affinity(char *buf, size_t buf_size, cpu_set_t * cpuset) { int i; - int isfirst = 1; - char tmp[25]; //%d = 10 :: 10 + '-' + 10 + ',' + '\0' = 23 + char tmp[25]; //%d, = 10+','+\0 or %d-%d, = 10 + '-' + 10 + ',' + '\0' = 23 int first = -1, last = -1; + int len = 0; + *buf = '\0'; for (i = 0; i < CPU_SETSIZE; i++) { if (CPU_ISSET(i, cpuset)) { if (first == -1) { @@ -745,13 +282,8 @@ static inline char * _dump_cpu_affinity(char *buf, size_t buf_size, cpu_set_t * } first = last = -1; - if (isfirst) { - strncpy(buf, tmp, buf_size-1); - isfirst=0; - } else { - strncat(buf, tmp, buf_size-1); - } - buf[buf_size-1] = '\0'; + snprintf(&buf[len], buf_size-len,"%s", tmp); + len = strlen(buf); } } @@ -761,26 +293,48 @@ static inline char * _dump_cpu_affinity(char *buf, size_t buf_size, cpu_set_t * } else { snprintf(tmp, sizeof(tmp), "%d-%d,", first, last); } - if (isfirst) { - strncpy(buf, tmp, buf_size-1); - } else { - strncat(buf, tmp, buf_size-1); - } - buf[buf_size-1] = '\0'; + snprintf(&buf[len], buf_size-len,"%s", tmp); + len = strlen(buf); } - char *comma = strrchr(buf, ','); - if (comma) comma[0] = '\0'; + if (len) + buf[len-1] = '\0'; // elimate trailing coma return buf; } -// called by HAL context_open to set affinity consistent with -// NIC NUMA location when NIC NUMA location is a superset of thread CPU set -// TBD unclear when this provides value. +// called by HAL context_open to set CPU affinity narrower consistent with +// NIC NUMA location +// Intel MPI sets PSM3_NO_CPUAFFINITY to disable this function +// Suspect this is not effective or has bugs. For Omni-Path the NIC +// driver set affinity before this was called, and this was thus likely a noop. +// This is a noop if: +// - if NIC is not NUMA local to any of CPUs in existing affinity +// - if existing affinity selects more cores than those local to NIC +// even if that set incompletely overlaps the NIC local core set +// suspect this is a bug and test should be opposity or just test +// for overlap. +// if NIC is NUMA local to CPU, and NIC core list is larger than existing +// affinity, will limit scope of affinity to cores NUMA local to NIC +// - does not consider the full set of selected NICs when multirail enabled +// - may only provide value if CPU set from caller is small but > 1 CPU NUMA +// domain in which case this will reduce it to a single CPU NUMA domain +// matching the NIC's NUMA location. +// +// By default this is enabled, but two undocumented variables +// PSM3_FORCE_CPUAFFINITY and PSM3_NO_CPUAFFINITY can control this +// as well as the ep_open skip_affinity flag. +// // May be better if we analyzed NIC NUMA location and various other // process and thread locations when NIC NUMA is a subset of CPU affinity // and guide a good choice for CPU affinity, but that would require // intra-node process coordination to avoid duplicate CPU selections +// +// TBD for GPU affinity this may not make sense. Also PSM3 can't force a GPU +// selection for an app. +// +// TBD when PSM3 is using multiple NICs (PSM3_MULTIRAIL > 0) this should +// be enhanced to attempt to select a CPU based on location of all NICs being +// used, not just a single NIC. int psm3_context_set_affinity(psm2_ep_t ep, int unit) { @@ -796,8 +350,9 @@ psm3_context_set_affinity(psm2_ep_t ep, int unit) } if (_HFI_DBG_ON) { - char cpu_buf[128] = {0}; - _HFI_DBG_ALWAYS( "CPU affinity Before set: %s\n", _dump_cpu_affinity(cpu_buf, 128, &cpuset)); + char cpu_buf[MAX_CPU_AFFINITY_STRING] = {0}; + _HFI_DBG_ALWAYS( "CPU affinity Before set: %s\n", + _dump_cpu_affinity(cpu_buf, MAX_CPU_AFFINITY_STRING, &cpuset)); } /* @@ -837,10 +392,11 @@ psm3_context_set_affinity(psm2_ep_t ep, int unit) //err = -PSM_HAL_ERROR_GENERAL_ERROR; goto bail; } else if (cpu_and_count == 0 && _HFI_DBG_ON) { - char buf1[128] = {0}; - char buf2[128] = {0}; + char buf1[MAX_CPU_AFFINITY_STRING] = {0}; + char buf2[MAX_CPU_AFFINITY_STRING] = {0}; _HFI_DBG_ALWAYS( "CPU affinity not set, NIC selected is not on the same socket as thread (\"%s\" & \"%s\" == 0).\n", - _dump_cpu_affinity(buf1, 128, &nic_cpuset), _dump_cpu_affinity(buf2, 128, &cpuset)); + _dump_cpu_affinity(buf1, MAX_CPU_AFFINITY_STRING, &nic_cpuset), + _dump_cpu_affinity(buf2, MAX_CPU_AFFINITY_STRING, &cpuset)); } } skip_affinity: @@ -852,8 +408,9 @@ psm3_context_set_affinity(psm2_ep_t ep, int unit) "Can't get CPU affinity: %s\n", strerror(errno)); goto bail; } - char cpu_buf[128] = {0}; - _HFI_DBG_ALWAYS( "CPU affinity After set: %s\n", _dump_cpu_affinity(cpu_buf, 128, &cpuset)); + char cpu_buf[MAX_CPU_AFFINITY_STRING] = {0}; + _HFI_DBG_ALWAYS( "CPU affinity After set: %s\n", + _dump_cpu_affinity(cpu_buf, MAX_CPU_AFFINITY_STRING, &cpuset)); } return 0; @@ -904,39 +461,3 @@ psm3_ep_verify_pkey(psm2_ep_t ep, uint16_t pkey, uint16_t *opkey, uint16_t* oind return PSM2_OK; } - -static -int psmi_parse_nic_selection_algorithm(void) -{ - union psmi_envvar_val env_nic_alg; - int nic_alg = PSMI_UNIT_SEL_ALG_ACROSS; - - const char* PSM3_NIC_SELECTION_ALG_HELP = - "NIC Device Selection Algorithm to use. Round Robin[RoundRobin or rr] (Default) " - ", Packed[p] or Round Robin All[RoundRobinAll or rra]."; - - /* If a specific unit is set in the environment, use that one. */ - psm3_getenv("PSM3_NIC_SELECTION_ALG", PSM3_NIC_SELECTION_ALG_HELP, - PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, - (union psmi_envvar_val)"rr", &env_nic_alg); - - if (!strcasecmp(env_nic_alg.e_str, "Round Robin") - || !strcasecmp(env_nic_alg.e_str, "RoundRobin") - || !strcasecmp(env_nic_alg.e_str, "rr")) - nic_alg = PSMI_UNIT_SEL_ALG_ACROSS; - else if (!strcasecmp(env_nic_alg.e_str, "Packed") - || !strcasecmp(env_nic_alg.e_str, "p")) - nic_alg = PSMI_UNIT_SEL_ALG_WITHIN; - else if (!strcasecmp(env_nic_alg.e_str, "Round Robin All") - || !strcasecmp(env_nic_alg.e_str, "RoundRobinAll") - || !strcasecmp(env_nic_alg.e_str, "rra")) - nic_alg = PSMI_UNIT_SEL_ALG_ACROSS_ALL; - else { - _HFI_INFO( - "Invalid value for PSM3_NIC_SELECTION_ALG ('%s') %-40s Using: %s\n", - env_nic_alg.e_str, PSM3_NIC_SELECTION_ALG_HELP, "RoundRobin"); - nic_alg = PSMI_UNIT_SEL_ALG_ACROSS; - } - - return nic_alg; -} diff --git a/prov/psm3/psm3/psm_context.h b/prov/psm3/psm3/psm_context.h index 188e1284cc4..28339284bcf 100644 --- a/prov/psm3/psm3/psm_context.h +++ b/prov/psm3/psm3/psm_context.h @@ -76,21 +76,4 @@ psm3_context_set_affinity(psm2_ep_t ep, int unit); psm2_error_t psm3_context_interrupt_set(psm2_ep_t ep, int enable); int psm3_context_interrupt_isenabled(psm2_ep_t ep); -/* - * round robin contexts across HFIs, then - * ports; this is the default. - * This option spreads the HFI selection within the local socket. - * If it is preferred to spread job over over entire set of - * HFIs within the system, see ALG_ACROSS_ALL below. - */ -#define PSMI_UNIT_SEL_ALG_ACROSS PSM_HAL_ALG_ACROSS - -#define PSMI_UNIT_SEL_ALG_ACROSS_ALL PSM_HAL_ALG_ACROSS_ALL - -/* - * use all contexts on an HFI (round robin - * active ports within), then next HFI - */ -#define PSMI_UNIT_SEL_ALG_WITHIN PSM_HAL_ALG_WITHIN - #endif /* PSM_CONTEXT_H */ diff --git a/prov/psm3/psm3/psm_ep.c b/prov/psm3/psm3/psm_ep.c index 9e31af3e65c..36dbf40abfa 100644 --- a/prov/psm3/psm3/psm_ep.c +++ b/prov/psm3/psm3/psm_ep.c @@ -119,385 +119,6 @@ psm2_error_t psm3_ep_num_devunits(uint32_t *num_units_o) return PSM2_OK; } -struct rail_info { - psmi_subnet128_t subnet; - unsigned unit; - unsigned port; - unsigned addr_index; -}; - -static int cmpfunc(const void *p1, const void *p2) -{ - struct rail_info *a = ((struct rail_info *) p1); - struct rail_info *b = ((struct rail_info *) p2); - int ret; - - ret = psmi_subnet128_cmp(a->subnet, b->subnet); - if (ret == 0) { - if (a->addr_index < b->addr_index) - return -1; - else if (a->addr_index > b->addr_index) - return 1; - } - return ret; -} - -// process PSM3_MULTIRAIL and PSM3_MULTIRAIL_MAP and return the -// list of unit/port/addr_index in unit[0-(*num_rails-1)], -// port[0-(*num_rails-1)] and addr_index[0-(*num_rails-1)] -// When *num_rails is returned as 0, multirail is not enabled and -// other mechanisms (PSM3_NIC, PSM3_NIC_SELECTION_ALG) must be -// used by the caller to select a single NIC for the process -static psm2_error_t -psm3_ep_multirail(int *num_rails, uint32_t *unit, uint16_t *port, int *addr_index) -{ - uint32_t num_units = 0; - psmi_subnet128_t subnet; - unsigned i, j, k, count = 0; - int ret; - psm2_error_t err = PSM2_OK; - struct rail_info rail_info[PSMI_MAX_RAILS]; - union psmi_envvar_val env_multirail; - union psmi_envvar_val env_multirail_map; - int multirail_within_socket_used = 0; - int node_id = -1, found = 0; - - psm3_getenv("PSM3_MULTIRAIL", - "Use all available NICs in the system for communication.\n" - "-1: No NIC autoselection,\n" - "0: Disabled (default),\n" - "1: Enable multirail across all available NICs,\n" - "2: Enable multirail within socket.\n" - "\t For multirail within a socket, we try to find at\n" - "\t least one NIC on the same socket as current task.\n" - "\t If none found, we continue to use other NICs within\n" - "\t the system.", - PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, - (union psmi_envvar_val)0, - &env_multirail); - if (env_multirail.e_int <= 0) { - *num_rails = 0; - return PSM2_OK; - } - - if (env_multirail.e_int == 2) - multirail_within_socket_used = 1; - -/* - * map is in format: unit:port-addr_index,unit:port-addr_index,... - * where :port is optional (default of 1) and unit can be name or number - * -addr_index is also optionall and defaults to "all" - * addr_index can be an integer between 0 and PSM3_ADDR_PER_NIC-1 - * or "any" or "all". "any" selects a single address using the hash and - * "all" setups a rail for each address. - */ -#define MAX_MAP_LEN (PSMI_MAX_RAILS*128) - if (!psm3_getenv("PSM3_MULTIRAIL_MAP", - "NIC selections for each rail in format:\n" - " rail,rail,...\n" -#if 0 - "Where rail can be: unit:port-addr_index or unit\n" -#else - "Where rail can be: unit-addr_index or unit\n" -#endif - "unit can be device name or unit number\n" -#if 0 - "where :port is optional (default of 1)\n" -#endif - "addr_index can be 0 to PSM3_ADDR_PER_NIC-1, or 'any' or 'all'\n" - "When addr_index is omitted, it defaults to 'all'\n" - "default autoselects", - PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, - (union psmi_envvar_val)"", &env_multirail_map)) { - - char temp[MAX_MAP_LEN+1]; - char *s; - char *delim; - - strncpy(temp, env_multirail_map.e_str, MAX_MAP_LEN); - if (temp[MAX_MAP_LEN-1] != 0) - return psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, - "PSM3_MULTIRAIL_MAP too long: '%s'", - env_multirail_map.e_str); - s = temp; - psmi_assert(*s); - do { - int u, p = 1; - int skip_port = 0; - int skip_addr_index = 0; - int a_index = PSM3_ADDR_INDEX_ALL; - - if (! *s) // trailing ',' on 2nd or later loop - break; - if (count >= PSMI_MAX_RAILS) - return psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, - "PSM3_MULTIRAIL_MAP exceeds %u rails: '%s'", - PSMI_MAX_RAILS, env_multirail_map.e_str); - - // find end of unit field and put in \0 as needed - delim = strpbrk(s, ":-,"); - if (!delim || *delim == ',') { - skip_port = 1; skip_addr_index = 1; - } else if (*delim == '-') { - skip_port = 1; - } - if (delim) - *delim = '\0'; - // parse unit - u = psm3_sysfs_find_unit(s); - if (u < 0) - return psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, - "PSM3_MULTIRAIL_MAP invalid unit: '%s'", s); - // find next field - if (delim) - s = delim+1; - if (! skip_port) { - // find end of port field and put in \0 as needed - delim = strpbrk(s, "-,"); - if (!delim || *delim == ',') - skip_addr_index = 1; - if (delim) - *delim = '\0'; - // parse port - p = psm3_parse_str_long(s); - if (p < 0) - return psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, - "PSM3_MULTIRAIL_MAP invalid port: '%s'", s); - // find next field - if (delim) - s = delim+1; - } - if (! skip_addr_index) { - // find end of addr_index field and put in \0 as needed - delim = strchr(s, ','); - if (delim) - *delim = '\0'; - // parse addr_index - if (0 == strcmp(s, "all")) - a_index = PSM3_ADDR_INDEX_ALL; // we will loop below - else if (0 == strcmp(s, "any")) - a_index = PSM3_ADDR_INDEX_ANY; // caller will pick - else { - a_index = psm3_parse_str_long(s); - if (a_index < 0 || a_index >= psm3_addr_per_nic) - return psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, - "PSM3_MULTIRAIL_MAP invalid addr index: '%s'", s); - } - // find next field - if (delim) - s = delim+1; - } - - if (a_index == PSM3_ADDR_INDEX_ALL) { // all - for (a_index = 0; a_index < psm3_addr_per_nic; a_index++) { - if (count >= PSMI_MAX_RAILS) - return psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, - "PSM3_MULTIRAIL_MAP exceeds %u rails: '%s' due to multi-ip", - PSMI_MAX_RAILS, env_multirail_map.e_str); - unit[count] = u; - port[count] = p; - addr_index[count] = a_index; - count++; - } - } else { - unit[count] = u; - port[count] = p; - addr_index[count] = a_index; - count++; - } - } while (delim); - *num_rails = count; - -/* - * Check if any of the port is not usable. Just use addr_index 0 for check - */ - for (i = 0; i < count; i++) { - _HFI_VDBG("rail %d: %u(%s) %u\n", i, - unit[i], psm3_sysfs_unit_dev_name(unit[i]), port[i]); - ret = psmi_hal_get_port_active(unit[i], port[i]); - if (ret <= 0) - return psm3_handle_error(NULL, - PSM2_EP_DEVICE_FAILURE, - "PSM3_MULTIRAIL_MAP: Unit/port: %d(%s):%d is not active.", - unit[i], psm3_sysfs_unit_dev_name(unit[i]), - port[i]); - ret = psmi_hal_get_port_lid(unit[i], port[i], 0 /* addr_index*/); - if (ret <= 0) - return psm3_handle_error(NULL, - PSM2_EP_DEVICE_FAILURE, - "PSM3_MULTIRAIL_MAP: unit %d(%s):%d was filtered out, unable to use", - unit[i], psm3_sysfs_unit_dev_name(unit[i]), - port[i]); - ret = psmi_hal_get_port_subnet(unit[i], port[i], 0 /* addr_index*/, NULL, NULL, NULL, NULL); - if (ret == -1) - return psm3_handle_error(NULL, - PSM2_EP_DEVICE_FAILURE, - "PSM3_MULTIRAIL_MAP: Couldn't get subnet for unit %d(%s):%d", - unit[i], psm3_sysfs_unit_dev_name(unit[i]), - port[i]); - } - return PSM2_OK; - } - - if ((err = psm3_ep_num_devunits(&num_units))) { - return err; - } - if (num_units > PSMI_MAX_RAILS) { - _HFI_INFO - ("Found %d units, max %d units are supported, use %d\n", - num_units, PSMI_MAX_RAILS, PSMI_MAX_RAILS); - num_units = PSMI_MAX_RAILS; - } - - /* - * PSM3_MULTIRAIL=2 functionality- - * - Try to find at least find one HFI in the same root - * complex. If none found, continue to run and - * use remaining HFIs in the system. - * - If we do find at least one HFI in same root complex, we - * go ahead and add to list. - */ - if (multirail_within_socket_used) { - node_id = psm3_get_current_proc_location(); - for (i = 0; i < num_units; i++) { - if (psmi_hal_get_unit_active(i) <= 0) - continue; - int node_id_i; - - if (!psmi_hal_get_node_id(i, &node_id_i)) { - if (node_id_i == node_id) { - found = 1; - break; - } - } - } - } -/* - * Get all the ports and addr_index with a valid lid and gid, one port per unit. - * but up to PSM3_ADDR_PER_NIC addresses - */ - for (i = 0; i < num_units; i++) { - int node_id_i; - - if (!psmi_hal_get_node_id(i, &node_id_i)) - { - if (multirail_within_socket_used && - found && (node_id_i != node_id)) - continue; - } - - for (j = PSM3_NIC_MIN_PORT; j <= PSM3_NIC_MAX_PORT; j++) { - int got_port = 0; - for (k = 0; k < psm3_addr_per_nic; k++) { - ret = psmi_hal_get_port_lid(i, j, k); - if (ret <= 0) - continue; - ret = psmi_hal_get_port_subnet(i, j, k, &subnet, NULL, NULL, NULL); - if (ret == -1) - continue; - - rail_info[count].subnet = subnet; - rail_info[count].unit = i; - rail_info[count].port = j; - rail_info[count].addr_index = k; - got_port = 1; - count++; - } - if (got_port) // one port per unit - break; - } - } - -/* - * Sort all the ports within rail_info from small to big. - * This is for multiple fabrics, and we use fabric with the - * smallest subnet to make the master connection. - */ - qsort(rail_info, count, sizeof(rail_info[0]), cmpfunc); - - for (i = 0; i < count; i++) { - unit[i] = rail_info[i].unit; - port[i] = rail_info[i].port; - addr_index[i] = rail_info[i].addr_index; - } - *num_rails = count; - return PSM2_OK; -} - -// this is used to find devices with the same address as another process, -// implying intra-node comms. -// we poplate hfi_nids and nnids with the set of network ids (NID) for -// all the local NICs. -// The caller will see if any of these NIDs match the NID of the remote process. -// Note that NIDs are globally unique and include both subnet and NIC address -// information, so we can compare them regardless of their subnet. -// NIDs which are not on the same subnet will not match. -// NIDs on the same subnet only match if they are the same NIC. -// Two local NICs with the same subnet and same address is an unexpected -// invalid config, and will silently match the two NICs. -#define MAX_GID_IDX 31 -static psm2_error_t -psm3_ep_devnids(psm2_nid_t **nids, uint32_t *num_nids_o) -{ - uint32_t num_units = 0; - int i; - psm2_error_t err = PSM2_OK; - - PSMI_ERR_UNLESS_INITIALIZED(NULL); - - if (hfi_nids == NULL) { - if ((err = psm3_ep_num_devunits(&num_units))) - goto fail; - hfi_nids = (psm2_nid_t *) - psmi_calloc(PSMI_EP_NONE, UNDEFINED, - num_units * psmi_hal_get_num_ports()*psm3_addr_per_nic, sizeof(*hfi_nids)); - if (hfi_nids == NULL) { - err = psm3_handle_error(NULL, PSM2_NO_MEMORY, - "Couldn't allocate memory for dev_nids structure"); - goto fail; - } - - for (i = 0; i < num_units; i++) { - int j; - for (j = PSM3_NIC_MIN_PORT; j <= PSM3_NIC_MAX_PORT; j++) { - int k; - for (k = 0; k < psm3_addr_per_nic; k++) { - int lid = psmi_hal_get_port_lid(i, j, k); - int ret, idx = 0; - psmi_subnet128_t subnet = { }; - psmi_naddr128_t addr = { }; - psmi_gid128_t gid = { }; - - // skip ports which aren't ready for use - if (lid <= 0) - continue; - ret = psmi_hal_get_port_subnet(i, j, k, &subnet, &addr, &idx, &gid); - if (ret == -1) - continue; - hfi_nids[nnids] = psm3_build_nid(i, addr, lid); - _HFI_VDBG("NIC unit %d, port %d addr_index %d, found %s " - "GID[%d] %s subnet %s\n", - i, j, k, - psm3_nid_fmt(hfi_nids[nnids], 0), - idx, psm3_gid128_fmt(gid, 1), - psm3_subnet128_fmt(subnet, 2)); - nnids++; - } - } - } - if (nnids == 0) { - err = psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, - "Couldn't find any unfiltered units"); - goto fail; - } - } - *nids = hfi_nids; - *num_nids_o = nnids; - -fail: - return err; -} - psm2_error_t psm3_ep_query(int *num_of_epinfo, psm2_epinfo_t *array_of_epinfo) { psm2_error_t err = PSM2_OK; @@ -632,6 +253,80 @@ psm2_error_t psm3_epaddr_to_epid(psm2_epaddr_t epaddr, psm2_epid_t *epid) return err; } +// this is used to find devices with the same address as another process, +// implying intra-node comms. +// we poplate hfi_nids and nnids with the set of network ids (NID) for +// all the local NICs. +// The caller will see if any of these NIDs match the NID of the remote process. +// Note that NIDs are globally unique and include both subnet and NIC address +// information, so we can compare them regardless of their subnet. +// NIDs which are not on the same subnet will not match. +// NIDs on the same subnet only match if they are the same NIC. +// Two local NICs with the same subnet and same address is an unexpected +// invalid config, and will silently match the two NICs. +#define MAX_GID_IDX 31 +static psm2_error_t +psm3_ep_devnids(psm2_nid_t **nids, uint32_t *num_nids_o) +{ + uint32_t num_units = 0; + int i; + psm2_error_t err = PSM2_OK; + + PSMI_ERR_UNLESS_INITIALIZED(NULL); + + if (hfi_nids == NULL) { + if ((err = psm3_ep_num_devunits(&num_units))) + goto fail; + hfi_nids = (psm2_nid_t *) + psmi_calloc(PSMI_EP_NONE, UNDEFINED, + num_units * psmi_hal_get_num_ports()*psm3_addr_per_nic, sizeof(*hfi_nids)); + if (hfi_nids == NULL) { + err = psm3_handle_error(NULL, PSM2_NO_MEMORY, + "Couldn't allocate memory for dev_nids structure"); + goto fail; + } + + for (i = 0; i < num_units; i++) { + int j; + for (j = PSM3_NIC_MIN_PORT; j <= PSM3_NIC_MAX_PORT; j++) { + int k; + for (k = 0; k < psm3_addr_per_nic; k++) { + int lid = psmi_hal_get_port_lid(i, j, k); + int ret, idx = 0; + psmi_subnet128_t subnet = { }; + psmi_naddr128_t addr = { }; + psmi_gid128_t gid = { }; + + // skip ports which aren't ready for use + if (lid <= 0) + continue; + ret = psmi_hal_get_port_subnet(i, j, k, &subnet, &addr, &idx, &gid); + if (ret == -1) + continue; + hfi_nids[nnids] = psm3_build_nid(i, addr, lid); + _HFI_VDBG("NIC unit %d, port %d addr_index %d, found %s " + "GID[%d] %s subnet %s\n", + i, j, k, + psm3_nid_fmt(hfi_nids[nnids], 0), + idx, psm3_gid128_fmt(gid, 1), + psm3_subnet128_fmt(subnet, 2)); + nnids++; + } + } + } + if (nnids == 0) { + err = psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, + "Couldn't find any unfiltered units"); + goto fail; + } + } + *nids = hfi_nids; + *num_nids_o = nnids; + +fail: + return err; +} + // Indicate if the given epid is a local process. // In which case we can use intra-node shared memory comms with it. psm2_error_t @@ -714,6 +409,12 @@ psm2_error_t psm3_ep_open_opts_get_defaults(struct psm3_ep_open_opts *opts) psm2_error_t psm3_poll_noop(ptl_t *ptl, int replyonly, bool force); +// open a single internal EP for a single NIC +// For 1st internal EP opts may indicate PSM3_NIC_ANY in which case +// psm3_ep_open_device will let psm3_context_open pick the NIC based on +// PSM3_NIC_SELECTION_ALG. +// For multirail and when opening additional QPs for the NIC, opts will +// select a specific NIC. psm2_error_t psm3_ep_open_internal(psm2_uuid_t const unique_job_key, int *devid_enabled, struct psm3_ep_open_opts const *opts_i, psm2_mq_t mq, @@ -821,11 +522,13 @@ psm3_ep_open_internal(psm2_uuid_t const unique_job_key, int *devid_enabled, /* Get immediate data size - transfers less than immediate data size do * not consume a send buffer and require just a send descriptor. */ - if (!psm3_getenv("PSM3_SEND_IMMEDIATE_SIZE", - "Immediate data send size not requiring a buffer [128]", - PSMI_ENVVAR_LEVEL_HIDDEN, - PSMI_ENVVAR_TYPE_UINT, - (union psmi_envvar_val)128, &envvar_val)) { + if (!psm3_getenv_range("PSM3_SEND_IMMEDIATE_SIZE", + "Immediate data send size not requiring a buffer. Default 128.", + "Actual permitted upper limit is NIC dependent.", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)128, + (union psmi_envvar_val)0, (union psmi_envvar_val)1024, + NULL, NULL, &envvar_val)) { opts.imm_size = envvar_val.e_uint; } @@ -1075,12 +778,10 @@ psm3_ep_open(psm2_uuid_t const unique_job_key, psm2_mq_t mq; psm2_epid_t epid; psm2_ep_t ep, tmp; - uint32_t units[PSMI_MAX_QPS]; - uint16_t ports[PSMI_MAX_QPS]; - int addr_indexes[PSMI_MAX_QPS]; - int i, num_rails = 0; + int i; int devid_enabled[PTL_MAX_INIT]; struct psm3_ep_open_opts opts = *opts_i; + struct multirail_config multirail_config = { 0 }; PSM2_LOG_MSG("entering"); PSMI_ERR_UNLESS_INITIALIZED(NULL); @@ -1127,15 +828,15 @@ psm3_ep_open(psm2_uuid_t const unique_job_key, goto fail; if (psm3_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) { - err = psm3_ep_multirail(&num_rails, units, ports, addr_indexes); + err = psm3_ep_multirail(&multirail_config); if (err != PSM2_OK) goto fail; /* If multi-rail is used, set the first ep unit/port */ - if (num_rails > 0) { - opts.unit = units[0]; - opts.port = ports[0]; - opts.addr_index = addr_indexes[0]; + if (multirail_config.num_rails > 0) { + opts.unit = multirail_config.units[0]; + opts.port = multirail_config.ports[0]; + opts.addr_index = multirail_config.addr_indexes[0]; } } #if defined(PSM_CUDA) || defined(PSM_ONEAPI) @@ -1183,13 +884,13 @@ psm3_ep_open(psm2_uuid_t const unique_job_key, psmi_hal_context_initstats(ep); union psmi_envvar_val envvar_val; - if (num_rails <= 0) { + if (multirail_config.num_rails <= 0) { // the NIC has now been selected for our process // use the same NIC for any additional QPs below - num_rails = 1; - units[0] = ep->unit_id; - ports[0] = ep->portnum; - addr_indexes[0] = ep->addr_index; + multirail_config.num_rails = 1; + multirail_config.units[0] = ep->unit_id; + multirail_config.ports[0] = ep->portnum; + multirail_config.addr_indexes[0] = ep->addr_index; } // When QP_PER_NIC >1, creates more than 1 QP on each NIC and then // uses the multi-rail algorithms to spread the traffic across QPs @@ -1204,22 +905,28 @@ psm3_ep_open(psm2_uuid_t const unique_job_key, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)1, &envvar_val); - if ((num_rails * envvar_val.e_uint) > PSMI_MAX_QPS) { + if ((multirail_config.num_rails * envvar_val.e_uint) > PSMI_MAX_QPS) { err = psm3_handle_error(NULL, PSM2_TOO_MANY_ENDPOINTS, "PSM3_QP_PER_NIC (%u) * num_rails (%d) > Max Support QPs (%u)", - envvar_val.e_uint, num_rails, PSMI_MAX_QPS); + envvar_val.e_uint, multirail_config.num_rails, PSMI_MAX_QPS); goto fail; } for (j= 0; j< envvar_val.e_uint; j++) { - for (i = 0; i < num_rails; i++) { - _HFI_VDBG("rail %d unit %u port %u addr_index %d\n", i, units[i], ports[i], addr_indexes[i]); + // loop will open additional internal EPs for all + // the additional QPs on 1st rail and for all the + // additional rails and all the QPs on those rails + for (i = 0; i < multirail_config.num_rails; i++) { + _HFI_VDBG("rail %d unit %u port %u addr_index %d\n", i, + multirail_config.units[i], + multirail_config.ports[i], + multirail_config.addr_indexes[i]); // did 0, 0 already above if (i == 0 && j== 0) continue; - opts.unit = units[i]; - opts.port = ports[i]; - opts.addr_index = addr_indexes[i]; + opts.unit = multirail_config.units[i]; + opts.port = multirail_config.ports[i]; + opts.addr_index = multirail_config.addr_indexes[i]; /* Create secondary EP */ err = psm3_ep_open_internal(unique_job_key, @@ -1542,6 +1249,15 @@ psm3_parse_devices(int devices[PTL_MAX_INIT]) int len; int i = 0; union psmi_envvar_val devs; + static int have_value = 0; + static int saved[PTL_MAX_INIT]; + + // only parse once so doesn't appear in PSM3_VERBOSE_ENV multiple times + if (have_value) { + for (i=0; i < PTL_MAX_INIT; i++) + devices[i] = saved[i]; + return PSM2_OK; + } /* See which ptl devices we want to use for this ep to be opened */ psm3_getenv("PSM3_DEVICES", @@ -1605,6 +1321,9 @@ psm3_parse_devices(int devices[PTL_MAX_INIT]) *(b_new - 1) = '\0'; _HFI_PRDBG("PSM Device allocation order: %s\n", devstr); + for (i=0; i < PTL_MAX_INIT; i++) + saved[i] = devices[i]; + have_value = 1; fail: if (devstr != NULL) psmi_free(devstr); diff --git a/prov/psm3/psm3/psm_ep.h b/prov/psm3/psm3/psm_ep.h index 609c75ea8b6..c1ec006eff9 100644 --- a/prov/psm3/psm3/psm_ep.h +++ b/prov/psm3/psm3/psm_ep.h @@ -123,6 +123,7 @@ struct psm2_ep { uint16_t network_pkey_index; /**> Pkey index */ int did_syslog; const char *dev_name; /* just for logging */ + const char *addl_nic_info; /* just for logging */ psm2_uuid_t uuid; uint16_t jkey; uint64_t service_id; /* OPA service ID */ @@ -271,8 +272,6 @@ struct psm2_epaddr { PSMI_PROFILE_UNBLOCK(); \ } while (0) -psm2_error_t psm3_parse_devices(int devices[PTL_MAX_INIT]); -int psm3_device_is_enabled(const int devices[PTL_MAX_INIT], int devid); int psm3_ep_device_is_enabled(const psm2_ep_t ep, int devid); #ifdef PSM_HAVE_RNDV_MOD diff --git a/prov/psm3/psm3/psm_mpool.c b/prov/psm3/psm3/psm_mpool.c index d6b6445a154..6bf33b7d74a 100644 --- a/prov/psm3/psm3/psm_mpool.c +++ b/prov/psm3/psm3/psm_mpool.c @@ -470,8 +470,10 @@ void MOCKABLE(psm3_mpool_get_obj_info)(mpool_t mp, uint32_t *num_obj_per_chunk, uint32_t *num_obj_max_total) { - *num_obj_per_chunk = mp->mp_num_obj_per_chunk; - *num_obj_max_total = mp->mp_num_obj_max_total; + if (num_obj_per_chunk) + *num_obj_per_chunk = mp->mp_num_obj_per_chunk; + if (num_obj_max_total) + *num_obj_max_total = mp->mp_num_obj_max_total; return; } MOCK_DEF_EPILOGUE(psm3_mpool_get_obj_info); diff --git a/prov/psm3/psm3/psm_mq.c b/prov/psm3/psm3/psm_mq.c index ca6cd100b7c..5203715fff8 100644 --- a/prov/psm3/psm3/psm_mq.c +++ b/prov/psm3/psm3/psm_mq.c @@ -1445,6 +1445,18 @@ psm2_error_t psm3_mqopt_ctl(psm2_mq_t mq, uint32_t key, void *value, int get) _HFI_VDBG("RNDV_SHM_SZ = %d (%s)\n", mq->shm_thresh_rv, get ? "GET" : "SET"); break; +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) + case PSM2_MQ_GPU_RNDV_SHM_SZ: + if (get) + *((uint32_t *) value) = mq->shm_gpu_thresh_rv; + else { + val32 = *((uint32_t *) value); + mq->shm_gpu_thresh_rv = val32; + } + _HFI_VDBG("RNDV_GPU_SHM_SZ = %d (%s)\n", + mq->shm_gpu_thresh_rv, get ? "GET" : "SET"); + break; +#endif case PSM2_MQ_MAX_SYSBUF_MBYTES: /* Deprecated: this option no longer does anything. */ break; @@ -1597,6 +1609,169 @@ psm3_mq_print_stats_finalize(psm2_mq_t mq) } } +/* parse a list of window_rv:limit values for + * PSM3_RNDV_NIC_WINDOW and PSM3_GPU_RNDV_NIC_WINDOW + * format is window:limit,window:limit,window + * limit value must be increasing, limit for last entry is optional and + * will be UINT32_MAX even if a value is specified. + * 0 - successfully parsed, *list points to malloced list + * -1 - str empty, *list unchanged + * -2 - syntax error, *list unchanged + */ +static int psm3_mq_parse_window_rv(const char *str, + size_t errstr_size, char errstr[], + struct psm3_mq_window_rv_entry **list) +{ +#define MAX_WINDOW_STR_LEN 1024 + char temp[MAX_WINDOW_STR_LEN+1]; + char *s; + char *delim; + struct psm3_mq_window_rv_entry *ret = NULL; + int i; + unsigned int win, limit; + int skip_limit; + + if (!str || ! *str) + return -1; + + strncpy(temp, str, MAX_WINDOW_STR_LEN); + if (temp[MAX_WINDOW_STR_LEN-1] != 0) { + // string too long + if (errstr_size) + snprintf(errstr, errstr_size, + " Value too long, limit %u characters", + MAX_WINDOW_STR_LEN-1); + return -2; + } + + s = temp; + i = 0; + do { + if (! *s) // trailing ',' on 2nd or later loop + break; + // find end of window field and put in \0 as needed + delim = strpbrk(s, ":,"); + skip_limit = (!delim || *delim == ','); + if (delim) + *delim = '\0'; + // parse window + if (psm3_parse_str_uint(s, &win, 1, PSM_MQ_NIC_MAX_RNDV_WINDOW)) { + if (errstr_size) + snprintf(errstr, errstr_size, " Invalid window_rv: %s", s); + goto fail; + } + // find next field + if (delim) + s = delim+1; + if (skip_limit) { + limit = UINT32_MAX; + } else { + delim = strpbrk(s, ","); + if (delim) + *delim = '\0'; + //parse limit + if (!strcasecmp(s, "max") || !strcasecmp(s, "maximum")) { + limit = UINT32_MAX; + } else { + if (psm3_parse_str_uint(s, &limit, 1, UINT32_MAX)) { + if (errstr_size) + snprintf(errstr, errstr_size, " Invalid limit: %s", s); + goto fail; + } + } + // find next field + if (delim) + s = delim+1; + } + if (i && ret[i-1].limit >= limit) { + if (errstr_size) + snprintf(errstr, errstr_size, " Limit not increasing: %u", limit); + goto fail; + } + + ret = (struct psm3_mq_window_rv_entry*)psmi_realloc(PSMI_EP_NONE, + UNDEFINED, ret, sizeof(struct psm3_mq_window_rv_entry)*(i+1)); + if (! ret) // keep scans happy + return -2; + ret[i].window_rv = ROUNDUP(win, PSMI_PAGESIZE); + ret[i].limit = limit; + i++; + } while (delim); + if (! i) + return -1; + // force last entry limit to UINT32_MAX so used for all remaining lengths + ret[i-1].limit = UINT32_MAX; + if (list) + *list = ret; + else + psmi_free(ret); + return 0; + +fail: + psmi_free(ret); + return -2; +} + +static int psm3_mq_parse_check_window_rv(int type, + const union psmi_envvar_val val, + void * ptr, + size_t errstr_size, char errstr[]) +{ + psmi_assert(type == PSMI_ENVVAR_TYPE_STR); + return psm3_mq_parse_window_rv(val.e_str, errstr_size, errstr, NULL); +} + +PSMI_ALWAYS_INLINE(uint32_t search_window(struct psm3_mq_window_rv_entry *e, + uint32_t len)) +{ + for (; len > e->limit; e++) + ; + return e->window_rv; +} + +// for CPU build, gpu argument ignored, but avoids needing ifdef in callers +uint32_t psm3_mq_max_window_rv(psm2_mq_t mq, int gpu) +{ + // must do search since window_rv may not be increasing (but usually is) + uint32_t ret = 0; + struct psm3_mq_window_rv_entry *e; +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) + if (gpu) + e = mq->ips_gpu_window_rv; + else +#endif + e = mq->ips_cpu_window_rv; + do { + ret = max(ret, e->window_rv); + } while ((e++)->limit < UINT32_MAX); + return ret; +} + +uint32_t psm3_mq_get_window_rv(psm2_mq_req_t req) +{ + if (! req->window_rv) { +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) + if (req->is_buf_gpu_mem) { + req->window_rv = search_window( + req->mq->ips_gpu_window_rv, + req->req_data.send_msglen); + } else +#endif /* PSM_CUDA || PSM_ONEAPI */ + req->window_rv = search_window(req->mq->ips_cpu_window_rv, + req->req_data.send_msglen); +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) + _HFI_VDBG("Selected Window of %u for %u byte %s msg\n", + req->window_rv, + req->req_data.send_msglen, + req->is_buf_gpu_mem?"GPU":"CPU"); +#else + _HFI_VDBG("Selected Window of %u for %u byte msg\n", + req->window_rv, req->req_data.send_msglen); +#endif + } + return req->window_rv; +} + /* * This is the API for the user. We actually allocate the MQ much earlier, but * the user can set options after obtaining an endpoint @@ -2402,6 +2577,9 @@ psm2_error_t psm3_mq_malloc(psm2_mq_t *mqo) // shm_thresh_rv is N/A to NIC and HAL, so we set this here and let // HAL set the rest of the defaults mq->shm_thresh_rv = MQ_SHM_THRESH_RNDV; +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) + mq->shm_gpu_thresh_rv = MQ_SHM_GPU_THRESH_RNDV; +#endif psmi_hal_mq_init_defaults(mq); @@ -2426,6 +2604,9 @@ psm2_error_t psm3_mq_initialize_params(psm2_mq_t mq) { union psmi_envvar_val env_hfitiny, env_rvwin, env_hfirv, env_shmrv, env_hash, env_stats; +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) + union psmi_envvar_val env_shmgpurv; +#endif // a limit of PSM_MQ_MAX_TINY btyes is hardcoded into the PSM protocol psm3_getenv("PSM3_MQ_TINY_NIC_LIMIT", @@ -2440,11 +2621,66 @@ psm2_error_t psm3_mq_initialize_params(psm2_mq_t mq) (union psmi_envvar_val)mq->hfi_thresh_rv, &env_hfirv); mq->hfi_thresh_rv = env_hfirv.e_uint; - psm3_getenv("PSM3_MQ_RNDV_NIC_WINDOW", - "NIC rendezvous window size, max 4M", - PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, - (union psmi_envvar_val)mq->hfi_base_window_rv, &env_rvwin); - mq->hfi_base_window_rv = min(PSM_MQ_NIC_MAX_RNDV_WINDOW, env_rvwin.e_uint); +#define WINDOW_SYNTAX "Specified as window_size:limit,window_size:limit, ...\nwhere limit is the largest message size the window_size is applicable to.\nThe last window_size in the list will be used for all remaining message\nsizes (eg. its limit is optional and ignored).\nwindow_size must be <= 4194304 and the limit in each entry must be larger\nthan the prior entry." + + // for loopback, no ips so no window_rv + if (mq->ips_cpu_window_rv_str) { + int got_depwin = 0; // using deprecated PSM3_MQ_RNDV_NIC_WINDOW + + // PSM3_RNDV_NIC_WINDOW overrides deprecated PSM3_MQ_RNDV_NIC_WINDOW. + // only parse PSM3_MQ_RNDV_NIC_WINDOW if used default for + // PSM3_RNDV_NIC_WINDOW because it was not specified. + if (psm3_getenv_range("PSM3_RNDV_NIC_WINDOW", + "List of NIC rendezvous windows sizes for messges to and from a CPU buffer.", + WINDOW_SYNTAX, + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val)(char*)(mq->ips_cpu_window_rv_str), + (union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL, + psm3_mq_parse_check_window_rv, NULL, &env_rvwin) > 0) { + // new syntax is superset of old + got_depwin = (0 == psm3_getenv_range("PSM3_MQ_RNDV_NIC_WINDOW", + "[Deprecated, use PSM3_RNDV_NIC_WINDOW and PSM3_GPU_RNDV_NIC_WINDOW]", + "NIC rendezvous window size, max 4194304", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val)(char*)(mq->ips_cpu_window_rv_str), + (union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL, + psm3_mq_parse_check_window_rv, NULL, &env_rvwin)); + } + if (psm3_mq_parse_window_rv(env_rvwin.e_str, 0, NULL, + &mq->ips_cpu_window_rv) < 0) { + // already checked, shouldn't get parse errors nor empty strings + psmi_assert(0); + } +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) + if (PSMI_IS_GPU_ENABLED && mq->ips_gpu_window_rv_str) { + union psmi_envvar_val env_gpurvwin; + char *env; + + env = psm3_env_get("PSM3_GPU_RNDV_NIC_WINDOW"); + if (env && *env) + got_depwin = 0; // use new default as default + // PSM3_GPU_RNDV_NIC_WINDOW overrides deprecated + // PSM3_MQ_RNDV_NIC_WINDOW. + // If PSM3_GPU_RNDV_NIC_WINDOW not specified and user specified + // PSM3_MQ_RNDV_NIC_WINDOW, use it for GPU too. + (void)psm3_getenv_range("PSM3_GPU_RNDV_NIC_WINDOW", + "List of NIC rendezvous windows sizes for messages to or from a GPU buffer.", + WINDOW_SYNTAX, + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, + got_depwin?env_rvwin: + (union psmi_envvar_val)(char*)(mq->ips_gpu_window_rv_str), + (union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL, + psm3_mq_parse_check_window_rv, NULL, &env_gpurvwin); + if (psm3_mq_parse_window_rv(env_gpurvwin.e_str, 0, NULL, + &mq->ips_gpu_window_rv)< 0) { + // already checked, shouldn't get parse errors nor empty strings + psmi_assert(0); + } + } +#else + (void)got_depwin; // keep compiler happy +#endif /* PSM_CUDA || PSM_ONEAPI */ + } /* Re-evaluate this since it may have changed after initializing the shm * device */ @@ -2455,6 +2691,17 @@ psm2_error_t psm3_mq_initialize_params(psm2_mq_t mq) (union psmi_envvar_val)mq->shm_thresh_rv, &env_shmrv); mq->shm_thresh_rv = env_shmrv.e_uint; +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) + if (PSMI_IS_GPU_ENABLED) { + mq->shm_gpu_thresh_rv = psm3_shm_mq_gpu_rv_thresh; + psm3_getenv("PSM3_MQ_RNDV_SHM_GPU_THRESH", + "shm eager-to-rendezvous switchover for GPU send", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)mq->shm_gpu_thresh_rv, &env_shmgpurv); + mq->shm_gpu_thresh_rv = env_shmgpurv.e_uint; + } +#endif + psm3_getenv("PSM3_MQ_HASH_THRESH", "linear list to hash tag matching switchover", PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, @@ -2486,6 +2733,10 @@ psm2_error_t MOCKABLE(psm3_mq_free)(psm2_mq_t mq) psm3_mq_req_fini(mq); psm3_mq_sysbuf_fini(mq); psm3_stats_deregister_type(PSMI_STATSTYPE_MQ, mq); +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) + psmi_free(mq->ips_gpu_window_rv); +#endif + psmi_free(mq->ips_cpu_window_rv); psmi_free(mq); return PSM2_OK; } diff --git a/prov/psm3/psm3/psm_mq_internal.h b/prov/psm3/psm3/psm_mq_internal.h index f83e50bbffd..6c7127b0245 100644 --- a/prov/psm3/psm3/psm_mq_internal.h +++ b/prov/psm3/psm3/psm_mq_internal.h @@ -85,6 +85,11 @@ struct psm2_mq_perf_data int perf_print_stats; }; +struct psm3_mq_window_rv_entry { + uint32_t window_rv; + uint32_t limit; +}; + #ifdef LEARN_HASH_SELECTOR // When transition back to nohash mode, should the prior // learned table_sel be retained for use next time transition to hash mode. @@ -175,9 +180,15 @@ struct psm2_mq { uint32_t hfi_thresh_tiny; uint32_t hfi_thresh_rv; uint32_t shm_thresh_rv; - uint32_t hfi_base_window_rv; /**> this is a base rndv window size, - will be further trimmed down per-connection based - on the peer's MTU */ +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) + uint32_t shm_gpu_thresh_rv; +#endif + const char *ips_cpu_window_rv_str; // default input to parser + struct psm3_mq_window_rv_entry *ips_cpu_window_rv; +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) + const char *ips_gpu_window_rv_str; // default input to parser + struct psm3_mq_window_rv_entry *ips_gpu_window_rv; +#endif uint32_t hash_thresh; int memmode; @@ -313,6 +324,7 @@ struct psm2_mq_req { mq_rts_callback_fn_t rts_callback; psm2_epaddr_t rts_peer; uintptr_t rts_sbuf; + uint32_t window_rv; // window size chosen by receiver or GPU send prefetcher #ifdef PSM_HAVE_REG_MR psm3_verbs_mr_t mr; // local registered memory for app buffer @@ -752,6 +764,9 @@ psm2_mq_req_t psm3_mq_req_match(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t * psm2_error_t psm3_mq_malloc(psm2_mq_t *mqo); psm2_error_t psm3_mq_initialize_params(psm2_mq_t mq); psm2_error_t psm3_mq_initstats(psm2_mq_t mq, psm2_epid_t epid); +extern uint32_t psm3_mq_max_window_rv(psm2_mq_t mq, int gpu); +uint32_t psm3_mq_get_window_rv(psm2_mq_req_t req); + psm2_error_t MOCKABLE(psm3_mq_free)(psm2_mq_t mq); MOCK_DCL_EPILOGUE(psm3_mq_free); diff --git a/prov/psm3/psm3/psm_mq_recv.c b/prov/psm3/psm3/psm_mq_recv.c index bc90d07c5cf..7b481351843 100644 --- a/prov/psm3/psm3/psm_mq_recv.c +++ b/prov/psm3/psm3/psm_mq_recv.c @@ -199,11 +199,13 @@ psm3_mq_req_copy(psm2_mq_req_t req, } if (msgptr != buf) { #if defined(PSM_CUDA) || defined(PSM_ONEAPI) + // for loopback HAL, invalid to call psm3_mq_get_window_rv() + // however, for loopback HAL, gdr copy is disabled if (use_gdrcopy) psm3_mq_req_gpu_copy((uint64_t)req->req_data.buf, req->req_data.recv_msglen, (uint64_t)msgptr, msglen_this, - req->mq->hfi_base_window_rv, buf, + psm3_mq_get_window_rv(req), buf, ep); else #endif diff --git a/prov/psm3/psm3/psm_mq_utils.c b/prov/psm3/psm3/psm_mq_utils.c index af2988f64f1..7e80739373a 100644 --- a/prov/psm3/psm3/psm_mq_utils.c +++ b/prov/psm3/psm3/psm_mq_utils.c @@ -82,9 +82,9 @@ psm2_mq_req_t MOCKABLE(psm3_mq_req_alloc)(psm2_mq_t mq, uint32_t type) return req; } else { /* we're out of reqs */ int issend = (type == MQE_TYPE_SEND); - uint32_t reqmax, reqchunk; + uint32_t reqmax; psm3_mpool_get_obj_info(issend ? mq->sreq_pool : mq->rreq_pool, - &reqchunk, &reqmax); + NULL, &reqmax); psm3_handle_error(PSMI_EP_NORETURN, PSM2_PARAM_ERR, "Exhausted %d MQ %s request descriptors, which usually indicates " diff --git a/prov/psm3/psm3/psm_nic_select.c b/prov/psm3/psm3/psm_nic_select.c new file mode 100644 index 00000000000..1a451f5eb67 --- /dev/null +++ b/prov/psm3/psm3/psm_nic_select.c @@ -0,0 +1,2098 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2024 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2024 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2024 Intel Corporation. All rights reserved. */ + +#include +#include +#include "psm_user.h" +#include "psm2_hal.h" +#ifdef PSM_USE_HWLOC +#include +#include +#endif + +#define MAX_MAP_LEN (PSMI_MAX_RAILS*128) + +// sanity check, psm_user.h should ensure this, unless user tried to +// manually set PSM_HAVE_GPU_CENTRIC_AFFINITY +#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY +#ifndef PSM_USE_HWLOC +#error "PSM_HAVE_GPU_CENTRIC_AFFINITY set without PSM_USE_HWLOC" +#endif +#endif + +// subnuma is risky right now, so disable and explore in future +//#ifdef PSM_USE_HWLOC +//#define PSM3_HAVE_CPU_SUBNUMA +//#endif +#undef PSM3_HAVE_CPU_SUBNUMA + +#if defined(PSM_HAVE_GPU_CENTRIC_AFFINITY) || defined(PSM3_HAVE_CPU_SUBNUMA) +struct pci_addr { + uint32_t domain; + uint32_t bus; + uint32_t dev; + uint32_t func; +}; +#endif + +// table of refcount per unit_id counting references by endpoints within +// local process +// protected by psm3_creation_lock (held in psm_ep.c during EP open and close) +static uint64_t psm3_nic_refcount[PSMI_MAX_RAILS]; + +// psm3_shared_affinity_nic_refcount_ptr is the pointer to table of refcount +// per unit_id countting references by all processes within node. +// protected by psm3_sem_affinity_shm_rw semaphore + +static int psmi_parse_nic_selection_algorithm(void); + +#if defined(PSM_HAVE_GPU_CENTRIC_AFFINITY) || defined(PSM3_HAVE_CPU_SUBNUMA) +static hwloc_topology_t psm3_hwloc_topology; +static int psm3_hwloc_topology_initialized; +static int psm3_hwloc_topology_init_failed; +static void psm3_deferred_hwloc_topology_init(); +#endif + +#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY +static int psm3_get_distance_between_pcis(const struct pci_addr *pci_addr_1, + const struct pci_addr *pci_addr_2); +#endif +#ifdef PSM3_HAVE_CPU_SUBNUMA +static hwloc_obj_t psm3_get_non_io_ancestor_obj( + const struct pci_addr *pci_addr); +#endif + +// As we consider and select NICs, we fill in additional information +// or set filtered to exclude the NIC from further consideration. +// The use of filtered avoids the cost of repeatedly compressing the list. +struct nic_info { + uint8_t filtered; // has NIC been filtered out from possible selection + psmi_subnet128_t subnet; + unsigned unit; + unsigned port; + unsigned addr_index; + int numa_id; // CPU NUMA location of NIC +#if defined(PSM_HAVE_GPU_CENTRIC_AFFINITY) || defined(PSM3_HAVE_CPU_SUBNUMA) + struct pci_addr pci_addr; +#endif +#ifdef PSM3_HAVE_CPU_SUBNUMA + int cpu_close; // is CPU sub-numa close to NIC +#endif +#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY + int gpu_distance; +#endif +}; + + +/* returns the 8-bit hash value of an uuid. */ +static inline +uint8_t +psm3_get_uuid_hash(psm2_uuid_t const uuid) +{ + int i; + uint8_t hashed_uuid = 0; + + for (i=0; i < sizeof(psm2_uuid_t); ++i) + hashed_uuid ^= *((uint8_t const *)uuid + i); + + return hashed_uuid; +} + +int psm3_get_current_proc_location() +{ + int core_id, node_id; + + core_id = sched_getcpu(); + if (core_id < 0) + return -EINVAL; + + node_id = numa_node_of_cpu(core_id); + if (node_id < 0) + return -EINVAL; + + return node_id; +} + +// print a bitmask in condensed form at _HFI_VBG level +// condensed form consolidates sequential numbers such as: "0-43,88-131" +static void vdbg_print_bitmask(const char* prefix, struct bitmask *bmp) +{ + if (_HFI_VDBG_ON) { + int i, len; + char buf[1024]; + int last=-1; + int first=-1; + int max = numa_num_possible_nodes(); + + snprintf(buf, sizeof(buf), "%s", prefix); + len = strlen(buf); + for (i=0; i 1) { + if (first == last) { + // first in a possible sequence + snprintf(&buf[len], sizeof(buf)-len, ",%d", i); + } else { + // complete prior sequence, first in a new sequence + snprintf(&buf[len], sizeof(buf)-len, "-%d,%d", last, i); + } + first = i; + last = first; + } else { + last = i; + } + len = strlen(buf); + } + // complete prior sequence as needed + if (first>=0 && first != last) + snprintf(&buf[len], sizeof(buf)-len, "-%d", last); + _HFI_VDBG("%s\n", buf); + } +} + +// return the largest possible numa ID of a CPU in this system +int psm3_get_max_cpu_numa() +{ + static int max_cpu_numa = -1; + struct bitmask *cpumask, *empty_cpumask; + int i; + + if (max_cpu_numa >= 0) + return max_cpu_numa; + + // we don't depend on numa_num_configured_nodes since in theory there + // could be non-CPU memory NUMA nodes. We only need to know the + // largest possible value for a CPU numa node ID + + // numa_max_node - largest NUMA node which is not disabled + // numa_node_to_cpus - given a NUMA node, create list of CPUs + // numa_node_of_cpu - cpu ID to NUMA (or error if invalid CPU) + // numa_node_to_cpus - cpumask of CPUs on given NUMA node + + max_cpu_numa = -1; + empty_cpumask = numa_allocate_cpumask(); + numa_bitmask_clearall(empty_cpumask); + //vdbg_print_bitmask("empty_cpumask: ", empty_cpumask); + + cpumask = numa_allocate_cpumask(); + _HFI_VDBG("numa_max_node=%d\n", numa_max_node()); + for (i=numa_max_node(); i >= 0; i--) { + numa_bitmask_clearall(cpumask); + int ret = numa_node_to_cpus(i, cpumask); + _HFI_VDBG("i=%d node_to_cpus ret=%d\n", i, ret); + vdbg_print_bitmask("cpumask: ", cpumask); + if (ret >= 0 && ! numa_bitmask_equal(cpumask, empty_cpumask)) { + max_cpu_numa = i; + break; + } + } + numa_free_cpumask(cpumask); + numa_free_cpumask(empty_cpumask); + psmi_assert_always(max_cpu_numa >= 0); + return max_cpu_numa; +} + +/* search the list of all units for those which are active + * and optionally match the given NUMA node_id (when node_id >= 0) + * returns the number of active units found. + * Note get_unit_active tests for active ports, valid addresses and + * performs filtering as done in get_port_subnets + */ +static int +hfi_find_active_hfis(int nunits, int node_id, int *saved_hfis) +{ + int found = 0, unit_id; + + for (unit_id = 0; unit_id < nunits; unit_id++) { + int node_id_i; + + if (psmi_hal_get_unit_active(unit_id) <= 0) + continue; + + if (node_id < 0) { + saved_hfis[found++] = unit_id; + _HFI_DBG("RoundRobinAll Found NIC unit= %d, local rank=%d.\n", + unit_id, psm3_get_mylocalrank()); + } else if (!psmi_hal_get_node_id(unit_id, &node_id_i) + && node_id_i == node_id) { + saved_hfis[found++] = unit_id; + _HFI_DBG("RoundRobin Found NIC unit= %d, node = %d, local rank=%d.\n", + unit_id, node_id, psm3_get_mylocalrank()); + } + } + return found; +} + +// select NIC across all NICs, use a hash of job_id and local rank to +// distribute local ranks across NICs and to attempt to distribute +// jobs across NICs. +// TBD - if know never have >1 job per node, could ignore job_id, perhaps +// have an env to exclude job_id from hash so NIC selection is deterministic +static void +psmi_spread_nic_selection(psm2_uuid_t const job_key, long *unit_start, + long *unit_end, int nunits) +{ + { + int found, saved_hfis[nunits]; + + /* else, we are going to look at: + (a hash of the job key plus the local rank id) mod nunits. */ + found = hfi_find_active_hfis(nunits, -1, saved_hfis); + if (found) + *unit_start = saved_hfis[((psm3_get_mylocalrank()+1) + + psm3_get_uuid_hash(job_key)) % found]; + else + // none found, caller will fail, start is a don't care + *unit_start = 0; + /* just in case, caller will check all other units, with wrap */ + if (*unit_start > 0) + *unit_end = *unit_start - 1; + else + *unit_end = nunits-1; + } + _HFI_DBG("RoundRobinAll Will select 1st viable NIC unit= %ld to %ld.\n", + *unit_start, *unit_end); +} + +static int +psm3_create_and_open_affinity_shm(psm2_uuid_t const job_key) +{ + int shm_fd, ret; + int first_to_create = 0; + size_t shm_name_len = 256; + + psmi_assert_always(psm3_affinity_semaphore_open); + if (psm3_affinity_shared_file_opened) { + /* opened and have our reference counted in shm */ + psmi_assert_always(psm3_affinity_shm_name != NULL); + psmi_assert_always(psm3_shared_affinity_ptr != NULL); + return 0; + } + + psm3_shared_affinity_ptr = NULL; + psm3_affinity_shm_name = (char *) psmi_malloc(PSMI_EP_NONE, UNDEFINED, shm_name_len); + + psmi_assert_always(psm3_affinity_shm_name != NULL); + snprintf(psm3_affinity_shm_name, shm_name_len, + AFFINITY_SHM_BASENAME".%d", + psm3_get_uuid_hash(job_key)); + shm_fd = shm_open(psm3_affinity_shm_name, O_RDWR | O_CREAT | O_EXCL, + S_IRUSR | S_IWUSR); + if ((shm_fd < 0) && (errno == EEXIST)) { + shm_fd = shm_open(psm3_affinity_shm_name, O_RDWR, S_IRUSR | S_IWUSR); + if (shm_fd < 0) { + _HFI_VDBG("Cannot open affinity shared mem fd:%s, errno=%d\n", + psm3_affinity_shm_name, errno); + goto free_name; + } + } else if (shm_fd >= 0) { + first_to_create = 1; + } else { + _HFI_VDBG("Cannot create affinity shared mem fd:%s, errno=%d\n", + psm3_affinity_shm_name, errno); + goto free_name; + } + + ret = ftruncate(shm_fd, PSMI_PAGESIZE); + if ( ret < 0 ) { + _HFI_VDBG("Cannot truncate affinity shared mem fd:%s, errno=%d\n", + psm3_affinity_shm_name, errno); + goto close_shm; + } + + psm3_shared_affinity_ptr = (uint64_t *) mmap(NULL, PSMI_PAGESIZE, PROT_READ | PROT_WRITE, + MAP_SHARED, shm_fd, 0); + if (psm3_shared_affinity_ptr == MAP_FAILED) { + _HFI_VDBG("Cannot mmap affinity shared memory: %s, errno=%d\n", + psm3_affinity_shm_name, errno); + goto close_shm; + } + close(shm_fd); + shm_fd = -1; + + if (first_to_create) { + _HFI_VDBG("Initializing shm to store NIC affinity per socket: %s\n", psm3_affinity_shm_name); + + memset(psm3_shared_affinity_ptr, 0, PSMI_PAGESIZE); + + /* + * Once shm object is initialized, unlock others to be able to + * use it. + */ + psmi_sem_post(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name); + } else { + _HFI_VDBG("Opened shm object to read/write NIC affinity per socket: %s\n", psm3_affinity_shm_name); + } + + /* + * Start critical section to increment reference count when creating + * or opening shm object. Decrement of ref count will be done before + * closing the shm. + */ + if (psmi_sem_timedwait(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name)) { + _HFI_VDBG("Could not enter critical section to update shm refcount\n"); + goto unmap_shm; + } + + psm3_shared_affinity_ptr[AFFINITY_SHM_REF_COUNT_LOCATION] += 1; + _HFI_VDBG("shm refcount = %"PRId64"\n", psm3_shared_affinity_ptr[AFFINITY_SHM_REF_COUNT_LOCATION]); + + /* End critical section */ + psmi_sem_post(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name); + + psm3_affinity_shared_file_opened = 1; + + return 0; + +unmap_shm: + munmap(psm3_shared_affinity_ptr, PSMI_PAGESIZE); + psm3_shared_affinity_ptr = NULL; +close_shm: + if (shm_fd >= 0) close(shm_fd); +free_name: + psmi_free(psm3_affinity_shm_name); + psm3_affinity_shm_name = NULL; + return -1; +} + +/* + * Spread HFI selection between units if we find more than one within a socket. + */ +static void +psmi_spread_hfi_within_socket(long *unit_start, long *unit_end, int node_id, + int *saved_hfis, int found, psm2_uuid_t const job_key) +{ + int ret, shm_location; + + /* + * Take affinity lock and open shared memory region to be able to + * accurately determine which HFI to pick for this process. If any + * issues, bail by picking first known HFI. + */ + if (!psm3_affinity_semaphore_open) + goto spread_hfi_fallback; + + ret = psm3_create_and_open_affinity_shm(job_key); + if (ret < 0) + goto spread_hfi_fallback; + + // one shm entry per CPU NUMA domain + // The entry contains the next round robin NIC to use + // in the form of a index into saved_hfis + // saved_hfis has a list of all the NUMA local active NICs + shm_location = AFFINITY_SHM_HFI_INDEX_LOCATION + node_id; + if (shm_location > PSMI_PAGESIZE) + goto spread_hfi_fallback; + + /* Start critical section to read/write shm object */ + if (psmi_sem_timedwait(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name)) { + _HFI_VDBG("Could not enter critical section to update NIC index\n"); + goto spread_hfi_fallback; + } + + *unit_start = *unit_end = saved_hfis[psm3_shared_affinity_ptr[shm_location]]; + psm3_shared_affinity_ptr[shm_location] = + (psm3_shared_affinity_ptr[shm_location] + 1) % found; + _HFI_DBG("RoundRobin Selected NIC unit= %ld, Next NIC=%ld, node = %d, local rank=%d, found=%d.\n", + *unit_start, psm3_shared_affinity_ptr[shm_location], node_id, + psm3_get_mylocalrank(), found); + + /* End Critical Section */ + psmi_sem_post(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name); + + return; + +spread_hfi_fallback: + *unit_start = *unit_end = saved_hfis[0]; +} + +static void +psm3_create_affinity_semaphores(psm2_uuid_t const job_key) +{ + int ret; + size_t sem_len = 256; + + /* + * If already opened, no need to do anything else. + * This could be true for Multi-EP cases where a different thread has + * already created the semaphores. We don't need separate locks here as + * we are protected by the overall "psm3_creation_lock" which each + * thread will take in psm3_ep_open() + */ + if (psm3_affinity_semaphore_open) + return; + + psm3_sem_affinity_shm_rw_name = (char *) psmi_malloc(PSMI_EP_NONE, UNDEFINED, sem_len); + psmi_assert_always(psm3_sem_affinity_shm_rw_name != NULL); + snprintf(psm3_sem_affinity_shm_rw_name, sem_len, + SEM_AFFINITY_SHM_RW_BASENAME".%d", + psm3_get_uuid_hash(job_key)); + + ret = psmi_init_semaphore(&psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name, + S_IRUSR | S_IWUSR, 0); + if (ret) { + _HFI_VDBG("Cannot initialize semaphore: %s for read-write access to shm object.\n", + psm3_sem_affinity_shm_rw_name); + if (psm3_sem_affinity_shm_rw) + sem_close(psm3_sem_affinity_shm_rw); + psmi_free(psm3_sem_affinity_shm_rw_name); + psm3_sem_affinity_shm_rw_name = NULL; + return; + } + + _HFI_VDBG("Semaphore: %s created for read-write access to shm object.\n", + psm3_sem_affinity_shm_rw_name); + + psm3_affinity_semaphore_open = 1; + + return; +} + +/* + * Get all the ports and optionally addr_index'es with a valid lid and gid, + * one port per unit but up to PSM3_ADDR_PER_NIC addresses. + * + * Returns count of entries put in nic_info + * + * There will be exactly per_addr_index entries per active unit all for the + * same port within the unit + */ +unsigned nic_info_init(struct nic_info *nic_info, unsigned nunits, int per_addr_index) +{ + unsigned unit, port, addr_index; + unsigned num_addr_index = per_addr_index?psm3_addr_per_nic:1; + int ret; + unsigned count = 0; + + for (unit = 0; unit < nunits; unit++) { + // get_unit_active is redundant since it loops on all ports and + // confirms at least 1 port has a valid lid. We test that below. + //if (psmi_hal_get_unit_active(unit) <= 0) + // continue; + for (port = PSM3_NIC_MIN_PORT; port <= PSM3_NIC_MAX_PORT; port++) { + int got_port = 0; + for (addr_index = 0; addr_index < num_addr_index; addr_index++) { + psmi_subnet128_t subnet; + ret = psmi_hal_get_port_lid(unit, port, addr_index); + if (ret <= 0) + continue; + ret = psmi_hal_get_port_subnet(unit, port, addr_index, &subnet, NULL, NULL, NULL); + if (ret == -1) + continue; + + nic_info[count].filtered = 0; + nic_info[count].subnet = subnet; + nic_info[count].unit = unit; + nic_info[count].port = port; + nic_info[count].addr_index = addr_index; +#if defined(PSM_HAVE_GPU_CENTRIC_AFFINITY) || defined(PSM3_HAVE_CPU_SUBNUMA) + nic_info[count].pci_addr.domain = UINT32_MAX; +#endif + got_port = 1; + count++; + } + if (got_port) // one port per unit + break; + } + } + return count; +} + +/* If at least 1 NIC matches the current CPUs NUMA id, + * filter out all NICs which do not match. + * If none match, noop. + * Also initializes nic_info.numa_id + */ +void nic_info_filter_numa(struct nic_info *nic_info, unsigned ninfo) +{ + unsigned i; + int found = 0; + + int cpu_numa_id = psm3_get_current_proc_location(); + if (cpu_numa_id < 0) { + _HFI_DBG("Unable to determine CPU NUMA location, skipping filter of NIC CPU NUMA location\n"); + return; + } + + for (i=0; i < ninfo; i++) + { + if (nic_info[i].filtered) + continue; + + if (psmi_hal_get_node_id(nic_info[i].unit, &nic_info[i].numa_id) != 0) { + // assume match (don't filter this NIC) + _HFI_DBG("Unable to determine NIC NUMA location for unit %d (%s), assuming local to CPU NUMA (%d)\n", + nic_info[i].unit, psm3_sysfs_unit_dev_name(nic_info[i].unit), + cpu_numa_id); + nic_info[i].numa_id = cpu_numa_id; + } else { + _HFI_DBG("NIC NUMA location for unit %d (%s) is %d\n", + nic_info[i].unit, psm3_sysfs_unit_dev_name(nic_info[i].unit), + nic_info[i].numa_id); + } + found += (nic_info[i].numa_id == cpu_numa_id); + } + if (found) { + _HFI_DBG("Found %d unfiltered NUMA local NICs for CPU NUMA id = %d\n", + found, cpu_numa_id); + // filter out NICs not in cpu_numa_id + for (i=0; i < ninfo; i++) + { + if (nic_info[i].filtered) + continue; + nic_info[i].filtered = (nic_info[i].numa_id != cpu_numa_id); + } + } else { + _HFI_DBG("No NUMA local NIC found, CPU NUMA id = %d\n", cpu_numa_id); + } +} + +/* If at least 1 NIC matches the current CPUs sub-NUMA group, + * filter out all NICs which do not match. + * If none match, noop. + * Also initializes nic_info.pci_addr and nic_info.cpu_close + */ +void nic_info_filter_sub_numa(struct nic_info *nic_info, unsigned ninfo) +{ +#ifdef PSM3_HAVE_CPU_SUBNUMA + unsigned i; + int found = 0; + hwloc_cpuset_t cpu_bind_set; + + psm3_deferred_hwloc_topology_init(); + if (psm3_hwloc_topology_init_failed) + return; // hwloc incorrect version + psmi_assert(psm3_hwloc_topology_initialized); + + // here we use entire CPU bind set, (should match pthread_getaffinity_np) + // as opposed to just the current process location. + cpu_bind_set = hwloc_bitmap_alloc(); + if (! cpu_bind_set) { + _HFI_DBG("Unable to allocate CPU set, skipping filter of CPU sub-NUMA location\n"); + return; + } +#if 0 + // use current process affinity + if (hwloc_get_cpubind(psm3_hwloc_topology, cpu_bind_set, + HWLOC_CPUBIND_PROCESS)) { + _HFI_DBG("Unable to determine process CPU binding, skipping filter of CPU sub-NUMA location\n"); + goto fail; + } +#else + // use current thread affinity + pthread_t mythread = pthread_self(); + if (hwloc_get_thread_cpubind(psm3_hwloc_topology, mythread, + cpu_bind_set, HWLOC_CPUBIND_THREAD)) { + _HFI_DBG("Unable to determine thread CPU binding, skipping filter of CPU sub-NUMA location\n"); + goto fail; + } +#endif + + for (i=0; i < ninfo; i++) + { + if (nic_info[i].filtered) + continue; + if (nic_info[i].pci_addr.domain == UINT32_MAX + && psmi_hal_get_unit_pci_bus(nic_info[i].unit, + &nic_info[i].pci_addr.domain, &nic_info[i].pci_addr.bus, + &nic_info[i].pci_addr.dev, &nic_info[i].pci_addr.func)) { + _HFI_DBG("Unable to get NIC PCIe address for unit %d (%s)\n", + nic_info[i].unit, psm3_sysfs_unit_dev_name(nic_info[i].unit)); + // can't filter out NIC because if all fail we won't have any. + // Unsure how to rank this NIC vs others, so assume not close + nic_info[i].cpu_close = 0; + continue; + } + + hwloc_obj_t ancestor = psm3_get_non_io_ancestor_obj( + &nic_info[i].pci_addr); + if (! ancestor) { + _HFI_DBG("Unable to determine NIC ancestor for unit %d (%s) at PCIe %04x:%02x:%02x.%x\n", + nic_info[i].unit, psm3_sysfs_unit_dev_name(nic_info[i].unit), + nic_info[i].pci_addr.domain, nic_info[i].pci_addr.bus, + nic_info[i].pci_addr.dev, nic_info[i].pci_addr.func); + // can't filter out NIC because if all fail we won't have any. + // Unsure how to rank this NIC vs others, so assume not close + nic_info[i].cpu_close = 0; + continue; + } + + // If any overlap of NIC and process CPU sets, consider it close + nic_info[i].cpu_close = + hwloc_bitmap_isincluded(cpu_bind_set, ancestor->cpuset) + || hwloc_bitmap_isincluded(ancestor->cpuset, cpu_bind_set); + + if (_HFI_DBG_ON) { + char buf[256] = {0};; + hwloc_bitmap_list_snprintf(buf, sizeof(buf), ancestor->cpuset); + buf[sizeof(buf)-1] = '\0'; // paranoid, hwloc doc not clear + _HFI_DBG_ALWAYS("NIC closeness to CPU for unit %d (%s) at %u:%u:%u:%u is %d, NIC close to CPUs: %s\n", + nic_info[i].unit, psm3_sysfs_unit_dev_name(nic_info[i].unit), + nic_info[i].pci_addr.domain, nic_info[i].pci_addr.bus, + nic_info[i].pci_addr.dev, nic_info[i].pci_addr.func, + nic_info[i].cpu_close, buf); + } + found += nic_info[i].cpu_close; + } + if (found) { + if (_HFI_DBG_ON) { + char buf[256] = {0};; + hwloc_bitmap_list_snprintf(buf, sizeof(buf), cpu_bind_set); + buf[sizeof(buf)-1] = '\0'; // paranoid, hwloc doc not clear + _HFI_DBG_ALWAYS("Found %d unfiltered NICs close to CPUs: %s\n", found, buf); + } + // filter out NICs not close + for (i=0; i < ninfo; i++) + { + if (nic_info[i].filtered) + continue; + nic_info[i].filtered = ! nic_info[i].cpu_close; + } + } else { + if (_HFI_DBG_ON) { + char buf[256] = {0};; + hwloc_bitmap_list_snprintf(buf, sizeof(buf), cpu_bind_set); + buf[sizeof(buf)-1] = '\0'; // paranoid, hwloc doc not clear + _HFI_DBG_ALWAYS("No NICs found close to CPUs: %s\n", buf); + } + } +fail: + hwloc_bitmap_free(cpu_bind_set); +#else + //_HFI_DBG("Filtering based on CPU closeness to NIC disabled\n"); +#endif +} + +#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY +/* Find the closest NIC to the current GPU and then fiter out all NICs + * which are further from the GPU than that closest NIC + * If no GPU for the process yet, or PSM3 GPU support not enabled, noop. + * Also initializes nic_info.pci_addr and nic_info.gpu_distance + */ +void nic_info_filter_gpu_distance(struct nic_info *nic_info, unsigned ninfo) +{ + unsigned i; + int min_distance = INT_MAX; // smallest distance found + unsigned found = 0; + struct pci_addr gpu_pci_addr; + + if (! PSMI_IS_GPU_ENABLED) + return; + + psm3_deferred_hwloc_topology_init(); + if (psm3_hwloc_topology_init_failed) + return; // hwloc incorrect version + psmi_assert(psm3_hwloc_topology_initialized); + + // Get current GPU PCIe address to gpu_pci_addr; +#ifdef PSM_CUDA + { + int domain, bus, dev; + int num_devices; + CUdevice device; + + PSMI_CUDA_CALL(cuDeviceGetCount, &num_devices); + _HFI_DBG("%d Cuda GPUs found\n", num_devices); + if (! num_devices) + return; + + if (num_devices == 1) { + PSMI_CUDA_CALL(cuDeviceGet, &device, 0); + } else { + // all GPUs will be visible to process, see if app chose one first + CUcontext ctxt = {0}; + if (! psmi_cuCtxGetCurrent || psmi_cuCtxGetCurrent(&ctxt) || ! ctxt) { + _HFI_DBG("Unable to get Cuda ctxt\n"); + //PSMI_CUDA_CALL(cuDeviceGet, &device, 0); + return; + } else { + PSMI_CUDA_CALL(cuCtxGetDevice, &device); + } + } + _HFI_DBG("Using Cuda GPU %d\n", device); + PSMI_CUDA_CALL(cuDeviceGetAttribute, + &domain, + CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, + device); + PSMI_CUDA_CALL(cuDeviceGetAttribute, + &bus, + CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, + device); + PSMI_CUDA_CALL(cuDeviceGetAttribute, + &dev, + CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, + device); + gpu_pci_addr.domain = domain; + gpu_pci_addr.bus = bus; + gpu_pci_addr.dev = dev; + gpu_pci_addr.func = 0; + } +#elif defined(PSM_ONEAPI) + { + ze_pci_ext_properties_t PciProperties; + + _HFI_DBG("%d Level Zero GPUs found\n", num_ze_devices); + if (! num_ze_devices) + return; + + // caling middleware will have limited GPUs visible to process + PSMI_ONEAPI_ZE_CALL(zeDevicePciGetPropertiesExt, + ze_devices[0].dev, &PciProperties); + gpu_pci_addr.domain = PciProperties.address.domain; + gpu_pci_addr.bus = PciProperties.address.bus; + gpu_pci_addr.dev = PciProperties.address.device; + gpu_pci_addr.func = PciProperties.address.function; + } +#endif + _HFI_DBG("GPU PCIe address is %04x:%02x:%02x.%x\n", + gpu_pci_addr.domain, gpu_pci_addr.bus, + gpu_pci_addr.dev, gpu_pci_addr.func); + + for (i=0; i < ninfo; i++) { + if (nic_info[i].filtered) + continue; + if (nic_info[i].pci_addr.domain == UINT32_MAX + && psmi_hal_get_unit_pci_bus(nic_info[i].unit, + &nic_info[i].pci_addr.domain, &nic_info[i].pci_addr.bus, + &nic_info[i].pci_addr.dev, &nic_info[i].pci_addr.func)) { + _HFI_DBG("Unable to get NIC PCIe address for unit %d (%s)\n", + nic_info[i].unit, psm3_sysfs_unit_dev_name(nic_info[i].unit)); + // can't filter out NIC because if all fail we won't have any. + // Unsure how to rank this NIC vs others, so use max distance + nic_info[i].gpu_distance = INT_MAX; + continue; + } + nic_info[i].gpu_distance = psm3_get_distance_between_pcis( + &nic_info[i].pci_addr, &gpu_pci_addr); + _HFI_DBG("NIC PCIe address for unit %d (%s) is %04x:%02x:%02x.%x distance to GPU: %d\n", + nic_info[i].unit, psm3_sysfs_unit_dev_name(nic_info[i].unit), + nic_info[i].pci_addr.domain, nic_info[i].pci_addr.bus, + nic_info[i].pci_addr.dev, nic_info[i].pci_addr.func, + nic_info[i].gpu_distance); + if (nic_info[i].gpu_distance < min_distance) { + min_distance = nic_info[i].gpu_distance; + } + } + if (min_distance == INT_MAX) { + _HFI_DBG("No NIC found with a known distance\n"); + return; // noop + } + + // filter out all NICs with a distance > min_distance + for (i=0; i < ninfo; i++) { + if (nic_info[i].filtered) + continue; + psmi_assert(nic_info[i].gpu_distance >= min_distance); + nic_info[i].filtered = (nic_info[i].gpu_distance > min_distance); + found += ! nic_info[i].filtered; + } + _HFI_DBG("Found %d unfiltered NICs with GPU distance of %d\n", + found, min_distance); +} +#endif /* PSM_HAVE_GPU_CENTRIC_AFFINITY */ + +// filter down the list of NICs with a CPU locality focus as priority +// if present, the GPU is considered last. If the GPU is NUMA local +// to the CPU, the GPU filter can further limit NICs to those close to the +// GPU (same PCIe switch). But if the GPU is not NUMA local to the CPU, +// the gpu distance filter may still limit distance or end up being a noop. +static void nic_info_filter_cpu_centric(struct nic_info *nic_info, + unsigned ninfo) +{ + _HFI_DBG("Filtering NICs with CPU Centric Strategy\n"); + nic_info_filter_sub_numa(nic_info, ninfo); + nic_info_filter_numa(nic_info, ninfo); +#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY + nic_info_filter_gpu_distance(nic_info, ninfo); +#endif +} + +#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY +// filter down the list of NICs with a GPU locality focus as priority +// When there is a GPU, once we have selected NICs closest to that +// GPU we are likely to have limited ourselves to NICs in the same +// NUMA as the GPU, so the CPU NUMA tests will become noops. +// For example, a GPU and NIC on the same PCIe switch will by definition +// be in the same CPU root complex and hence same CPU NUMA. +// But if there is no GPU or none of the NICs are close to the GPU +// the CPU numa tests may narrow the list of NICs. +static void nic_info_filter_gpu_centric(struct nic_info *nic_info, + unsigned ninfo) +{ + _HFI_DBG("Filtering NICs with GPU Centric Strategy\n"); + nic_info_filter_gpu_distance(nic_info, ninfo); + nic_info_filter_numa(nic_info, ninfo); + nic_info_filter_sub_numa(nic_info, ninfo); +} +#endif /* PSM_HAVE_GPU_CENTRIC_AFFINITY */ + +// analyze the refcount table and filter out NICs with refcounts +// higher than the lowest found. +// If all NICs have equal refcounts, noop. +static void +nic_info_filter_refcount(struct nic_info *nic_info, unsigned ninfo, + uint64_t *refcount, unsigned nunits, const char *str) +{ + unsigned i; + uint64_t min_refcount = UINT64_MAX; // smallest refcount found + unsigned found = 0; + + for (i=0; i < ninfo; i++) { + if (nic_info[i].filtered) + continue; + psmi_assert(nic_info[i].unit < nunits); + _HFI_DBG("NIC %s reference count for unit %d (%s) is %"PRIu64"\n", str, + nic_info[i].unit, psm3_sysfs_unit_dev_name(nic_info[i].unit), + refcount[nic_info[i].unit]); + if (refcount[nic_info[i].unit] < min_refcount) { + min_refcount = refcount[nic_info[i].unit]; + psmi_assert(nic_info[i].unit < nunits);; + } + } + if (min_refcount == UINT64_MAX) { + // unexpected, should have found a smaller value + _HFI_DBG("No NIC found with a low %s reference count\n", str); + return; // noop + } + + // filter out all NICs with a refcount > min_refcount + for (i=0; i < ninfo; i++) { + if (nic_info[i].filtered) + continue; + psmi_assert(refcount[nic_info[i].unit] >= min_refcount); + nic_info[i].filtered = (refcount[nic_info[i].unit] > min_refcount); + found += ! nic_info[i].filtered; + } + _HFI_DBG("Found %d unfiltered NICs with %s reference count of %"PRIu64"\n", + found, str, min_refcount); +} + +// return index in nic_info of 1st unfiltered NIC +static unsigned +nic_info_get_first_unfiltered_nic(struct nic_info *nic_info, unsigned ninfo) +{ + unsigned i; + for (i=0; i < ninfo; i++) { + if (! nic_info[i].filtered) + return i; + } + psmi_assert(0); + return 0; +} + +/* + * Select NIC among the unfiltered NICs in nic_info while + * scoreboarding use of each NIC and picking the one with lowest + * unit number and least use. + * + * Scoreboarding starts with the local process's NIC usage across all EPs + * This helps to ensure a given process balances itself across unfiltered NICs + * on the assumption that all local processes will ultimately have the same + * number of endpoints. + * + * After the local process scoreboarding, the shm scoreboard is checked + * to pick a NIC based on lowest refcount within the server. Thus balancing + * NIC usage within the server. + * + * Among NICs with the lowest reference counts, the lowest entry in nic_info + * (also lowest unit_id) is selected. + * This assumes only one entry appears in nic_info for each unit_id + * (eg. nic_info_init was given per_addr_index of 1) and the entries in + * nic_info are sorted by unit_id (in order built by nic_info_init). + * + * Due to call sequence prior to this, nic_info list will already be sorted by + * unit_id since it was built in that order by nic_info_init. + * Returns index in nic_info of selected NIC. + * On any issues, selects 1st NIC + */ +static int +psm3_open_shm_scoreboard_and_select_nic( + struct nic_info *nic_info, unsigned ninfo, + psm2_uuid_t const job_key, unsigned nunits) +{ + int ret, shm_location, index; + + psmi_assert(nunits > 0); + psmi_assert(ninfo > 0); + + // balance among endpoints within current process + nic_info_filter_refcount(nic_info, ninfo, + psm3_nic_refcount, nunits, "local process"); + + psm3_create_affinity_semaphores(job_key); + /* + * Take affinity lock and open shared memory region to be able to + * accurately determine which NIC to pick for this process. If any + * issues, bail by picking first unfiltered NIC in nic_info + */ + if (!psm3_affinity_semaphore_open) + goto fallback; + + ret = psm3_create_and_open_affinity_shm(job_key); + if (ret < 0) + goto fallback; + + // start of scoreboard area, we keep refcount for each unit_id. + // Note that some other modes may organize the shm area differently, + // so it's important that all processes and all endpoints use the same + // fundamental modes for PSM3_MULTIRAIL and PSM3_NIC_SELECTION_ALG + shm_location = AFFINITY_SHM_HFI_INDEX_LOCATION; + if (shm_location + sizeof(*psm3_shared_affinity_ptr)*nunits > PSMI_PAGESIZE) + goto fallback; + + // At psm3_shm_refcount_ptr in Linux shared memory is a table indexed + // by unit_id with a reference count per NIC showing the total + // endpoints within the job which are using the NIC. + psm3_shared_affinity_nic_refcount_ptr = + &psm3_shared_affinity_ptr[shm_location]; + + /* Start critical section to read/write shm object */ + if (psmi_sem_timedwait(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name)) { + _HFI_VDBG("Could not enter critical section to update NIC index\n"); + goto fallback; + } + + // balance among procceses within current node + nic_info_filter_refcount(nic_info, ninfo, + psm3_shared_affinity_nic_refcount_ptr, + nunits, "local node"); + + // use lowest index among those which remain + index = nic_info_get_first_unfiltered_nic(nic_info, ninfo); + + // update reference counts for node level and process level + psm3_shared_affinity_nic_refcount_ptr[nic_info[index].unit]++; + psm3_nic_refcount[nic_info[index].unit]++; + + /* End Critical Section */ + psmi_sem_post(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name); + + psmi_assert(index >= 0 && index <= ninfo); + _HFI_DBG("Selected NIC unit %d(%s)\n", + nic_info[index].unit, psm3_sysfs_unit_dev_name(nic_info[index].unit)); + return index; + +fallback: + index = nic_info_get_first_unfiltered_nic(nic_info, ninfo); + psm3_nic_refcount[nic_info[index].unit]++; // inc process level refcount + return index; +} + +// decrement reference counts which were incremented in local process +// and in shm within node +// For modes which do not track this style of refcounts psm3_nic_refcount +// will be zero for every unit_id and psm3_shared_affinity_nic_refcount_ptr will +// be NULL (or if psm3 has been finalized) +void psm3_dec_nic_refcount(int unit_id) +{ + // in some modes we don't track refcount, in which case do nothing + if (psm3_nic_refcount[unit_id]) + psm3_nic_refcount[unit_id]--; + if (psm3_affinity_shared_file_opened && psm3_affinity_semaphore_open + && psm3_shared_affinity_nic_refcount_ptr) { + /* Start critical section to read/write shm object */ + if (psmi_sem_timedwait(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name)) { + _HFI_VDBG("Could not enter critical section to update NIC refcount\n"); + } else { + psm3_shared_affinity_nic_refcount_ptr[unit_id]--; + /* End Critical Section */ + psmi_sem_post(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name); + } + } +} + +psm2_error_t +psm3_compute_start_and_end_unit_cpu_centric( + psm2_uuid_t const job_key, + long *unit_start,long *unit_end, int nunits) +{ + unsigned index; + unsigned ninfo; + struct nic_info nic_info[PSMI_MAX_RAILS]; + + // caller will enumerate addr_index, just just get all active ports + ninfo = nic_info_init(nic_info, nunits, 0); + if (! ninfo) { + // should not happen, caller already confirmed there is >1 active unit + // mimic what caller of psm3_compute_start_and_end_unit would do + return psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, + "PSM3 no nic units are active"); + } + + nic_info_filter_cpu_centric(nic_info, ninfo); + + index = psm3_open_shm_scoreboard_and_select_nic(nic_info, ninfo, + job_key, nunits); + psmi_assert(index >= 0 && index < ninfo); + + // caller will select 1st active port and an addr_index within unit + *unit_start = *unit_end = nic_info[index].unit; + return PSM2_OK; +} + +#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY +psm2_error_t +psm3_compute_start_and_end_unit_gpu_centric( + psm2_uuid_t const job_key, + long *unit_start,long *unit_end, int nunits) +{ + unsigned index; + unsigned ninfo; + struct nic_info nic_info[PSMI_MAX_RAILS]; + + // caller will enumerate addr_index, just just get all active ports + ninfo = nic_info_init(nic_info, nunits, 0); + if (! ninfo) { + // should not happen, caller already confirmed there is >1 active unit + // mimic what caller of psm3_compute_start_and_end_unit would do + return psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, + "PSM3 no nic units are active"); + } + + nic_info_filter_gpu_centric(nic_info, ninfo); + + index = psm3_open_shm_scoreboard_and_select_nic(nic_info, ninfo, + job_key, nunits); + psmi_assert(index >= 0 && index < ninfo); + + // caller will select 1st active port and an addr_index within unit + *unit_start = *unit_end = nic_info[index].unit; + return PSM2_OK; +} +#endif /* PSM_HAVE_GPU_CENTRIC_AFFINITY */ + +// return set of units to consider and which to start at. +// caller will use 1st active unit which can be opened. +// caller will wrap around so it's valid for start >= end +// Note: When using multiple rails per PSM process, higher level code will +// walk through desired units and unit_param will specify a specific unit +// if unit_param is PSM3_NIC_ANY, this will pick starting point for nic search +psm2_error_t +psm3_compute_start_and_end_unit(long unit_param, long addr_index, + int nunitsactive, int nunits, + psm2_uuid_t const job_key, + long *unit_start, long *unit_end) +{ + unsigned short nic_sel_alg = PSMI_UNIT_SEL_ALG_ACROSS; + int node_id, found = 0; + int saved_hfis[nunits]; + + /* if the user did not set PSM3_NIC then ... */ + if (unit_param == PSM3_NIC_ANY) + { + if (nunitsactive > 1) { + // if NICs are on different planes (non-routed subnets) + // we need to have all ranks default to the same plane + // so force 1st active NIC in that case + int have_subnet = 0, unit_id; + psmi_subnet128_t got_subnet = { }; + for (unit_id = 0; unit_id < nunits; unit_id++) { + psmi_subnet128_t subnet; + if (psmi_hal_get_unit_active(unit_id) <= 0) + continue; + if (0 != psmi_hal_get_port_subnet(unit_id, 1 /* VERBS_PORT*/, + addr_index>0?addr_index:0, + &subnet, NULL, NULL, NULL)) + continue; // can't access NIC + if (! have_subnet) { + have_subnet = 1; + got_subnet = subnet; + } else if (! psm3_subnets_match(got_subnet, + subnet)) { + // active units have different tech + // (IB/OPA vs Eth) or different subnets + // caller will pick 1st active unit + *unit_start = 0; + *unit_end = nunits - 1; + _HFI_DBG("Multi-Plane config: Will select 1st viable NIC unit= %ld to %ld.\n", + *unit_start, *unit_end); + return PSM2_OK; + } + } + } + + /* Get the actual selection algorithm from the environment: */ + nic_sel_alg = psmi_parse_nic_selection_algorithm(); + /* If round-robin is selection algorithm and ... */ + if ((nic_sel_alg == PSMI_UNIT_SEL_ALG_ACROSS) && + /* there are more than 1 active units then ... */ + (nunitsactive > 1)) + { + /* + * Pick an HFI on same root complex as current task. + * linux IPC ensures balanced NIC usage within job. + * If none found, fall back to + * RoundRobinAll load-balancing algorithm. + */ + node_id = psm3_get_current_proc_location(); + if (node_id >= 0) { + found = hfi_find_active_hfis(nunits, node_id, + saved_hfis); + if (found > 1) { + psm3_create_affinity_semaphores(job_key); + psmi_spread_hfi_within_socket(unit_start, unit_end, + node_id, saved_hfis, + found, job_key); + } else if (found == 1) { + *unit_start = *unit_end = saved_hfis[0]; + _HFI_DBG("RoundRobin Selected NIC unit= %ld, node = %d, local rank=%d, found=%d.\n", + *unit_start, node_id, + psm3_get_mylocalrank(), found); + } + } + + if (node_id < 0 || !found) { + _HFI_DBG("RoundRobin No local NIC found, using RoundRobinAll, node = %d, local rank=%d, found=%d.\n", + node_id, + psm3_get_mylocalrank(), found); + psmi_spread_nic_selection(job_key, unit_start, + unit_end, nunits); + } + } else if ((nic_sel_alg == PSMI_UNIT_SEL_ALG_ACROSS_ALL) && + (nunitsactive > 1)) { + psmi_spread_nic_selection(job_key, unit_start, + unit_end, nunits); + } else if ((nic_sel_alg == PSMI_UNIT_SEL_ALG_CPU_CENTRIC) && + (nunitsactive > 1)) { + return psm3_compute_start_and_end_unit_cpu_centric(job_key, + unit_start, unit_end, nunits); +#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY + } else if ((nic_sel_alg == PSMI_UNIT_SEL_ALG_GPU_CENTRIC) && + (nunitsactive > 1)) { + return psm3_compute_start_and_end_unit_gpu_centric(job_key, + unit_start, unit_end, nunits); +#endif + } else { // PSMI_UNIT_SEL_ALG_WITHIN or only 1 active unit + // caller will pick 1st active unit + *unit_start = 0; + *unit_end = nunits - 1; + _HFI_DBG("%s: Will select 1st viable NIC unit= %ld to %ld.\n", + (nic_sel_alg == PSMI_UNIT_SEL_ALG_WITHIN) + ?"Packed":"Only 1 viable NIC", + *unit_start, *unit_end); + } + } else if (unit_param >= 0) { + /* the user specified PSM3_NIC, we use it. */ + *unit_start = *unit_end = unit_param; + _HFI_DBG("Caller selected NIC %ld.\n", *unit_start); + } else { + psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, + "PSM3 can't open unit: %ld for reading and writing", + unit_param); + return PSM2_EP_DEVICE_FAILURE; + } + + return PSM2_OK; +} + +static +int psmi_parse_nic_selection_algorithm(void) +{ + union psmi_envvar_val env_nic_alg; + int nic_alg = PSMI_UNIT_SEL_ALG_ACROSS; + + const char* PSM3_NIC_SELECTION_ALG_HELP = + "NIC Device Selection Algorithm to use. Round Robin[RoundRobin or rr] (Default) " + ", Packed[p], Round Robin All[RoundRobinAll or rra]," +#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY + " CPU Centric Round Robin [CpuRoundRobin or crr]" + ", or GPU Centric Round Robin [GpuRoundRobin or grr]"; +#else + " or CPU Centric Round Robin [CpuRoundRobin or crr]"; +#endif + + + /* If a specific unit is set in the environment, use that one. */ + psm3_getenv("PSM3_NIC_SELECTION_ALG", PSM3_NIC_SELECTION_ALG_HELP, + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val)"rr", &env_nic_alg); + + if (!strcasecmp(env_nic_alg.e_str, "Round Robin") + || !strcasecmp(env_nic_alg.e_str, "RoundRobin") + || !strcasecmp(env_nic_alg.e_str, "rr")) + nic_alg = PSMI_UNIT_SEL_ALG_ACROSS; + else if (!strcasecmp(env_nic_alg.e_str, "Packed") + || !strcasecmp(env_nic_alg.e_str, "p")) + nic_alg = PSMI_UNIT_SEL_ALG_WITHIN; + else if (!strcasecmp(env_nic_alg.e_str, "Round Robin All") + || !strcasecmp(env_nic_alg.e_str, "RoundRobinAll") + || !strcasecmp(env_nic_alg.e_str, "rra")) + nic_alg = PSMI_UNIT_SEL_ALG_ACROSS_ALL; + else if (!strcasecmp(env_nic_alg.e_str, "CPU Centric Round Robin") + || !strcasecmp(env_nic_alg.e_str, "CpuRoundRobin") + || !strcasecmp(env_nic_alg.e_str, "crr")) + nic_alg = PSMI_UNIT_SEL_ALG_CPU_CENTRIC; +#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY + else if (!strcasecmp(env_nic_alg.e_str, "GPU Centric Round Robin") + || !strcasecmp(env_nic_alg.e_str, "GpuRoundRobin") + || !strcasecmp(env_nic_alg.e_str, "grr")) + nic_alg = PSMI_UNIT_SEL_ALG_GPU_CENTRIC; +#endif + else { + _HFI_INFO( + "Invalid value for PSM3_NIC_SELECTION_ALG ('%s') %-40s Using: %s\n", + env_nic_alg.e_str, PSM3_NIC_SELECTION_ALG_HELP, "RoundRobin"); + nic_alg = PSMI_UNIT_SEL_ALG_ACROSS; + } + + return nic_alg; +} + +/* parse a list of NIC rails for PSM3_MULTIRAIL_MAP + * map is in format: unit:port-addr_index,unit:port-addr_index,...;unit.... + * where :port is optional (default of 1) and unit can be name or number + * -addr_index is also optional and defaults to "all" + * addr_index can be an integer between 0 and PSM3_ADDR_PER_NIC-1 + * or "any" or "all". "any" selects a single address using the hash and + * "all" setups a rail for each address. + * ; may separate sets of rails. When more than 1 set is presented, the + * map_index selects which set is used. + * Returns: + * 0 - successfully parsed, config_out updated + * -1 - str empty, config_out unchanged + * -2 - syntax error, config_out partially updated + */ +static int psm3_parse_multirail_map(const char *str, int map_index, + size_t errstr_size, char errstr[], + struct multirail_config *config_out) +{ + char temp[MAX_MAP_LEN+1]; + char *s; + char *delim; + char delim_char = '\0'; + unsigned i; + int ret; + int set_index = 0; + + if (!str || ! *str) + return -1; + + strncpy(temp, str, MAX_MAP_LEN); + if (temp[MAX_MAP_LEN-1] != 0) { + if (errstr_size) + snprintf(errstr, errstr_size, + " Value too long, limit %u characters", + MAX_MAP_LEN-1); + return -2; + } + config_out->num_rails = 0; + s = temp; + psmi_assert(*s); + do { + int u; + unsigned int p = 1; + int skip_port = 0; + int skip_addr_index = 0; + long a_index = PSM3_ADDR_INDEX_ALL; + + if (! *s) { // trailing ',' or ';' on 2nd or later loop + if (delim_char == ';') + set_index--; // never started next set + break; + } + if (delim_char == ';') { + // start of a new set + config_out->num_rails = 0; + } + if (config_out->num_rails >= PSMI_MAX_RAILS) { + if (errstr_size) + snprintf(errstr, errstr_size, + " Value too long, limit %u rails", + PSMI_MAX_RAILS); + return -2; + } + + // find end of unit field and put in \0 as needed + delim = strpbrk(s, ":-,;"); + if (!delim || *delim == ',' || *delim == ';') { + skip_port = 1; skip_addr_index = 1; + } else if (*delim == '-') { + skip_port = 1; + } + if (delim) { + delim_char = *delim; + *delim = '\0'; + } else { + delim_char = '\0'; + } + // parse unit + u = psm3_sysfs_find_unit(s); + if (u < 0) { + if (errstr_size) + snprintf(errstr, errstr_size, + " Invalid unit: '%s'", s); + return -2; + } + // find next field + if (delim) + s = delim+1; + if (! skip_port) { + // find end of port field and put in \0 as needed + delim = strpbrk(s, "-,;"); + if (!delim || *delim == ',' || *delim == ';') + skip_addr_index = 1; + if (delim) { + delim_char = *delim; + *delim = '\0'; + } else { + delim_char = '\0'; + } + // parse port + if (psm3_parse_str_uint(s, &p, 0, UINT_MAX) < 0) { + if (errstr_size) + snprintf(errstr, errstr_size, + " Invalid port: '%s'", s); + return -2; + } + // find next field + if (delim) + s = delim+1; + } + if (! skip_addr_index) { + // find end of addr_index field and put in \0 as needed + delim = strpbrk(s, ",;"); + if (delim) { + delim_char = *delim; + *delim = '\0'; + } else { + delim_char = '\0'; + } + // parse addr_index + if (0 == strcmp(s, "all")) + a_index = PSM3_ADDR_INDEX_ALL; // we will loop below + else if (0 == strcmp(s, "any")) + a_index = PSM3_ADDR_INDEX_ANY; // caller will pick + else if (psm3_parse_str_long(s, &a_index, 0, psm3_addr_per_nic-1)) { + if (errstr_size) + snprintf(errstr, errstr_size, + " Invalid addr index: '%s'", s); + return -2; + } + // find next field + if (delim) + s = delim+1; + } + + if (a_index == PSM3_ADDR_INDEX_ALL) { // all + for (a_index = 0; a_index < psm3_addr_per_nic; a_index++) { + if (config_out->num_rails >= PSMI_MAX_RAILS) { + if (errstr_size) + snprintf(errstr, errstr_size, + " Limit of %u rails exceeded due to multi-addr", + PSMI_MAX_RAILS); + return -2; + } + config_out->units[config_out->num_rails] = u; + config_out->ports[config_out->num_rails] = p; + config_out->addr_indexes[config_out->num_rails] = a_index; + config_out->num_rails++; + } + } else { + config_out->units[config_out->num_rails] = u; + config_out->ports[config_out->num_rails] = p; + config_out->addr_indexes[config_out->num_rails] = a_index; + config_out->num_rails++; + } + if (delim_char == ';') { + if (set_index == map_index) + break; // found it, stop parsing + set_index++; // start of next + } + } while (delim); + + // if only 1 set input, we use it, otherwise must have enough sets for us + psmi_assert(set_index >= 0); + if (set_index > 0 && set_index != map_index) { + if (errstr_size) + snprintf(errstr, errstr_size, + " Insufficient sets specified: %d need at least %d", + set_index+1, map_index+1); + return -2; + } + psmi_assert(set_index == 0 || set_index == map_index); + + // must have at least 1 rail. Since we caught empty string above, + // if we get here without any rails input must be something like "," or ";" + // and we'll treat that as a syntax error + if (! config_out->num_rails) { + if (errstr_size) + snprintf(errstr, errstr_size, " No rails specified"); + return -2; + } + + // Check if any of the ports are not usable. Just use addr_index 0 for check + for (i = 0; i < config_out->num_rails; i++) { + _HFI_VDBG("rail %d: %u(%s) %u\n", i, + config_out->units[i], + psm3_sysfs_unit_dev_name(config_out->units[i]), + config_out->ports[i]); + + ret = psmi_hal_get_port_active(config_out->units[i], + config_out->ports[i]); + if (ret <= 0) { + if (errstr_size) + snprintf(errstr, errstr_size, + " Unit:port: %d(%s):%d is not active.", + config_out->units[i], + psm3_sysfs_unit_dev_name(config_out->units[i]), + config_out->ports[i]); + return -2; + } + + ret = psmi_hal_get_port_lid(config_out->units[i], + config_out->ports[i], 0 /* addr_index*/); + if (ret <= 0) { + if (errstr_size) + snprintf(errstr, errstr_size, + " Unit:port: %d(%s):%d was filtered out, unable to use", + config_out->units[i], + psm3_sysfs_unit_dev_name(config_out->units[i]), + config_out->ports[i]); + return -2; + } + + ret = psmi_hal_get_port_subnet(config_out->units[i], + config_out->ports[i], 0 /* addr_index*/, + NULL, NULL, NULL, NULL); + if (ret == -1) { + if (errstr_size) + snprintf(errstr, errstr_size, + " Couldn't get subnet for unit %d (%s):%d", + config_out->units[i], + psm3_sysfs_unit_dev_name(config_out->units[i]), + config_out->ports[i]); + return -2; + } + } + + // valid input + return 0; +} + +static int psm3_parse_check_multirail_map(int type, + const union psmi_envvar_val val, void *ptr, + size_t errstr_size, char errstr[]) +{ + struct multirail_config temp; + int map_index = *(int*)ptr; + psmi_assert(type == PSMI_ENVVAR_TYPE_STR); + return psm3_parse_multirail_map(val.e_str, map_index, errstr_size, errstr, + &temp); +} + +// comparison function for qsort +// Sort by subnet 1st, then by nic unit, then by addr_index. +// Nics are already numbered in alphabetic order so this effectively +// sorts by subnet, then nic name, then addr_index.. +// We simply ignore the filtered field, filtered NICs will also get sorted +// but omitted from final output list by caller +static int niccmpfunc(const void *p1, const void *p2) +{ + struct nic_info *a = ((struct nic_info *) p1); + struct nic_info *b = ((struct nic_info *) p2); + int ret; + + ret = psmi_subnet128_cmp(a->subnet, b->subnet); + if (ret == 0) { + if (a->unit < b->unit) + return -1; + else if (a->unit > b->unit) + return 1; + + if (a->addr_index < b->addr_index) + return -1; + else if (a->addr_index > b->addr_index) + return 1; + } + return ret; +} + +/* + * Sort all the ports within nic_info from small to big. + * So, when there are multiple fabrics, and we will use fabric with the + * smallest subnet to make the master connection. + */ +static void +psm3_copy_nic_info_to_multitrail_config( + struct nic_info *nic_info, unsigned ninfo, + struct multirail_config *multirail_config) +{ + unsigned i, j; + + qsort(nic_info, ninfo, sizeof(nic_info[0]), niccmpfunc); + + multirail_config->num_rails = 0; + j = 0; + for (i = 0; i < ninfo; i++) { + if (nic_info[i].filtered) + continue; + multirail_config->units[j] = nic_info[i].unit; + multirail_config->ports[j] = nic_info[i].port; + multirail_config->addr_indexes[j] = nic_info[i].addr_index; + multirail_config->num_rails++; + j++; + } +} + +// select a list of NICs to use, optimizing for CPU locality first +static psm2_error_t +psm3_ep_multirail_autoselect_cpu_centric(uint32_t nunits, + struct multirail_config *multirail_config) +{ + unsigned ninfo; + struct nic_info nic_info[PSMI_MAX_RAILS]; + + // enumerate addr_index too + ninfo = nic_info_init(nic_info, nunits, 1); + if (! ninfo) { + // caller will try single NIC selection next + multirail_config->num_rails = 0; + return PSM2_OK; + } + + nic_info_filter_cpu_centric(nic_info, ninfo); + + // we will use all unfiltered units + + // ensure psm3_context_set_affinity doesn't unnecessarily narrow CPU + // selection, it will be called per rail and if rails are in + // different CPU NUMA could have an undesired impact + setenv("PSM3_NO_AFFINITY", "1", 1); + + psm3_copy_nic_info_to_multitrail_config(nic_info, ninfo, multirail_config); + return PSM2_OK; +} + +#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY +// select a list of NICs to use, optimizing for GPU locality first +static psm2_error_t +psm3_ep_multirail_autoselect_gpu_centric(uint32_t nunits, + struct multirail_config *multirail_config) +{ + unsigned ninfo; + struct nic_info nic_info[PSMI_MAX_RAILS]; + + // enumerate addr_index too + ninfo = nic_info_init(nic_info, nunits, 1); + if (! ninfo) { + // caller will try single NIC selection next + multirail_config->num_rails = 0; + return PSM2_OK; + } + + nic_info_filter_gpu_centric(nic_info, ninfo); + + // we will use all unfiltered units + + // ensure psm3_context_set_affinity doesn't unnecessarily narrow CPU + // selection, it will be called per rail and if rails are in + // different CPU NUMA could have an undesired impact + setenv("PSM3_NO_AFFINITY", "1", 1); + + psm3_copy_nic_info_to_multitrail_config(nic_info, ninfo, multirail_config); + return PSM2_OK; +} +#endif /* PSM_HAVE_GPU_CENTRIC_AFFINITY */ + +// for use in psm3_ep_multirail_autoselect so can sort rails +// by subnet and addr_index +struct rail_info { + psmi_subnet128_t subnet; + unsigned unit; + unsigned port; + unsigned addr_index; +}; + +static int cmpfunc(const void *p1, const void *p2) +{ + struct rail_info *a = ((struct rail_info *) p1); + struct rail_info *b = ((struct rail_info *) p2); + int ret; + + ret = psmi_subnet128_cmp(a->subnet, b->subnet); + if (ret == 0) { + if (a->addr_index < b->addr_index) + return -1; + else if (a->addr_index > b->addr_index) + return 1; + } + return ret; +} + +// Multirail enabled, autoselect one or more NICs for this process +// multirail_mode is PSM3_MULTIRAIL selection (1=all NICs, 2=NUMA local NICs) +static psm2_error_t +psm3_ep_multirail_autoselect(int multirail_mode, + struct multirail_config *multirail_config) +{ + uint32_t num_units = 0; + psmi_subnet128_t subnet; + unsigned i, j, k, count = 0; + int ret; + psm2_error_t err = PSM2_OK; + struct rail_info rail_info[PSMI_MAX_RAILS]; + int multirail_within_socket_used = 0; + int node_id = -1, found = 0; + + if (multirail_mode == 2) + multirail_within_socket_used = 1; + + + if ((err = psm3_ep_num_devunits(&num_units))) { + return err; + } + + if (num_units > PSMI_MAX_RAILS) { + _HFI_INFO + ("Found %d units, max %d units are supported, using first %d\n", + num_units, PSMI_MAX_RAILS, PSMI_MAX_RAILS); + num_units = PSMI_MAX_RAILS; + } + + if (multirail_mode == 3) + return psm3_ep_multirail_autoselect_cpu_centric(num_units, multirail_config); +#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY + if (multirail_mode == 4) + return psm3_ep_multirail_autoselect_gpu_centric(num_units, multirail_config); +#endif + + /* + * PSM3_MULTIRAIL=2 functionality- + * - Try to find at least find one NIC in the same root + * complex. If none found, continue to run and + * use remaining NIC in the system. + * - If we do find at least one NIC in same root complex, we + * go ahead and add to list. + */ + if (multirail_within_socket_used) { + node_id = psm3_get_current_proc_location(); + for (i = 0; i < num_units; i++) { + if (psmi_hal_get_unit_active(i) <= 0) + continue; + int node_id_i; + + if (!psmi_hal_get_node_id(i, &node_id_i)) { + if (node_id_i == node_id) { + found = 1; + break; + } + } + } + } +/* + * Get all the ports and addr_index with a valid lid and gid, one port per unit. + * but up to PSM3_ADDR_PER_NIC addresses. If we are using the NUMA selection + * algorithm and found at list 1 NUMA local NIC above, limit the list to NUMA + * local NICs, otherwise list all NICs + */ + for (i = 0; i < num_units; i++) { + int node_id_i; + + if (!psmi_hal_get_node_id(i, &node_id_i)) + { + if (multirail_within_socket_used && + found && (node_id_i != node_id)) + continue; + } + + for (j = PSM3_NIC_MIN_PORT; j <= PSM3_NIC_MAX_PORT; j++) { + int got_port = 0; + for (k = 0; k < psm3_addr_per_nic; k++) { + ret = psmi_hal_get_port_lid(i, j, k); + if (ret <= 0) + continue; + ret = psmi_hal_get_port_subnet(i, j, k, &subnet, NULL, NULL, NULL); + if (ret == -1) + continue; + + rail_info[count].subnet = subnet; + rail_info[count].unit = i; + rail_info[count].port = j; + rail_info[count].addr_index = k; + got_port = 1; + count++; + } + if (got_port) // one port per unit + break; + } + } + +/* + * Sort all the ports within rail_info from small to big. + * This is for multiple fabrics, and we use fabric with the + * smallest subnet to make the master connection. + */ + qsort(rail_info, count, sizeof(rail_info[0]), cmpfunc); + + for (i = 0; i < count; i++) { + multirail_config->units[i] = rail_info[i].unit; + multirail_config->ports[i] = rail_info[i].port; + multirail_config->addr_indexes[i] = rail_info[i].addr_index; + } + multirail_config->num_rails = count; + return PSM2_OK; +} + +// process PSM3_MULTIRAIL and PSM3_MULTIRAIL_MAP and return the +// list of unit/port/addr_index in multirail_config. +// When multirail_config->num_rails is returned as 0, multirail is not enabled +// and other mechanisms (PSM3_NIC, PSM3_NIC_SELECTION_ALG) must be +// used by the caller to select a single NIC for the process. +// This can return num_rails==1 if exactly 1 NIC is to be used by this process +// or num_rails>1 if this process is to stripe data across multiple NICs +// in which case the 1st NIC in multirail_config should be used as the +// primary NIC for job communications setup. +psm2_error_t +psm3_ep_multirail(struct multirail_config *multirail_config) +{ + int ret; + union psmi_envvar_val env_multirail; + union psmi_envvar_val env_multirail_map; + int map_index; + + psm3_getenv_range("PSM3_MULTIRAIL", + "Control use of multiple NICs", + "-1: No PSM3 NIC autoselection (middleware selects 1 NIC per process).\n" + " 0: (default) Middleware may select NICs or use PSM3 'autoselect_one'\n" + " interface. 'autoselect_one' interface will pick 1 NIC per process\n" + " based on PSM3_NIC_SELECTION_ALG.\n" + " 1: Enable multirail, each process uses all available NICs. Only 'autoselect'\n" + " interface presented to middleware.\n" + " 2: Enable multirail, each process uses all NUMA local NICs. Only 'autoselect'\n" + " interface presented to middleware. If no NUMA local NICs found for a given\n" + " process, PSM3 will use all available NICs for that process.\n" + " 3: Enable multirail, each process selects only ideally located NICs with\n" +#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY + " consideration of NIC, CPU" +#ifdef PSM3_HAVE_CPU_SUBNUMA + " sub-NUMA" +#endif + " and GPU locations with priority given\n" + " to CPU locality. Only 'autoselect' interface presented to middleware.\n" + " If no NUMA local NICs are found for a given process and all NICs are equal\n" + " distance to the GPU, PSM3 will use all available NICs for that process.\n" +#else + " consideration of NIC and CPU" +#ifdef PSM3_HAVE_CPU_SUBNUMA + " sub-NUMA" +#endif + " locations.\n" + " Only 'autoselect' interface presented to middleware.\n" + " If no NUMA local NICs are found for a given process, PSM3 will use all\n" + " available NICs for that process.\n" +#endif +#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY + " 4: Enable multirail, each process selects only ideally located NICs with\n" + " consideration of NIC, GPU, and CPU" +#ifdef PSM3_HAVE_CPU_SUBNUMA + " sub-NUMA" +#endif + " locations with priority given\n" + " to GPU locality. Only 'autoselect' interface presented to middleware.\n" + " If no NUMA local NICs are found for a given process, PSM3 will use all\n" + " available NICs of equal distance to the GPU for that process." +#endif + , + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)0, +#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY + (union psmi_envvar_val)-1, (union psmi_envvar_val)4, +#else + (union psmi_envvar_val)-1, (union psmi_envvar_val)3, +#endif + NULL, NULL, &env_multirail); + if (env_multirail.e_int <= 0) { + // will pick 1 NIC per process + multirail_config->num_rails = 0; + return PSM2_OK; + } + + if (env_multirail.e_int == 1 || env_multirail.e_int == 2) { + // TBD - move this code to a separate function + // for PSM3_MULTIRAIL=1 or 2, PSM3_MULTIRAIL_MAP can explicitly select NICs. + // We treat invalid input, such as bad syntax or selection of an unusable + // port (down/missing/etc), as a fatal error instead of attempting to run + // on the default PSM3_MULTIRAIL_MAP config. This helps avoid + // inconsistent NIC selections, especially for down ports, which may + // cause confusing behaviors or errors. + // If PSM3_MULTIRAIL_MAP contains multiple lists of NICs, then + // if PSM3_MULTIRAIL=1 - use local rank index (0, ...) to select + // if PSM3_MULTIRAIL=2 - use process NUMA (0, ...) to select + if (env_multirail.e_int == 1) { + map_index = psm3_get_mylocalrank(); + } else if (env_multirail.e_int == 2) { + map_index = psm3_get_current_proc_location(); + if (map_index < 0) { + return psm3_handle_error(PSMI_EP_NORETURN, + PSM2_EP_DEVICE_FAILURE, + "Unable to get NUMA location of current process\n"); + } + } else { + psmi_assert(0); + } + ret = psm3_getenv_range("PSM3_MULTIRAIL_MAP", + "Explicit NIC selections for each rail", + "Specified as:\n" + " rail,rail,...;rail,rail,...\n" +#if 0 + "Where rail can be: unit:port-addr_index or unit\n" +#else + "Where rail can be: unit-addr_index or unit\n" +#endif + "unit can be device name or unit number\n" +#if 0 + "where :port is optional (default of 1)\n" +#endif + "addr_index can be 0 to PSM3_ADDR_PER_NIC-1, or 'any' or 'all'\n" + "When addr_index is omitted, it defaults to 'all'\n" + "When more than 1 set of rails is present (each set is separated by ;),\n" + "the set to use for a given process is selected based on PSM3_MULTIRAIL.\n" + " 1 - use local rank number to select\n" + " 2 - use local CPU NUMA to select\n" + "When empty, PSM3 will autoselect NICs as controlled by PSM3_MULTIRAIL.", + PSMI_ENVVAR_LEVEL_USER|PSMI_ENVVAR_FLAG_FATAL, PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val)"", + (union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL, + psm3_parse_check_multirail_map, &map_index, &env_multirail_map); + if (ret < 0) { // syntax error in input, ret error instead of using default + psmi_assert(0); // should not get here since specified FLAG_FATAL + multirail_config->num_rails = 0; + return psm3_handle_error(PSMI_EP_NORETURN, + PSM2_EP_DEVICE_FAILURE, + "Invalid value for PSM3_MULTIRAIL_MAP: '%s', can't proceed\n", + env_multirail_map.e_str); + } + if (! ret) { + // valid input + if (psm3_parse_multirail_map(env_multirail_map.e_str, map_index, 0, NULL, + multirail_config) < 0) { + // already checked, shouldn't get parse errors nor empty strings + psmi_assert(0); + } + return PSM2_OK; + } + } + + // multirail enabled, automatically select 1 or more NICs + return psm3_ep_multirail_autoselect(env_multirail.e_int, multirail_config); +} + +// potential job start hwloc initialization. To avoid overhead +// when hwloc is not needed, we defer to the 1st actual need for hwloc +void +psm3_hwloc_topology_init() +{ +} + +#if defined(PSM_HAVE_GPU_CENTRIC_AFFINITY) || defined(PSM3_HAVE_CPU_SUBNUMA) +// deferred hwloc initialization. Caller must hold psm3_creation_lock +static void psm3_deferred_hwloc_topology_init() +{ + unsigned version; + Dl_info info_hwloc; + const char *location; + + // only try once + if (psm3_hwloc_topology_initialized || psm3_hwloc_topology_init_failed) + return; + +#define SHOW_HWLOC_VERSION(ver) (ver)>>16, ((ver) >> 8) & 0xff, (ver) & 0xff + version = hwloc_get_api_version(); + location = dladdr(hwloc_topology_init, &info_hwloc) ? + info_hwloc.dli_fname : "hwloc path not available"; + if ((version >> 16) != (HWLOC_API_VERSION >> 16)) { + _HFI_ERROR("PSM3 was compiled for hwloc API %u.%u.%u but found library API %u.%u.%u at %s.\n" + "You may need to point LD_LIBRARY_PATH to the right hwloc library.\n" + "Disabling some NIC selection affinity features\n", + SHOW_HWLOC_VERSION(HWLOC_API_VERSION), SHOW_HWLOC_VERSION(version), + location); + psm3_hwloc_topology_init_failed = 1; + return; + } + // HWLOC_VERSION string mentioned in docs, but not defined in headers + psm3_print_identify("%s %s hwloc runtime API %u.%u.%u at %s, built against API %u.%u.%u\n", + psm3_get_mylabel(), psm3_ident_tag, + SHOW_HWLOC_VERSION(version), location, + SHOW_HWLOC_VERSION(HWLOC_API_VERSION)); + + hwloc_topology_init(&psm3_hwloc_topology); + // detection configuration, need all PCI devices and CPU sub-numa + // HWLOC_API_VERSION is rev X.Y.Z as (X<<16)+(Y<<8)+Z + // significant API changes from 1.0 to 2.0, including ABI changes +#if HWLOC_API_VERSION < 0x20000 + hwloc_topology_set_flags(psm3_hwloc_topology, + HWLOC_TOPOLOGY_FLAG_IO_DEVICES|HWLOC_TOPOLOGY_FLAG_IO_BRIDGES); +#else + hwloc_topology_set_io_types_filter(psm3_hwloc_topology, + HWLOC_TYPE_FILTER_KEEP_ALL); +#endif + hwloc_topology_load(psm3_hwloc_topology); + psm3_hwloc_topology_initialized = 1; +} +#endif /* defined(PSM_HAVE_GPU_CENTRIC_AFFINITY) || defined(PSM3_HAVE_CPU_SUBNUMA) */ + +void +psm3_hwloc_topology_destroy() +{ +#if defined(PSM_HAVE_GPU_CENTRIC_AFFINITY) || defined(PSM3_HAVE_CPU_SUBNUMA) + if (psm3_hwloc_topology_initialized) { + psm3_hwloc_topology_initialized = 0; + hwloc_topology_destroy(psm3_hwloc_topology); + } +#endif +} + +#if defined(PSM_HAVE_GPU_CENTRIC_AFFINITY) || defined(PSM3_HAVE_CPU_SUBNUMA) +/* Get the next PCI device in the system. + * + * return the first PCI device if prev is NULL. + * looping on this allows iterating through all PCIe devices + * device=any PCIe component (root controller, bridge, switch, device, etc) + */ +static inline hwloc_obj_t +get_next_pcidev(hwloc_topology_t topology, hwloc_obj_t prev) +{ + return hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_PCI_DEVICE, prev); +} + +/* Find the PCI device hwloc object matching the PCI bus id + * given domain, bus, device and func PCI bus id. + */ +static hwloc_obj_t +get_pcidev_by_busid(hwloc_topology_t topology, + const struct pci_addr *addr) +{ + hwloc_obj_t obj = NULL; + while ((obj = get_next_pcidev(topology, obj)) != NULL) { + if (obj->attr->pcidev.domain == addr->domain + && obj->attr->pcidev.bus == addr->bus + && obj->attr->pcidev.dev == addr->dev + && obj->attr->pcidev.func == addr->func) + return obj; + } + return NULL; +} +#endif /* defined(PSM_HAVE_GPU_CENTRIC_AFFINITY) || defined(PSM3_HAVE_CPU_SUBNUMA) */ + +#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY +// compare two hwloc objects for equality +// 1 on match, 0 on mismatch +static int equal_hwlocobj(const hwloc_obj_t obj1, const hwloc_obj_t obj2) +{ + return (obj1->type == obj2->type + && obj1->depth == obj2->depth + && obj1->logical_index == obj2->logical_index); +} + +// compute distance in between objects (PCIe devices). +// If the devices are on different PCIe controllers and/or different CPU sockets +// returns INT_MAX +static int get_distance_to_common_ancestor(const hwloc_obj_t obj1, const hwloc_obj_t obj2) +{ + int d1 = 0; + int d2 = 0; + hwloc_obj_t temp1 = obj1; + + while (temp1) { + + hwloc_obj_t temp2 = obj2; + d2 = 0; + + while (temp2) { + + /* common ancestor found */ + if (equal_hwlocobj(temp1, temp2)) { + return d1 + d2; + } + temp2 = temp2->parent; + d2++; + } + temp1 = temp1->parent; + d1++; + } + + /* No common ancestor found, return INT_MAX as the distance */ + return INT_MAX; +} + +// compute distance in PCIe hops between devices. If the +// If the devices are on different PCIe controllers and/or different CPU sockets +// returns INT_MAX +static int psm3_get_distance_between_pcis(const struct pci_addr *pci_addr_1, + const struct pci_addr *pci_addr_2) +{ + hwloc_obj_t obj1 = get_pcidev_by_busid(psm3_hwloc_topology, pci_addr_1); + hwloc_obj_t obj2 = get_pcidev_by_busid(psm3_hwloc_topology, pci_addr_2); + return get_distance_to_common_ancestor(obj1, obj2); +} +#endif /* PSM_HAVE_GPU_CENTRIC_AFFINITY */ + +#ifdef PSM3_HAVE_CPU_SUBNUMA +// find ancestor of a device, namely the PCIe controller in the CPU socket +static hwloc_obj_t psm3_get_non_io_ancestor_obj( + const struct pci_addr *pci_addr) +{ + hwloc_obj_t obj = get_pcidev_by_busid(psm3_hwloc_topology, pci_addr); + if (! obj) + return NULL; + return hwloc_get_non_io_ancestor_obj(psm3_hwloc_topology, obj); +} +#endif /* PSM3_HAVE_CPU_SUBNUMA */ diff --git a/prov/psm3/psm3/psm_nic_select.h b/prov/psm3/psm3/psm_nic_select.h new file mode 100644 index 00000000000..cfd23ea1081 --- /dev/null +++ b/prov/psm3/psm3/psm_nic_select.h @@ -0,0 +1,116 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2024 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2024 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2024 Intel Corporation. All rights reserved. */ + +#ifndef _PSMI_IN_USER_H +#error psm_nic_select.h not meant to be included directly, include psm_user.h instead +#endif + +#ifndef _PSM_NIC_SELECT_H +#define _PSM_NIC_SELECT_H + +// PSM3_NIC_SELECTION_ALG choices +/* + * round robin contexts across HFIs, then + * ports; this is the default. + * This option spreads the HFI selection within the local socket. + * If it is preferred to spread job over over entire set of + * HFIs within the system, see ALG_ACROSS_ALL below. + */ +#define PSMI_UNIT_SEL_ALG_ACROSS PSM_HAL_ALG_ACROSS + +#define PSMI_UNIT_SEL_ALG_ACROSS_ALL PSM_HAL_ALG_ACROSS_ALL + +/* + * use all contexts on an HFI (round robin + * active ports within), then next HFI + */ +#define PSMI_UNIT_SEL_ALG_WITHIN PSM_HAL_ALG_WITHIN + +#define PSMI_UNIT_SEL_ALG_CPU_CENTRIC PSM_HAL_ALG_CPU_CENTRIC +#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY +#define PSMI_UNIT_SEL_ALG_GPU_CENTRIC PSM_HAL_ALG_GPU_CENTRIC +#endif + +struct multirail_config { + int num_rails; + uint32_t units[PSMI_MAX_RAILS]; + uint16_t ports[PSMI_MAX_RAILS]; + int addr_indexes[PSMI_MAX_RAILS]; +}; + +// return set of units to consider and which to start at. +// caller will use 1st active unit which can be opened. +// caller will wrap around so it's valid for start >= end +// Note: When using multiple rails per PSM process, higher level code will +// walk through desired units and unit_param will specify a specific unit +// if unit_param is PSM3_NIC_ANY, this will pick starting point for nic search +psm2_error_t +psm3_compute_start_and_end_unit(long unit_param, long addr_index, + int nunitsactive,int nunits, + psm2_uuid_t const job_key, + long *unit_start,long *unit_end); + +psm2_error_t +psm3_ep_multirail(struct multirail_config *multirail_config); + +// decrement any NIC refcounts which may have been +// incremented by psm3_compute_start_and_end_unit +void psm3_dec_nic_refcount(int unit_id); + +// manage hwloc topology discovery. These will be Noops when ! PSM_USE_HWLOC +void psm3_hwloc_topology_init(); +void psm3_hwloc_topology_destroy(); + +#endif /* PSM_NIC_SELECT_H */ diff --git a/prov/psm3/psm3/psm_oneapi_ze.c b/prov/psm3/psm3/psm_oneapi_ze.c index 568581ad84b..2090fb68326 100644 --- a/prov/psm3/psm3/psm_oneapi_ze.c +++ b/prov/psm3/psm3/psm_oneapi_ze.c @@ -70,6 +70,7 @@ int psm3_num_ze_dev_fds; #endif int psm3_oneapi_immed_sync_copy; int psm3_oneapi_immed_async_copy; +unsigned psm3_oneapi_parallel_dtod_copy_thresh; const char* psmi_oneapi_ze_result_to_string(const ze_result_t result) { #define ZE_RESULT_CASE(RES) case ZE_RESULT_##RES: return STRINGIFY(RES) @@ -203,6 +204,72 @@ void psmi_oneapi_ze_memcpy(void *dstptr, const void *srcptr, size_t size) } } +// synchronous GPU memcpy DTOD (xeLink) +void psmi_oneapi_ze_memcpy_DTOD(void *dstptr, const void *srcptr, size_t size) +{ + struct ze_dev_ctxt *ctxt; + + psmi_assert(size > 0); + ctxt = psmi_oneapi_dev_ctxt_get(dstptr); + if (!ctxt) { + _HFI_ERROR("dst %p src %p not GPU buf for copying\n", + dstptr, srcptr); + return; + } + if (size <= psm3_oneapi_parallel_dtod_copy_thresh) { + if (psm3_oneapi_immed_sync_copy) { + PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->cl, + dstptr, srcptr, size, NULL, 0, NULL); + } else { + PSMI_ONEAPI_ZE_CALL(zeCommandListReset, ctxt->cl); + PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->cl, + dstptr, srcptr, size, NULL, 0, NULL); + PSMI_ONEAPI_ZE_CALL(zeCommandListClose, ctxt->cl); + PSMI_ONEAPI_ZE_CALL(zeCommandQueueExecuteCommandLists, ctxt->cq, + 1, &ctxt->cl, NULL); + PSMI_ONEAPI_ZE_CALL(zeCommandQueueSynchronize, ctxt->cq, UINT32_MAX); + } + } else { + // for large DTOD copies, start 2 parallel commands + // then wait for both + size_t size0 = ROUNDUP64P2(size/2, 64*1024); + size_t size1 = size - size0; + + if (psm3_oneapi_immed_sync_copy) { + PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->async_cl0, + dstptr, srcptr, size0, ctxt->copy_status0, 0, NULL); + + PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->async_cl1, + (void*)((uintptr_t)dstptr+size0), + (void*)((uintptr_t)srcptr+size0), size1, ctxt->copy_status1, + 0, NULL); + } else { + PSMI_ONEAPI_ZE_CALL(zeCommandListReset, ctxt->async_cl0); + PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->async_cl0, + dstptr, srcptr, size0, ctxt->copy_status0, 0, NULL); + PSMI_ONEAPI_ZE_CALL(zeCommandListClose, ctxt->async_cl0); + PSMI_ONEAPI_ZE_CALL(zeCommandQueueExecuteCommandLists, ctxt->async_cq0, + 1, &ctxt->async_cl0, NULL); + + PSMI_ONEAPI_ZE_CALL(zeCommandListReset, ctxt->async_cl1); + PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->async_cl1, + (void*)((uintptr_t)dstptr+size0), + (void*)((uintptr_t)srcptr+size0), size1, ctxt->copy_status1, + 0, NULL); + PSMI_ONEAPI_ZE_CALL(zeCommandListClose, ctxt->async_cl1); + PSMI_ONEAPI_ZE_CALL(zeCommandQueueExecuteCommandLists, ctxt->async_cq1, + 1, &ctxt->async_cl1, NULL); + } + // 2nd copy may be slightly smaller so waity for it first so + // can potentially hide its Reset latency while 1st copy completes + PSMI_ONEAPI_ZE_CALL(zeEventHostSynchronize, ctxt->copy_status1, UINT32_MAX); + PSMI_ONEAPI_ZE_CALL(zeEventHostReset, ctxt->copy_status1); + + PSMI_ONEAPI_ZE_CALL(zeEventHostSynchronize, ctxt->copy_status0, UINT32_MAX); + PSMI_ONEAPI_ZE_CALL(zeEventHostReset, ctxt->copy_status0); + } +} + // for pipelined async GPU memcpy // *p_cq is left as NULL when psm3_oneapi_immed_async_copy enabled void psmi_oneapi_async_cmd_create(struct ze_dev_ctxt *ctxt, diff --git a/prov/psm3/psm3/psm_perf.c b/prov/psm3/psm3/psm_perf.c index 6b30ca60eeb..5e2f6c4f169 100644 --- a/prov/psm3/psm3/psm_perf.c +++ b/prov/psm3/psm3/psm_perf.c @@ -207,7 +207,7 @@ static void psmi_rdpmc_perf_framework_init() * * Read the current value of a running performance counter. */ -unsigned long long rdpmc_read(struct rdpmc_ctx *ctx) +unsigned long long psm3_rdpmc_read(struct rdpmc_ctx *ctx) { static __thread int rdpmc_perf_initialized = 0; diff --git a/prov/psm3/psm3/psm_perf.h b/prov/psm3/psm3/psm_perf.h index db51ceb2fa7..8fdea147fca 100644 --- a/prov/psm3/psm3/psm_perf.h +++ b/prov/psm3/psm3/psm_perf.h @@ -87,7 +87,7 @@ extern char global_rdpmc_slot_name[RDPMC_PERF_MAX_SLOT_NUMBER][RDPMC_PERF_MAX_SL extern unsigned int global_rdpmc_type; extern unsigned int global_rdpmc_config; -extern unsigned long long rdpmc_read(struct rdpmc_ctx *ctx); +extern unsigned long long psm3_rdpmc_read(struct rdpmc_ctx *ctx); #define RDPMC_PERF_INIT() \ { \ @@ -111,12 +111,12 @@ extern unsigned long long rdpmc_read(struct rdpmc_ctx *ctx); #define RDPMC_PERF_BEGIN(slot_number) \ { \ - global_rdpmc_begin[(slot_number)] = rdpmc_read(&global_rdpmc_ctx); \ + global_rdpmc_begin[(slot_number)] = psm3_rdpmc_read(&global_rdpmc_ctx); \ } #define RDPMC_PERF_END(slot_number) \ { \ - global_rdpmc_summ[(slot_number)] += (rdpmc_read(&global_rdpmc_ctx) - global_rdpmc_begin[(slot_number)]); \ + global_rdpmc_summ[(slot_number)] += (psm3_rdpmc_read(&global_rdpmc_ctx) - global_rdpmc_begin[(slot_number)]); \ global_rdpmc_number[(slot_number)]++; \ } diff --git a/prov/psm3/psm3/psm_stats.c b/prov/psm3/psm3/psm_stats.c index 400a8e8c55e..4ae33fe9a85 100644 --- a/prov/psm3/psm3/psm_stats.c +++ b/prov/psm3/psm3/psm_stats.c @@ -641,30 +641,54 @@ psm2_error_t psm3_stats_initialize(void) { union psmi_envvar_val env_stats_freq; + union psmi_envvar_val env_stats_prefix; union psmi_envvar_val env_stats_help; union psmi_envvar_val env_statsmask; - int got_stats_freq; - int got_stats_help; - int got_statsmask; + int noenv_stats_freq; // env var not specified, used default + int noenv_stats_prefix; // env var not specified, used default + int noenv_stats_help; // env var not specified, used default + int noenv_statsmask; // env var not specified, used default psmi_assert(! perf_stats_initialized); - got_stats_freq = psm3_getenv("PSM3_PRINT_STATS", - "Prints performance stats every n seconds to file " - "./psm3-perf-stat-[hostname]-pid-[pid] when set to -1 stats are " - "printed only once on 1st ep close", - PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, - (union psmi_envvar_val) 0, &env_stats_freq); - print_stats_freq = env_stats_freq.e_uint; - - got_stats_help = psm3_getenv("PSM3_PRINT_STATS_HELP", + noenv_stats_freq = (0 < psm3_getenv_range("PSM3_PRINT_STATS", + "Prints performance stats every n seconds", + " 0 - disable output\n" + " -1 - only output once at end of job on 1st ep close\n" + " >=1 - output every n seconds\n" + " val: - limit output to rank 0 (for val of -1 or >=1)\n" + " val:pattern - limit output to processes whose label matches\n " +#ifdef FNM_EXTMATCH + "extended " +#endif + "glob pattern (for val of -1 or >=1)\n" + "Output goes to file ${PSM3_PRNT_STATS_PREFIX}psm3-perf-stat-[hostname]-pid-[pid]", + PSMI_ENVVAR_LEVEL_USER|PSMI_ENVVAR_FLAG_NOABBREV, + PSMI_ENVVAR_TYPE_STR_VAL_PAT_INT, + (union psmi_envvar_val)"0", + (union psmi_envvar_val)-1, (union psmi_envvar_val)INT_MAX, + NULL, NULL, &env_stats_freq)); + (void)psm3_parse_val_pattern_int(env_stats_freq.e_str, 0, + &print_stats_freq, + PSMI_ENVVAR_FLAG_NOABBREV, -1, INT_MAX); + + noenv_stats_prefix = (0 < psm3_getenv_range("PSM3_PRINT_STATS_PREFIX", + "Prefix for filename for performance stats output", + "May be used to add a prefix possibly including directory for output", + PSMI_ENVVAR_LEVEL_USER|PSMI_ENVVAR_FLAG_NOABBREV, + PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val)"./", + (union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL, + NULL, NULL, &env_stats_prefix)); + + noenv_stats_help = (0 < psm3_getenv("PSM3_PRINT_STATS_HELP", "Prints performance stats help text on rank 0 to file " - "./psm3-perf-stat-help-[hostname]-pid-[pid]", + "${PSM3_PRINT_STATS_PREFIX}psm3-perf-stat-help-[hostname]-pid-[pid]", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, - (union psmi_envvar_val) 0, &env_stats_help); + (union psmi_envvar_val) 0, &env_stats_help)); print_stats_help = env_stats_help.e_uint && (psm3_get_myrank() == 0); - got_statsmask = psm3_getenv("PSM3_PRINT_STATSMASK", + noenv_statsmask = (0 < psm3_getenv("PSM3_PRINT_STATSMASK", "Mask of statistic types to print: " "MQ=1, RCVTHREAD=0x100, IPS=0x200" #if defined(PSM_HAVE_REG_MR) @@ -681,21 +705,21 @@ psm3_stats_initialize(void) #endif ". 0x100000 causes zero values to also be shown", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, - (union psmi_envvar_val) PSMI_STATSTYPE_ALL, &env_statsmask); + (union psmi_envvar_val) PSMI_STATSTYPE_ALL, &env_statsmask)); print_statsmask = env_statsmask.e_uint; stats_start = time(NULL); snprintf(perf_file_name, sizeof(perf_file_name), - "./psm3-perf-stat-%s-pid-%d", - psm3_gethostname(), getpid()); + "%spsm3-perf-stat-%s-pid-%d", + env_stats_prefix.e_str, psm3_gethostname(), getpid()); if (print_stats_help) { // a few optons, such as CUDA, ONEAPI_ZE, RDMA affect what is // included in help, so use a unique filename per job snprintf(perf_help_file_name, sizeof(perf_help_file_name), - "./psm3-perf-stat-help-%s-pid-%d", - psm3_gethostname(), getpid()); + "%spsm3-perf-stat-help-%s-pid-%d", + env_stats_prefix.e_str, psm3_gethostname(), getpid()); perf_help_fd = fopen(perf_help_file_name, "w"); if (!perf_help_fd) _HFI_ERROR("Failed to create fd for performance logging help: %s: %s\n", @@ -706,13 +730,19 @@ psm3_stats_initialize(void) print_job_info_help(); print_basic_job_info(); - if (got_stats_freq) + // if got a valid value or an invalid value, psm3_getenv will have + // stashed it and print_basic_job_info will have put in stats file + // otherwise we want to always report the STATS variable settings + if (noenv_stats_freq) psm3_stats_print_env_val("PSM3_PRINT_STATS", PSMI_ENVVAR_TYPE_UINT, env_stats_freq); - if (got_stats_help) + if (noenv_stats_prefix) + psm3_stats_print_env_val("PSM3_PRINT_STATS_PREFIX", + PSMI_ENVVAR_TYPE_STR, env_stats_prefix); + if (noenv_stats_help) psm3_stats_print_env_val("PSM3_PRINT_STATS_HELP", PSMI_ENVVAR_TYPE_UINT, env_stats_help); - if (got_statsmask) + if (noenv_statsmask) psm3_stats_print_env_val("PSM3_PRINT_STATSMASK", PSMI_ENVVAR_TYPE_UINT_FLAGS, env_statsmask); diff --git a/prov/psm3/psm3/psm_sysbuf.c b/prov/psm3/psm3/psm_sysbuf.c index f9bee0be199..698507e8528 100644 --- a/prov/psm3/psm3/psm_sysbuf.c +++ b/prov/psm3/psm3/psm_sysbuf.c @@ -77,11 +77,46 @@ struct psmi_mem_block_ctrl { void psm3_mq_sysbuf_init(psm2_mq_t mq) { int i; + // sysbuf is used for unexpected eager messages in nic, shm and self + // for self, unexpected is a courtesy to bad apps, app should always post + // recv before send when sendint to self. + // for nic, eager is only messages below rendezvous threshold. + // In TCP and CPU jobs threshold can be larger. TCP allows up to 256K. + // Typical verbs rendezvous threshold is 8000-64K bytes, with GPU + // tending to use a lower threshold as GPU copies are expensive. + // for shm, GPU messages use rendezvous anytime GPU supports Scale-Up + // GPU to GPU comms, such as xeLink or nvLink. + // A message which exceeds largest block_size[], will have a temporary + // sysbuf allocated and freed. For CPU this is ok as malloc is not + // terribly expensive. However for GPU, the subsequent copy will pay + // a GPU DMA registration cost in Cuda or Level Zero, so it is best to + // avoid temporary buffers. Fortunately GPU apps tend to have fewer + // processes per node and hence more available CPU memory to hold the + // buffers. + // + // So for GPU jobs, we allow a few larger block sizes just in case + // rendezvous threshold is set high or TCP is being used with a large + // eager message size (aka PSM3_MTU). + // replenishing_rate is how many we add to pool at a time, there is + // no upper bound to the pool. +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) + uint32_t gpu_block_sizes[] = {256, 512, 1024, 2048, 4096, 8192, 65536, 262144, (uint32_t)-1}; + uint32_t gpu_replenishing_rate[] = {128, 64, 32, 16, 8, 4, 2, 2, 0}; + uint32_t block_sizes[] = {256, 512, 1024, 2048, 4096, 8192, (uint32_t)-1, (uint32_t)-1, (uint32_t)-1}; + uint32_t replenishing_rate[] = {128, 64, 32, 16, 8, 4, 0, 0, 0}; +#else uint32_t block_sizes[] = {256, 512, 1024, 2048, 4096, 8192, (uint32_t)-1}; uint32_t replenishing_rate[] = {128, 64, 32, 16, 8, 4, 0}; +#endif if (mq->mem_ctrl_is_init) return; +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) + if (PSMI_IS_GPU_ENABLED) { + memcpy(block_sizes, gpu_block_sizes, sizeof(block_sizes)); + memcpy(replenishing_rate, gpu_replenishing_rate, sizeof(replenishing_rate)); + } +#endif mq->mem_ctrl_is_init = 1; for (i=0; i < MM_NUM_OF_POOLS; i++) { @@ -125,9 +160,35 @@ void psm3_mq_sysbuf_fini(psm2_mq_t mq) // free all buffers that is currently no for (i=0; i < MM_NUM_OF_POOLS; i++) { while ((block = mq->handler_index[i].free_list) != NULL) { mq->handler_index[i].free_list = block->next; +#if defined(PSM_CUDA) && !defined(PSM_NO_CUDA_REGISTER) + if (PSMI_IS_GPU_ENABLED && cu_ctxt) { + /* ignore NOT_REGISTERED in case cuda initialized late */ + /* ignore other errors as context could be destroyed before this */ + CUresult cudaerr; + //PSMI_CUDA_CALL_EXCEPT(CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, + // cuMemHostUnregister, block); + psmi_count_cuMemHostUnregister++; + cudaerr = psmi_cuMemHostUnregister(block); + if (cudaerr) { + const char *pStr = NULL; + psmi_count_cuGetErrorString++; + psmi_cuGetErrorString(cudaerr, &pStr); + _HFI_DBG("CUDA failure: cuMemHostUnregister returned %d: %s\n", + cudaerr, pStr?pStr:"Unknown"); + } + } +#endif #if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT) - if (PSMI_IS_GPU_ENABLED) - PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, ze_driver, block); + if (PSMI_IS_GPU_ENABLED) { + ze_result_t result; + //PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, ze_driver, block); + psmi_count_zexDriverReleaseImportedPointer++; + result = psmi_zexDriverReleaseImportedPointer(ze_driver, + block); + if (result != ZE_RESULT_SUCCESS) { + _HFI_DBG("OneAPI Level Zero failure: zexDriverReleaseImportedPointer returned %d: %s\n", result, psmi_oneapi_ze_result_to_string(result)); + } + } #endif psmi_free(block); } @@ -168,6 +229,13 @@ void *psm3_mq_sysbuf_alloc(psm2_mq_t mq, uint32_t alloc_size) new_block = psmi_malloc(mq->ep, UNEXPECTED_BUFFERS, newsz); if (new_block) { +#if defined(PSM_CUDA) && !defined(PSM_NO_CUDA_REGISTER) + // for transient buffers, no use Importing, adds cost for + // CPU copy, just pay GPU cost on the copy, we use once & free + //if (PSMI_IS_GPU_ENABLED && check_have_cuda_ctxt()) + // PSMI_CUDA_CALL(cuMemHostRegister, new_block, newsz, + // CU_MEMHOSTALLOC_PORTABLE); +#endif #if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT) // for transient buffers, no use Importing, adds cost for // CPU copy, just pay GPU cost on the copy, we use once & free @@ -189,6 +257,14 @@ void *psm3_mq_sysbuf_alloc(psm2_mq_t mq, uint32_t alloc_size) new_block = psmi_malloc(mq->ep, UNEXPECTED_BUFFERS, newsz); if (new_block) { +#if defined(PSM_CUDA) && !defined(PSM_NO_CUDA_REGISTER) + // By registering memory with Cuds, we make + // cuMemcpy* run faster for copies between + // GPU and this sysbuf + if (PSMI_IS_GPU_ENABLED && check_have_cuda_ctxt()) + PSMI_CUDA_CALL(cuMemHostRegister, new_block, newsz, + CU_MEMHOSTALLOC_PORTABLE); +#endif #if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT) // By registering memory with Level Zero, we make // zeCommandListAppendMemoryCopy run faster for copies between @@ -233,11 +309,21 @@ void psm3_mq_sysbuf_free(psm2_mq_t mq, void * mem_to_free) mm_handler = block_to_free->mem_handler; if (mm_handler->flags & MM_FLAG_TRANSIENT) { +#if defined(PSM_CUDA) && !defined(PSM_NO_CUDA_REGISTER) + // for transient buffers, no use Importing, adds cost for + // CPU copy, just pay GPU cost on the copy, we use once & free + //if (PSMI_IS_GPU_ENABLED && cu_ctxt) { + // /* ignore NOT_REGISTERED in case cuda initialized late */ + // CUresult cudaerr; + // PSMI_CUDA_CALL_EXCEPT(CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, + // cuMemHostUnregister, block_to_free); + //} +#endif #if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT) // for transient buffers, no use Importing, adds cost for // CPU copy, just pay GPU cost on the copy, we use once & free //if (PSMI_IS_GPU_ENABLED) - // PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, ze_driver, block); + // PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, ze_driver, block_to_free); #endif psmi_free(block_to_free); } else { diff --git a/prov/psm3/psm3/psm_sysbuf.h b/prov/psm3/psm3/psm_sysbuf.h index 90945d520ed..31ff116d088 100644 --- a/prov/psm3/psm3/psm_sysbuf.h +++ b/prov/psm3/psm3/psm_sysbuf.h @@ -58,7 +58,11 @@ #include "psm_user.h" +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#define MM_NUM_OF_POOLS 9 +#else #define MM_NUM_OF_POOLS 7 +#endif typedef struct psmi_mem_ctrl { struct psmi_mem_block_ctrl *free_list; diff --git a/prov/psm3/psm3/psm_user.h b/prov/psm3/psm3/psm_user.h index 38e9b8d9310..18c58d9934d 100644 --- a/prov/psm3/psm3/psm_user.h +++ b/prov/psm3/psm3/psm_user.h @@ -60,6 +60,13 @@ extern "C" { #endif +#if defined(PSM_CUDA) +// if defined, do not use cuMemHostRegister for malloced pipeline +// copy bounce buffers +// otherwise, use cuMemHostRegister when malloc buffer +//#define PSM3_NO_CUDA_REGISTER +#endif + #if defined(PSM_ONEAPI) // if defined, use malloc for pipeline copy bounce buffers // otherwise, use zeMemAllocHost @@ -116,6 +123,10 @@ extern "C" { #endif /* RNDV_MOD */ +#if (defined(PSM_CUDA) || defined(PSM_ONEAPI)) && defined(PSM_USE_HWLOC) +#define PSM_HAVE_GPU_CENTRIC_AFFINITY +#endif + #include "psm_config.h" #include #include @@ -166,6 +177,7 @@ typedef void *psmi_hal_hw_context; #include "psm_help.h" #include "psm_error.h" +#include "psm_nic_select.h" #include "psm_context.h" #include "psm_utils.h" #include "psm_timer.h" @@ -208,6 +220,7 @@ extern int psm3_opened_endpoint_count; extern int psm3_affinity_shared_file_opened; extern uint64_t *psm3_shared_affinity_ptr; +extern uint64_t *psm3_shared_affinity_nic_refcount_ptr; extern char *psm3_affinity_shm_name; extern sem_t *psm3_sem_affinity_shm_rw; @@ -378,6 +391,8 @@ extern uint32_t gpudirect_rdma_send_limit; extern uint32_t gpudirect_rdma_recv_limit; extern uint32_t gpu_thresh_rndv; +#define MAX_ZE_DEVICES 8 + struct ips_gpu_hostbuf { STAILQ_ENTRY(ips_gpu_hostbuf) req_next; STAILQ_ENTRY(ips_gpu_hostbuf) next; @@ -390,8 +405,9 @@ struct ips_gpu_hostbuf { CUevent copy_status; #elif defined(PSM_ONEAPI) ze_event_pool_handle_t event_pool; - ze_command_list_handle_t command_list; + ze_command_list_handle_t command_lists[MAX_ZE_DEVICES]; ze_event_handle_t copy_status; + int cur_dev_inx; #endif psm2_mq_req_t req; void* host_buf; @@ -413,8 +429,6 @@ extern void *psmi_cuda_lib; #ifdef PSM_ONEAPI -#define MAX_ZE_DEVICES 8 - int psmi_oneapi_ze_initialize(void); psm2_error_t psm3_ze_init_fds(void); int *psm3_ze_get_dev_fds(int *nfds); @@ -428,11 +442,22 @@ extern int psm3_num_ze_dev_fds; struct ze_dev_ctxt { ze_device_handle_t dev; + int dev_index; /* Index in ze_devices[] */ uint32_t ordinal; /* CmdQGrp ordinal for the 1st copy_only engine */ uint32_t index; /* Cmdqueue index within the CmdQGrp */ uint32_t num_queues; /* Number of queues in the CmdQGrp */ + // for most sync copies ze_command_queue_handle_t cq; // NULL if psm3_oneapi_immed_sync_copy ze_command_list_handle_t cl; + // fields below are only used for large DTOD sync copy so can do 2 + // parallel async copies then wait for both + ze_event_handle_t copy_status0; + ze_event_handle_t copy_status1; + ze_command_list_handle_t async_cl0; + ze_command_list_handle_t async_cl1; + ze_command_queue_handle_t async_cq0;// NULL if psm3_oneapi_immed_sync_copy + ze_command_queue_handle_t async_cq1;// NULL if psm3_oneapi_immed_sync_copy + ze_event_pool_handle_t event_pool; }; extern ze_api_version_t zel_api_version; @@ -444,6 +469,7 @@ extern int num_ze_devices; extern struct ze_dev_ctxt *cur_ze_dev; extern int psm3_oneapi_immed_sync_copy; extern int psm3_oneapi_immed_async_copy; +extern unsigned psm3_oneapi_parallel_dtod_copy_thresh; const char* psmi_oneapi_ze_result_to_string(const ze_result_t result); void psmi_oneapi_async_cmd_create(struct ze_dev_ctxt *ctxt, @@ -467,6 +493,7 @@ extern int psm3_oneapi_ze_using_zemem_alloc; extern void psm3_oneapi_ze_can_use_zemem(); void psmi_oneapi_ze_memcpy(void *dstptr, const void *srcptr, size_t size); +void psmi_oneapi_ze_memcpy_DTOD(void *dstptr, const void *srcptr, size_t size); static inline int device_support_gpudirect() @@ -501,6 +528,8 @@ extern CUresult (*psmi_cuEventRecord)(CUevent hEvent, CUstream hStream); extern CUresult (*psmi_cuEventSynchronize)(CUevent hEvent); extern CUresult (*psmi_cuMemHostAlloc)(void** pp, size_t bytesize, unsigned int Flags); extern CUresult (*psmi_cuMemFreeHost)(void* p); +extern CUresult (*psmi_cuMemHostRegister)(void* p, size_t bytesize, unsigned int Flags); +extern CUresult (*psmi_cuMemHostUnregister)(void* p); extern CUresult (*psmi_cuMemcpy)(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount); extern CUresult (*psmi_cuMemcpyDtoD)(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount); extern CUresult (*psmi_cuMemcpyDtoH)(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount); @@ -527,6 +556,7 @@ extern ze_result_t (*psmi_zexDriverImportExternalPointer)(ze_driver_handle_t hDr extern ze_result_t (*psmi_zexDriverReleaseImportedPointer)(ze_driver_handle_t hDriver, void *ptr); #endif extern ze_result_t (*psmi_zeDeviceGet)(ze_driver_handle_t hDriver, uint32_t *pCount, ze_device_handle_t *phDevices); +extern ze_result_t (*psmi_zeDevicePciGetPropertiesExt)(ze_device_handle_t hDevice, ze_pci_ext_properties_t *pPciProperties); #ifndef PSM3_NO_ONEAPI_IMPORT extern ze_result_t (*psmi_zeDriverGetExtensionFunctionAddress)(ze_driver_handle_t hDriver, const char *name, void **ppFunctionAddress); #endif @@ -591,6 +621,8 @@ extern uint64_t psmi_count_cuEventRecord; extern uint64_t psmi_count_cuEventSynchronize; extern uint64_t psmi_count_cuMemHostAlloc; extern uint64_t psmi_count_cuMemFreeHost; +extern uint64_t psmi_count_cuMemHostRegister; +extern uint64_t psmi_count_cuMemHostUnregister; extern uint64_t psmi_count_cuMemcpy; extern uint64_t psmi_count_cuMemcpyDtoD; extern uint64_t psmi_count_cuMemcpyDtoH; @@ -617,6 +649,7 @@ extern uint64_t psmi_count_zexDriverImportExternalPointer; extern uint64_t psmi_count_zexDriverReleaseImportedPointer; #endif extern uint64_t psmi_count_zeDeviceGet; +extern uint64_t psmi_count_zeDevicePciGetPropertiesExt; #ifndef PSM3_NO_ONEAPI_IMPORT extern uint64_t psmi_count_zeDriverGetExtensionFunctionAddress; #endif @@ -679,6 +712,20 @@ static int check_set_cuda_ctxt(void) return 0; } +/* Make sure have a real GPU job. Set cu_ctxt if available */ +PSMI_ALWAYS_INLINE( +int check_have_cuda_ctxt(void)) +{ + if (! cu_ctxt) { + if (unlikely(check_set_cuda_ctxt())) { \ + psm3_handle_error(PSMI_EP_NORETURN, \ + PSM2_INTERNAL_ERR, "Failed to set/synchronize" \ + " CUDA context.\n"); \ + } \ + } + return (cu_ctxt != NULL); +} + #define PSMI_CUDA_CALL(func, args...) do { \ CUresult cudaerr; \ @@ -688,19 +735,18 @@ static int check_set_cuda_ctxt(void) " CUDA context.\n"); \ } \ psmi_count_##func++; \ - cudaerr = psmi_##func(args); \ + cudaerr = (CUresult)psmi_##func(args); \ if (cudaerr != CUDA_SUCCESS) { \ const char *pStr = NULL; \ psmi_count_cuGetErrorString++; \ psmi_cuGetErrorString(cudaerr, &pStr); \ _HFI_ERROR( \ "CUDA failure: %s() (at %s:%d)" \ - "returned %d: %s\n", \ + " returned %d: %s\n", \ #func, __FILE__, __LINE__, cudaerr, \ pStr?pStr:"Unknown"); \ - psm3_handle_error( \ - PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \ - "Error returned from CUDA function.\n");\ + psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \ + "Error returned from CUDA function %s.\n", #func);\ } \ } while (0) #endif // PSM_CUDA @@ -712,12 +758,12 @@ static int check_set_cuda_ctxt(void) psmi_count_##func++; \ result = psmi_##func(args); \ if(result != ZE_RESULT_SUCCESS) { \ - _HFI_ERROR( "OneAPI Level Zero failure: %s() (at %s:%d) " \ - "returned %d(%s)\n", \ - #func, __FILE__, __LINE__, result, psmi_oneapi_ze_result_to_string(result)); \ - psm3_handle_error( PSMI_EP_NORETURN, \ - PSM2_INTERNAL_ERR, \ - "Error returned from OneAPI Level Zero function %s.\n", STRINGIFY(func)); \ + _HFI_ERROR( "OneAPI Level Zero failure: %s() (at %s:%d)" \ + " returned 0x%x: %s\n", \ + #func, __FILE__, __LINE__, result, \ + psmi_oneapi_ze_result_to_string(result)); \ + psm3_handle_error( PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \ + "Error returned from OneAPI Level Zero function %s.\n", #func); \ } \ } while (0) @@ -755,7 +801,7 @@ _psmi_is_oneapi_ze_mem(const void *ptr, struct ze_dev_ctxt **ctxt)) if (result == ZE_RESULT_SUCCESS && (mem_props.type != ZE_MEMORY_TYPE_UNKNOWN)) { ret = 1; - _HFI_VDBG("ptr %p type %d dev %p ze_device %p\n", + _HFI_VDBG("ptr %p type %d dev %p cur_ze_dev %p\n", ptr, mem_props.type, dev, cur_ze_dev->dev); /* * Check if the gpu device has changed. @@ -782,6 +828,7 @@ _psmi_is_oneapi_ze_mem(const void *ptr, struct ze_dev_ctxt **ctxt)) break; } } + _HFI_VDBG("check ze_device[%d-%d] for dev %p: no match\n", 0, num_ze_devices-1, dev); } } @@ -947,19 +994,18 @@ int gpu_p2p_supported()) "before psm3_ep_open call \n"); \ _HFI_ERROR( \ "CUDA failure: %s() (at %s:%d)" \ - "returned %d: %s\n", \ + " returned %d: %s\n", \ #func, __FILE__, __LINE__, cudaerr, \ pStr?pStr:"Unknown"); \ - psm3_handle_error( \ - PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \ - "Error returned from CUDA function.\n");\ + psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \ + "Error returned from CUDA function %s.\n", #func);\ } else if (cudaerr == except_err) { \ const char *pStr = NULL; \ psmi_count_cuGetErrorString++; \ psmi_cuGetErrorString(cudaerr, &pStr); \ _HFI_DBG( \ "CUDA non-zero return value: %s() (at %s:%d)" \ - "returned %d: %s\n", \ + " returned %d: %s\n", \ #func, __FILE__, __LINE__, cudaerr, \ pStr?pStr:"Unknown"); \ } \ @@ -974,12 +1020,11 @@ int gpu_p2p_supported()) psmi_count_cuGetErrorString++; \ psmi_cuGetErrorString(cudaerr, &pStr); \ _HFI_ERROR( \ - "CUDA failure: %s() returned %d: %s\n", \ - "cuEventQuery", cudaerr, \ + "CUDA failure: %s() (at %s:%d) returned %d: %s\n", \ + "cuEventQuery", __FILE__, __LINE__, cudaerr, \ pStr?pStr:"Unknown"); \ - psm3_handle_error( \ - PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \ - "Error returned from CUDA function.\n");\ + psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \ + "Error returned from CUDA function cuEventQuery.\n");\ } \ } while (0) @@ -1063,13 +1108,12 @@ int _psm3_oneapi_ze_memcpy_done(const struct ips_gpu_hostbuf *ghb) } else if (result == ZE_RESULT_NOT_READY) { return 0; } else { - _HFI_ERROR( "OneAPI LZ failure: %s() returned %d(%s)\n", - __FUNCTION__, result, + _HFI_ERROR("OneAPI Level Zero failure: %s() (at %s:%d) returned 0x%x: %s\n", + "zeEventQueryStatus", __FILE__, __LINE__, result, psmi_oneapi_ze_result_to_string(result)); - psm3_handle_error( PSMI_EP_NORETURN, - PSM2_INTERNAL_ERR, - "Error returned from OneAPI LZ function %s.\n", - __FUNCTION__); + psm3_handle_error( PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "Error returned from OneAPI Level Zero function %s.\n", + "zeEventQueryStatus"); } return 0; } @@ -1219,16 +1263,13 @@ _psmi_is_gdr_copy_enabled()) PSMI_CUDA_CALL(cuEventRecord, ghb->copy_status, \ protoexp->cudastream_recv); \ } while (0) -#define PSM3_GPU_MEMCPY_DTOH_START(proto, ghb, len, bufsz) \ +#define PSM3_GPU_MEMCPY_DTOH_START(proto, ghb, len) \ do { \ if (proto->cudastream_send == NULL) { \ PSMI_CUDA_CALL(cuStreamCreate, \ &proto->cudastream_send, \ CU_STREAM_NON_BLOCKING); \ } \ - if (ghb->host_buf == NULL && bufsz) { \ - PSM3_GPU_HOST_ALLOC(&ghb->host_buf, bufsz); \ - } \ if (ghb->copy_status == NULL) { \ PSMI_CUDA_CALL(cuEventCreate, \ &ghb->copy_status, CU_EVENT_DEFAULT); \ @@ -1246,13 +1287,6 @@ _psmi_is_gdr_copy_enabled()) ghb->copy_status = NULL; \ ghb->host_buf = NULL; \ } while (0) -// TBD, create of Event here could be omitted and let HTOD/DTOH_START create it -#define PSM3_GPU_HOSTBUF_FORCE_INIT(ghb, bufsz) \ - do { \ - PSM3_GPU_HOST_ALLOC(&ghb->host_buf, bufsz); \ - PSMI_CUDA_CALL(cuEventCreate, \ - &ghb->copy_status, CU_EVENT_DEFAULT); \ - } while (0) #define PSM3_GPU_HOSTBUF_RESET(ghb) \ do { \ } while (0) @@ -1278,6 +1312,10 @@ _psmi_is_gdr_copy_enabled()) PSMI_CUDA_CALL(cuMemHostAlloc, (void **)(ret_ptr), \ (size),CU_MEMHOSTALLOC_PORTABLE); \ } while (0) +#define PSM3_GPU_HOST_FREE(ptr) \ + do { \ + PSMI_CUDA_CALL(cuMemFreeHost, (void *)ptr); \ + } while (0) // HOST_ALLOC memory treated as CPU memory for Verbs MRs #define PSM3_GPU_ADDR_SEND_MR(mqreq) \ ( (mqreq)->is_buf_gpu_mem && ! (mqreq)->gpu_hostbuf_used ) @@ -1295,24 +1333,40 @@ _psmi_is_gdr_copy_enabled()) #elif defined(PSM_ONEAPI) #define PSM3_GPU_PREPARE_HTOD_MEMCPYS(protoexp) \ do { \ - protoexp->cq_recv = NULL; \ + int i; \ + \ + for (i = 0; i < MAX_ZE_DEVICES; i++) \ + protoexp->cq_recvs[i] = NULL; \ } while (0) #define PSM3_GPU_PREPARE_DTOH_MEMCPYS(proto) \ do { \ - proto->cq_send = NULL; \ + int i; \ + \ + for (i = 0; i < MAX_ZE_DEVICES; i++) \ + proto->cq_sends[i] = NULL; \ } while (0) #define PSM3_GPU_SHUTDOWN_HTOD_MEMCPYS(protoexp) \ do { \ - if (protoexp->cq_recv) { \ - PSMI_ONEAPI_ZE_CALL(zeCommandQueueDestroy, \ - protoexp->cq_recv); \ + int i; \ + \ + for (i = 0; i < MAX_ZE_DEVICES; i++) { \ + if (protoexp->cq_recvs[i]) { \ + PSMI_ONEAPI_ZE_CALL(zeCommandQueueDestroy, \ + protoexp->cq_recvs[i]); \ + protoexp->cq_recvs[i] = NULL; \ + } \ } \ } while (0) #define PSM3_GPU_SHUTDOWN_DTOH_MEMCPYS(proto) \ do { \ - if (proto->cq_send) { \ - PSMI_ONEAPI_ZE_CALL(zeCommandQueueDestroy, \ - proto->cq_send); \ + int i; \ + \ + for (i = 0; i < MAX_ZE_DEVICES; i++) { \ + if (proto->cq_sends[i]) { \ + PSMI_ONEAPI_ZE_CALL(zeCommandQueueDestroy, \ + proto->cq_sends[i]); \ + proto->cq_sends[i] = NULL; \ + } \ } \ } while (0) @@ -1330,13 +1384,14 @@ _psmi_is_gdr_copy_enabled()) .index = 0 \ }; \ struct ze_dev_ctxt *ctxt; \ + int inx; \ \ ctxt = psmi_oneapi_dev_ctxt_get(ghb->gpu_buf); \ if (!ctxt) \ psm3_handle_error(PSMI_EP_NORETURN, \ PSM2_INTERNAL_ERR, \ - "%s HTOD: no dev ctxt\n", \ - __FUNCTION__); \ + "%s HTOD: unknown GPU device for addr %p\n", \ + __FUNCTION__, ghb->gpu_buf);\ if (ghb->event_pool == NULL) { \ PSMI_ONEAPI_ZE_CALL(zeEventPoolCreate, \ ze_context, &pool_desc, 0, NULL, \ @@ -1347,23 +1402,26 @@ _psmi_is_gdr_copy_enabled()) ghb->event_pool, &event_desc, \ &ghb->copy_status); \ } \ - if (! ghb->command_list) { \ + inx = ctxt->dev_index; \ + if (! ghb->command_lists[inx]) { \ psmi_oneapi_async_cmd_create(ctxt, \ - &protoexp->cq_recv, &ghb->command_list);\ + &protoexp->cq_recvs[inx], \ + &ghb->command_lists[inx]); \ } \ + ghb->cur_dev_inx = inx; \ PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, \ - ghb->command_list, \ + ghb->command_lists[inx], \ ghb->gpu_buf, ghb->host_buf, len, \ ghb->copy_status, 0, NULL); \ if (! psm3_oneapi_immed_async_copy) { \ PSMI_ONEAPI_ZE_CALL(zeCommandListClose, \ - ghb->command_list); \ + ghb->command_lists[inx]); \ PSMI_ONEAPI_ZE_CALL(zeCommandQueueExecuteCommandLists,\ - protoexp->cq_recv, 1, \ - &ghb->command_list, NULL); \ + protoexp->cq_recvs[inx], 1, \ + &ghb->command_lists[inx], NULL); \ } \ } while (0) -#define PSM3_GPU_MEMCPY_DTOH_START(proto, ghb, len, bufsz) \ +#define PSM3_GPU_MEMCPY_DTOH_START(proto, ghb, len) \ do { \ ze_event_pool_desc_t pool_desc = { \ .stype = ZE_STRUCTURE_TYPE_EVENT_POOL_DESC, \ @@ -1377,13 +1435,14 @@ _psmi_is_gdr_copy_enabled()) .index = 0 \ }; \ struct ze_dev_ctxt *ctxt; \ + int inx; \ \ ctxt = psmi_oneapi_dev_ctxt_get(ghb->gpu_buf); \ if (!ctxt) \ psm3_handle_error(PSMI_EP_NORETURN, \ PSM2_INTERNAL_ERR, \ - "%s DTOH: no dev ctxt\n", \ - __FUNCTION__); \ + "%s DTOH: unknown GPU device for addr %p\n", \ + __FUNCTION__, ghb->gpu_buf);\ if (ghb->event_pool == NULL) { \ PSMI_ONEAPI_ZE_CALL(zeEventPoolCreate, \ ze_context, &pool_desc, 0, NULL, \ @@ -1394,68 +1453,50 @@ _psmi_is_gdr_copy_enabled()) ghb->event_pool, &event_desc, \ &ghb->copy_status); \ } \ - if (ghb->host_buf == NULL && bufsz) { \ - PSM3_GPU_HOST_ALLOC(&ghb->host_buf, bufsz); \ - } \ - if (! ghb->command_list) { \ + inx = ctxt->dev_index; \ + if (! ghb->command_lists[inx]) { \ psmi_oneapi_async_cmd_create(ctxt, \ - &proto->cq_send, &ghb->command_list);\ + &proto->cq_sends[inx], \ + &ghb->command_lists[inx]); \ } \ + ghb->cur_dev_inx = inx; \ PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, \ - ghb->command_list, \ + ghb->command_lists[inx], \ ghb->host_buf, ghb->gpu_buf, len, \ ghb->copy_status, 0, NULL); \ if (! psm3_oneapi_immed_async_copy) { \ PSMI_ONEAPI_ZE_CALL(zeCommandListClose, \ - ghb->command_list); \ + ghb->command_lists[inx]); \ PSMI_ONEAPI_ZE_CALL(zeCommandQueueExecuteCommandLists,\ - proto->cq_send, 1, \ - &ghb->command_list, NULL); \ + proto->cq_sends[inx], 1, \ + &ghb->command_lists[inx], NULL); \ } \ } while (0) #define PSM3_GPU_MEMCPY_DONE(ghb) \ _psm3_oneapi_ze_memcpy_done(ghb) #define PSM3_GPU_HOSTBUF_LAZY_INIT(ghb) \ do { \ + int i; \ + \ ghb->event_pool = NULL; \ ghb->copy_status = NULL; \ - ghb->command_list = NULL; \ + for (i = 0; i < MAX_ZE_DEVICES; i++) \ + ghb->command_lists[i] = NULL; \ ghb->host_buf = NULL; \ } while (0) -// TBD, create of Event and command list here could be omitted and let -// HTOD/DTOH_START create it -#define PSM3_GPU_HOSTBUF_FORCE_INIT(ghb, bufsz) \ - do { \ - ze_event_pool_desc_t pool_desc = { \ - .stype = ZE_STRUCTURE_TYPE_EVENT_POOL_DESC, \ - .flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE, \ - .count = 1 \ - }; \ - ze_event_desc_t event_desc = { \ - .stype = ZE_STRUCTURE_TYPE_EVENT_DESC, \ - .signal = ZE_EVENT_SCOPE_FLAG_HOST, \ - .wait = ZE_EVENT_SCOPE_FLAG_HOST, \ - .index = 0 \ - }; \ - PSMI_ONEAPI_ZE_CALL(zeEventPoolCreate, \ - ze_context, &pool_desc, 0, NULL, \ - &ghb->event_pool); \ - PSMI_ONEAPI_ZE_CALL(zeEventCreate, \ - ghb->event_pool, &event_desc, \ - &ghb->copy_status); \ - PSM3_GPU_HOST_ALLOC(&ghb->host_buf, bufsz); \ - } while (0) #define PSM3_GPU_HOSTBUF_RESET(ghb) \ do { \ if (! psm3_oneapi_immed_async_copy) { \ PSMI_ONEAPI_ZE_CALL(zeCommandListReset, \ - ghb->command_list); \ + ghb->command_lists[ghb->cur_dev_inx]);\ } \ PSMI_ONEAPI_ZE_CALL(zeEventHostReset, \ ghb->copy_status); \ } while (0) #define PSM3_GPU_HOSTBUF_DESTROY(ghb) \ do { \ + int i; \ + \ if (ghb->copy_status != NULL) { \ PSMI_ONEAPI_ZE_CALL(zeEventDestroy, \ ghb->copy_status); \ @@ -1467,13 +1508,17 @@ _psmi_is_gdr_copy_enabled()) PSMI_ONEAPI_ZE_CALL(zeEventPoolDestroy, \ ghb->event_pool); \ } \ - if (ghb->command_list != NULL) { \ - PSMI_ONEAPI_ZE_CALL(zeCommandListDestroy, \ - ghb->command_list); \ + for (i = 0; i < MAX_ZE_DEVICES; i++) { \ + if (ghb->command_lists[i]) { \ + PSMI_ONEAPI_ZE_CALL( \ + zeCommandListDestroy, \ + ghb->command_lists[i]); \ + ghb->command_lists[i] = NULL; \ + } \ } \ } while (0) #define PSM3_GPU_MEMCPY_DTOD(dstptr, srcptr, len) \ - do { psmi_oneapi_ze_memcpy(dstptr, srcptr, len); } while(0) + do { psmi_oneapi_ze_memcpy_DTOD(dstptr, srcptr, len); } while(0) #define PSM3_GPU_MEMCPY_HTOD(dstptr, srcptr, len) \ do { psmi_oneapi_ze_memcpy(dstptr, srcptr, len); } while(0) #define PSM3_GPU_SYNCHRONIZE_MEMCPY() \ @@ -1506,6 +1551,7 @@ _psmi_is_gdr_copy_enabled()) ( (tidrecvc)->is_ptr_gpu_backed \ || ((mqreq)->gpu_hostbuf_used && psm3_oneapi_ze_using_zemem_alloc)) #endif /* PSM3_USE_ONEAPI_MALLOC */ +#define PSM3_GPU_HOST_FREE(ptr) PSM3_ONEAPI_ZE_HOST_FREE(ptr) #define PSM3_MARK_BUF_SYNCHRONOUS(buf) do { /* not needed for OneAPI ZE */ } while (0) #define PSM3_GPU_MEMCPY_DTOH(dstptr, srcptr, len) \ do { psmi_oneapi_ze_memcpy(dstptr, srcptr, len); } while (0) diff --git a/prov/psm3/psm3/psm_utils.c b/prov/psm3/psm3/psm_utils.c index e99b950e1bf..c2525fa935c 100644 --- a/prov/psm3/psm3/psm_utils.c +++ b/prov/psm3/psm3/psm_utils.c @@ -2550,14 +2550,12 @@ unsigned psmi_parse_gpudirect_rdma_send_limit(int force) /* Default send threshold for Gpu-direct set to UINT_MAX * (always use GPUDIRECT) */ - psm3_getenv("PSM3_GPUDIRECT_RDMA_SEND_LIMIT", - "GPUDirect RDMA feature on send side will be switched off for messages larger than limit.", + psm3_getenv_range("PSM3_GPUDIRECT_RDMA_SEND_LIMIT", + "GPUDirect RDMA feature on send side will be switched off for messages larger than limit.", NULL, PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, -#ifdef PSM_ONEAPI - (union psmi_envvar_val)(1024*1024), &envval); -#else - (union psmi_envvar_val)UINT_MAX, &envval); -#endif + (union psmi_envvar_val)UINT_MAX, + (union psmi_envvar_val)0, (union psmi_envvar_val)UINT_MAX, + NULL, NULL, &envval); saved = envval.e_uint; done: @@ -2584,10 +2582,16 @@ unsigned psmi_parse_gpudirect_rdma_recv_limit(int force) /* Default receive threshold for Gpu-direct set to UINT_MAX * (always use GPUDIRECT) */ - psm3_getenv("PSM3_GPUDIRECT_RDMA_RECV_LIMIT", - "GPUDirect RDMA feature on receive side will be switched off for messages larger than limit.", + psm3_getenv_range("PSM3_GPUDIRECT_RDMA_RECV_LIMIT", + "GPUDirect RDMA feature on receive side will be switched off for messages larger than limit.", NULL, PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, - (union psmi_envvar_val)UINT_MAX, &envval); +#ifdef PSM_CUDA + (union psmi_envvar_val)UINT_MAX, +#elif defined(PSM_ONEAPI) + (union psmi_envvar_val)1, +#endif + (union psmi_envvar_val)0, (union psmi_envvar_val)UINT_MAX, + NULL, NULL, &envval); saved = envval.e_uint; done: @@ -2611,10 +2615,11 @@ unsigned psmi_parse_gpudirect_rv_gpu_cache_size(int reload) // RV defaults are sufficient for default PSM parameters // but for HALs with RDMA, if user adjusts ep->hfi_num_send_rdma or - // mq->hfi_base_window_rv they also need to increase the cache size. + // mq->ips_gpu_window_rv they also need to increase the cache size. // psm3_verbs_alloc_mr_cache will verify cache size is sufficient. // min size is (HFI_TF_NFLOWS + ep->hfi_num_send_rdma) * - // chunk size (mq->hfi_base_window_rv after psmi_mq_initialize_params) + // chunk size (psm3_mq_max_window_rv(mq, 1) after + // psmi_mq_initialize_params) if (PSMI_IS_GPU_ENABLED && psmi_parse_gpudirect() ) { psm3_getenv("PSM3_RV_GPU_CACHE_SIZE", "kernel space GPU cache size" @@ -2665,23 +2670,28 @@ int psm3_parse_identify(void) { union psmi_envvar_val myenv; static int have_value; - static unsigned saved_identify; + static int saved_identify; // only parse once so doesn't appear in PSM3_VERBOSE_ENV multiple times if (have_value) return saved_identify; - psm3_getenv("PSM3_IDENTIFY", "Identify PSM version being run " - "(0 - disable, 1 - enable, 1: - limit output to rank 0, " - "1:pattern - limit output " - "to processes whose label matches " + psm3_getenv_range("PSM3_IDENTIFY", "Identify PSM version being run", + " 0 - disable\n" + " 1 - enable\n" + " 1: - limit output to rank 0\n" + " 1:pattern - limit output to processes whose label matches\n " #ifdef FNM_EXTMATCH "extended " #endif "glob pattern)", - PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR_VAL_PAT, - (union psmi_envvar_val)"0", &myenv); - (void)psm3_parse_val_pattern(myenv.e_str, 0, &saved_identify); + PSMI_ENVVAR_LEVEL_USER|PSMI_ENVVAR_FLAG_NOABBREV, + PSMI_ENVVAR_TYPE_STR_VAL_PAT_INT, + (union psmi_envvar_val)"0", + (union psmi_envvar_val)0, (union psmi_envvar_val)1, + NULL, NULL, &myenv); + (void)psm3_parse_val_pattern_int(myenv.e_str, 0, &saved_identify, + PSMI_ENVVAR_FLAG_NOABBREV, 0, 1); have_value = 1; return saved_identify; @@ -2891,11 +2901,12 @@ void psm3_print_ep_identify(psm2_ep_t ep) (void)psmi_hal_get_port_speed(ep->unit_id, ep->portnum, &link_speed); psmi_hal_get_node_id(ep->unit_id, &node_id); - psm3_print_identify("%s %s NIC %u (%s) Port %u %"PRIu64" Mbps NUMA %d %s%s\n", + psm3_print_identify("%s %s NIC %u (%s) Port %u %"PRIu64" Mbps NUMA %d %s%s%s\n", psm3_get_mylabel(), psm3_ident_tag, ep->unit_id, ep->dev_name, ep->portnum, link_speed/(1000*1000), node_id, psm3_epid_fmt_addr(ep->epid, 0), + ep->addl_nic_info?ep->addl_nic_info:"", (! psm3_ep_device_is_enabled(ep, PTL_DEVID_AMSH) && (((struct ptl_ips *)(ep->ptl_ips.ptl))->proto.flags & IPS_PROTO_FLAG_LOOPBACK))?" loopback":""); @@ -3011,7 +3022,7 @@ void psm3_parse_multi_ep() #ifdef PSM_FI -unsigned psm3_faultinj_enabled = 0; +int psm3_faultinj_enabled = 0; int psm3_faultinj_verbose = 0; char *psm3_faultinj_outfile = NULL; int psm3_faultinj_sec_rail = 0; @@ -3025,21 +3036,25 @@ void psm3_parse_faultinj() { union psmi_envvar_val env_fi; - psm3_getenv("PSM3_FI", "PSM Fault Injection " - "(0 - disable, 1 - enable, " - "2 - enable but default each injector to 0 rate " - "#: - limit to rank 0, " - "#:pattern - limit " - "to processes whose label matches " + psm3_getenv_range("PSM3_FI", "PSM Fault Injection", + " 0 - disable\n" + " 1 - enable\n" + " 2 - enable but default each injector to 0 rate\n" + " #: - limit to rank 0\n" + " #:pattern - limit to processes whose label matches\n " #ifdef FNM_EXTMATCH "extended " #endif - "glob pattern) " - "mode 2 can be useful to generate full stats help " - "when PSM3_PRINT_STATS_HELP enabled", - PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR_VAL_PAT, - (union psmi_envvar_val)"0", &env_fi); - (void)psm3_parse_val_pattern(env_fi.e_str, 0, &psm3_faultinj_enabled); + "glob pattern\n" + "mode 2 can be useful to generate help for all injectors\n" + "when PSM3_PRINT_STATS_HELP=1 or PSM3_VERBOSE_ENV=3:", + PSMI_ENVVAR_LEVEL_HIDDEN|PSMI_ENVVAR_FLAG_NOABBREV, + PSMI_ENVVAR_TYPE_STR_VAL_PAT_INT, + (union psmi_envvar_val)"0", + (union psmi_envvar_val)0, (union psmi_envvar_val)2, + NULL, NULL, &env_fi); + (void)psm3_parse_val_pattern_int(env_fi.e_str, 0, + &psm3_faultinj_enabled, PSMI_ENVVAR_FLAG_NOABBREV, 0, 2); if (psm3_faultinj_enabled) { char *def = NULL; @@ -3143,6 +3158,52 @@ void psm3_faultinj_fini() return; } +/* parse fault injection controls + * format is num:denom:initial_seed + * denom must be >= num and > 0 + * Either field can be omitted in which case default (input fvals) is used + * for given field. + * 0 - successfully parsed, fvals updated + * -1 - str empty, fvals unchanged + * -2 - syntax error, fvals may have been changed + */ +static int parse_faultinj_control(const char *str, + size_t errstr_size, char errstr[], + int fvals[3]) +{ + psmi_assert(fvals); + int ret = psm3_parse_str_tuples(str, 3, fvals); + if (ret < 0) + return ret; + if (! fvals[1]) { + if (errstr_size) + snprintf(errstr, errstr_size, " denom must be non-zero"); + return -2; + } + if (fvals[0] < 0 || fvals[1] < 0) { + if (errstr_size) + snprintf(errstr, errstr_size, " Negative values for num and denom not allowed"); + return -2; + } + if (fvals[0] > fvals[1]) { + if (errstr_size) + snprintf(errstr, errstr_size, " num (%d) must be <= denom (%d)", fvals[0], fvals[1]); + return -2; + } + return 0; +} + +static int parse_check_faultinj_control(int type, + const union psmi_envvar_val val, void *ptr, + size_t errstr_size, char errstr[]) +{ + // parser will set fvals to result, use a copy to protect input of defaults + int fvals[3] = { ((int*)ptr)[0], ((int*)ptr)[1], ((int*)ptr)[2] }; + psmi_assert(type == PSMI_ENVVAR_TYPE_STR_TUPLES); + return parse_faultinj_control(val.e_str, errstr_size, errstr, fvals); +} + + /* * Intended to be used only once, not in the critical path */ @@ -3186,27 +3247,34 @@ struct psm3_faultinj_spec *psm3_faultinj_getspec(const char *spec_name, * error condition. */ { - int fvals[3] = { num, denom, (int)getpid() }; + int fvals[3] = { fi->num, fi->denom, fi->initial_seed }; union psmi_envvar_val env_fi; char fvals_str[128]; char fname[128]; char fdesc[300]; + int ret; snprintf(fvals_str, sizeof(fvals_str), "%d:%d:%d", fi->num, fi->denom, fi->initial_seed); snprintf(fname, sizeof(fname), "PSM3_FI_%s", spec_name); - snprintf(fdesc, sizeof(fdesc), "Fault Injection - %s <%s>", - help, fvals_str); - - if (!psm3_getenv(fname, fdesc, PSMI_ENVVAR_LEVEL_HIDDEN, - PSMI_ENVVAR_TYPE_STR_TUPLES, - (union psmi_envvar_val)fvals_str, &env_fi)) { + snprintf(fdesc, sizeof(fdesc), "Fault Injection - %s", help); + + ret = psm3_getenv_range(fname, fdesc, + "Specified as num:denom:seed, where num/denom is approx probability\nand seed seeds the random number generator", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR_TUPLES, + (union psmi_envvar_val)fvals_str, + (union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL, + parse_check_faultinj_control, fvals, &env_fi); + if (ret == 0) { /* not using default values */ - (void)psm3_parse_str_tuples(env_fi.e_str, 3, fvals); + if (parse_faultinj_control(env_fi.e_str, 0, NULL, fvals) < 0) { + // already checked, shouldn't get parse errors nor empty strings + psmi_assert(0); + } fi->num = fvals[0]; fi->denom = fvals[1]; fi->initial_seed = fvals[2]; - } else if (psm3_faultinj_enabled == 2) { + } else if (ret == 1 && psm3_faultinj_enabled == 2) { // default unspecified injectors to off fi->num = 0; } diff --git a/prov/psm3/psm3/psm_utils.h b/prov/psm3/psm3/psm_utils.h index ab654cb451d..d39b49e6711 100644 --- a/prov/psm3/psm3/psm_utils.h +++ b/prov/psm3/psm3/psm_utils.h @@ -528,7 +528,7 @@ void psm3_parse_multi_ep(); * pri_reg_mr - priority register MR failure (ENOMEM) * gdrmmap - GPU gdrcopy pin and mmap failure */ -extern unsigned psm3_faultinj_enabled; /* use macro to test */ +extern int psm3_faultinj_enabled; /* use macro to test */ extern int psm3_faultinj_verbose; /* use IS_FAULT macro to test */ extern int psm3_faultinj_sec_rail;/* faults only on secondary rails or EPs */ diff --git a/prov/psm3/psm3/ptl.h b/prov/psm3/psm3/ptl.h index dcdba3a7c6d..44110636411 100644 --- a/prov/psm3/psm3/ptl.h +++ b/prov/psm3/psm3/ptl.h @@ -68,14 +68,6 @@ #include #include -/* We currently have 3 PTLs, 0 is reserved. */ -#define PTL_DEVID_IPS 1 -#define PTL_DEVID_AMSH 2 -#define PTL_DEVID_SELF 3 - -/* We can currently initialize up to 3 PTLs */ -#define PTL_MAX_INIT 3 - /* struct ptl is an incomplete type, and it serves as a generic or opaque container. It should remain an incomplete type in the entire psm source base. concrete ptl types need to have a suffix such as ptl_self, diff --git a/prov/psm3/psm3/ptl_am/am_config.h b/prov/psm3/psm3/ptl_am/am_config.h index f436f471c25..79600601037 100644 --- a/prov/psm3/psm3/ptl_am/am_config.h +++ b/prov/psm3/psm3/ptl_am/am_config.h @@ -67,6 +67,14 @@ #define AMSH_HAVE_CMA 0x1 #define AMSH_HAVE_KASSIST 0x1 +#if defined(PSM_CUDA) +/* Threshold for GPU rendezvous (aka scale-up transfer vs via CPU shared mem */ +#define PSMI_MQ_GPU_RV_THRESH 127 +#elif defined(PSM_ONEAPI) +/* Threshold for GPU rendezvous (aka scale-up transfer vs via CPU shared mem */ +#define PSMI_MQ_GPU_RV_THRESH 127 +#endif + /* Each block reserves some space at the beginning to store auxiliary data */ #define AMSH_BLOCK_HEADER_SIZE 4096 diff --git a/prov/psm3/psm3/ptl_am/am_oneapi_memhandle_cache.c b/prov/psm3/psm3/ptl_am/am_oneapi_memhandle_cache.c index a8151240469..ac561c6d32f 100644 --- a/prov/psm3/psm3/ptl_am/am_oneapi_memhandle_cache.c +++ b/prov/psm3/psm3/ptl_am/am_oneapi_memhandle_cache.c @@ -96,7 +96,7 @@ typedef struct { static psm2_error_t am_ze_memhandle_mpool_alloc( am_ze_memhandle_cache_t cache, uint32_t memcache_size); -void am_ze_memhandle_delete(void *buf_ptr); +static void am_ze_memhandle_delete(void *buf_ptr); /* * Custom comparator @@ -653,9 +653,9 @@ am_ze_memhandle_acquire(am_ze_memhandle_cache_t cache, } +#if defined(HAVE_DRM) || defined(HAVE_LIBDRM) void am_ze_memhandle_delete(void *buf_ptr) { -#if defined(HAVE_DRM) || defined(HAVE_LIBDRM) /* Release the reference to the buffer */ PSMI_ONEAPI_ZE_CALL(zeMemFree, ze_context, buf_ptr); @@ -679,8 +679,8 @@ void am_ze_memhandle_delete(void *buf_ptr) * GEM_CLOSE. */ #endif -#endif /* HAVE_DRM or HAVE_LIBDRM */ } +#endif /* HAVE_DRM or HAVE_LIBDRM */ void am_ze_memhandle_release(am_ze_memhandle_cache_t cache, diff --git a/prov/psm3/psm3/ptl_am/am_reqrep_shmem.c b/prov/psm3/psm3/ptl_am/am_reqrep_shmem.c index 2cea9932454..020f3afb349 100644 --- a/prov/psm3/psm3/ptl_am/am_reqrep_shmem.c +++ b/prov/psm3/psm3/ptl_am/am_reqrep_shmem.c @@ -88,6 +88,9 @@ #endif int psm3_shm_mq_rv_thresh = PSMI_MQ_RV_THRESH_NO_KASSIST; +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +int psm3_shm_mq_gpu_rv_thresh = PSMI_MQ_GPU_RV_THRESH; +#endif // qcounts and qelemsz tunable via amsh_fifo_getconfig(); static amsh_qinfo_t amsh_qcounts = { @@ -371,6 +374,16 @@ psm2_error_t psm3_shm_create(ptl_t *ptl_gen) } memset((void *) mapptr, 0, segsz); /* touch all of my pages */ +#if defined(PSM_CUDA) && !defined(PSM3_NO_CUDA_REGISTER) + if (PSMI_IS_GPU_ENABLED && check_have_cuda_ctxt()) + PSMI_CUDA_CALL(cuMemHostRegister, mapptr, segsz, + CU_MEMHOSTALLOC_PORTABLE); +#endif +#if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT) + if (PSMI_IS_GPU_ENABLED) + PSMI_ONEAPI_ZE_CALL(zexDriverImportExternalPointer, ze_driver, + mapptr, segsz); +#endif /* Our own ep's info for ptl_am resides at the start of the shm object. Other processes need some of this info to @@ -418,6 +431,37 @@ psm2_error_t psm3_epdir_extend(ptl_t *ptl_gen) psm2_error_t psm3_do_unmap(uintptr_t shmbase) { psm2_error_t err = PSM2_OK; +#if defined(PSM_CUDA) && !defined(PSM3_NO_CUDA_REGISTER) + if (PSMI_IS_GPU_ENABLED && cu_ctxt) { + /* ignore NOT_REGISTERED in case cuda initialized late */ + /* ignore other errors as context could be destroyed before this */ + CUresult cudaerr; + //PSMI_CUDA_CALL_EXCEPT(CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, + // cuMemHostUnregister, (void*)shmbase); + psmi_count_cuMemHostUnregister++; + cudaerr = psmi_cuMemHostUnregister((void*)shmbase); + if (cudaerr) { + const char *pStr = NULL; + psmi_count_cuGetErrorString++; + psmi_cuGetErrorString(cudaerr, &pStr); + _HFI_DBG("CUDA failure: cuMemHostUnregister returned %d: %s\n", + cudaerr, pStr?pStr:"Unknown"); + } + } +#endif +#if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT) + if (PSMI_IS_GPU_ENABLED) { + ze_result_t result; + //PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, ze_driver, + // (void *)shmbase); + psmi_count_zexDriverReleaseImportedPointer++; + result = psmi_zexDriverReleaseImportedPointer(ze_driver, + (void *)shmbase); + if (result != ZE_RESULT_SUCCESS) { + _HFI_DBG("OneAPI Level Zero failure: zexDriverReleaseImportedPointer returned %d: %s\n", result, psmi_oneapi_ze_result_to_string(result)); + } + } +#endif if (munmap((void *)shmbase, am_ctl_sizeof_block())) { err = psm3_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR, @@ -550,6 +594,16 @@ psm2_error_t psm3_shm_map_remote(ptl_t *ptl_gen, psm2_epid_t epid, uint16_t *shm // read every page in segment so faulted into our address space psm3_touch_mmap(dest_mapptr, segsz); +#if defined(PSM_CUDA) && !defined(PSM3_NO_CUDA_REGISTER) + if (PSMI_IS_GPU_ENABLED && check_have_cuda_ctxt()) + PSMI_CUDA_CALL(cuMemHostRegister, dest_mapptr, segsz, + CU_MEMHOSTALLOC_PORTABLE); +#endif +#if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT) + if (PSMI_IS_GPU_ENABLED) + PSMI_ONEAPI_ZE_CALL(zexDriverImportExternalPointer, ze_driver, + dest_mapptr, segsz); +#endif shmidx = -1; if ((ptl->max_ep_idx + 1) == ptl->am_ep_size) { @@ -711,6 +765,37 @@ psm2_error_t psm3_shm_detach(ptl_t *ptl_gen) shm_unlink(ptl->amsh_keyname); psmi_free(ptl->amsh_keyname); +#if defined(PSM_CUDA) && !defined(PSM3_NO_CUDA_REGISTER) + if (PSMI_IS_GPU_ENABLED && cu_ctxt) { + /* ignore NOT_REGISTERED in case cuda initialized late */ + /* ignore other errors as context could be destroyed before this */ + CUresult cudaerr; + //PSMI_CUDA_CALL_EXCEPT(CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, + // cuMemHostUnregister, (void*)shmbase); + psmi_count_cuMemHostUnregister++; + cudaerr = psmi_cuMemHostUnregister((void*)shmbase); + if (cudaerr) { + const char *pStr = NULL; + psmi_count_cuGetErrorString++; + psmi_cuGetErrorString(cudaerr, &pStr); + _HFI_DBG("CUDA failure: cuMemHostUnregister returned %d: %s\n", + cudaerr, pStr?pStr:"Unknown"); + } + } +#endif +#if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT) + if (PSMI_IS_GPU_ENABLED) { + ze_result_t result; + //PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, ze_driver, + // (void *)shmbase); + psmi_count_zexDriverReleaseImportedPointer++; + result = psmi_zexDriverReleaseImportedPointer(ze_driver, + (void *)shmbase); + if (result != ZE_RESULT_SUCCESS) { + _HFI_DBG("OneAPI Level Zero failure: zexDriverReleaseImportedPointer returned %d: %s\n", result, psmi_oneapi_ze_result_to_string(result)); + } + } +#endif if (munmap((void *)shmbase, am_ctl_sizeof_block())) { err = psm3_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR, @@ -2382,7 +2467,8 @@ amsh_mq_send_inner_eager(psm2_mq_t mq, psm2_mq_req_t req, psm2_epaddr_t epaddr, args[2].u32w1 = tag->tag[2]; args[2].u32w0 = 0; - if (!flags_user && len <= AMLONG_MTU) { + psmi_assert(!(flags_user & PSM2_MQ_FLAG_SENDSYNC));// needs rndv + if (len <= AMLONG_MTU) { if (len <= 32) args[0].u32w0 = MQ_MSG_TINY; else @@ -2445,26 +2531,29 @@ amsh_mq_send_inner(psm2_mq_t mq, psm2_mq_req_t req, psm2_epaddr_t epaddr, if (PSM3_IS_BUFFER_GPU_MEM(ubuf, len)) { gpu_mem = 1; - /* All sends from a gpu buffer use the rendezvous protocol if p2p is supported */ - if (ep_supports_p2p) { + /* SENDSYNC gets priority, assume not used for MPI_isend w/INJECT */ + /* otherwise use eager for INJECT as caller is waiting */ + if ((flags_user & (PSM2_MQ_FLAG_SENDSYNC|PSM2_MQ_FLAG_INJECT)) + == PSM2_MQ_FLAG_INJECT) + goto do_eager; + + /* larger sends from a gpu buffer use the rendezvous protocol if p2p is supported */ + if (ep_supports_p2p && len > mq->shm_gpu_thresh_rv) { goto do_rendezvous; } - - /* - * Use eager messages if P2P is unsupported between endpoints. - * Potentially use rendezvous with blocking requests only. - */ - if (!is_blocking) - goto do_eager; - } + } else #endif + /* SENDSYNC gets priority, assume not used for MPI_isend w/INJECT */ + /* otherwise use eager for INJECT as caller is waiting */ + if ((flags_user & (PSM2_MQ_FLAG_SENDSYNC|PSM2_MQ_FLAG_INJECT)) + == PSM2_MQ_FLAG_INJECT) + goto do_eager; + if (flags_user & PSM2_MQ_FLAG_SENDSYNC) goto do_rendezvous; if (len <= mq->shm_thresh_rv) -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) do_eager: -#endif return amsh_mq_send_inner_eager(mq, req, epaddr, args, flags_user, flags_internal, tag, ubuf, len); do_rendezvous: @@ -2600,17 +2689,31 @@ int psm3_get_kassist_mode() return PSMI_KASSIST_OFF; #endif -#if !defined(PSM_CUDA) && !defined(PSM_ONEAPI) union psmi_envvar_val env_kassist; const char *PSM3_KASSIST_MODE_HELP = "PSM Shared memory kernel assist mode " "(cma-put, cma-get, none)"; +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) + // GPU limits KASSIST choices to cma-get or none + const char *PSM3_KASSIST_MODE_GPU_HELP = "PSM Shared memory kernel assist mode " + "(cma-get, none)"; +#endif - if (!psm3_getenv("PSM3_KASSIST_MODE", PSM3_KASSIST_MODE_HELP, + if (!psm3_getenv("PSM3_KASSIST_MODE", +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) + PSMI_IS_GPU_ENABLED? + PSM3_KASSIST_MODE_GPU_HELP:PSM3_KASSIST_MODE_HELP, +#else + PSM3_KASSIST_MODE_HELP, +#endif PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR, (union psmi_envvar_val) PSMI_KASSIST_MODE_DEFAULT_STRING, &env_kassist)) { char *s = env_kassist.e_str; - if (strcasecmp(s, "cma-put") == 0) + if ( +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) + ! PSMI_IS_GPU_ENABLED && +#endif + strcasecmp(s, "cma-put") == 0) mode = PSMI_KASSIST_CMA_PUT; else if (strcasecmp(s, "cma-get") == 0) mode = PSMI_KASSIST_CMA_GET; @@ -2622,7 +2725,6 @@ int psm3_get_kassist_mode() mode = PSMI_KASSIST_CMA_GET; } } -#endif return mode; } @@ -3005,11 +3107,9 @@ amsh_init(psm2_ep_t ep, ptl_t *ptl_gen, ptl_ctl_t *ctl) PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val) CUDA_MEMHANDLE_CACHE_SIZE, &env_memcache_size); -#if defined(HAVE_DRM) || defined(HAVE_LIBDRM) if ((err = am_cuda_memhandle_cache_alloc(&ptl->memhandle_cache, env_memcache_size.e_uint, &ep->mq->stats) != PSM2_OK)) goto fail; -#endif } } #endif @@ -3160,6 +3260,10 @@ static psm2_error_t amsh_fini(ptl_t *ptl_gen, int force, uint64_t timeout_ns) am_ze_memhandle_cache_free(ptl->memhandle_cache); #endif ptl->memhandle_cache = NULL; +#endif +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) + if (PSMI_IS_GPU_ENABLED && ptl->gpu_bounce_buf) + PSM3_GPU_HOST_FREE(ptl->gpu_bounce_buf); #endif return PSM2_OK; fail: diff --git a/prov/psm3/psm3/ptl_am/psm_am_internal.h b/prov/psm3/psm3/ptl_am/psm_am_internal.h index 56df72a6c13..203b9512c3a 100644 --- a/prov/psm3/psm3/ptl_am/psm_am_internal.h +++ b/prov/psm3/psm3/ptl_am/psm_am_internal.h @@ -468,6 +468,10 @@ struct ptl_am { #ifdef PSM_ONEAPI am_ze_memhandle_cache_t memhandle_cache; #endif +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +#define AMSH_GPU_BOUNCE_BUF_SZ (256*1024) + void *gpu_bounce_buf; // for H to D +#endif } __attribute__((aligned(64))); #endif diff --git a/prov/psm3/psm3/ptl_am/ptl.c b/prov/psm3/psm3/ptl_am/ptl.c index 62142f898a9..8a38d22ad4d 100644 --- a/prov/psm3/psm3/ptl_am/ptl.c +++ b/prov/psm3/psm3/ptl_am/ptl.c @@ -54,6 +54,7 @@ /* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ #include "psm_user.h" +#include "psm2_hal.h" #include "psm_mq_internal.h" #include "psm_am_internal.h" #include "cmarw.h" @@ -162,19 +163,32 @@ ptl_handle_rtsmatch_request(psm2_mq_req_t req, int was_posted, * resides on the GPU */ if (req->is_buf_gpu_mem) { - void* gpu_ipc_bounce_buf = psmi_malloc(PSMI_EP_NONE, UNDEFINED, req->req_data.recv_msglen); - size_t nbytes = psm3_cma_get(pid, (void *)req->rts_sbuf, - gpu_ipc_bounce_buf, req->req_data.recv_msglen); - psmi_assert_always(nbytes == req->req_data.recv_msglen); - PSM3_GPU_MEMCPY_HTOD(req->req_data.buf, gpu_ipc_bounce_buf, - req->req_data.recv_msglen); + size_t cnt = 0; + if (!ptl->gpu_bounce_buf) + PSM3_GPU_HOST_ALLOC(&ptl->gpu_bounce_buf, AMSH_GPU_BOUNCE_BUF_SZ); + while (cnt < req->req_data.recv_msglen) { + size_t nbytes = min(req->req_data.recv_msglen-cnt, + AMSH_GPU_BOUNCE_BUF_SZ); + size_t res = psm3_cma_get(pid, (void *)(req->rts_sbuf+cnt), + ptl->gpu_bounce_buf, nbytes); + void *buf; + psmi_assert_always(nbytes == res); + if (PSMI_USE_GDR_COPY_RECV(nbytes) + && NULL != (buf = psmi_hal_gdr_convert_gpu_to_host_addr( + (unsigned long)req->req_data.buf+cnt, + nbytes, 1, ptl->ep))) + psm3_mq_mtucpy_host_mem(buf, ptl->gpu_bounce_buf, nbytes); + else + PSM3_GPU_MEMCPY_HTOD(req->req_data.buf+cnt, + ptl->gpu_bounce_buf, nbytes); + cnt+= nbytes; + } /* Cuda library has recent optimizations where they do * not guarantee synchronus nature for Host to Device * copies for msg sizes less than 64k. The event record * and synchronize calls are to guarentee completion. */ PSM3_GPU_SYNCHRONIZE_MEMCPY(); - psmi_free(gpu_ipc_bounce_buf); } else { /* cma can be done in handler context or not. */ size_t nbytes = psm3_cma_get(pid, (void *)req->rts_sbuf, diff --git a/prov/psm3/psm3/ptl_am/ptl_fwd.h b/prov/psm3/psm3/ptl_am/ptl_fwd.h index e7dcd060d22..85593aad847 100644 --- a/prov/psm3/psm3/ptl_am/ptl_fwd.h +++ b/prov/psm3/psm3/ptl_am/ptl_fwd.h @@ -60,5 +60,6 @@ extern struct ptl_ctl_init psm3_ptl_amsh; extern int psm3_shm_mq_rv_thresh; +extern int psm3_shm_mq_gpu_rv_thresh; #endif diff --git a/prov/psm3/psm3/ptl_ips/ips_expected_proto.h b/prov/psm3/psm3/ptl_ips/ips_expected_proto.h index 6e9b94f3a97..2bdd85a309c 100644 --- a/prov/psm3/psm3/ptl_ips/ips_expected_proto.h +++ b/prov/psm3/psm3/ptl_ips/ips_expected_proto.h @@ -137,7 +137,8 @@ struct ips_protoexp { #ifdef PSM_CUDA CUstream cudastream_recv; #elif defined(PSM_ONEAPI) - ze_command_queue_handle_t cq_recv; // NULL if psm3_oneapi_immed_async_copy + /* Will not be usd if psm3_oneapi_immed_async_copy */ + ze_command_queue_handle_t cq_recvs[MAX_ZE_DEVICES]; #endif }; @@ -201,6 +202,7 @@ struct ips_tid_send_desc { * would need to attach to a tidsendc would be 2 */ struct ips_gpu_hostbuf *gpu_hostbuf[2]; + struct ips_gpu_hostbuf *gpu_split_buf; /* Number of hostbufs attached */ uint8_t gpu_num_buf; #endif @@ -362,4 +364,11 @@ psm3_ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp, ptl_arg_t rdescid, uint32_t tidflow_genseq, ips_tid_session_list *tid_list, uint32_t tid_list_size); + +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +// buffers for GPU send copy pipeline +struct ips_gpu_hostbuf* psm3_ips_allocate_send_chb(struct ips_proto *proto, + uint32_t nbytes, int allow_temp); +void psm3_ips_deallocate_send_chb(struct ips_gpu_hostbuf* chb, int reset); +#endif #endif /* #ifndef __IPS_EXPECTED_PROTO_H__ */ diff --git a/prov/psm3/psm3/ptl_ips/ips_path_rec.c b/prov/psm3/psm3/ptl_ips/ips_path_rec.c index 3db38328818..de57f5317e9 100644 --- a/prov/psm3/psm3/ptl_ips/ips_path_rec.c +++ b/prov/psm3/psm3/ptl_ips/ips_path_rec.c @@ -127,8 +127,12 @@ enum psm3_ibv_rate ips_link_speed_to_enum(uint64_t link_speed) return PSM3_IBV_RATE_300_GBPS; else if (link_speed <= 400*PSM3_GIGABIT) return PSM3_IBV_RATE_400_GBPS; - else + else if (link_speed <= 600*PSM3_GIGABIT) return PSM3_IBV_RATE_600_GBPS; + else if (link_speed <= 800*PSM3_GIGABIT) + return PSM3_IBV_RATE_800_GBPS; + else + return PSM3_IBV_RATE_1200_GBPS; } static uint64_t ips_enum_to_link_speed(enum psm3_ibv_rate rate) @@ -155,6 +159,8 @@ static uint64_t ips_enum_to_link_speed(enum psm3_ibv_rate rate) case PSM3_IBV_RATE_50_GBPS: return 50*PSM3_GIGABIT; case PSM3_IBV_RATE_400_GBPS: return 400*PSM3_GIGABIT; case PSM3_IBV_RATE_600_GBPS: return 600*PSM3_GIGABIT; + case PSM3_IBV_RATE_800_GBPS: return 800*PSM3_GIGABIT; + case PSM3_IBV_RATE_1200_GBPS: return 1200*PSM3_GIGABIT; default: return 100*PSM3_GIGABIT; } } @@ -458,6 +464,51 @@ ips_none_path_rec(struct ips_proto *proto, return err; } +/* parse error check timeouts for PSM3_ERRCHK_TIMEOUT or PSM3_ERRCHK_TIMEOUT_US + * format is min:max:factor + * all must be non-zero, min must be <= max + * Either field can be omitted in which case default (input tvals) is used + * for given field. + * 0 - successfully parsed, tvals updated + * -1 - str empty, tvals unchanged + * -2 - syntax error, tvals may have been changed + */ +static int parse_errchk_timeout(const char *str, + size_t errstr_size, char errstr[], + int tvals[3]) +{ + psmi_assert(tvals); + int ret = psm3_parse_str_tuples(str, 3, tvals); + if (ret < 0) + return ret; + if (tvals[0] < 0 || tvals[1] < 0 || tvals[2] < 0) { + if (errstr_size) + snprintf(errstr, errstr_size, " Negative values not allowed"); + return -2; + } + if (tvals[0] == 0 || tvals[1] == 0 || tvals[2] == 0) { + if (errstr_size) + snprintf(errstr, errstr_size, " Zero values not allowed"); + return -2; + } + if (tvals[0] > tvals[1]) { + if (errstr_size) + snprintf(errstr, errstr_size, " min (%d) must be <= max (%d)", tvals[0], tvals[1]); + return -2; + } + return 0; +} + +static int parse_check_errchk_timeout(int type, + const union psmi_envvar_val val, void *ptr, + size_t errstr_size, char errstr[]) +{ + // parser will set tvals to result, use a copy to protect input of defaults + int tvals[3] = { ((int*)ptr)[0], ((int*)ptr)[1], ((int*)ptr)[2] }; + psmi_assert(type == PSMI_ENVVAR_TYPE_STR_TUPLES); + return parse_errchk_timeout(val.e_str, errstr_size, errstr, tvals); +} + static psm2_error_t ips_none_path_rec_init(struct ips_proto *proto) { psm2_error_t err = PSM2_OK; @@ -478,17 +529,18 @@ static psm2_error_t ips_none_path_rec_init(struct ips_proto *proto) IPS_PROTO_ERRCHK_FACTOR_DEFAULT }; - if (!psm3_getenv("PSM3_ERRCHK_TIMEOUT", - "Errchk timeouts in mS ", + (void)psm3_getenv_range("PSM3_ERRCHK_TIMEOUT", + "Errchk timeouts in milliseconds ", + "Specified as min:max:factor where min and max is the range of timeouts\nand factor is the multiplier for growing timeout", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR_TUPLES, (union psmi_envvar_val)PSM_TID_TIMEOUT_DEFAULT, - &env_to)) { - /* Not using default values, parse what we can */ - (void)psm3_parse_str_tuples(env_to.e_str, 3, tvals); - /* Adjust for max smaller than min, things would break */ - if (tvals[1] < tvals[0]) - tvals[1] = tvals[0]; + (union psmi_envvar_val)NULL, + (union psmi_envvar_val)NULL, + parse_check_errchk_timeout, tvals, &env_to); + if (parse_errchk_timeout(env_to.e_str, 0, NULL, tvals) < 0) { + // already checked, shouldn't get parse errors nor empty strings + psmi_assert(0); } proto->epinfo.ep_timeout_ack = ms_2_cycles(tvals[0]); @@ -502,22 +554,26 @@ static psm2_error_t ips_none_path_rec_init(struct ips_proto *proto) * This allows values in units of microseconds and will override * any values specified in PSM3_ERRCHK_TIMEOUT */ - if (!psm3_getenv("PSM3_ERRCHK_TIMEOUT_US", - "Errchk timeouts in usec ", + int us_tvals[3] = { + IPS_PROTO_ERRCHK_MS_MIN_DEFAULT*1000, + IPS_PROTO_ERRCHK_MS_MAX_DEFAULT*1000, + IPS_PROTO_ERRCHK_FACTOR_DEFAULT + }; + if (1 > psm3_getenv_range("PSM3_ERRCHK_TIMEOUT_US", + "Errchk timeouts in microseconds ", + "Specified as min:max:factor where min and max is the range of timeouts\nand factor is the multiplier for growing timeout", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR_TUPLES, (union psmi_envvar_val)PSM_TID_TIMEOUT_DEFAULT_US, - &env_to)) { - /* Not using default values, parse what we can */ - int us_tvals[3] = { - IPS_PROTO_ERRCHK_MS_MIN_DEFAULT*1000, - IPS_PROTO_ERRCHK_MS_MAX_DEFAULT*1000, - IPS_PROTO_ERRCHK_FACTOR_DEFAULT - }; - (void)psm3_parse_str_tuples(env_to.e_str, 3, us_tvals); - /* Adjust for max smaller than min, things would break */ - if (us_tvals[1] < us_tvals[0]) - us_tvals[1] = us_tvals[0]; + (union psmi_envvar_val)NULL, + (union psmi_envvar_val)NULL, + parse_check_errchk_timeout, us_tvals, &env_to)) { + // value specified (perhaps bad input), use + // what was returned (will be default if bad input) + if (parse_errchk_timeout(env_to.e_str, 0, NULL, us_tvals) < 0) { + // already checked, shouldn't get parse errors nor empty strings + psmi_assert(0); + } proto->epinfo.ep_timeout_ack = us_2_cycles(us_tvals[0]); proto->epinfo.ep_timeout_ack_max = us_2_cycles(us_tvals[1]); proto->epinfo.ep_timeout_ack_factor = us_tvals[2]; diff --git a/prov/psm3/psm3/ptl_ips/ips_path_rec.h b/prov/psm3/psm3/ptl_ips/ips_path_rec.h index ebca755e95a..17fa819a396 100644 --- a/prov/psm3/psm3/ptl_ips/ips_path_rec.h +++ b/prov/psm3/psm3/ptl_ips/ips_path_rec.h @@ -124,6 +124,8 @@ enum psm3_ibv_rate { PSM3_IBV_RATE_50_GBPS = 20, PSM3_IBV_RATE_400_GBPS = 21, PSM3_IBV_RATE_600_GBPS = 22, + PSM3_IBV_RATE_800_GBPS = 23, + PSM3_IBV_RATE_1200_GBPS = 24, }; static inline int opa_mtu_enum_to_int(enum opa_mtu mtu) diff --git a/prov/psm3/psm3/ptl_ips/ips_proto.c b/prov/psm3/psm3/ptl_ips/ips_proto.c index d4c723a430a..f6c9c215bcb 100644 --- a/prov/psm3/psm3/ptl_ips/ips_proto.c +++ b/prov/psm3/psm3/ptl_ips/ips_proto.c @@ -452,6 +452,8 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl, (protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED)) { struct psmi_rlimit_mpool rlim = GPU_HOSTBUFFER_LIMITS; uint32_t maxsz, chunksz, max_elements; + uint32_t pool_num_obj_max_total; + uint32_t small_pool_num_obj_max_total; if ((err = psm3_parse_mpool_env(proto->mq, 1, &rlim, &maxsz, &chunksz))) @@ -459,10 +461,12 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl, /* the maxsz is the amount in MB, not the number of entries, * since the element size depends on the window size */ - max_elements = (maxsz*1024*1024) / proto->mq->hfi_base_window_rv; + max_elements = (maxsz*1024*1024) / psm3_mq_max_window_rv(proto->mq, 1); /* mpool requires max_elements to be power of 2. round down. */ max_elements = 1 << (31 - __builtin_clz(max_elements)); - proto->gpu_hostbuf_send_cfg.bufsz = proto->mq->hfi_base_window_rv; + /* need at least 3 buffers */ + max_elements = max(4, max_elements); + proto->gpu_hostbuf_send_cfg.bufsz = psm3_mq_max_window_rv(proto->mq, 1); proto->gpu_hostbuf_pool_send = psm3_mpool_create_for_gpu(sizeof(struct ips_gpu_hostbuf), chunksz, max_elements, 0, @@ -476,6 +480,8 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl, "Couldn't allocate GPU host send buffer pool"); goto fail; } + psm3_mpool_get_obj_info(proto->gpu_hostbuf_pool_send, + NULL, &pool_num_obj_max_total); /* use the same number of elements for the small pool */ proto->gpu_hostbuf_small_send_cfg.bufsz = GPU_SMALLHOSTBUF_SZ; @@ -492,6 +498,8 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl, "Couldn't allocate GPU host small send buffer pool"); goto fail; } + psm3_mpool_get_obj_info(proto->gpu_hostbuf_pool_small_send, + NULL, &small_pool_num_obj_max_total); /* Configure the amount of prefetching */ union psmi_envvar_val env_prefetch_limit; @@ -502,6 +510,12 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl, (union psmi_envvar_val)GPU_WINDOW_PREFETCH_DEFAULT, &env_prefetch_limit); proto->gpu_prefetch_limit = env_prefetch_limit.e_uint; + _HFI_DBG("GPU Send Copy Pipeline: %u of %u bytes (small), %u of %u bytes, prefetch %u\n", + small_pool_num_obj_max_total, + proto->gpu_hostbuf_small_send_cfg.bufsz, + pool_num_obj_max_total, + proto->gpu_hostbuf_send_cfg.bufsz, + proto->gpu_prefetch_limit); } #endif /* PSM_CUDA || PSM_ONEAPI */ @@ -530,7 +544,8 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl, // but can survive if it's smaller as we will delay transfer til avail if (protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED) { cache_pri_entries = HFI_TF_NFLOWS + proto->ep->hfi_num_send_rdma; - cache_pri_size = (uint64_t)cache_pri_entries * proto->mq->hfi_base_window_rv; + cache_pri_size = (uint64_t)cache_pri_entries * + psm3_mq_max_window_rv(proto->mq, 0); if (MR_CACHE_USER_CACHING(proto->ep->mr_cache_mode)) { // we attempt to cache, so can benefit from more than inflight // make enough room to have a good number of entries @@ -578,7 +593,7 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl, default_cache_entries = max(default_cache_entries, ((uint64_t)env_mr_cache_size_mb.e_uint * (1024*1024)) - / max( proto->mq->hfi_base_window_rv/2, + / max(psm3_mq_max_window_rv(proto->mq, 0)/2, proto->mq->hfi_thresh_rv)); } else { // only send DMA, size based on smaller MRs @@ -2292,10 +2307,10 @@ ips_proto_register_stats(struct ips_proto *proto) "RDMA rendezvous message bytes received direct into a GPU buffer", &proto->strat_stats.rndv_rdma_gdr_recv_bytes), PSMI_STATS_DECLU64("rndv_rdma_hbuf_recv", - "RDMA rendezvous messages received into via pipelined GPU copy", + "RDMA rendezvous messages received into a GPU buffer via pipelined GPU copy", &proto->strat_stats.rndv_rdma_hbuf_recv), PSMI_STATS_DECLU64("rndv_rdma_hbuf_recv_bytes", - "RDMA rendezvous message bytes received into via pipelined GPU copy", + "RDMA rendezvous message bytes received into a GPU buffer via pipelined GPU copy", &proto->strat_stats.rndv_rdma_hbuf_recv_bytes), #endif PSMI_STATS_DECLU64("rndv_rdma_cpu_send", @@ -2312,10 +2327,10 @@ ips_proto_register_stats(struct ips_proto *proto) "RDMA rendezvous message bytes sent from a GPU buffer via send RDMA", &proto->strat_stats.rndv_rdma_gdr_send_bytes), PSMI_STATS_DECLU64("rndv_rdma_hbuf_send", - "RDMA rendezvous messages sent from a GPU buffer into via pipelined GPU copy", + "RDMA rendezvous messages sent from a GPU buffer via pipelined GPU copy", &proto->strat_stats.rndv_rdma_hbuf_send), PSMI_STATS_DECLU64("rndv_rdma_hbuf_send_bytes", - "RDMA rendezvous message bytes sent from a GPU buffer into via pipelined GPU copy", + "RDMA rendezvous message bytes sent from a GPU buffer via pipelined GPU copy", &proto->strat_stats.rndv_rdma_hbuf_send_bytes), #endif }; diff --git a/prov/psm3/psm3/ptl_ips/ips_proto.h b/prov/psm3/psm3/ptl_ips/ips_proto.h index eccd6ce3d25..9c1b920f075 100644 --- a/prov/psm3/psm3/ptl_ips/ips_proto.h +++ b/prov/psm3/psm3/ptl_ips/ips_proto.h @@ -437,7 +437,8 @@ struct ips_proto { #ifdef PSM_CUDA CUstream cudastream_send; #elif defined(PSM_ONEAPI) - ze_command_queue_handle_t cq_send; // NULL if psm3_oneapi_immed_async_copy + /* Will not be used if psm3_oneapi_immed_async_copy */ + ze_command_queue_handle_t cq_sends[MAX_ZE_DEVICES]; #endif #if defined(PSM_CUDA) || defined(PSM_ONEAPI) diff --git a/prov/psm3/psm3/ptl_ips/ips_proto_expected.c b/prov/psm3/psm3/ptl_ips/ips_proto_expected.c index 057bdb74c5c..c39231b8679 100644 --- a/prov/psm3/psm3/ptl_ips/ips_proto_expected.c +++ b/prov/psm3/psm3/ptl_ips/ips_proto_expected.c @@ -260,10 +260,11 @@ MOCKABLE(psm3_ips_protoexp_init)(const struct ips_proto *proto, #if defined(PSM_CUDA) || defined(PSM_ONEAPI) { - if (PSMI_IS_GPU_ENABLED && - !(proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV)) { + if (PSMI_IS_GPU_ENABLED) { struct psmi_rlimit_mpool rlim = GPU_HOSTBUFFER_LIMITS; uint32_t maxsz, chunksz, max_elements; + uint32_t pool_num_obj_max_total; + uint32_t small_pool_num_obj_max_total; if ((err = psm3_parse_mpool_env(protoexp->proto->mq, 1, &rlim, &maxsz, &chunksz))) @@ -271,11 +272,14 @@ MOCKABLE(psm3_ips_protoexp_init)(const struct ips_proto *proto, /* the maxsz is the amount in MB, not the number of entries, * since the element size depends on the window size */ - max_elements = (maxsz*1024*1024) / proto->mq->hfi_base_window_rv; + max_elements = (maxsz*1024*1024) / + psm3_mq_max_window_rv(proto->mq, 1); /* mpool requires max_elements to be power of 2. round down. */ max_elements = 1 << (31 - __builtin_clz(max_elements)); + /* need at least 2 buffers */ + max_elements = max(2, max_elements); protoexp->gpu_hostbuf_recv_cfg.bufsz = - proto->mq->hfi_base_window_rv; + psm3_mq_max_window_rv(proto->mq, 1); protoexp->gpu_hostbuf_pool_recv = psm3_mpool_create_for_gpu(sizeof(struct ips_gpu_hostbuf), @@ -290,6 +294,8 @@ MOCKABLE(psm3_ips_protoexp_init)(const struct ips_proto *proto, "Couldn't allocate GPU host receive buffer pool"); goto fail; } + psm3_mpool_get_obj_info(protoexp->gpu_hostbuf_pool_recv, + NULL, &pool_num_obj_max_total); protoexp->gpu_hostbuf_small_recv_cfg.bufsz = GPU_SMALLHOSTBUF_SZ; @@ -306,6 +312,13 @@ MOCKABLE(psm3_ips_protoexp_init)(const struct ips_proto *proto, "Couldn't allocate GPU host small receive buffer pool"); goto fail; } + psm3_mpool_get_obj_info(protoexp->gpu_hostbuf_pool_small_recv, + NULL, &small_pool_num_obj_max_total); + _HFI_DBG("GPU Recv Copy Pipeline: %u of %u bytes (small), %u of %u bytes\n", + small_pool_num_obj_max_total, + protoexp->gpu_hostbuf_small_recv_cfg.bufsz, + pool_num_obj_max_total, + protoexp->gpu_hostbuf_recv_cfg.bufsz); PSM3_GPU_PREPARE_HTOD_MEMCPYS(protoexp); STAILQ_INIT(&protoexp->gpupend_getreqsq); } else { @@ -460,7 +473,7 @@ psm3_ips_protoexp_tid_get_from_token(struct ips_protoexp *protoexp, uint64_t nbytes; PSM2_LOG_MSG("entering"); - psmi_assert((req->mq->hfi_base_window_rv % PSMI_PAGESIZE) == 0); + psmi_assert((psm3_mq_get_window_rv(req) % PSMI_PAGESIZE) == 0); getreq = (struct ips_tid_get_request *) psm3_mpool_get(protoexp->tid_getreq_pool); @@ -519,8 +532,9 @@ psm3_ips_protoexp_tid_get_from_token(struct ips_protoexp *protoexp, else #endif nbytes = PSMI_ALIGNUP((length + count - 1) / count, PSMI_PAGESIZE); - getreq->tidgr_rndv_winsz = - min(nbytes, req->mq->hfi_base_window_rv); + getreq->tidgr_rndv_winsz = psm3_mq_get_window_rv(req); + if (nbytes < getreq->tidgr_rndv_winsz) + getreq->tidgr_rndv_winsz = nbytes; _HFI_MMDBG("posting TID get request: nbytes=%"PRIu64" winsz=%u len=%u\n", nbytes, getreq->tidgr_rndv_winsz, getreq->tidgr_length); // we have now computed the size of each TID sequence (tidgr_rndv_winsz) @@ -635,12 +649,19 @@ psm3_ips_protoexp_send_tid_grant(struct ips_tid_recv_desc *tidrecvc) #if defined(PSM_CUDA) || defined(PSM_ONEAPI) -static -void psmi_deallocate_chb(struct ips_gpu_hostbuf* chb) +void psm3_ips_deallocate_send_chb(struct ips_gpu_hostbuf* chb, int reset) { - PSM3_GPU_HOSTBUF_DESTROY(chb); - psmi_free(chb); - return; + if (chb->is_tempbuf) { + PSM3_GPU_HOSTBUF_DESTROY(chb); + psmi_free(chb); + } else { + chb->req = NULL; + chb->offset = 0; + chb->bytes_read = 0; + if (reset) + PSM3_GPU_HOSTBUF_RESET(chb); + psm3_mpool_put(chb); + } } #endif @@ -677,19 +698,13 @@ ips_protoexp_tidsendc_complete(struct ips_tid_send_desc *tidsendc) STAILQ_REMOVE(&req->sendreq_prefetch, tidsendc->gpu_hostbuf[0], ips_gpu_hostbuf, req_next); - if (tidsendc->gpu_hostbuf[0]->is_tempbuf) - psmi_deallocate_chb(tidsendc->gpu_hostbuf[0]); - else { - tidsendc->gpu_hostbuf[0]->req = NULL; - tidsendc->gpu_hostbuf[0]->offset = 0; - tidsendc->gpu_hostbuf[0]->bytes_read = 0; - PSM3_GPU_HOSTBUF_RESET(tidsendc->gpu_hostbuf[0]); - psm3_mpool_put(tidsendc->gpu_hostbuf[0]); - } + psm3_ips_deallocate_send_chb(tidsendc->gpu_hostbuf[0], 1); psmi_gpu_run_prefetcher(protoexp, tidsendc); } - } else - psmi_free(tidsendc->userbuf); + } else { + psm3_ips_deallocate_send_chb(tidsendc->gpu_split_buf, 0); + tidsendc->gpu_split_buf = NULL; + } } #endif /* Check if we can complete the send request. */ @@ -1220,7 +1235,9 @@ int ips_protoexp_handle_immed_data(struct ips_proto *proto, uint64_t conn_ref, // For User RC conn_ref is context we set in rc_qp_create (*ipsaddr) // For Kernel RC, conn_ref is the conn handle (psm3_rv_conn_get_conn_handle) // maybe this should be an assert so don't add test in production code + // caller can't get qp_context (conn_ref) from rbuf_qp for SRQ if ((conn_type == RDMA_IMMED_USER_RC) + && ! proto->ep->verbs_ep.srq && (uint64_t)tidrecvc->ipsaddr != conn_ref) { // RDWA Write is not on expected RC QP from remote node _HFI_ERROR("RDMA Write on Wrong User QP 0x%"PRIx64", expect 0x%"PRIx64"\n", @@ -1304,19 +1321,41 @@ psmi_gpu_reclaim_hostbufs(struct ips_tid_get_request *getreq) } return PSM2_OK; } -static -struct ips_gpu_hostbuf* psmi_allocate_chb(uint32_t window_len) + +// allocate a chb control structure. The actual buffer and event needed for the +// DTOH async copy are allocated in chb's 1st use in PSM3_GPU_MEMCPY_DTOH_START +struct ips_gpu_hostbuf* psm3_ips_allocate_send_chb(struct ips_proto *proto, + uint32_t nbytes, int allow_temp) { - struct ips_gpu_hostbuf* chb = (struct ips_gpu_hostbuf*) - psmi_calloc(PSMI_EP_NONE, + struct ips_gpu_hostbuf* chb = NULL; + unsigned bufsz; + + if (nbytes <= GPU_SMALLHOSTBUF_SZ) { + chb = (struct ips_gpu_hostbuf *) psm3_mpool_get( + proto->gpu_hostbuf_pool_small_send); + bufsz = proto->gpu_hostbuf_small_send_cfg.bufsz; + } + if (chb == NULL) { + chb = (struct ips_gpu_hostbuf *) psm3_mpool_get( + proto->gpu_hostbuf_pool_send); + bufsz = proto->gpu_hostbuf_send_cfg.bufsz; + } + + /* were any buffers available? If not force allocate */ + if (chb == NULL && allow_temp) { + chb = (struct ips_gpu_hostbuf*) psmi_calloc(PSMI_EP_NONE, UNDEFINED, 1, sizeof(struct ips_gpu_hostbuf)); - if_pf (chb == NULL) { - psm3_handle_error(PSMI_EP_NORETURN, PSM2_NO_MEMORY, - "Couldn't allocate cuda host buffers "); - return NULL; + if_pf (chb == NULL) { + psm3_handle_error(PSMI_EP_NORETURN, PSM2_NO_MEMORY, + "Couldn't allocate GPU host bounce buffers "); + return NULL; + } + chb->is_tempbuf = 1; + bufsz = nbytes; } - PSM3_GPU_HOSTBUF_FORCE_INIT(chb, window_len); + if (chb && ! chb->host_buf) + PSM3_GPU_HOST_ALLOC(&chb->host_buf, bufsz); return chb; } @@ -1333,21 +1372,12 @@ void psmi_gpu_run_prefetcher(struct ips_protoexp *protoexp, if (req->prefetch_send_msgoff < req->req_data.send_msglen) { /* some data remains to be sent */ offset = req->prefetch_send_msgoff; + psmi_assert(req->is_buf_gpu_mem); window_len = ips_gpu_next_window( - proto->mq->hfi_base_window_rv, + psm3_mq_get_window_rv(req), offset, req->req_data.buf_len); - unsigned bufsz = 0; - if (window_len <= GPU_SMALLHOSTBUF_SZ) { - chb = (struct ips_gpu_hostbuf *) psm3_mpool_get( - proto->gpu_hostbuf_pool_small_send); - bufsz = proto->gpu_hostbuf_small_send_cfg.bufsz; - } - if (chb == NULL) { - chb = (struct ips_gpu_hostbuf *) psm3_mpool_get( - proto->gpu_hostbuf_pool_send); - bufsz = proto->gpu_hostbuf_send_cfg.bufsz; - } + chb = psm3_ips_allocate_send_chb(proto, window_len, 0); /* were any buffers available for the prefetcher? */ if (chb == NULL) return; @@ -1358,7 +1388,7 @@ void psmi_gpu_run_prefetcher(struct ips_protoexp *protoexp, chb->gpu_buf = (uint8_t*)req->req_data.buf + offset; chb->bytes_read = 0; - PSM3_GPU_MEMCPY_DTOH_START(proto, chb, window_len, bufsz); + PSM3_GPU_MEMCPY_DTOH_START(proto, chb, window_len); STAILQ_INSERT_TAIL(&req->sendreq_prefetch, chb, req_next); return; @@ -1384,28 +1414,13 @@ void psmi_attach_chb_to_tidsendc(struct ips_protoexp *protoexp, while (req->prefetch_send_msgoff < tsess_srcoff + tsess_length) { /* some data remains to be sent */ offset = req->prefetch_send_msgoff; + psmi_assert(req->is_buf_gpu_mem); window_len = ips_gpu_next_window( - proto->mq->hfi_base_window_rv, + psm3_mq_get_window_rv(req), offset, req->req_data.buf_len); - unsigned bufsz = 0; - if (window_len <= GPU_SMALLHOSTBUF_SZ) { - chb = (struct ips_gpu_hostbuf *) psm3_mpool_get( - proto->gpu_hostbuf_pool_small_send); - bufsz = proto->gpu_hostbuf_small_send_cfg.bufsz; - } - if (chb == NULL) { - chb = (struct ips_gpu_hostbuf *) psm3_mpool_get( - proto->gpu_hostbuf_pool_send); - bufsz = proto->gpu_hostbuf_send_cfg.bufsz; - } - - /* were any buffers available? If not force allocate */ - if (chb == NULL) { - chb = psmi_allocate_chb(window_len); - psmi_assert(chb); - chb->is_tempbuf = 1; - } + /* if no buffers available, force allocate of a temp buf */ + chb = psm3_ips_allocate_send_chb(proto, window_len, 1); req->prefetch_send_msgoff += window_len; chb->offset = offset; chb->size = window_len; @@ -1413,19 +1428,24 @@ void psmi_attach_chb_to_tidsendc(struct ips_protoexp *protoexp, chb->gpu_buf = (uint8_t*)req->req_data.buf + offset; chb->bytes_read = 0; - PSM3_GPU_MEMCPY_DTOH_START(proto, chb, window_len, bufsz); + PSM3_GPU_MEMCPY_DTOH_START(proto, chb, window_len); STAILQ_INSERT_TAIL(&req->sendreq_prefetch, chb, req_next); if (type == PSMI_GPU_PARTIAL_MATCH_FOUND) { + // caller matched 1st chb, but needed more prefetched + // see if we have what we need now if ((tsess_srcoff < chb->offset) && ((tsess_srcoff + tsess_length) > chb->offset)) { + // will collect the 2 prefetched chb's for this + // RDMA Write send into a single CPU temp buffer + // do alloc now, hoping to hide it behind GPU async copy to chb tidsendc->gpu_hostbuf[0] = chb_prev; tidsendc->gpu_hostbuf[1] = chb; tidsendc->gpu_num_buf = 2; - void *buffer = psmi_malloc(PSMI_EP_NONE, UNDEFINED, - tsess_length); + tidsendc->gpu_split_buf = psm3_ips_allocate_send_chb(proto, + tsess_length, 1); tidsendc->userbuf = - (void *)((uintptr_t) buffer); + (void *)((uintptr_t) tidsendc->gpu_split_buf->host_buf); tidsendc->buffer = (void *)((uintptr_t)tidsendc->userbuf + tsess_unaligned_start); @@ -1433,29 +1453,35 @@ void psmi_attach_chb_to_tidsendc(struct ips_protoexp *protoexp, } } else { if (attached) { + // we attached one in prior loop, now have + // a second, should have what we need now + psmi_assert((tsess_srcoff + tsess_length) > chb->offset); + // will collect the 2 prefetched chb's for this + // RDMA Write send into a single CPU temp buffer + // do alloc now, hoping to hide it behind GPU async copy to chb tidsendc->gpu_hostbuf[0] = chb_prev; tidsendc->gpu_hostbuf[1] = chb; tidsendc->gpu_num_buf = 2; - void *buffer = psmi_malloc(PSMI_EP_NONE, UNDEFINED, - tsess_length); + tidsendc->gpu_split_buf = psm3_ips_allocate_send_chb(proto, + tsess_length, 1); tidsendc->userbuf = - (void *)((uintptr_t) buffer); + (void *)((uintptr_t) tidsendc->gpu_split_buf->host_buf); tidsendc->buffer = (void *)((uintptr_t)tidsendc->userbuf + tsess_unaligned_start); - attached = 0; return; } if ((tsess_srcoff > chb->offset) && (tsess_srcoff < (chb->offset + chb->size)) && ((tsess_srcoff + tsess_length) > (chb->offset + chb->size))) { + // we prefetched one, but need another chb_prev = chb; attached = 1; - chb = NULL; continue; } else if ((chb->offset <= tsess_srcoff) && ((tsess_srcoff + tsess_length) <= (chb->offset+chb->size))) { + // we prefetched one and have what we need tidsendc->gpu_hostbuf[0] = chb; tidsendc->gpu_hostbuf[1] = NULL; tidsendc->gpu_num_buf = 1; @@ -1466,8 +1492,7 @@ void psmi_attach_chb_to_tidsendc(struct ips_protoexp *protoexp, (void *)((uintptr_t)tidsendc->userbuf + tsess_unaligned_start ); return; - } else - chb = NULL; + } } } } @@ -1575,11 +1600,11 @@ psm3_ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp, tidsendc->gpu_num_buf = 0; if (req->gpu_hostbuf_used) { /* To get a match: - * 1. Tid list offset + length is contained within a chb - * 2. Tid list offset + length is contained within - * the prefetched offset of this req. - * 3. Tid list offset + length is partially prefetched - * within one chb. (A partial match) + * 1. FULL - Tid list offset + length is contained within a chb + * 2. SPLIT - Tid list offset + length is contained within + * the prefetched offset of this req. (2 chb) + * 3. PARTIAL - Tid list offset + length is partially prefetched + * within one chb. */ STAILQ_FOREACH(chb, &req->sendreq_prefetch, req_next) { rc = psmi_find_match_in_prefeteched_chb(chb, @@ -1600,10 +1625,13 @@ psm3_ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp, tidsendc->gpu_hostbuf[1] = NULL; tidsendc->gpu_num_buf = 1; } else if (rc == PSMI_GPU_SPLIT_MATCH_FOUND){ - void *buffer = psmi_malloc(PSMI_EP_NONE, UNDEFINED, - tid_list->tsess_length); + // will collect the 2 prefetched chb's for this + // RDMA Write send into a single CPU temp buffer + // do alloc now, hoping to hide it behind GPU async copy to chb + tidsendc->gpu_split_buf =psm3_ips_allocate_send_chb(protoexp->proto, + tid_list->tsess_length, 1); tidsendc->userbuf = - (void *)((uintptr_t) buffer); + (void *)((uintptr_t) tidsendc->gpu_split_buf->host_buf); tidsendc->buffer = (void *)((uintptr_t)tidsendc->userbuf ); @@ -1612,6 +1640,7 @@ psm3_ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp, tidsendc->gpu_hostbuf[1] = chb_next; tidsendc->gpu_num_buf = 2; } else if (rc == PSMI_GPU_PARTIAL_MATCH_FOUND) { + // need to prefetch more psmi_attach_chb_to_tidsendc(protoexp, req, tidsendc, chb, @@ -1620,6 +1649,7 @@ psm3_ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp, 0, rc); } else { + // no match, need to prefetch psmi_attach_chb_to_tidsendc(protoexp, req, tidsendc, NULL, @@ -1849,6 +1879,7 @@ psm2_error_t ips_tid_send_exp(struct ips_tid_send_desc *tidsendc) #if defined(PSM_CUDA) || defined(PSM_ONEAPI) struct ips_gpu_hostbuf *chb, *chb_next; uint32_t offset_in_chb, i; + // wait for async copies into needed prefetcher chb's to finish for (i = 0; i < tidsendc->gpu_num_buf; i++) { chb = tidsendc->gpu_hostbuf[i]; if (chb) { @@ -1864,8 +1895,9 @@ psm2_error_t ips_tid_send_exp(struct ips_tid_send_desc *tidsendc) chb = tidsendc->gpu_hostbuf[0]; chb_next = tidsendc->gpu_hostbuf[1]; offset_in_chb = tidsendc->tid_list.tsess_srcoff - chb->offset; - /* Copying data from multiple cuda - * host buffers into a bounce buffer. + /* Copying data from multiple prefetched + * host buffers into a single temp CPU bounce buffer. + * so can issue a single RDMA Write from the temp bounce buffer */ memcpy(tidsendc->buffer, (void *)((uintptr_t)chb->host_buf + offset_in_chb), chb->size-offset_in_chb); @@ -1881,29 +1913,13 @@ psm2_error_t ips_tid_send_exp(struct ips_tid_send_desc *tidsendc) if(chb->bytes_read == chb->size) { STAILQ_REMOVE(&tidsendc->mqreq->sendreq_prefetch, chb, ips_gpu_hostbuf, req_next); - if (chb->is_tempbuf) - psmi_deallocate_chb(chb); - else { - chb->req = NULL; - chb->offset = 0; - chb->bytes_read = 0; - PSM3_GPU_HOSTBUF_RESET(chb); - psm3_mpool_put(chb); - } + psm3_ips_deallocate_send_chb(chb, 1); psmi_gpu_run_prefetcher(protoexp, tidsendc); } if(chb_next->bytes_read == chb_next->size) { STAILQ_REMOVE(&tidsendc->mqreq->sendreq_prefetch, chb_next, ips_gpu_hostbuf, req_next); - if (chb_next->is_tempbuf) - psmi_deallocate_chb(chb_next); - else{ - chb_next->req = NULL; - chb_next->offset = 0; - chb_next->bytes_read = 0; - PSM3_GPU_HOSTBUF_RESET(chb_next); - psm3_mpool_put(chb_next); - } + psm3_ips_deallocate_send_chb(chb_next, 1); psmi_gpu_run_prefetcher(protoexp, tidsendc); } /* Clean Up tidsendc ref's to split cuda hostbufs when no longer needed */ @@ -2190,8 +2206,10 @@ ips_tid_recv_alloc(struct ips_protoexp *protoexp, tidrecvc->stats.nReXmit = 0; tidrecvc->stats.nErrChkReceived = 0; - _HFI_EXP("alloc tidrecv=%d\n", - tidrecvc->rdescid._desc_idx); + _HFI_EXP("alloc tidrecv=%d srcoff=%u length=%u\n", + tidrecvc->rdescid._desc_idx, + tidrecvc->tid_list.tsess_srcoff, + tidrecvc->tid_list.tsess_length); tidrecvc->grantscb = grantscb; diff --git a/prov/psm3/psm3/ptl_ips/ips_proto_mq.c b/prov/psm3/psm3/ptl_ips/ips_proto_mq.c index b4582c6521d..cdcc480e89a 100644 --- a/prov/psm3/psm3/ptl_ips/ips_proto_mq.c +++ b/prov/psm3/psm3/ptl_ips/ips_proto_mq.c @@ -158,8 +158,7 @@ int ips_proto_mq_eager_complete(void *reqp, uint32_t nbytes) chb = STAILQ_FIRST(&req->sendreq_prefetch); STAILQ_REMOVE_HEAD(&req->sendreq_prefetch, req_next); - PSM3_GPU_HOSTBUF_RESET(chb); - psm3_mpool_put(chb); + psm3_ips_deallocate_send_chb(chb, 1); } } #endif @@ -508,24 +507,13 @@ ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req, while ((offset < len) && (prefetch_lookahead < proto->gpu_prefetch_limit)) { chb = NULL; + psmi_assert(req->is_buf_gpu_mem); window_len = ips_gpu_next_window( - proto->mq->hfi_base_window_rv, + psm3_mq_get_window_rv(req), offset, len); - unsigned bufsz; - if (window_len <= GPU_SMALLHOSTBUF_SZ) { - chb = (struct ips_gpu_hostbuf *) - psm3_mpool_get( - proto->gpu_hostbuf_pool_small_send); - bufsz = proto->gpu_hostbuf_small_send_cfg.bufsz; - } - if (chb == NULL) { - chb = (struct ips_gpu_hostbuf *) - psm3_mpool_get( - proto->gpu_hostbuf_pool_send); - bufsz = proto->gpu_hostbuf_send_cfg.bufsz; - } + chb = psm3_ips_allocate_send_chb(proto, window_len, 0); /* any buffers available? */ if (chb == NULL) { @@ -540,7 +528,7 @@ ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req, chb->gpu_buf = (uint8_t*)buf + offset; chb->bytes_read = 0; - PSM3_GPU_MEMCPY_DTOH_START(proto, chb, window_len, bufsz); + PSM3_GPU_MEMCPY_DTOH_START(proto, chb, window_len); STAILQ_INSERT_TAIL(&req->sendreq_prefetch, chb, req_next); @@ -590,7 +578,7 @@ ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req, && ips_epaddr_rdma_connected(ipsaddr) && !req->mr #if defined(PSM_CUDA) || defined(PSM_ONEAPI) - && len > GPUDIRECT_THRESH_RV + && (!PSMI_IS_GPU_ENABLED || len > GPUDIRECT_THRESH_RV) && ! req->gpu_hostbuf_used #endif ) { @@ -625,9 +613,11 @@ ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req, #if defined(PSM_CUDA) || defined(PSM_ONEAPI) static inline -int psm3_is_needed_rendezvous(struct ips_proto *proto, uint32_t len) +int psm3_is_needed_rendezvous(struct ips_proto *proto, uint32_t len, + uint32_t flags_user) { if ( + !(flags_user & PSM2_MQ_FLAG_INJECT) && len > gpu_thresh_rndv){ return 1; } @@ -667,6 +657,8 @@ psm3_ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user ipsaddr = (ips_epaddr_t *)mepaddr; } psmi_assert(ipsaddr->cstate_outgoing == CSTATE_ESTABLISHED); + // psmx3 layer never uses mq_isend for FI_INJECT + psmi_assert(! (flags_user & PSM2_MQ_FLAG_INJECT)); proto = ((psm2_epaddr_t) ipsaddr)->proto; @@ -681,7 +673,7 @@ psm3_ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user if (req->is_buf_gpu_mem) { gpu_mem = 1; PSM3_MARK_BUF_SYNCHRONOUS(ubuf); - if (psm3_is_needed_rendezvous(proto, len)) + if (psm3_is_needed_rendezvous(proto, len, 0)) goto do_rendezvous; } #endif @@ -1026,12 +1018,13 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags, gpu_mem = PSM3_IS_BUFFER_GPU_MEM(ubuf, len); if (gpu_mem) { PSM3_MARK_BUF_SYNCHRONOUS(ubuf); - if (psm3_is_needed_rendezvous(proto, len)) + if (psm3_is_needed_rendezvous(proto, len, flags)) goto do_rendezvous; } #endif flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO]; + /* SENDSYNC gets priority, assume not used for MPI_isend w/INJECT */ if (flags & PSM2_MQ_FLAG_SENDSYNC) { goto do_rendezvous; } else if (len <= mq->hfi_thresh_tiny) { @@ -1117,7 +1110,11 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags, } else { user_buffer = ubuf; #ifdef PSM_HAVE_REG_MR - if (len > proto->iovec_gpu_thresh_eager_blocking) { + if (len > proto->iovec_gpu_thresh_eager_blocking +#ifdef PSM_INJECT_NOSDMA + && !(flags & PSM2_MQ_FLAG_INJECT) +#endif + ) { scb->mr = psm3_verbs_reg_mr( proto->mr_cache, 0, (void*)user_buffer, len, IBV_ACCESS_IS_GPU_ADDR); @@ -1142,7 +1139,11 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags, #endif // PSM_CUDA || PSM_ONEAPI { #ifdef PSM_HAVE_REG_MR - if (len > proto->iovec_thresh_eager_blocking) { + if (len > proto->iovec_thresh_eager_blocking +#ifdef PSM_INJECT_NOSDMA + && !(flags & PSM2_MQ_FLAG_INJECT) +#endif + ) { scb->mr = psm3_verbs_reg_mr(proto->mr_cache, 0, (void*)user_buffer, len, 0); } else @@ -1240,6 +1241,7 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags, ubuf, len, tag->tag[0], tag->tag[1], tag->tag[2]); } else if (len <= mq->hfi_thresh_rv) { + // for FI_INJECT eager comes from user buffer, needs end to end ack psm2_mq_req_t req; /* Block until we can get a req */ diff --git a/prov/psm3/psm3/ptl_ips/ptl_rcvthread.c b/prov/psm3/psm3/ptl_ips/ptl_rcvthread.c index f1cee4faffd..562721a0b37 100644 --- a/prov/psm3/psm3/ptl_ips/ptl_rcvthread.c +++ b/prov/psm3/psm3/ptl_ips/ptl_rcvthread.c @@ -264,11 +264,64 @@ void psm3_ips_ptl_rcvthread_transfer_ownership(ptl_t *from_ptl_gen, ptl_t *to_pt rcvc->ptl = to_ptl_gen; } +/* parse recv thread frequency for PSM3_RCVTHREAD_FREQ" + * format is min_freq[:max_freq[:shift_freq]]", + * Either field can be omitted in which case default (input tvals) is used + * for given field. + * 0 - successfully parsed, tvals updated + * -1 - str empty, tvals unchanged + * -2 - syntax error, tvals may have been changed + */ +static int parse_rcvthread_freq(const char *str, + size_t errstr_size, char errstr[], + int tvals[3]) +{ + psmi_assert(tvals); + int ret = psm3_parse_str_tuples(str, 3, tvals); + if (ret < 0) + return ret; + if (tvals[0] == 0 || tvals[1] == 0) { + // disables receiver thread, no other checks needed + return 0; + } + if (tvals[0] < 0 || tvals[0] > 1000) { + if (errstr_size) + snprintf(errstr, errstr_size, " min_freq must be 0 to 1000"); + return -2; + } + if (tvals[1] < 0 || tvals[1] > 1000) { + if (errstr_size) + snprintf(errstr, errstr_size, " max_freq must be 0 to 1000"); + return -2; + } + if (tvals[0] > tvals[1]) { + if (errstr_size) + snprintf(errstr, errstr_size, " min_freq (%d) must be <= max_freq (%d)", tvals[0], tvals[1]); + return -2; + } + if (tvals[2] < 0 || tvals[2] > 10) { + if (errstr_size) + snprintf(errstr, errstr_size, " shift_freq must be 0 to 10"); + return -2; + } + return 0; +} + +static int parse_check_rcvthread_freq(int type, + const union psmi_envvar_val val, void *ptr, + size_t errstr_size, char errstr[]) +{ + // parser will set tvals to result, use a copy to protect input of defaults + int tvals[3] = { ((int*)ptr)[0], ((int*)ptr)[1], ((int*)ptr)[2] }; + psmi_assert(type == PSMI_ENVVAR_TYPE_STR_TUPLES); + return parse_rcvthread_freq(val.e_str, errstr_size, errstr, tvals); +} + + psm2_error_t rcvthread_initsched(struct ptl_rcvthread *rcvc) { union psmi_envvar_val env_to; char rcv_freq[192]; - int no_timeout = 0; int tvals[3] = { RCVTHREAD_TO_MIN_FREQ, RCVTHREAD_TO_MAX_FREQ, RCVTHREAD_TO_SHIFT @@ -276,40 +329,19 @@ psm2_error_t rcvthread_initsched(struct ptl_rcvthread *rcvc) snprintf(rcv_freq, sizeof(rcv_freq) - 1, "%d:%d:%d", RCVTHREAD_TO_MIN_FREQ, RCVTHREAD_TO_MAX_FREQ, RCVTHREAD_TO_SHIFT); - rcv_freq[sizeof(rcv_freq) - 1] = '\0'; - if (!psm3_getenv("PSM3_RCVTHREAD_FREQ", + (void)psm3_getenv_range("PSM3_RCVTHREAD_FREQ", "Recv Thread frequency (per sec) ", + "Specified as min_freq[:max_freq[:shift_freq]]\nwhere min_freq and max_freq are polls per second\n(0 disables receiver thread)\nand 2^shift_freq is amount to multiply or divide frequency by", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR_TUPLES, - (union psmi_envvar_val)rcv_freq, &env_to)) { - /* not using default values */ - (void)psm3_parse_str_tuples(env_to.e_str, 3, tvals); - int invalid = 0; - - if (tvals[0] == 0 || tvals[1] == 0) { - no_timeout = 1; - } else { - if (tvals[0] > 1000) - invalid = 1; - if (tvals[1] > 1000 || tvals[1] < tvals[0]) - invalid = 1; - if (tvals[2] > 10) - invalid = 1; - } - - if (invalid) { - _HFI_INFO - ("Overriding invalid request for RcvThread frequency" - " settings of %s to be <%d:%d:%d>\n", env_to.e_str, - RCVTHREAD_TO_MIN_FREQ, RCVTHREAD_TO_MAX_FREQ, - RCVTHREAD_TO_SHIFT); - tvals[0] = RCVTHREAD_TO_MIN_FREQ; - tvals[1] = RCVTHREAD_TO_MAX_FREQ; - tvals[2] = RCVTHREAD_TO_SHIFT; - } + (union psmi_envvar_val)rcv_freq, + (union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL, + parse_check_rcvthread_freq, tvals, &env_to); + if (parse_rcvthread_freq(env_to.e_str, 0, NULL, tvals) < 0) { + // already checked, shouldn't get parse errors nor empty strings + psmi_assert(0); } - - if (no_timeout) { + if (tvals[0] == 0 || tvals[1] == 0) { rcvc->last_timeout = -1; _HFI_PRDBG("PSM3_RCVTHREAD_FREQ set to only interrupt " "(no timeouts)\n"); diff --git a/prov/psm3/psm3/ptl_self/ptl.c b/prov/psm3/psm3/ptl_self/ptl.c index 35181f0f3ba..19231015d9b 100644 --- a/prov/psm3/psm3/ptl_self/ptl.c +++ b/prov/psm3/psm3/ptl_self/ptl.c @@ -80,6 +80,14 @@ ptl_handle_rtsmatch(psm2_mq_req_t recv_req, int was_posted) psm2_mq_req_t send_req = (psm2_mq_req_t) recv_req->ptl_req_ptr; if (recv_req->req_data.recv_msglen > 0) { +#ifdef PSM_DSA + if (psm3_use_dsa(recv_req->req_data.recv_msglen)) + psm3_dsa_memcpy(recv_req->req_data.buf, + send_req->req_data.buf, + recv_req->req_data.recv_msglen, 0, + &send_req->mq->stats.dsa_stats[0]); + else +#endif psm3_mq_mtucpy(recv_req->req_data.buf, send_req->req_data.buf, recv_req->req_data.recv_msglen); } diff --git a/prov/psm3/psm3/utils/utils_dsa.c b/prov/psm3/psm3/utils/utils_dsa.c index 219f8201fb1..2c697b1cf20 100644 --- a/prov/psm3/psm3/utils/utils_dsa.c +++ b/prov/psm3/psm3/utils/utils_dsa.c @@ -97,9 +97,14 @@ static uint32_t dsa_thresh; // copies > thresh will use DSA // per process (such as OneCCL workers or Intel MPI Multi-EP threading). // But expected counts for such are modest (2-4 for Intel MPI, 8-16 for OneCCL) #define DSA_MAX_QUEUES 32 + +// Default: 2 MB. +#define DSA_MAX_XFER_SIZE_DEFAULT (1 << 21) + // information parsed from PSM3_DSA_WQS static char *dsa_wq_filename[DSA_MAX_PROC][DSA_MAX_QUEUES]; static uint8_t dsa_wq_mode[DSA_MAX_PROC][DSA_MAX_QUEUES]; +static uint32_t dsa_wq_max_xfer_size[DSA_MAX_PROC][DSA_MAX_QUEUES]; static uint32_t dsa_num_wqs[DSA_MAX_PROC]; static uint32_t dsa_num_proc; @@ -108,6 +113,7 @@ struct dsa_wq { const char *wq_filename; // points into dsa_wq_filename void *wq_reg; // mmap memory uint32_t use_count; // how many threads assigned to this WQ + uint32_t max_xfer_size; // Maximum supported transfer size uint8_t dedicated; // is this a dedicated (1) or shared (0) WQ }; static struct dsa_wq dsa_wqs[DSA_MAX_QUEUES]; @@ -119,6 +125,7 @@ static psmi_spinlock_t dsa_wq_lock; // protects dsa_wq.use_count // Each thread is assigned a DSA WQ on 1st memcpy static __thread void *dsa_wq_reg = NULL; static __thread uint8_t dsa_wq_dedicated; +static __thread uint32_t dsa_wq_xfer_limit; // we keep completion record in thread local storage instead of stack // this way if a DSA completion times out and arrives late it still has a @@ -163,6 +170,13 @@ void psm3_dsa_memcpy(void *dest, const void *src, uint32_t n, int rx, uint32_t cpu_n; uint64_t start_cycles, end_cycles; uint64_t loops; + uint32_t dsa_chk_size; + uint32_t cpu_chk_size; + int t_chunks; + uint32_t dsa_copied_len = 0; + uint32_t cpu_copied_len = 0; + int copied_chunks = 0; + uint32_t dsa_cp_len; #if defined(PSM_CUDA) || defined(PSM_ONEAPI) if (n && PSMI_IS_GPU_ENABLED && (PSMI_IS_GPU_MEM(dest) || PSMI_IS_GPU_MEM((void *) src))) { @@ -177,22 +191,31 @@ void psm3_dsa_memcpy(void *dest, const void *src, uint32_t n, int rx, return; } + /* + * Calculate the total chunks. + */ + t_chunks = (n + dsa_wq_xfer_limit - 1) / dsa_wq_xfer_limit; + // TBD - add some statistics for DSA vs CPU copy use // to maximize performance we do part of the copy with CPU while we // wait for DSA to copy the rest if (dsa_ratio) { cpu_n = n/dsa_ratio; + cpu_chk_size = cpu_n / t_chunks; // TBD - should we compute so DSA gets a full multiple of pages and CPU // does the rest? Should we start DSA on a page boundary? // round down to page boundary //cpu_n = ROUNDDOWNP2(cpu_n, PSMI_PAGESIZE); // round to a multiple of 8 bytes at least - cpu_n = ROUNDDOWNP2(cpu_n, 8); + cpu_chk_size = ROUNDDOWNP2(cpu_chk_size, 8); + cpu_n = cpu_chk_size * t_chunks; } else { cpu_n = 0; + cpu_chk_size = 0; } dsa_n = n - cpu_n; + dsa_chk_size = (dsa_n + t_chunks - 1)/t_chunks; dsa_src = (void*)((uintptr_t)src + cpu_n); dsa_dest = (void*)((uintptr_t)dest + cpu_n); psmi_assert(dsa_n); @@ -200,6 +223,8 @@ void psm3_dsa_memcpy(void *dest, const void *src, uint32_t n, int rx, // comp ptr must be 32 byte aligned comp = (struct dsa_completion_record *)(((uintptr_t)&dsa_comp[0] + 0x1f) & ~0x1f); + +restart: comp->status = 0; desc.opcode = DSA_OPCODE_MEMMOVE; /* set CRAV (comp address valid) and RCR (request comp) so get completion */ @@ -218,9 +243,13 @@ void psm3_dsa_memcpy(void *dest, const void *src, uint32_t n, int rx, // for overall server. Best to take the pain here as page faults should // be rare during steady state of most apps // desc.flags |= IDXD_OP_FLAG_BOF; - desc.xfer_size = dsa_n; - desc.src_addr = (uintptr_t)dsa_src; - desc.dst_addr = (uintptr_t)dsa_dest; + if (copied_chunks < (t_chunks - 1)) + dsa_cp_len = dsa_chk_size; + else + dsa_cp_len = dsa_n - dsa_copied_len; + desc.xfer_size = dsa_cp_len; + desc.src_addr = (uintptr_t)dsa_src + dsa_copied_len; + desc.dst_addr = (uintptr_t)dsa_dest + dsa_copied_len; desc.completion_addr = (uintptr_t)comp; // make sure completion status zeroing fully written before post to HW @@ -239,9 +268,8 @@ void psm3_dsa_memcpy(void *dest, const void *src, uint32_t n, int rx, if (get_cycles() > end_cycles) { _HFI_INFO("Disabling DSA: DSA SWQ Enqueue Timeout\n"); dsa_available = 0; - memcpy(dest, src, n); stats->dsa_error++; - return; + goto memcpy_exit; } } stats->dsa_swq_wait_ns += cycles_to_nanosecs(get_cycles() - start_cycles); @@ -252,11 +280,13 @@ void psm3_dsa_memcpy(void *dest, const void *src, uint32_t n, int rx, if (cpu_n) { // while DSA does it's thing, we copy rest via CPU - memcpy(dest, src, cpu_n); + memcpy((void *)((uintptr_t)dest + cpu_copied_len), + (void *)((uintptr_t)src + cpu_copied_len), cpu_chk_size); + cpu_copied_len += cpu_chk_size; } stats->dsa_copy++; - stats->dsa_copy_bytes += dsa_n; + stats->dsa_copy_bytes += dsa_cp_len; // wait for DSA to finish start_cycles = get_cycles(); @@ -269,8 +299,8 @@ void psm3_dsa_memcpy(void *dest, const void *src, uint32_t n, int rx, if (get_cycles() > end_cycles && comp->status == 0) { _HFI_INFO("Disabling DSA: DSA Hardware Timeout\n"); dsa_available = 0; - memcpy(dsa_dest, dsa_src, dsa_n); stats->dsa_error++; + goto memcpy_exit; return; } loops++; @@ -294,9 +324,22 @@ void psm3_dsa_memcpy(void *dest, const void *src, uint32_t n, int rx, stats->dsa_page_fault_rd++; _HFI_VDBG("DSA desc failed: page fault status %u\n", comp->status); } - memcpy(dsa_dest, dsa_src, dsa_n); - return; + goto memcpy_exit; } + /* Check loop status */ + dsa_copied_len += dsa_cp_len; + if (++copied_chunks < t_chunks) + goto restart; + + return; + +memcpy_exit: + memcpy((void *)((uintptr_t)dsa_dest + dsa_copied_len), + (void *)((uintptr_t)dsa_src + dsa_copied_len), + dsa_n - dsa_copied_len); + memcpy((void *)((uintptr_t)dest + cpu_copied_len), + (void *)((uintptr_t)src + cpu_copied_len), + cpu_n - cpu_copied_len); return; } @@ -378,6 +421,58 @@ static int psm3_dsa_mode(const char *wq_filename) return -1; } +// determine the max transfer size for a DSA WQ by reading the max_transfer_size +// file under DSA_DEVICES/wqX.Y/ +// where wqX.Y is last part of supplied wq_filename +// return the max_transfer_size. +// on error returns 0 and an _HFI_ERROR message has been output +static int psm3_dsa_max_xfer_size(const char *wq_filename) +{ + char wq_size_filename[PATH_MAX]; + const char *p; + char buf[20]; + int fd; + int res; + + p = strrchr(wq_filename, '/'); + if (p) + p++; // skip '/' + else + p = wq_filename; + res = snprintf(wq_size_filename, sizeof(wq_size_filename), + "%s/%s/max_transfer_size", DSA_DEVICES, p); + if (res < 0 || res > sizeof(wq_size_filename) - 1) { + _HFI_ERROR("Unable to determine DSA WQ max xfer size for %s\n", + wq_filename); + return 0; + } + fd = open(wq_size_filename, O_RDONLY); + if (fd < 0) { + _HFI_ERROR("Failed to open DSA WQ max xfer size: %s: %s\n", + wq_size_filename, strerror(errno)); + return 0; + } + res = read(fd, buf, sizeof(buf)-1); + if (res < 0) { + _HFI_ERROR("Failed to read DSA WQ max xfer size: %s: %s\n", + wq_size_filename, strerror(errno)); + close(fd); + return 0; + } + close(fd); + if (! res) { + _HFI_ERROR("Failed to read DSA WQ max xfer size: %s: empty file\n", + wq_size_filename); + return 0; + } + if (buf[res-1] == '\n') + buf[res-1] = '\0'; + else + buf[res] = '\0'; + _HFI_DBG("DSA WQ %s max xfer size %s\n", wq_filename, buf); + return (uint32_t)strtoul(buf, NULL, 0); +} + /* initialize DSA - call once per process */ /* Some invalid inputs and DSA initialization errors are treated as fatal errors * since if DSA gets initialized on some nodes, but not on others, the @@ -410,11 +505,11 @@ int psm3_dsa_init(void) if (! psm3_getenv("PSM3_DSA_WQS", "List of DSA WQ devices to use, one list per local process or per\n" "CPU socket:\n" - " wq0,wq2:wq4,wq6:,...\n" + " wq0,wq2;wq4,wq6;,...\n" "Each wq should be a shared workqueue DSA device or a unique\n" "dedicated workqueue DSA device,\n" " such as /dev/dsa/wq0.0\n" - "Colon separates the lists for different processes\n" + "Semicolon separates the lists for different processes\n" " default is '' in which case DSA is not used\n", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, (union psmi_envvar_val)"", &env_dsa_wq)) { @@ -430,10 +525,13 @@ int psm3_dsa_init(void) } s = temp; psmi_assert(*s); + // both : and ; are treated the same below, : is deprecated do { int mode; + uint32_t xfer_size; + new_proc = 0; - if (! *s) // trailing ',' or ':' on 2nd or later loop + if (! *s) // trailing ',' or ':' or ';' on 2nd or later loop break; if (proc >= DSA_MAX_PROC) { _HFI_ERROR("PSM3_DSA_WQS exceeds %u per node process limit: '%s'", @@ -441,9 +539,9 @@ int psm3_dsa_init(void) psmi_free(temp); goto fail; } - delim = strpbrk(s, ",:"); + delim = strpbrk(s, ",:;"); if (delim) { - new_proc = (*delim == ':'); + new_proc = (*delim == ':' || *delim == ';'); *delim = '\0'; } if (dsa_num_wqs[proc] > DSA_MAX_QUEUES) { @@ -460,6 +558,9 @@ int psm3_dsa_init(void) } if (mode) all_are_shared = 0; + xfer_size = psm3_dsa_max_xfer_size(s); + dsa_wq_max_xfer_size[proc][dsa_num_wqs[proc]] = xfer_size > 0 ? + xfer_size : DSA_MAX_XFER_SIZE_DEFAULT; dsa_wq_mode[proc][dsa_num_wqs[proc]] = mode; dsa_wq_filename[proc][dsa_num_wqs[proc]] = psmi_strdup(PSMI_EP_NONE, s); dsa_num_wqs[proc]++; @@ -468,7 +569,7 @@ int psm3_dsa_init(void) s = delim+1; } while (delim); psmi_free(temp); - // new_proc means trailing :, ignore it + // new_proc means trailing : or ;, ignore it // otherwise, last we processed counts if (!new_proc && proc < DSA_MAX_PROC && dsa_num_wqs[proc]) proc++; @@ -580,6 +681,7 @@ int psm3_dsa_init(void) // key off having rw access to the DSA WQ to decide if DSA is available dsa_wqs[i].wq_filename = dsa_wq_filename[proc][i]; dsa_wqs[i].dedicated = dsa_wq_mode[proc][i]; + dsa_wqs[i].max_xfer_size = dsa_wq_max_xfer_size[proc][i]; if (! realpath(dsa_wqs[i].wq_filename, dsa_filename)) { _HFI_ERROR("Failed to resolve DSA WQ path %s\n", dsa_wqs[i].wq_filename); goto fail; @@ -658,6 +760,7 @@ static inline void psm3_dsa_pick_wq(void) found: dsa_wq_reg = dsa_wqs[sel].wq_reg; dsa_wq_dedicated = dsa_wqs[sel].dedicated; + dsa_wq_xfer_limit = dsa_wqs[sel].max_xfer_size; } diff --git a/prov/psm3/psm3/utils/utils_env.c b/prov/psm3/psm3/utils/utils_env.c index f8c2dbd8b96..55efb77bc2b 100644 --- a/prov/psm3/psm3/utils/utils_env.c +++ b/prov/psm3/psm3/utils/utils_env.c @@ -90,7 +90,8 @@ int psm3_env_initialize(void) // get verbosity level setting for env logging // if invalid syntax, will output warning when parse during psm3_getenv const char *verb_env = getenv("PSM3_VERBOSE_ENV"); - (void)psm3_parse_val_pattern(verb_env, 0, &verb_env_val); + (void)psm3_parse_val_pattern_uint(verb_env, 0, &verb_env_val, + PSMI_ENVVAR_FLAG_NOABBREV, 0, 3); if (verb_env_val) env_log_level = 0; // log at INFO level @@ -119,7 +120,7 @@ int psm3_env_initialize(void) c = fgetc(f); if (c != EOF) { // line too long, fgetc until read newline - _HFI_INFO("%s: Ignoring line too long: '%s' ...\n", + _HFI_ENV_ERROR("%s: Ignoring line too long: '%s' ...\n", PSM3_ENV_FILENAME, buf); while (c != (int)(unsigned char)'\n' && (c = fgetc(f)) != EOF) ; @@ -150,7 +151,7 @@ int psm3_env_initialize(void) j = strspn(&buf[i], "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_"); if (buf[i+j] != '=') { // malformed assignment,skip - _HFI_INFO("%s: Ignoring malformed assignment: '%s'\n", + _HFI_ENV_ERROR("%s: Ignoring malformed assignment: '%s'\n", PSM3_ENV_FILENAME, buf); continue; } @@ -180,7 +181,8 @@ int psm3_env_initialize(void) // allow /etc/psm3.conf to set PSM3_VERBOSE_ENV when defaulted // if invalid syntax, will output warning when parse during psm3_getenv if (! verb_env && 0 == strcmp("PSM3_VERBOSE_ENV", var.name)) { - (void)psm3_parse_val_pattern(var.value, 0, &verb_env_val); + (void)psm3_parse_val_pattern_uint(var.value, 0, &verb_env_val, + PSMI_ENVVAR_FLAG_NOABBREV, 0, 3); if (verb_env_val) env_log_level = 0; // log at INFO level } @@ -189,7 +191,7 @@ int psm3_env_initialize(void) // this must be parsed in a constructor prior to this function, // so we ignore it here if (0 == strcmp(var.name, "PSM3_DISABLE_MMAP_MALLOC")) { - _HFI_INFO("WARNING: %s Ignoring %s\n", PSM3_ENV_FILENAME,var.name); + _HFI_ENV_ERROR("WARNING: %s Ignoring %s\n", PSM3_ENV_FILENAME,var.name); psmi_free(var.name); psmi_free(var.value); continue; @@ -252,7 +254,9 @@ void psm3_env_print_val(FILE *f, const char *name, int type, switch (type) { case PSMI_ENVVAR_TYPE_STR: case PSMI_ENVVAR_TYPE_STR_TUPLES: - case PSMI_ENVVAR_TYPE_STR_VAL_PAT: + case PSMI_ENVVAR_TYPE_STR_VAL_PAT_INT: + case PSMI_ENVVAR_TYPE_STR_VAL_PAT_UINT: + case PSMI_ENVVAR_TYPE_STR_VAL_PAT_UINT_FLAGS: fprintf(f, "%s=%s\n", name, val.e_str); break; case PSMI_ENVVAR_TYPE_INT: @@ -286,7 +290,9 @@ int psm3_env_snprint_val(char *buf, size_t size, const char *name, int type, switch (type) { case PSMI_ENVVAR_TYPE_STR: case PSMI_ENVVAR_TYPE_STR_TUPLES: - case PSMI_ENVVAR_TYPE_STR_VAL_PAT: + case PSMI_ENVVAR_TYPE_STR_VAL_PAT_INT: + case PSMI_ENVVAR_TYPE_STR_VAL_PAT_UINT: + case PSMI_ENVVAR_TYPE_STR_VAL_PAT_UINT_FLAGS: return snprintf(buf, size, "%s=%s\n", name, val.e_str); break; case PSMI_ENVVAR_TYPE_INT: @@ -332,20 +338,18 @@ char *psm3_env_get(const char *name) return NULL; } -/* _CONSUMED_ALL() is a macro which indicates if strtol() consumed all - of the input passed to it. */ -#define _CONSUMED_ALL(CHAR_PTR) (((CHAR_PTR) != NULL) && (*(CHAR_PTR) == 0)) - // don't document that 3 and 3: and 3:pattern can output hidden params const char *PSM3_VERBOSE_ENV_HELP = - "Enable verbose output of environment variables. " - "(0 - none, 1 - changed w/o help, 2 - user help, " - "#: - limit output to rank 0, #:pattern - limit output " - "to processes whose label matches " + "Enable verbose output of environment variables.\n" + " 0 - none\n" + " 1 - only output changed w/o help\n" + " 2 - output all with help,\n" + " #: - limit output to rank 0\n" + " #:pattern - limit output to processes whose label matches\n " #ifdef FNM_EXTMATCH - "extended " + "extended " #endif - "glob pattern"; + "glob pattern"; /* If PSM3_VERBOSE_ENV is set in the environment, we determine * what its verbose level is and print the environment at "INFO" @@ -362,25 +366,24 @@ static int psm3_getenv_is_verblevel(int printlevel) unsigned verb_env_val; if (env) psm3_stats_print_env("PSM3_VERBOSE_ENV", env); - int ret = psm3_parse_val_pattern(env, 0, &verb_env_val); + int ret = psm3_parse_val_pattern_uint(env, 0, &verb_env_val, + PSMI_ENVVAR_FLAG_NOABBREV, 0, 3); psmi_getenv_verblevel = verb_env_val; - if (psmi_getenv_verblevel < 0 || psmi_getenv_verblevel > 3) - psmi_getenv_verblevel = 2; if (psmi_getenv_verblevel > 0) nlevel = 0; /* output at INFO level */ if (ret == -2) - _HFI_ENVDBG(0, "Invalid value for %s ('%s') %-40s Using: %u\n", - "PSM3_VERBOSE_ENV", env, PSM3_VERBOSE_ENV_HELP, verb_env_val); + _HFI_ENVDBG(0, "Invalid value for %s ('%s') Using: %u\nHelp: %s\n", + "PSM3_VERBOSE_ENV", env, verb_env_val, PSM3_VERBOSE_ENV_HELP); else if (psmi_getenv_verblevel == 1) _HFI_ENVDBG(0, " %-25s => '%s' (default was '%s')\n", "PSM3_VERBOSE_ENV", env?env:"", "0"); else if (env && *env) - _HFI_ENVDBG(nlevel, " %-25s %-40s => '%s' (default was '%s')\n", - "PSM3_VERBOSE_ENV", PSM3_VERBOSE_ENV_HELP, env, "0"); + _HFI_ENVDBG(nlevel, " %-25s => '%s' (default was '%s')\nHelp: %s\n", + "PSM3_VERBOSE_ENV", env, "0", PSM3_VERBOSE_ENV_HELP); else /* defaulted */ _HFI_ENVDBG(nlevel, - " %-25s %-40s => '%s'\n", - "PSM3_VERBOSE_ENV", PSM3_VERBOSE_ENV_HELP, "0"); + " %-25s => '%s'\nHelp: %s\n", + "PSM3_VERBOSE_ENV", "0", PSM3_VERBOSE_ENV_HELP); } // printlevel is visibility of env (USER=1 or HIDDEN=2) // so at verbosity 1 and 2 output USER @@ -419,314 +422,647 @@ static int psm3_count_tuples(const char *str) return ret; } -int -MOCKABLE(psm3_getenv)(const char *name, const char *descr, int level, - int type, union psmi_envvar_val defval, +/* _CONSUMED_ALL indicates if strtol() (and friends) consumed all of the input + * passed to it. CHAR_PTR is the output char pointer from strtol + */ +#define _CONSUMED_ALL(CHAR_PTR) (((CHAR_PTR) != NULL) && (*(CHAR_PTR) == 0)) + +/* convert a string to a signed number with basic bounds checking + * returns 0 - valid value and *val updated + * -1 -> empty string, *val unchanged + * -2 -> parse or range error, *val unchanged + */ +static int convert_str_signed(const char *str, long long *val, + long long min, long long max) +{ + char *ep; + long long temp; + + psmi_assert(val != NULL); + if (! str || ! *str) + return -1; + /* Avoid base 8 (octal) on purpose, so don't pass in 0 for radix */ + errno = 0; + temp = strtoll(str, &ep, 10); + if (! _CONSUMED_ALL(ep)) { + errno = 0; + temp = strtoll(str, &ep, 16); + if (! _CONSUMED_ALL(ep)) + return -2; + } + if (errno || temp < min || temp > max) + return -2; + + *val = temp; + return 0; +} + +/* convert a string to an unsigned number with basic bounds checking + * returns 0 - valid value and *val updated + * -1 -> empty string, *val unchanged + * -2 -> parse or range error, *val unchanged + */ +static int convert_str_unsigned(const char *str, unsigned long long *val, + unsigned long long min, unsigned long long max) +{ + char *ep; + unsigned long long temp; + + psmi_assert(val != NULL); + if (! str || ! *str) + return -1; + /* Avoid base 8 (octal) on purpose, so don't pass in 0 for radix */ + errno = 0; + temp = strtoull(str, &ep, 10); + if (! _CONSUMED_ALL(ep)) { + errno = 0; + temp = strtoull(str, &ep, 16); + if (! _CONSUMED_ALL(ep)) + return -2; + } + if (errno || temp < min || temp > max) + return -2; + + *val = temp; + return 0; +} +#undef _CONSUMED_ALL + +// returns: +// 0 - valid value input +// 1 - variable not set, used default +// -1 - invalid value for variable or invalid syntax, used default +int MOCKABLE(psm3_getenv_range)(const char *name, const char *descr, + const char *help, unsigned level_flags, + int type, union psmi_envvar_val defval, union psmi_envvar_val min, + union psmi_envvar_val max, psm3_getenv_check_t check, void *ptr, union psmi_envvar_val *newval) { - int used_default = 0; + int ret = 0; union psmi_envvar_val tval; char *env = psm3_env_get(name); + unsigned level = level_flags & PSMI_ENVVAR_LEVEL_MASK; + char rangestr[80] = ""; // for help + char errstr[512] = ""; // additional info for invalid values + char statserrstr[700] = ""; // add'l info for stats file when invalid input + +#define FORMAT_RANGESTR(FIELD, fmt) \ + do { \ + if ((level_flags & PSMI_ENVVAR_FLAG_NOMIN)) { \ + if ((level_flags & PSMI_ENVVAR_FLAG_NOMAX)) \ + rangestr[0] = '\0'; \ + else \ + snprintf(rangestr, sizeof(rangestr)," Max allowed " fmt "%s",\ + max.FIELD, \ + (level_flags & PSMI_ENVVAR_FLAG_NOABBREV)?"":" (or 'max')");\ + } else if ((level_flags & PSMI_ENVVAR_FLAG_NOMAX)) { \ + snprintf(rangestr, sizeof(rangestr)," Min allowed " fmt "%s", \ + min.FIELD, \ + (level_flags & PSMI_ENVVAR_FLAG_NOABBREV)?"":" (or 'min')");\ + } else { \ + snprintf(rangestr, sizeof(rangestr)," Valid range " fmt "%s" \ + " to " fmt "%s", \ + min.FIELD, \ + (level_flags & PSMI_ENVVAR_FLAG_NOABBREV)?"":" (or 'min')",\ + max.FIELD, \ + (level_flags & PSMI_ENVVAR_FLAG_NOABBREV)?"":" (or 'max')");\ + } \ + } while (0) + +#define _GETENV_CHECK(tval) \ + do { \ + if (check) { \ + if ((*check)(type, tval, ptr, sizeof(errstr), errstr)) { \ + tval = defval; \ + ret = -1; \ + /* errstr now has additional error information */ \ + } \ + } \ + } while (0); /* for verblevel 1 we only output non-default values with no help * for verblevel>1 we promote to info (verblevel=2 promotes USER, * verblevel=3 promotes HIDDEN) and show help. * for verblevel< 1 we don't promote anything and show help */ -#define _GETENV_PRINT(env, used_default, fmt, val, defval) \ +#define _GETENV_PRINT(env, ret, fmt, val, defval) \ do { \ (void)psm3_getenv_is_verblevel(level); \ - if (env && *env && used_default) \ - _HFI_INFO("Invalid value for %s ('%s') %-40s Using: " fmt "\n", \ - name, env, descr, val); \ - else if (used_default && psmi_getenv_verblevel != 1) \ - GETENV_PRINTF(level, "%s%-25s %-40s => " fmt \ - "\n", level > 1 ? "*" : " ", name, \ - descr, val); \ - else if (! used_default && psmi_getenv_verblevel == 1) \ + if (ret < 0 && (level_flags & PSMI_ENVVAR_FLAG_FATAL)) { \ + _HFI_ENV_ERROR("Invalid value for %s ('%s')%s\nHelp: %s%s\n%s%s", \ + name, env, errstr, descr, rangestr,\ + help?help:"", help?"\n":""); \ + snprintf(statserrstr, sizeof(statserrstr), \ + "Invalid value ('%s')%s", env, errstr); \ + } else if (ret < 0) { \ + _HFI_ENV_ERROR("Invalid value for %s ('%s')%s Using: " fmt "\nHelp: %s%s\n%s%s", \ + name, env, errstr, val, descr, rangestr,\ + help?help:"", help?"\n":""); \ + snprintf(statserrstr, sizeof(statserrstr), \ + "Invalid value ('%s')%s Using: " fmt, \ + env, errstr, val); \ + } else if (ret > 0 && psmi_getenv_verblevel != 1) \ + GETENV_PRINTF(level, "%s%-25s => " fmt \ + "\nHelp: %s%s\n%s%s", level > 1 ? "*" : " ", name, \ + val, descr, rangestr, \ + help?help:"", help?"\n":"");\ + else if (ret == 0 && psmi_getenv_verblevel == 1) \ GETENV_PRINTF(1, "%s%-25s => " \ fmt " (default was " fmt ")\n", \ level > 1 ? "*" : " ", name, \ val, defval); \ - else if (! used_default && psmi_getenv_verblevel != 1) \ - GETENV_PRINTF(1, "%s%-25s %-40s => " \ - fmt " (default was " fmt ")\n", \ - level > 1 ? "*" : " ", name, descr, \ - val, defval); \ + else if (ret == 0 && psmi_getenv_verblevel != 1) \ + GETENV_PRINTF(1, "%s%-25s => " \ + fmt " (default was " fmt ")\nHelp: %s%s\n%s%s", \ + level > 1 ? "*" : " ", name, \ + val, defval, descr, rangestr, \ + help?help:"", help?"\n":""); \ } while (0) -#define _CONVERT_TO_NUM(DEST,TYPE,STRTOL) \ - do { \ - char *ep; \ - /* Avoid base 8 (octal) on purpose, so don't pass in 0 for radix */ \ - DEST = (TYPE)STRTOL(env, &ep, 10); \ - if (! _CONSUMED_ALL(ep)) { \ - DEST = (TYPE)STRTOL(env, &ep, 16); \ - if (! _CONSUMED_ALL(ep)) { \ - used_default = 1; \ - tval = defval; \ - } \ - } \ +#define _CONVERT_TO_NUM(FIELD,TYPE,SIGNED,MIN,MAX) \ + do { \ + if (!(level_flags & (PSMI_ENVVAR_FLAG_NOMIN|PSMI_ENVVAR_FLAG_NOABBREV))\ + && (!strcasecmp(env, "min") || !strcasecmp(env, "minimum")))\ + tval.FIELD = min.FIELD; \ + else if (!(level_flags & (PSMI_ENVVAR_FLAG_NOMAX|PSMI_ENVVAR_FLAG_NOABBREV))\ + && (!strcasecmp(env, "max") || !strcasecmp(env, "maximum")))\ + tval.FIELD = max.FIELD; \ + else { \ + SIGNED long long temp; \ + if (convert_str_##SIGNED(env, &temp, MIN, MAX)) { \ + ret = -1; /* callered checked empty, so must be invalid */ \ + tval = defval; \ + } else if ((temp < min.FIELD \ + && !(level_flags & PSMI_ENVVAR_FLAG_NOMIN)) \ + || (temp > max.FIELD \ + && !(level_flags & PSMI_ENVVAR_FLAG_NOMAX))) { \ + ret = -1; \ + tval = defval; \ + } else { \ + tval.FIELD = (TYPE)temp; \ + } \ + } \ } while (0) switch (type) { case PSMI_ENVVAR_TYPE_YESNO: - tval.e_int = psm3_parse_str_yesno(env); - if (tval.e_int < 0) { + if (!env || *env == '\0') { tval = defval; - used_default = 1; + ret = 1; + } else { + switch (psm3_parse_str_yesno(env, &tval.e_int)) { + case -1: // empty, use default + psmi_assert(0); // shouldn't happen, checked for empty above + tval = defval; + ret = 1; + break; + case -2: // bad syntax, use default + tval = defval; + ret = -1; + break; + default: // valid input + _GETENV_CHECK(tval); + break; + } } - _GETENV_PRINT(env, used_default, "%s", tval.e_int ? "YES" : "NO", + _GETENV_PRINT(env, ret, "%s", tval.e_int ? "YES" : "NO", defval.e_int ? "YES" : "NO"); break; case PSMI_ENVVAR_TYPE_STR: if (!env || *env == '\0') { tval = defval; - used_default = 1; - } else + ret = 1; + } else { tval.e_str = env; - _GETENV_PRINT(env, used_default, "'%s'", tval.e_str, defval.e_str); + _GETENV_CHECK(tval); + } + _GETENV_PRINT(env, ret, "'%s'", tval.e_str, defval.e_str); break; case PSMI_ENVVAR_TYPE_INT: if (!env || *env == '\0') { tval = defval; - used_default = 1; + ret = 1; } else { - _CONVERT_TO_NUM(tval.e_int,int,strtol); + _CONVERT_TO_NUM(e_int,int,signed,INT_MIN,INT_MAX); + if (ret == 0) + _GETENV_CHECK(tval); } - _GETENV_PRINT(env, used_default, "%d", tval.e_int, defval.e_int); + FORMAT_RANGESTR(e_int, "%d"); + _GETENV_PRINT(env, ret, "%d", tval.e_int, defval.e_int); break; case PSMI_ENVVAR_TYPE_UINT: case PSMI_ENVVAR_TYPE_UINT_FLAGS: if (!env || *env == '\0') { tval = defval; - used_default = 1; + ret = 1; } else { - _CONVERT_TO_NUM(tval.e_int,unsigned int,strtoul); + _CONVERT_TO_NUM(e_uint,unsigned int,unsigned,0,UINT_MAX); + if (ret == 0) + _GETENV_CHECK(tval); } - if (type == PSMI_ENVVAR_TYPE_UINT_FLAGS) - _GETENV_PRINT(env, used_default, "0x%x", tval.e_uint, + if (type == PSMI_ENVVAR_TYPE_UINT_FLAGS) { + FORMAT_RANGESTR(e_uint, "0x%x"); + _GETENV_PRINT(env, ret, "0x%x", tval.e_uint, defval.e_uint); - else - _GETENV_PRINT(env, used_default, "%u", tval.e_uint, + } else { + FORMAT_RANGESTR(e_uint, "%u"); + _GETENV_PRINT(env, ret, "%u", tval.e_uint, defval.e_uint); + } break; case PSMI_ENVVAR_TYPE_LONG: if (!env || *env == '\0') { tval = defval; - used_default = 1; + ret = 1; } else { - _CONVERT_TO_NUM(tval.e_long,long,strtol); + _CONVERT_TO_NUM(e_long,long,signed,LONG_MIN,LONG_MAX); + if (ret == 0) + _GETENV_CHECK(tval); } - _GETENV_PRINT(env, used_default, "%ld", tval.e_long, defval.e_long); + FORMAT_RANGESTR(e_long, "%ld"); + _GETENV_PRINT(env, ret, "%ld", tval.e_long, defval.e_long); break; case PSMI_ENVVAR_TYPE_ULONG_ULONG: if (!env || *env == '\0') { tval = defval; - used_default = 1; + ret = 1; } else { - _CONVERT_TO_NUM(tval.e_ulonglong,unsigned long long,strtoull); + _CONVERT_TO_NUM(e_ulonglong,unsigned long long,unsigned,0,ULLONG_MAX); + if (ret == 0) + _GETENV_CHECK(tval); } - _GETENV_PRINT(env, used_default, "%llu", + FORMAT_RANGESTR(e_ulonglong, "%llu"); + _GETENV_PRINT(env, ret, "%llu", tval.e_ulonglong, defval.e_ulonglong); break; - case PSMI_ENVVAR_TYPE_STR_VAL_PAT: - { + case PSMI_ENVVAR_TYPE_STR_VAL_PAT_INT: + if (!env || *env == '\0') { + tval = defval; + ret = 1; + } else if (check) { + // check will parse_val_pattern_int and check value returned + // caller must parse again + tval.e_str = env; + _GETENV_CHECK(tval); + } else { + int trash; + // we parse just for syntax check, caller must parse again + switch (psm3_parse_val_pattern_int(env, 0, &trash, level_flags, + (level_flags & PSMI_ENVVAR_FLAG_NOMIN)?INT_MIN:min.e_int, + (level_flags & PSMI_ENVVAR_FLAG_NOMAX)?INT_MAX:max.e_int)) { + case -1: // empty, use default + psmi_assert(0); // shouldn't happen, checked for empty above + tval = defval; + ret = 1; + break; + case -2: // one or more fields with bad syntax, use default + tval = defval; + ret = -1; + break; + default: // valid string + tval.e_str = env; + break; + } + } + FORMAT_RANGESTR(e_int, "%d"); + _GETENV_PRINT(env, ret, "'%s'", tval.e_str, defval.e_str); + break; + case PSMI_ENVVAR_TYPE_STR_VAL_PAT_UINT: + case PSMI_ENVVAR_TYPE_STR_VAL_PAT_UINT_FLAGS: + if (!env || *env == '\0') { + tval = defval; + ret = 1; + } else if (check) { + // check will parse_val_pattern_uint and check value returned + // caller must parse again + tval.e_str = env; + _GETENV_CHECK(tval); + } else { unsigned trash; // we parse just for syntax check, caller must parse again - if (psm3_parse_val_pattern(env, 0, &trash) < 0) { + switch (psm3_parse_val_pattern_uint(env, 0, &trash, level_flags, + (level_flags & PSMI_ENVVAR_FLAG_NOMIN)?0:min.e_uint, + (level_flags & PSMI_ENVVAR_FLAG_NOMAX)?UINT_MAX:max.e_uint)) { + case -1: // empty, use default + psmi_assert(0); // shouldn't happen, checked for empty above tval = defval; - used_default = 1; - } else + ret = 1; + break; + case -2: // one or more fields with bad syntax, use default + tval = defval; + ret = -1; + break; + default: // valid string tval.e_str = env; - _GETENV_PRINT(env, used_default, "'%s'", tval.e_str, defval.e_str); + break; + } } + if (type == PSMI_ENVVAR_TYPE_STR_VAL_PAT_UINT_FLAGS) + FORMAT_RANGESTR(e_uint, "0x%x"); + else + FORMAT_RANGESTR(e_uint, "%u"); + _GETENV_PRINT(env, ret, "'%s'", tval.e_str, defval.e_str); break; case PSMI_ENVVAR_TYPE_STR_TUPLES: - { + if (!env || *env == '\0') { + tval = defval; + ret = 1; + } else if (check) { + // check will parse_str_tuples and check their values + // caller must parse again + tval.e_str = env; + _GETENV_CHECK(tval); + } else { // we parse just for syntax check, caller must parse again int vals[3]; int ntup = psm3_count_tuples(defval.e_str); psmi_assert_always(ntup > 0 && ntup <= 3); - // parse default into vals[] so can show what caller get - (void)psm3_parse_str_tuples(defval.e_str, ntup, vals); switch (psm3_parse_str_tuples(env, ntup, vals)) { case -1: // empty, use default + psmi_assert(0); // shouldn't happen, checked for empty above tval = defval; - used_default = 1; - _GETENV_PRINT(env, 1, "'%s'", tval.e_str, defval.e_str); + ret = 1; break; - case -2: // one or more fields with bad syntax, show what we have - tval.e_str = env; - // only 3 choices, so just bruteforce it - switch (ntup) { - case 1: - _HFI_INFO("Invalid value for %s ('%s') %-40s Using: %d\n", - name, env, descr, vals[0]); - break; - case 2: - _HFI_INFO("Invalid value for %s ('%s') %-40s Using: %d:%d\n", - name, env, descr, vals[0], vals[1]); - break; - case 3: - _HFI_INFO("Invalid value for %s ('%s') %-40s Using: %d:%d:%d\n", - name, env, descr, vals[0], vals[1], vals[2]); - break; - } + case -2: // one or more fields with bad syntax, use default + tval = defval; + ret = -1; break; default: // valid string tval.e_str = env; - _GETENV_PRINT(env, 0, "'%s'", tval.e_str, defval.e_str); break; } } + _GETENV_PRINT(env, ret, "'%s'", tval.e_str, defval.e_str); break; case PSMI_ENVVAR_TYPE_ULONG: case PSMI_ENVVAR_TYPE_ULONG_FLAGS: default: if (!env || *env == '\0') { tval = defval; - used_default = 1; + ret = 1; } else { - _CONVERT_TO_NUM(tval.e_ulong,unsigned long,strtoul); + _CONVERT_TO_NUM(e_ulong,unsigned long,unsigned,0,ULONG_MAX); + if (ret == 0) + _GETENV_CHECK(tval); } - if (type == PSMI_ENVVAR_TYPE_ULONG_FLAGS) - _GETENV_PRINT(env, used_default, "0x%lx", tval.e_ulong, + if (type == PSMI_ENVVAR_TYPE_ULONG_FLAGS) { + FORMAT_RANGESTR(e_ulong, "0x%lx"); + _GETENV_PRINT(env, ret, "0x%lx", tval.e_ulong, defval.e_ulong); - else - _GETENV_PRINT(env, used_default, "%lu", tval.e_ulong, + } else { + FORMAT_RANGESTR(e_ulong, "%lu"); + _GETENV_PRINT(env, ret, "%lu", tval.e_ulong, defval.e_ulong); + } break; } +#undef FORMAT_RANGESTR +#undef _GETENV_CHECK #undef _GETENV_PRINT +#undef _CONVERT_TO_NUM *newval = tval; - if (! used_default) + switch (ret) { + case 0: // good input psm3_stats_print_env(name, env); + break; + case -1: // bad input, used default + // _GETENV_PRINT has set staterrstr + psm3_stats_print_env(name, statserrstr); + if (level_flags & PSMI_ENVVAR_FLAG_FATAL) { + // treat syntax or invalid input as fatal + psm3_handle_error(PSMI_EP_NORETURN, PSM2_PARAM_ERR, + "Invalid value for %s: '%s', can't proceed\n", + name, env); + } + break; + case 1: // no input, used default + // nothing special here + // as needed psm3_stats_initialize will log the stats controls + break; + } + return ret; +} +MOCK_DEF_EPILOGUE(psm3_getenv_range); + +int +MOCKABLE(psm3_getenv)(const char *name, const char *descr, int level, + int type, union psmi_envvar_val defval, + union psmi_envvar_val *newval) +{ + switch (type) { + case PSMI_ENVVAR_TYPE_YESNO: + return psm3_getenv_range(name, descr, NULL, (unsigned)level|PSMI_ENVVAR_FLAG_NOMIN_NOMAX, type, defval, (union psmi_envvar_val)0, (union psmi_envvar_val)1, NULL, NULL, newval); + break; + + case PSMI_ENVVAR_TYPE_STR: + return psm3_getenv_range(name, descr, NULL, (unsigned)level|PSMI_ENVVAR_FLAG_NOMIN_NOMAX, type, defval, (union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL, NULL, NULL, newval); + break; - return used_default; + case PSMI_ENVVAR_TYPE_INT: + return psm3_getenv_range(name, descr, NULL, (unsigned)level|PSMI_ENVVAR_FLAG_NOMIN_NOMAX, type, defval, (union psmi_envvar_val)INT_MIN, (union psmi_envvar_val)INT_MAX, NULL, NULL, newval); + break; + + case PSMI_ENVVAR_TYPE_UINT: + case PSMI_ENVVAR_TYPE_UINT_FLAGS: + return psm3_getenv_range(name, descr, NULL, (unsigned)level|PSMI_ENVVAR_FLAG_NOMIN_NOMAX, type, defval, (union psmi_envvar_val)0, (union psmi_envvar_val)UINT_MAX, NULL, NULL, newval); + break; + + case PSMI_ENVVAR_TYPE_LONG: + return psm3_getenv_range(name, descr, NULL, (unsigned)level|PSMI_ENVVAR_FLAG_NOMIN_NOMAX, type, defval, (union psmi_envvar_val)LONG_MIN, (union psmi_envvar_val)LONG_MAX, NULL, NULL, newval); + break; + case PSMI_ENVVAR_TYPE_ULONG_ULONG: + return psm3_getenv_range(name, descr, NULL, (unsigned)level|PSMI_ENVVAR_FLAG_NOMIN_NOMAX, type, defval, (union psmi_envvar_val)0, (union psmi_envvar_val)ULLONG_MAX, NULL, NULL, newval); + break; + case PSMI_ENVVAR_TYPE_STR_VAL_PAT_INT: + case PSMI_ENVVAR_TYPE_STR_VAL_PAT_UINT: + case PSMI_ENVVAR_TYPE_STR_VAL_PAT_UINT_FLAGS: + case PSMI_ENVVAR_TYPE_STR_TUPLES: + return psm3_getenv_range(name, descr, NULL, (unsigned)level|PSMI_ENVVAR_FLAG_NOMIN_NOMAX, type, defval, (union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL, NULL, NULL, newval); + break; + case PSMI_ENVVAR_TYPE_ULONG: + case PSMI_ENVVAR_TYPE_ULONG_FLAGS: + default: + return psm3_getenv_range(name, descr, NULL, (unsigned)level|PSMI_ENVVAR_FLAG_NOMIN_NOMAX, type, defval, (union psmi_envvar_val)0, (union psmi_envvar_val)ULONG_MAX, NULL, NULL, newval); + break; + } } MOCK_DEF_EPILOGUE(psm3_getenv); /* * Parsing int parameters * 0 -> ok, *val updated - * -1 -> empty string - * -2 -> parse error + * -1 -> empty string, *val not updated + * -2 -> parse error, *val not updated */ -int psm3_parse_str_int(const char *string, int *val) +int psm3_parse_str_int(const char *string, int *val, int min, int max) { - char *ep; - long ret; + int ret; + long long temp; - psmi_assert(val != NULL); - if (! string || ! *string) - return -1; - /* Avoid base 8 (octal) on purpose, so don't pass in 0 for radix */ - ret = strtol(string, &ep, 10); - if (! _CONSUMED_ALL(ep)) { - ret = strtol(string, &ep, 16); - if (! _CONSUMED_ALL(ep)) - return -2; - } - *val = ret; + if ((ret = convert_str_signed(string, &temp, min, max))) + return ret; + *val = (int)temp; return 0; } /* * Parsing uint parameters * 0 -> ok, *val updated - * -1 -> empty string - * -2 -> parse error + * -1 -> empty string, *val not updated + * -2 -> parse error, *val not updated */ -int psm3_parse_str_uint(const char *string, unsigned int *val) +int psm3_parse_str_uint(const char *string, unsigned int *val, + unsigned int min, unsigned int max) { - char *ep; - unsigned long ret; + int ret; + unsigned long long temp; - psmi_assert(val != NULL); - if (! string || ! *string) - return -1; - /* Avoid base 8 (octal) on purpose, so don't pass in 0 for radix */ - ret = strtoul(string, &ep, 10); - if (! _CONSUMED_ALL(ep)) { - ret = strtoul(string, &ep, 16); - if (! _CONSUMED_ALL(ep)) - return -2; - } - *val = ret; + if ((ret = convert_str_unsigned(string, &temp, min, max))) + return ret; + *val = (unsigned int)temp; return 0; } /* * Parsing long parameters - * -1 -> empty string - * -2 -> parse error + * Returns: + * 0 -> ok, *val updated + * -1 -> empty string, *val not updated + * -2 -> parse error, *val not updated */ -long psm3_parse_str_long(const char *string) +int psm3_parse_str_long(const char *string, long *val, long min, long max) { - char *ep; - long ret; + int ret; + long long temp; - if (! string || ! *string) - return -1; - /* Avoid base 8 (octal) on purpose, so don't pass in 0 for radix */ - ret = strtol(string, &ep, 10); - if (! _CONSUMED_ALL(ep)) { - ret = strtol(string, &ep, 16); - if (! _CONSUMED_ALL(ep)) - return -2; - } - return ret; + if ((ret = convert_str_signed(string, &temp, min, max))) + return ret; + *val = (long)temp; + return 0; } /* * Parsing yesno parameters * allows: yes/no, true/false, on/off, 1/0 - * -1 -> empty string - * -2 -> parse error + * Returns: + * 0 -> ok, *val updated + * *val = 0 - no selected + * *val = 1 - yes selected + * -1 -> empty string, *val not updated + * -2 -> parse error, *val not updated */ -int psm3_parse_str_yesno(const char *string) +int psm3_parse_str_yesno(const char *string, int *val) { + psmi_assert(val != NULL); if (! string || ! *string) return -1; else if (string[0] == 'Y' || string[0] == 'y' || string[0] == 'T' || string[0] == 't' || ((string[0] == 'O' || string[0] == 'o') - && (string[1] == 'n' || string[1] == 'N'))) - return 1; - else if (string[0] == 'N' || string[0] == 'n' + && (string[1] == 'n' || string[1] == 'N'))) { + *val = 1; + } else if (string[0] == 'N' || string[0] == 'n' || string[0] == 'F' || string[0] == 'f' || ((string[0] == 'O' || string[0] == 'o') - && (string[1] == 'f' || string[1] == 'F'))) - return 0; - else { - char *ep; - unsigned long temp; - temp = strtoul(string, &ep, 0); - if (!_CONSUMED_ALL(ep)) { - return -2; - } else if (temp != 0) { - return 1; + && (string[1] == 'f' || string[1] == 'F'))) { + *val = 0; + } else { + unsigned long long temp; + if (convert_str_unsigned(string, &temp, 0, UINT_MAX)) + return -2; // already checked for empty, so must be invalid value + *val = (temp != 0); + } + return 0; +} + +/* parse int env of the form 'val' or 'val:' or 'val:pattern' + * for PSM3_PRINT_STATS + * Returns: + * 0 - parsed and matches current process, *val set to parsed val + * 0 - parsed and doesn't match current process, *val set to def + * -1 - nothing provided, *val set to def + * -2 - syntax error, *val set to def + * flags PSMI_ENVVAR_FLAG_NOMIN, PSMI_ENVVAR_FLAG_NOMAX and + * PSMI_ENVVAR_FLAG_NOABBREV control if 'min', 'minimum', 'max' or 'maximum' + * allowed as input and indicate if min and/or max supplied. + */ +int psm3_parse_val_pattern_int(const char *env, int def, int *val, + unsigned flags, int min, int max) +{ + int ret = 0; + long long temp; + + psmi_assert(val != NULL); + if (!env || ! *env) { + *val = def; + ret = -1; + } else { + char *e = psmi_strdup(NULL, env); + char *p; + + if (flags & PSMI_ENVVAR_FLAG_NOMIN) + min = INT_MIN; + if (flags & PSMI_ENVVAR_FLAG_NOMAX) + max = INT_MAX; + + psmi_assert_always(e != NULL); + if (e == NULL) { // for klocwork + *val = def; + goto done; + } + p = strchr(e, ':'); + if (p) + *p = '\0'; + if (!(flags & (PSMI_ENVVAR_FLAG_NOMIN|PSMI_ENVVAR_FLAG_NOABBREV)) + && (!strcasecmp(e, "min") || !strcasecmp(e, "minimum"))) + *val = min; + else if (!(flags & (PSMI_ENVVAR_FLAG_NOMAX|PSMI_ENVVAR_FLAG_NOABBREV)) + && (!strcasecmp(e, "max") || !strcasecmp(e, "maximum"))) + *val = max; + else if (convert_str_signed(e, &temp, min, max)) { + *val = def; + ret = -2; } else { - return 0; + *val = (int)temp; } + if (ret == 0 && p) { + if (! *(p+1)) { // val: -> val:*:rank0 + if (psm3_get_myrank() != 0) + *val = def; +#ifdef FNM_EXTMATCH + } else if (0 != fnmatch(p+1, psm3_get_mylabel(), FNM_EXTMATCH )) { +#else + } else if (0 != fnmatch(p+1, psm3_get_mylabel(), 0 )) { +#endif + *val = def; + } + } + psmi_free(e); } +done: + return ret; } -/* parse env of the form 'val' or 'val:' or 'val:pattern' +/* parse unsigned env of the form 'val' or 'val:' or 'val:pattern' * for PSM3_VERBOSE_ENV, PSM3_TRACEMASK, PSM3_FI and PSM3_IDENITFY + * Returns: * 0 - parsed and matches current process, *val set to parsed val * 0 - parsed and doesn't match current process, *val set to def * -1 - nothing provided, *val set to def * -2 - syntax error, *val set to def + * flags PSMI_ENVVAR_FLAG_NOMIN, PSMI_ENVVAR_FLAG_NOMAX and + * PSMI_ENVVAR_FLAG_NOABBREV control if 'min', 'minimum', 'max' or 'maximum' + * allowed as input and indicate if min and/or max supplied. */ -int psm3_parse_val_pattern(const char *env, unsigned def, unsigned *val) +int psm3_parse_val_pattern_uint(const char *env, unsigned def, unsigned *val, + unsigned flags, unsigned min, unsigned max) { int ret = 0; + unsigned long long temp; psmi_assert(val != NULL); if (!env || ! *env) { @@ -734,9 +1070,13 @@ int psm3_parse_val_pattern(const char *env, unsigned def, unsigned *val) ret = -1; } else { char *e = psmi_strdup(NULL, env); - char *ep; char *p; + if (flags & PSMI_ENVVAR_FLAG_NOMIN) + min = 0; + if (flags & PSMI_ENVVAR_FLAG_NOMAX) + max = UINT_MAX; + psmi_assert_always(e != NULL); if (e == NULL) { // for klocwork *val = def; @@ -745,11 +1085,19 @@ int psm3_parse_val_pattern(const char *env, unsigned def, unsigned *val) p = strchr(e, ':'); if (p) *p = '\0'; - *val = (int)strtoul(e, &ep, 0); - if (! _CONSUMED_ALL(ep)) { + if (!(flags & (PSMI_ENVVAR_FLAG_NOMIN|PSMI_ENVVAR_FLAG_NOABBREV)) + && (!strcasecmp(e, "min") || !strcasecmp(e, "minimum"))) + *val = min; + else if (!(flags & (PSMI_ENVVAR_FLAG_NOMAX|PSMI_ENVVAR_FLAG_NOABBREV)) + && (!strcasecmp(e, "max") || !strcasecmp(e, "maximum"))) + *val = max; + else if (convert_str_unsigned(e, &temp, min, max)) { *val = def; ret = -2; - } else if (p) { + } else { + *val = (unsigned)temp; + } + if (ret == 0 && p) { if (! *(p+1)) { // val: -> val:*:rank0 if (psm3_get_myrank() != 0) *val = def; @@ -777,11 +1125,11 @@ int psm3_parse_val_pattern(const char *env, unsigned def, unsigned *val) * It's valid for less than ntup values to be supplied, any unsupplied * fields are not updated in vals[] * Returns: - * 0 - parsed with no errors, vals[] updated - * -1 - empty or NULL string, vals[] unchanged - * -2 - syntax error in one of more of the parameters - * parameters with syntax errors are unchanged, others without - * syntax errors are updated in vals[] + * 0 - parsed with no errors, vals[] updated + * -1 - empty or NULL string, vals[] unchanged + * -2 - syntax error in one of more of the parameters + * parameters with syntax errors are unchanged, others without + * syntax errors are updated in vals[] */ int psm3_parse_str_tuples(const char *string, int ntup, int *vals) { @@ -804,17 +1152,14 @@ int psm3_parse_str_tuples(const char *string, int ntup, int *vals) while (*e && *e != ':') e++; if (e > b) { /* something to parse */ - char *ep; int len = e - b; - long int l; + long long temp; strncpy(buf, b, len); buf[len] = '\0'; - l = strtol(buf, &ep, 0); - if (ep != buf) { /* successful conversion */ - vals[tup_i] = (int)l; - } else { + if (convert_str_signed(buf, &temp, INT_MIN, INT_MAX)) ret = -2; - } + else + vals[tup_i] = (int)temp; } if (*e == ':') e++; /* skip delimiter */ diff --git a/prov/psm3/psm3/utils/utils_mallopt.c b/prov/psm3/psm3/utils/utils_mallopt.c index a821281cb00..830c1bbd22b 100644 --- a/prov/psm3/psm3/utils/utils_mallopt.c +++ b/prov/psm3/psm3/utils/utils_mallopt.c @@ -82,7 +82,8 @@ static void init_mallopt_disable_mmap(void) { // since this occurs before psm3_init, we can't use psm3_env_get // default to NO (0) - if (psm3_parse_str_yesno(getenv("PSM3_DISABLE_MMAP_MALLOC")) > 0) { + int disable = 0; + if (!psm3_parse_str_yesno(getenv("PSM3_DISABLE_MMAP_MALLOC"), &disable) && disable) { if (mallopt(M_MMAP_MAX, 0) && mallopt(M_TRIM_THRESHOLD, -1)) { psm3_malloc_no_mmap = 1; } diff --git a/prov/psm3/src/psmx3_attr.c b/prov/psm3/src/psmx3_attr.c index 7c5a61a8031..fc3663f6133 100644 --- a/prov/psm3/src/psmx3_attr.c +++ b/prov/psm3/src/psmx3_attr.c @@ -272,17 +272,87 @@ static uint64_t psmx3_check_fi_hmem_cap(void) { int gpu = 0; unsigned int gpudirect = 0; #ifdef PSM_CUDA - (void)psm3_parse_str_int(psm3_env_get("PSM3_CUDA"), &gpu); + (void)psm3_parse_str_int(psm3_env_get("PSM3_CUDA"), &gpu, INT_MIN, INT_MAX); #else /* PSM_ONEAPI */ - (void)psm3_parse_str_int(psm3_env_get("PSM3_ONEAPI_ZE"), &gpu); + (void)psm3_parse_str_int(psm3_env_get("PSM3_ONEAPI_ZE"), &gpu, + INT_MIN, INT_MAX); #endif - (void)psm3_parse_str_uint(psm3_env_get("PSM3_GPUDIRECT"), &gpudirect); + (void)psm3_parse_str_uint(psm3_env_get("PSM3_GPUDIRECT"), &gpudirect, + 0, UINT_MAX); if ((gpu || gpudirect) && !ofi_hmem_p2p_disabled()) return FI_HMEM; #endif /* PSM_CUDA || PSM_ONEAPI */ return 0; } +static uint64_t get_max_inject_size(void) { + unsigned int thresh_rv; + unsigned int temp; + int have_shm = 1; + int have_nic = 1; + int devid_enabled[PTL_MAX_INIT]; + + // check PSM3_DEVICES to determine if PSM3 shm enabled + if ((PSM2_OK == psm3_parse_devices(devid_enabled))) { + have_shm = psm3_device_is_enabled(devid_enabled, PTL_DEVID_AMSH); + have_nic = psm3_device_is_enabled(devid_enabled, PTL_DEVID_IPS); + } + + // figure out the smallest rendezvous threshold (GPU vs CPU ips vs shm) + // If middleware above is not using PSM3 for shm but leaves it in + // PSM3_DEVICES, this could be more restrictive than necessary, + // but it's safe. Note that PSM3_DEVICES can't be set per EP open. + // Also not yet sure which HAL will be selected so must pick most + // conservative ips (NIC) config + thresh_rv = 65536; // default in odd case of PSM3_DEVICES=self + + if (have_nic) { + temp = PSM_MQ_NIC_RNDV_THRESH; + psm3_parse_str_uint(psm3_env_get("PSM3_MQ_RNDV_NIC_THRESH"), &temp, + 0, UINT_MAX); + if (thresh_rv > temp) + thresh_rv = temp; + } + + if (have_shm) { + temp = MQ_SHM_THRESH_RNDV; + psm3_parse_str_uint(psm3_env_get("PSM3_MQ_RNDV_SHM_THRESH"), &temp, + 0, UINT_MAX); + if (thresh_rv > temp) + thresh_rv = temp; + } + +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) + if (psmx3_prov_info.caps & FI_HMEM) { + if (have_nic) { + // GPU ips rendezvous threshold + // sockets HAL avoids rendezvous, so this may be overly restrictive + temp = GPU_THRESH_RNDV; + // PSM3_CUDA_THRESH_RNDV depricated, use PSM3_GPU_THRESH_RNDV if set + psm3_parse_str_uint(psm3_env_get("PSM3_CUDA_THRESH_RNDV"), &temp, + 0, UINT_MAX); + psm3_parse_str_uint(psm3_env_get("PSM3_GPU_THRESH_RNDV"), &temp, + 0, UINT_MAX); + if (thresh_rv > temp) + thresh_rv = temp; + } + + if (have_shm) { + // GPU shm rendezvous threshold + temp = MQ_SHM_GPU_THRESH_RNDV; + psm3_parse_str_uint(psm3_env_get("PSM3_MQ_RNDV_SHM_GPU_THRESH"), &temp, + 0, UINT_MAX); + if (thresh_rv > temp) + thresh_rv = temp; + } + } +#endif + + // messages <= thresh_rv guaranteed to use eager, so thresh_rv + // is the max allowed inject_size. + return thresh_rv; +} + /* * Possible provider variations: * @@ -496,6 +566,8 @@ void psmx3_update_prov_info(struct fi_info *info, struct psmx3_ep_name *dest_addr) { struct fi_info *p; + unsigned int max_inject_size; + unsigned int inject_size; for (p = info; p; p = p->next) { psmx3_dup_addr(p->addr_format, src_addr, @@ -506,6 +578,15 @@ void psmx3_update_prov_info(struct fi_info *info, psmx3_expand_default_unit(info); + max_inject_size = get_max_inject_size(); + if (psmx3_env.inject_size > max_inject_size) + inject_size = max_inject_size; + else + inject_size = psmx3_env.inject_size; + PSMX3_INFO(&psmx3_prov, FI_LOG_CORE, + "Using inject_size=%u based on FI_PSM3_INJECT_SIZE=%u with max %u\n", + inject_size, psmx3_env.inject_size, max_inject_size); + for (p = info; p; p = p->next) { int unit = ((struct psmx3_ep_name *)p->src_addr)->unit; int port = ((struct psmx3_ep_name *)p->src_addr)->port; @@ -539,7 +620,7 @@ void psmx3_update_prov_info(struct fi_info *info, int addr_index = psmx3_domain_info.addr_index[unit]; args[0].unit = unit_id; - args[1].port = port; + args[1].port = port == PSMX3_DEFAULT_PORT ? 1 : port; args[2].addr_index = addr_index; args[3].length = sizeof(unit_name); @@ -571,7 +652,7 @@ void psmx3_update_prov_info(struct fi_info *info, int addr_index = psmx3_domain_info.addr_index[unit]; args[0].unit = unit_id; - args[1].port = port; + args[1].port = port == PSMX3_DEFAULT_PORT ? 1 : port; args[2].addr_index = addr_index; args[3].length = sizeof(fabric_name); @@ -591,7 +672,7 @@ void psmx3_update_prov_info(struct fi_info *info, } } - p->tx_attr->inject_size = psmx3_env.inject_size; + p->tx_attr->inject_size = inject_size; } } diff --git a/prov/psm3/src/psmx3_cq.c b/prov/psm3/src/psmx3_cq.c index f1a10349dce..b072eb230df 100644 --- a/prov/psm3/src/psmx3_cq.c +++ b/prov/psm3/src/psmx3_cq.c @@ -622,8 +622,10 @@ psmx3_mq_status_copy(struct psm2_mq_req_user *req, void *status_array, int entry data = PSMX3_GET_CQDATA(PSMX3_STATUS_TAG(req)); if (PSMX3_HAS_IMM(PSMX3_GET_FLAGS(PSMX3_STATUS_TAG(req)))) flags |= FI_REMOTE_CQ_DATA; - if (multi_recv_req->offset + PSMX3_STATUS_RCVLEN(req) + - multi_recv_req->min_buf_size > multi_recv_req->len) + len_remaining = multi_recv_req->len - multi_recv_req->offset - + PSMX3_STATUS_RCVLEN(req); + if (len_remaining < multi_recv_req->min_buf_size || + len_remaining == 0) flags |= FI_MULTI_RECV; /* buffer used up */ err = psmx3_cq_rx_complete( status_data->poll_cq, ep->recv_cq, ep->av, @@ -638,7 +640,8 @@ psmx3_mq_status_copy(struct psm2_mq_req_user *req, void *status_array, int entry /* repost multi-recv buffer */ multi_recv_req->offset += PSMX3_STATUS_RCVLEN(req); len_remaining = multi_recv_req->len - multi_recv_req->offset; - if (len_remaining >= multi_recv_req->min_buf_size) { + if (len_remaining >= multi_recv_req->min_buf_size && + len_remaining > 0) { if (len_remaining > PSMX3_MAX_MSG_SIZE) len_remaining = PSMX3_MAX_MSG_SIZE; err = psm3_mq_irecv2(ep->rx->psm2_mq, @@ -786,7 +789,8 @@ psmx3_mq_status_copy(struct psm2_mq_req_user *req, void *status_array, int entry multi_recv_req = PSMX3_CTXT_USER(fi_context); multi_recv_req->offset += PSMX3_STATUS_RCVLEN(req); len_remaining = multi_recv_req->len - multi_recv_req->offset; - if (len_remaining >= multi_recv_req->min_buf_size) { + if (len_remaining >= multi_recv_req->min_buf_size && + len_remaining > 0) { if (len_remaining > PSMX3_MAX_MSG_SIZE) len_remaining = PSMX3_MAX_MSG_SIZE; err = psm3_mq_irecv2(ep->rx->psm2_mq, diff --git a/prov/psm3/src/psmx3_init.c b/prov/psm3/src/psmx3_init.c index c263446fd64..c20035a84de 100644 --- a/prov/psm3/src/psmx3_init.c +++ b/prov/psm3/src/psmx3_init.c @@ -320,11 +320,11 @@ static int psmx3_check_multi_ep_cap(void) { uint64_t caps = PSM2_MULTI_EP_CAP; char *s = NULL; + int val = 1; /* if parses as empty (-1) or invalid (-2), use default of 1 */ s = psm3_env_get("PSM3_MULTI_EP"); - /* if parses as empty or invalid (-1), use default of 1 */ - /* psm3 below us will provide warning as needed when it parses it */ - if (psm3_get_capability_mask(caps) == caps && 0 != psm3_parse_str_yesno(s)) + /* psm3 below us will provide warning as needed when it parses it again */ + if (psm3_get_capability_mask(caps) == caps && (psm3_parse_str_yesno(s, &val) || val)) psmx3_env.multi_ep = 1; else psmx3_env.multi_ep = 0; @@ -438,7 +438,7 @@ static int psmx3_update_hfi_info(void) // if parses as empty or invalid (-1), use default of 0 */ // PSM3 below us will provide warning as needed when it parses it s = psm3_env_get("PSM3_MULTIRAIL"); - (void)psm3_parse_str_int(s, &multirail); + (void)psm3_parse_str_int(s, &multirail, INT_MIN, INT_MAX); psmx3_domain_info.num_reported_units = 0; psmx3_domain_info.num_active_units = 0; @@ -699,6 +699,7 @@ static void psmx3_update_nic_info(struct fi_info *info) } } +static int init_calls; static int psmx3_getinfo(uint32_t api_version, const char *node, const char *service, uint64_t flags, const struct fi_info *hints, struct fi_info **info) @@ -740,6 +741,8 @@ static int psmx3_getinfo(uint32_t api_version, const char *node, goto err_out; } + init_calls += 1; + /* when available, default domain and fabric names are a superset * of all individual names, so we can do a substr search as a 1st level * filter @@ -872,6 +875,9 @@ static int psmx3_getinfo(uint32_t api_version, const char *node, *info = prov_info; free(src_addr); free(dest_addr); + if (hints || init_calls >= 2) { + psm3_turn_off_init_cache(); + } return 0; err_out: diff --git a/prov/psm3/src/psmx3_msg.c b/prov/psm3/src/psmx3_msg.c index 3fe17a6bf73..519593def74 100644 --- a/prov/psm3/src/psmx3_msg.c +++ b/prov/psm3/src/psmx3_msg.c @@ -225,7 +225,7 @@ ssize_t psmx3_send_generic(struct fid_ep *ep, const void *buf, size_t len, return -FI_EMSGSIZE; err = psm3_mq_send2(ep_priv->tx->psm2_mq, psm2_epaddr, - send_flag, &psm2_tag, buf, len); + send_flag|PSM2_MQ_FLAG_INJECT, &psm2_tag, buf, len); if (err != PSM2_OK) return psmx3_errno(err); @@ -374,7 +374,7 @@ ssize_t psmx3_sendv_generic(struct fid_ep *ep, const struct iovec *iov, } err = psm3_mq_send2(ep_priv->tx->psm2_mq, psm2_epaddr, - send_flag, &psm2_tag, req->buf, len); + send_flag|PSM2_MQ_FLAG_INJECT, &psm2_tag, req->buf, len); free(req); diff --git a/prov/psm3/src/psmx3_tagged.c b/prov/psm3/src/psmx3_tagged.c index 17caec29533..41475dc211c 100644 --- a/prov/psm3/src/psmx3_tagged.c +++ b/prov/psm3/src/psmx3_tagged.c @@ -551,7 +551,7 @@ ssize_t psmx3_tagged_send_generic(struct fid_ep *ep, return -FI_EMSGSIZE; err = psm3_mq_send2(ep_priv->tx->psm2_mq, psm2_epaddr, - 0, &psm2_tag, buf, len); + PSM2_MQ_FLAG_INJECT, &psm2_tag, buf, len); if (err != PSM2_OK) return psmx3_errno(err); @@ -764,8 +764,8 @@ psmx3_tagged_inject_specialized(struct fid_ep *ep, const void *buf, else PSMX3_SET_TAG(psm2_tag, tag, ep_priv->sep_id, PSMX3_TYPE_TAGGED); - err = psm3_mq_send2(ep_priv->tx->psm2_mq, psm2_epaddr, 0, - &psm2_tag, buf, len); + err = psm3_mq_send2(ep_priv->tx->psm2_mq, psm2_epaddr, + PSM2_MQ_FLAG_INJECT, &psm2_tag, buf, len); if (err != PSM2_OK) return psmx3_errno(err); @@ -915,7 +915,7 @@ ssize_t psmx3_tagged_sendv_generic(struct fid_ep *ep, } err = psm3_mq_send2(ep_priv->tx->psm2_mq, psm2_epaddr, - send_flag, &psm2_tag, req->buf, len); + send_flag|PSM2_MQ_FLAG_INJECT, &psm2_tag, req->buf, len); free(req); diff --git a/prov/rxm/src/rxm.h b/prov/rxm/src/rxm.h index 1164e72cbf0..bf53299ce1f 100644 --- a/prov/rxm/src/rxm.h +++ b/prov/rxm/src/rxm.h @@ -205,6 +205,7 @@ extern size_t rxm_cq_eq_fairness; extern int rxm_passthru; extern int force_auto_progress; extern int rxm_use_write_rndv; +extern int rxm_detect_hmem_iface; extern enum fi_wait_obj def_wait_obj, def_tcp_wait_obj; struct rxm_ep; @@ -309,11 +310,22 @@ struct rxm_mr { }; static inline enum fi_hmem_iface -rxm_mr_desc_to_hmem_iface_dev(void **desc, size_t count, uint64_t *device) +rxm_iov_desc_to_hmem_iface_dev(const struct iovec *iov, void **desc, + size_t count, uint64_t *device) { - if (!count || !desc || !desc[0]) { + enum fi_hmem_iface iface = FI_HMEM_SYSTEM; + + if (!count) { *device = 0; - return FI_HMEM_SYSTEM; + return iface; + } + + if (!desc || !desc[0]) { + if (rxm_detect_hmem_iface) + iface = ofi_get_hmem_iface(iov[0].iov_base, device, NULL); + else + *device = 0; + return iface; } *device = ((struct rxm_mr *) desc[0])->device; diff --git a/prov/rxm/src/rxm_atomic.c b/prov/rxm/src/rxm_atomic.c index 9fe0e1f8b0d..2c9711910d8 100644 --- a/prov/rxm/src/rxm_atomic.c +++ b/prov/rxm/src/rxm_atomic.c @@ -124,9 +124,10 @@ rxm_ep_atomic_common(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, datatype_sz); buf_len = ofi_total_iov_len(buf_iov, msg->iov_count); - buf_iface = rxm_mr_desc_to_hmem_iface_dev(msg->desc, - msg->iov_count, - &buf_device); + buf_iface = rxm_iov_desc_to_hmem_iface_dev(buf_iov, + msg->desc, + msg->iov_count, + &buf_device); } if (op == ofi_op_atomic_compare) { @@ -136,9 +137,10 @@ rxm_ep_atomic_common(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, cmp_len = ofi_total_iov_len(cmp_iov, compare_iov_count); assert(buf_len == cmp_len); - cmp_iface = rxm_mr_desc_to_hmem_iface_dev(compare_desc, - compare_iov_count, - &cmp_device); + cmp_iface = rxm_iov_desc_to_hmem_iface_dev(cmp_iov, + compare_desc, + compare_iov_count, + &cmp_device); } data_len = buf_len + cmp_len + sizeof(struct rxm_atomic_hdr); diff --git a/prov/rxm/src/rxm_cq.c b/prov/rxm/src/rxm_cq.c index 2b4e169ba6c..8ba24f9a544 100644 --- a/prov/rxm/src/rxm_cq.c +++ b/prov/rxm/src/rxm_cq.c @@ -378,9 +378,10 @@ static void rxm_process_seg_data(struct rxm_rx_buf *rx_buf, int *done) uint64_t device; ssize_t done_len; - iface = rxm_mr_desc_to_hmem_iface_dev(rx_buf->recv_entry->rxm_iov.desc, - rx_buf->recv_entry->rxm_iov.count, - &device); + iface = rxm_iov_desc_to_hmem_iface_dev(rx_buf->recv_entry->rxm_iov.iov, + rx_buf->recv_entry->rxm_iov.desc, + rx_buf->recv_entry->rxm_iov.count, + &device); done_len = ofi_copy_to_hmem_iov(iface, device, rx_buf->recv_entry->rxm_iov.iov, @@ -629,6 +630,7 @@ void rxm_handle_eager(struct rxm_rx_buf *rx_buf) rx_buf->recv_entry->rxm_iov.desc, rx_buf->data, rx_buf->pkt.hdr.size, rx_buf->recv_entry->rxm_iov.iov, rx_buf->recv_entry->rxm_iov.count, 0); + assert((size_t) done_len == rx_buf->pkt.hdr.size); rxm_finish_recv(rx_buf, done_len); @@ -640,9 +642,10 @@ void rxm_handle_coll_eager(struct rxm_rx_buf *rx_buf) uint64_t device; ssize_t done_len; - iface = rxm_mr_desc_to_hmem_iface_dev(rx_buf->recv_entry->rxm_iov.desc, - rx_buf->recv_entry->rxm_iov.count, - &device); + iface = rxm_iov_desc_to_hmem_iface_dev(rx_buf->recv_entry->rxm_iov.iov, + rx_buf->recv_entry->rxm_iov.desc, + rx_buf->recv_entry->rxm_iov.count, + &device); done_len = ofi_copy_to_hmem_iov(iface, device, rx_buf->recv_entry->rxm_iov.iov, @@ -1247,9 +1250,10 @@ static ssize_t rxm_handle_atomic_resp(struct rxm_ep *rxm_ep, " msg_id: 0x%" PRIx64 "\n", rx_buf->pkt.hdr.op, rx_buf->pkt.ctrl_hdr.msg_id); - iface = rxm_mr_desc_to_hmem_iface_dev(tx_buf->atomic_result.desc, - tx_buf->atomic_result.count, - &device); + iface = rxm_iov_desc_to_hmem_iface_dev(tx_buf->atomic_result.iov, + tx_buf->atomic_result.desc, + tx_buf->atomic_result.count, + &device); assert(!(rx_buf->comp_flags & ~(FI_RECV | FI_REMOTE_CQ_DATA))); diff --git a/prov/rxm/src/rxm_domain.c b/prov/rxm/src/rxm_domain.c index 54d0b6f4b1f..055fca16bea 100644 --- a/prov/rxm/src/rxm_domain.c +++ b/prov/rxm/src/rxm_domain.c @@ -474,12 +474,24 @@ int rxm_msg_mr_reg_internal(struct rxm_domain *rxm_domain, const void *buf, size_t len, uint64_t acs, uint64_t flags, struct fid_mr **mr) { int ret, tries = 0; + struct iovec iov = { + .iov_base = (void *)buf, + .iov_len = len, + }; + struct fi_mr_attr attr = { + .mr_iov = &iov, + .iov_count = 1, + .access = acs, + .iface = FI_HMEM_SYSTEM, + }; + + if (rxm_detect_hmem_iface) + attr.iface = ofi_get_hmem_iface(buf, &attr.device.reserved, NULL); /* If we can't get a key within 1024 tries, give up */ do { - ret = fi_mr_reg(rxm_domain->msg_domain, buf, len, acs, 0, - rxm_domain->mr_key++ | FI_PROV_SPECIFIC, - flags, mr, NULL); + attr.requested_key = rxm_domain->mr_key++ | (1UL << 31); + ret = fi_mr_regattr(rxm_domain->msg_domain, &attr, flags, mr); } while (ret == -FI_ENOKEY && tries++ < 1024); return ret; @@ -771,7 +783,7 @@ static ssize_t rxm_send_credits(struct fid_ep *ep, uint64_t credits) msg.context = tx_buf; msg.desc = &tx_buf->hdr.desc; - ret = fi_sendmsg(ep, &msg, FI_PRIORITY); + ret = fi_sendmsg(ep, &msg, OFI_PRIORITY); if (!ret) return FI_SUCCESS; diff --git a/prov/rxm/src/rxm_ep.c b/prov/rxm/src/rxm_ep.c index f3eb7bf549d..c458af354b4 100644 --- a/prov/rxm/src/rxm_ep.c +++ b/prov/rxm/src/rxm_ep.c @@ -960,7 +960,7 @@ void rxm_ep_progress_deferred_queue(struct rxm_ep *rxm_ep, msg.msg_iov = &iov; ret = fi_sendmsg(def_tx_entry->rxm_conn->msg_ep, &msg, - FI_PRIORITY); + OFI_PRIORITY); if (ret) { if (ret != -FI_EAGAIN) { rxm_cq_write_error( diff --git a/prov/rxm/src/rxm_init.c b/prov/rxm/src/rxm_init.c index 3ca3c22593f..a29c530b0d3 100644 --- a/prov/rxm/src/rxm_init.c +++ b/prov/rxm/src/rxm_init.c @@ -58,6 +58,7 @@ size_t rxm_packet_size; int rxm_passthru = 0; /* disable by default, need to analyze performance */ int force_auto_progress; int rxm_use_write_rndv; +int rxm_detect_hmem_iface; enum fi_wait_obj def_wait_obj = FI_WAIT_FD, def_tcp_wait_obj = FI_WAIT_UNSPEC; char *rxm_proto_state_str[] = { @@ -700,6 +701,11 @@ RXM_INI "to the tcp provider, depending on the capabilities " "requested by the application."); + fi_param_define(&rxm_prov, "detect_hmem_iface", FI_PARAM_BOOL, + "Detect iface for user buffers with NULL desc passed " + "in. This allows such buffers be copied or registered " + "internally by RxM. (default: false)."); + /* passthru supported disabled - to re-enable would need to fix call to * fi_cq_read to pass in the correct data structure. However, passthru * will not be needed at all with in-work tcp changes. @@ -725,6 +731,8 @@ RXM_INI "(FI_OFI_RXM_DATA_AUTO_PROGRESS = 1), domain threading " "level would be set to FI_THREAD_SAFE\n"); + fi_param_get_bool(&rxm_prov, "detect_hmem_iface", &rxm_detect_hmem_iface); + #if HAVE_RXM_DL ofi_mem_init(); ofi_hmem_init(); diff --git a/prov/rxm/src/rxm_msg.c b/prov/rxm/src/rxm_msg.c index 39af82972bc..fdd036e7d32 100644 --- a/prov/rxm/src/rxm_msg.c +++ b/prov/rxm/src/rxm_msg.c @@ -473,7 +473,7 @@ rxm_send_sar(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, ssize_t ret; assert(segs_cnt >= 2); - iface = rxm_mr_desc_to_hmem_iface_dev(desc, count, &device); + iface = rxm_iov_desc_to_hmem_iface_dev(iov, desc, count, &device); first_tx_buf = rxm_init_segment(rxm_ep, rxm_conn, context, data_len, rxm_buffer_size, @@ -709,7 +709,7 @@ rxm_send_common(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, (data_len > rxm_ep->rxm_info->tx_attr->inject_size)) || (data_len <= rxm_ep->rxm_info->tx_attr->inject_size)); - iface = rxm_mr_desc_to_hmem_iface_dev(desc, count, &device); + iface = rxm_iov_desc_to_hmem_iface_dev(iov, desc, count, &device); if (iface == FI_HMEM_ZE || iface == FI_HMEM_SYNAPSEAI) goto rndv_send; diff --git a/prov/shm/src/smr_ep.c b/prov/shm/src/smr_ep.c index d786a06b0a7..8b2884e3fa4 100644 --- a/prov/shm/src/smr_ep.c +++ b/prov/shm/src/smr_ep.c @@ -811,6 +811,21 @@ static void smr_cleanup_epoll(struct smr_sock_info *sock_info) ofi_epoll_close(sock_info->epollfd); } +static void smr_free_sock_info(struct smr_ep *ep) +{ + int i, j; + + for (i = 0; i < SMR_MAX_PEERS; i++) { + if (!ep->sock_info->peers[i].device_fds) + continue; + for (j = 0; j < ep->sock_info->nfds; j++) + close(ep->sock_info->peers[i].device_fds[j]); + free(ep->sock_info->peers[i].device_fds); + } + free(ep->sock_info); + ep->sock_info = NULL; +} + static int smr_ep_close(struct fid *fid) { struct smr_ep *ep; @@ -826,11 +841,16 @@ static int smr_ep_close(struct fid *fid) close(ep->sock_info->listen_sock); unlink(ep->sock_info->name); smr_cleanup_epoll(ep->sock_info); - free(ep->sock_info); + smr_free_sock_info(ep); } - if (ep->srx && ep->util_ep.ep_fid.msg != &smr_no_recv_msg_ops) - (void) util_srx_close(&ep->srx->fid); + if (ep->srx) { + /* shm is an owner provider */ + if (ep->util_ep.ep_fid.msg != &smr_no_recv_msg_ops) + (void) util_srx_close(&ep->srx->fid); + else /* shm is a peer provider */ + free(ep->srx); + } ofi_endpoint_close(&ep->util_ep); @@ -1169,19 +1189,6 @@ void smr_ep_exchange_fds(struct smr_ep *ep, int64_t id) SMR_CMAP_FAILED : SMR_CMAP_SUCCESS; } -static void smr_free_sock_info(struct smr_ep *ep) -{ - int i, j; - - for (i = 0; i < SMR_MAX_PEERS; i++) { - for (j = 0; j < ep->sock_info->nfds; j++) - close(ep->sock_info->peers[i].device_fds[j]); - free(ep->sock_info->peers[i].device_fds); - } - free(ep->sock_info); - ep->sock_info = NULL; -} - static void smr_init_ipc_socket(struct smr_ep *ep) { struct smr_sock_name *sock_name; @@ -1259,6 +1266,15 @@ static void smr_init_ipc_socket(struct smr_ep *ep) static int smr_discard(struct fi_peer_rx_entry *rx_entry) { struct smr_cmd_ctx *cmd_ctx = rx_entry->peer_context; + struct smr_region *peer_smr; + struct smr_resp *resp; + + if (cmd_ctx->cmd.msg.hdr.src_data >= smr_src_iov) { + peer_smr = smr_peer_region(cmd_ctx->ep->region, + cmd_ctx->cmd.msg.hdr.id); + resp = smr_get_ptr(peer_smr, cmd_ctx->cmd.msg.hdr.src_data); + resp->status = SMR_STATUS_SUCCESS; + } ofi_buf_free(cmd_ctx); @@ -1348,7 +1364,6 @@ static int smr_ep_ctrl(struct fid *fid, int command, void *arg) struct smr_domain *domain; struct smr_ep *ep; struct smr_av *av; - struct fid_peer_srx *srx; int ret; ep = container_of(fid, struct smr_ep, util_ep.ep_fid.fid); @@ -1405,12 +1420,6 @@ static int smr_ep_ctrl(struct fid *fid, int command, void *arg) if (ret) return ret; } else { - srx = calloc(1, sizeof(*srx)); - srx->peer_ops = &smr_srx_peer_ops; - srx->owner_ops = smr_get_peer_srx(ep)->owner_ops; - srx->ep_fid.fid.context = - smr_get_peer_srx(ep)->ep_fid.fid.context; - ep->srx = &srx->ep_fid; ep->util_ep.ep_fid.msg = &smr_no_recv_msg_ops; ep->util_ep.ep_fid.tagged = &smr_no_recv_tag_ops; } diff --git a/prov/shm/src/smr_progress.c b/prov/shm/src/smr_progress.c index fd271453d5b..b30918e6986 100644 --- a/prov/shm/src/smr_progress.c +++ b/prov/shm/src/smr_progress.c @@ -99,8 +99,6 @@ static int smr_progress_resp_entry(struct smr_ep *ep, struct smr_resp *resp, break; case smr_src_ipc: assert(pending->mr[0]); - if (pending->mr[0]->iface == FI_HMEM_ZE) - close(pending->fd); break; case smr_src_sar: sar_buf = smr_freestack_get_entry_from_index( @@ -654,7 +652,7 @@ static void smr_do_atomic(void *src, struct ofi_mr *dst_mr, void *dst, } if (flags & SMR_RMA_REQ) - memcpy(src, op == FI_ATOMIC_READ ? tmp_dst : tmp_result, + memcpy(src, op == FI_ATOMIC_READ ? cpy_dst : tmp_result, cnt * ofi_datatype_size(datatype)); if (cpy_dst != dst) { diff --git a/prov/shm/src/smr_signal.h b/prov/shm/src/smr_signal.h index d30af3acce5..164a96714d8 100644 --- a/prov/shm/src/smr_signal.h +++ b/prov/shm/src/smr_signal.h @@ -46,14 +46,19 @@ static void smr_handle_signal(int signum, siginfo_t *info, void *ucontext) struct smr_sock_name *sock_name; int ret; + pthread_mutex_lock(&ep_list_lock); dlist_foreach_container(&ep_name_list, struct smr_ep_name, ep_name, entry) { shm_unlink(ep_name->name); } + pthread_mutex_unlock(&ep_list_lock); + + pthread_mutex_lock(&sock_list_lock); dlist_foreach_container(&sock_name_list, struct smr_sock_name, sock_name, entry) { unlink(sock_name->name); } + pthread_mutex_unlock(&sock_list_lock); /* Register the original signum handler, SIG_DFL or otherwise */ ret = sigaction(signum, &old_action[signum], NULL); diff --git a/prov/shm/src/smr_util.c b/prov/shm/src/smr_util.c index 16f157b5e80..85e76f6bcab 100644 --- a/prov/shm/src/smr_util.c +++ b/prov/shm/src/smr_util.c @@ -393,9 +393,6 @@ int smr_map_to_region(const struct fi_provider *prov, struct smr_map *map, const char *name = smr_no_prefix(peer_buf->peer.name); char tmp[SMR_PATH_MAX]; - if (peer_buf->region) - return FI_SUCCESS; - pthread_mutex_lock(&ep_list_lock); entry = dlist_find_first_match(&ep_name_list, smr_match_name, name); if (entry) { @@ -406,10 +403,16 @@ int smr_map_to_region(const struct fi_provider *prov, struct smr_map *map, } pthread_mutex_unlock(&ep_list_lock); + ofi_spin_lock(&map->lock); + if (peer_buf->region) + goto unlock; + fd = shm_open(name, O_RDWR, S_IRUSR | S_IWUSR); if (fd < 0) { - FI_WARN_ONCE(prov, FI_LOG_AV, "shm_open error\n"); - return -errno; + ret = -errno; + FI_WARN_ONCE(prov, FI_LOG_AV, + "shm_open error: name %s errno %d\n", name, errno); + goto unlock; } memset(tmp, 0, sizeof(tmp)); @@ -454,6 +457,8 @@ int smr_map_to_region(const struct fi_provider *prov, struct smr_map *map, out: close(fd); +unlock: + ofi_spin_unlock(&map->lock); return ret; } @@ -555,13 +560,12 @@ int smr_map_add(const struct fi_provider *prov, struct smr_map *map, strncpy(map->peers[*id].peer.name, name, SMR_NAME_MAX); map->peers[*id].peer.name[SMR_NAME_MAX - 1] = '\0'; map->peers[*id].region = NULL; + map->num_peers++; + ofi_spin_unlock(&map->lock); ret = smr_map_to_region(prov, map, *id); if (!ret) map->peers[*id].peer.id = *id; - - map->num_peers++; - ofi_spin_unlock(&map->lock); return ret == -ENOENT ? 0 : ret; } @@ -569,8 +573,7 @@ void smr_map_del(struct smr_map *map, int64_t id) { struct dlist_entry *entry; - if (id >= SMR_MAX_PEERS || id < 0 || map->peers[id].peer.id < 0) - return; + assert(id >= 0 && id < SMR_MAX_PEERS); pthread_mutex_lock(&ep_list_lock); entry = dlist_find_first_match(&ep_name_list, smr_match_name, @@ -578,6 +581,9 @@ void smr_map_del(struct smr_map *map, int64_t id) pthread_mutex_unlock(&ep_list_lock); ofi_spin_lock(&map->lock); + if (map->peers[id].peer.id < 0) + goto unlock; + if (!entry) { if (map->flags & SMR_FLAG_HMEM_ENABLED) (void) ofi_hmem_host_unregister(map->peers[id].region); @@ -591,6 +597,7 @@ void smr_map_del(struct smr_map *map, int64_t id) map->peers[id].peer.id = -1; map->num_peers--; +unlock: ofi_spin_unlock(&map->lock); } diff --git a/prov/shm/src/smr_util.h b/prov/shm/src/smr_util.h index 369cbcab90a..9ad2fe9337e 100644 --- a/prov/shm/src/smr_util.h +++ b/prov/shm/src/smr_util.h @@ -54,7 +54,7 @@ extern "C" { #endif -#define SMR_VERSION 6 +#define SMR_VERSION 7 #define SMR_FLAG_ATOMIC (1 << 0) #define SMR_FLAG_DEBUG (1 << 1) @@ -225,7 +225,12 @@ struct smr_region { int pid; uint8_t cma_cap_peer; uint8_t cma_cap_self; + uint8_t xpmem_cap_self; + uint8_t resv2; + uint32_t max_sar_buf_per_peer; + struct xpmem_pinfo xpmem_self; + struct xpmem_pinfo xpmem_peer; void *base_addr; pthread_spinlock_t lock; /* lock for shm access if both ep->tx_lock and this lock need to @@ -244,10 +249,6 @@ struct smr_region { size_t peer_data_offset; size_t name_offset; size_t sock_name_offset; - - uint8_t xpmem_cap_self; - struct xpmem_pinfo xpmem_self; - struct xpmem_pinfo xpmem_peer; }; struct smr_resp { diff --git a/prov/sm2/src/sm2_atomic.c b/prov/sm2/src/sm2_atomic.c index a9f7ff02874..3f57241a528 100644 --- a/prov/sm2/src/sm2_atomic.c +++ b/prov/sm2/src/sm2_atomic.c @@ -58,9 +58,12 @@ sm2_atomic_format(struct sm2_xfer_entry *xfer_entry, uint8_t datatype, memcpy(atomic_entry->atomic_hdr.rma_ioc, rma_ioc, sizeof(*rma_ioc) * rma_ioc_count); - atomic_entry->atomic_hdr.result_iov_count = result_count; - memcpy(atomic_entry->atomic_hdr.result_iov, resultv, - sizeof(*resultv) * result_count); + if (xfer_entry->hdr.op == ofi_op_atomic_fetch || + xfer_entry->hdr.op == ofi_op_atomic_compare) { + atomic_entry->atomic_hdr.result_iov_count = result_count; + memcpy(atomic_entry->atomic_hdr.result_iov, resultv, + sizeof(*resultv) * result_count); + } switch (xfer_entry->hdr.op) { case ofi_op_atomic: @@ -78,8 +81,7 @@ sm2_atomic_format(struct sm2_xfer_entry *xfer_entry, uint8_t datatype, xfer_entry->hdr.size = ofi_copy_from_iov( atomic_entry->atomic_data.buf, SM2_ATOMIC_COMP_INJECT_SIZE, iov, iov_count, 0); - comp_size = ofi_copy_from_iov(atomic_entry->atomic_data.comp + - xfer_entry->hdr.size, + comp_size = ofi_copy_from_iov(atomic_entry->atomic_data.comp, SM2_ATOMIC_COMP_INJECT_SIZE, compare_iov, compare_count, 0); if (comp_size != xfer_entry->hdr.size) @@ -171,8 +173,8 @@ static inline ssize_t sm2_generic_atomic( } break; default: - assert(0); - break; + FI_WARN(&sm2_prov, FI_LOG_EP_CTRL, "Unrecognized atomic op\n"); + return -FI_ENOSYS; } ofi_genlock_lock(&ep->util_ep.lock); diff --git a/prov/sm2/src/sm2_av.c b/prov/sm2/src/sm2_av.c index 8214d292f04..e324c3775d6 100644 --- a/prov/sm2/src/sm2_av.c +++ b/prov/sm2/src/sm2_av.c @@ -128,8 +128,10 @@ static int sm2_av_insert(struct fid_av *av_fid, const void *addr, size_t count, srx->owner_ops->foreach_unspec_addr(srx, &sm2_get_addr); } - if (flags & FI_EVENT) + if (flags & FI_EVENT) { + assert(util_av->eq); ofi_av_write_event(util_av, succ_count, 0, context); + } return succ_count; } @@ -285,7 +287,7 @@ int sm2_av_open(struct fid_domain *domain, struct fi_av_attr *attr, return 0; out: - ofi_av_close(&sm2_av->util_av); + (void) ofi_av_close(&sm2_av->util_av); free(sm2_av); return ret; } diff --git a/prov/sm2/src/sm2_ep.c b/prov/sm2/src/sm2_ep.c index 72446135a57..7cac9ad1783 100644 --- a/prov/sm2/src/sm2_ep.c +++ b/prov/sm2/src/sm2_ep.c @@ -641,7 +641,8 @@ int sm2_endpoint(struct fid_domain *domain, struct fi_info *info, if (ret || ofi_bufpool_grow(ep->xfer_ctx_pool)) { FI_WARN(&sm2_prov, FI_LOG_EP_CTRL, "Unable to create xfer_entry ctx pool\n"); - return -FI_ENOMEM; + ret = -FI_ENOMEM; + goto close; } ep->util_ep.ep_fid.fid.ops = &sm2_ep_fi_ops; @@ -653,6 +654,8 @@ int sm2_endpoint(struct fid_domain *domain, struct fi_info *info, *ep_fid = &ep->util_ep.ep_fid; return 0; +close: + (void) ofi_endpoint_close(&ep->util_ep); name: free((void *) ep->name); ep: diff --git a/prov/sm2/src/sm2_init.c b/prov/sm2/src/sm2_init.c index 12bb33e9ce2..b43739eb497 100644 --- a/prov/sm2/src/sm2_init.c +++ b/prov/sm2/src/sm2_init.c @@ -59,7 +59,6 @@ size_t sm2_calculate_size_offsets(ptrdiff_t *rq_offset, ptrdiff_t *fs_offset) int sm2_create(const struct fi_provider *prov, const struct sm2_attr *attr, struct sm2_mmap *sm2_mmap, sm2_gid_t *gid) { - struct sm2_ep_name *ep_name; ptrdiff_t recv_queue_offset, freestack_offset; int ret; void *mapped_addr; @@ -80,20 +79,6 @@ int sm2_create(const struct fi_provider *prov, const struct sm2_attr *attr, return ret; } - ep_name = calloc(1, sizeof(*ep_name)); - if (!ep_name) { - FI_WARN(prov, FI_LOG_EP_CTRL, "calloc error\n"); - return -FI_ENOMEM; - } - strncpy(ep_name->name, (char *) attr->name, FI_NAME_MAX - 1); - ep_name->name[FI_NAME_MAX - 1] = '\0'; - - if (ret < 0) { - FI_WARN(prov, FI_LOG_EP_CTRL, "ftruncate error\n"); - ret = -errno; - goto remove; - } - mapped_addr = sm2_mmap_ep_region(sm2_mmap, *gid); if (mapped_addr == MAP_FAILED) { @@ -131,7 +116,6 @@ int sm2_create(const struct fi_provider *prov, const struct sm2_attr *attr, remove: sm2_file_unlock(sm2_mmap); - free(ep_name); return ret; } diff --git a/prov/sockets/src/sock_cntr.c b/prov/sockets/src/sock_cntr.c index a9a8956dbe6..03ff2c9a2b1 100644 --- a/prov/sockets/src/sock_cntr.c +++ b/prov/sockets/src/sock_cntr.c @@ -452,8 +452,10 @@ static uint64_t sock_cntr_readerr(struct fid_cntr *cntr) _cntr = container_of(cntr, struct sock_cntr, cntr_fid); if (_cntr->domain->progress_mode == FI_PROGRESS_MANUAL) sock_cntr_progress(_cntr); + pthread_mutex_lock(&_cntr->mut); if (_cntr->err_flag) _cntr->err_flag = 0; + pthread_mutex_unlock(&_cntr->mut); return ofi_atomic_get32(&_cntr->err_cnt); } diff --git a/prov/sockets/src/sock_cq.c b/prov/sockets/src/sock_cq.c index 779460f092a..c525b29f558 100644 --- a/prov/sockets/src/sock_cq.c +++ b/prov/sockets/src/sock_cq.c @@ -340,8 +340,12 @@ static ssize_t sock_cq_sreadfrom(struct fid_cq *cq, void *buf, size_t count, ssize_t cq_entry_len, avail; sock_cq = container_of(cq, struct sock_cq, cq_fid); + pthread_mutex_lock(&sock_cq->lock); if (ofi_rbused(&sock_cq->cqerr_rb)) - return -FI_EAVAIL; + ret = -FI_EAVAIL; + pthread_mutex_unlock(&sock_cq->lock); + if (ret) + return ret; cq_entry_len = sock_cq->cq_entry_size; if (sock_cq->attr.wait_cond == FI_CQ_COND_THRESHOLD) diff --git a/prov/sockets/src/sock_eq.c b/prov/sockets/src/sock_eq.c index 86c560f0169..3c48f1c2e7b 100644 --- a/prov/sockets/src/sock_eq.c +++ b/prov/sockets/src/sock_eq.c @@ -313,7 +313,9 @@ static int sock_eq_control(struct fid *fid, int command, void *arg) case FI_WAIT_NONE: case FI_WAIT_UNSPEC: case FI_WAIT_FD: + ofi_mutex_lock(&eq->lock); memcpy(arg, &eq->list.signal.fd[FI_READ_FD], sizeof(int)); + ofi_mutex_unlock(&eq->lock); break; case FI_WAIT_SET: case FI_WAIT_MUTEX_COND: diff --git a/prov/sockets/src/sock_wait.c b/prov/sockets/src/sock_wait.c index d55660b218e..9e3869fe40b 100644 --- a/prov/sockets/src/sock_wait.c +++ b/prov/sockets/src/sock_wait.c @@ -137,8 +137,13 @@ static int sock_wait_wait(struct fid_wait *wait_fid, int timeout) cq = container_of(list_item->fid, struct sock_cq, cq_fid); sock_cq_progress(cq); + pthread_mutex_lock(&cq->lock); if (ofi_rbused(&cq->cqerr_rb)) - return 1; + err = 1; + + pthread_mutex_unlock(&cq->lock); + if (err) + return err; break; case FI_CLASS_CNTR: diff --git a/prov/usnic/Makefile.include b/prov/usnic/Makefile.include new file mode 100644 index 00000000000..74ff3d6c02b --- /dev/null +++ b/prov/usnic/Makefile.include @@ -0,0 +1,164 @@ +# +# Copyright (c) 2014-2019, Cisco Systems, Inc. All rights reserved. +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# + +if HAVE_USNIC +libusnic_direct_sources = \ + prov/usnic/src/usnic_direct/cq_desc.h \ + prov/usnic/src/usnic_direct/cq_enet_desc.h \ + prov/usnic/src/usnic_direct/kcompat.h \ + prov/usnic/src/usnic_direct/kcompat_priv.h \ + prov/usnic/src/usnic_direct/libnl1_utils.h \ + prov/usnic/src/usnic_direct/libnl3_utils.h \ + prov/usnic/src/usnic_direct/libnl_utils_common.c \ + prov/usnic/src/usnic_direct/libnl_utils.h \ + prov/usnic/src/usnic_direct/linux/delay.h \ + prov/usnic/src/usnic_direct/linux/slab.h \ + prov/usnic/src/usnic_direct/linux_types.h \ + prov/usnic/src/usnic_direct/rq_enet_desc.h \ + prov/usnic/src/usnic_direct/usd_caps.c \ + prov/usnic/src/usnic_direct/usd_caps.h \ + prov/usnic/src/usnic_direct/usd_dest.c \ + prov/usnic/src/usnic_direct/usd_dest.h \ + prov/usnic/src/usnic_direct/usd_device.c \ + prov/usnic/src/usnic_direct/usd_device.h \ + prov/usnic/src/usnic_direct/usd_event.c \ + prov/usnic/src/usnic_direct/usd_enum.c \ + prov/usnic/src/usnic_direct/usd.h \ + prov/usnic/src/usnic_direct/usd_ib_cmd.c \ + prov/usnic/src/usnic_direct/usd_ib_cmd.h \ + prov/usnic/src/usnic_direct/usd_ib_sysfs.c \ + prov/usnic/src/usnic_direct/usd_ib_sysfs.h \ + prov/usnic/src/usnic_direct/usd_mem.c \ + prov/usnic/src/usnic_direct/usd_poll.c \ + prov/usnic/src/usnic_direct/usd_post.c \ + prov/usnic/src/usnic_direct/usd_post.h \ + prov/usnic/src/usnic_direct/usd_post_ud_raw.c \ + prov/usnic/src/usnic_direct/usd_post_ud_udp.c \ + prov/usnic/src/usnic_direct/usd_post_ud_pio_udp.c \ + prov/usnic/src/usnic_direct/usd_queue.h \ + prov/usnic/src/usnic_direct/usd_queues.c \ + prov/usnic/src/usnic_direct/usd_socket.c \ + prov/usnic/src/usnic_direct/usd_socket.h \ + prov/usnic/src/usnic_direct/usd_time.h \ + prov/usnic/src/usnic_direct/usd_util.h \ + prov/usnic/src/usnic_direct/usd_vnic.c \ + prov/usnic/src/usnic_direct/usd_vnic.h \ + prov/usnic/src/usnic_direct/usnic_abi.h \ + prov/usnic/src/usnic_direct/usnic_direct.h \ + prov/usnic/src/usnic_direct/usnic_ib_abi.h \ + prov/usnic/src/usnic_direct/usnic_ip_utils.c \ + prov/usnic/src/usnic_direct/usnic_ip_utils.h \ + prov/usnic/src/usnic_direct/usnic_user_utils.h \ + prov/usnic/src/usnic_direct/vnic_cq.c \ + prov/usnic/src/usnic_direct/vnic_cq.h \ + prov/usnic/src/usnic_direct/vnic_dev.c \ + prov/usnic/src/usnic_direct/vnic_devcmd.h \ + prov/usnic/src/usnic_direct/vnic_dev.h \ + prov/usnic/src/usnic_direct/vnic_enet.h \ + prov/usnic/src/usnic_direct/vnic_resource.h \ + prov/usnic/src/usnic_direct/vnic_rq.c \ + prov/usnic/src/usnic_direct/vnic_rq.h \ + prov/usnic/src/usnic_direct/vnic_stats.h \ + prov/usnic/src/usnic_direct/vnic_wq.c \ + prov/usnic/src/usnic_direct/vnic_wq.h \ + prov/usnic/src/usnic_direct/vnic_intr.c \ + prov/usnic/src/usnic_direct/vnic_intr.h \ + prov/usnic/src/usnic_direct/wq_enet_desc.h + +_usnic_files = \ + $(libusnic_direct_sources) \ + prov/usnic/src/fi_ext_usnic.h \ + prov/usnic/src/usdf.h \ + prov/usnic/src/usdf_av.c \ + prov/usnic/src/usdf_av.h \ + prov/usnic/src/usdf_cm.c \ + prov/usnic/src/usdf_cm.h \ + prov/usnic/src/usdf_cq.c \ + prov/usnic/src/usdf_cq.h \ + prov/usnic/src/usdf_dgram.c \ + prov/usnic/src/usdf_dgram.h \ + prov/usnic/src/usdf_domain.c \ + prov/usnic/src/usdf_endpoint.c \ + prov/usnic/src/usdf_endpoint.h \ + prov/usnic/src/usdf_ep_dgram.c \ + prov/usnic/src/usdf_eq.c \ + prov/usnic/src/usdf_fabric.c \ + prov/usnic/src/usdf_mem.c \ + prov/usnic/src/usdf_pep.c \ + prov/usnic/src/usdf_progress.c \ + prov/usnic/src/usdf_progress.h \ + prov/usnic/src/usdf_rudp.h \ + prov/usnic/src/usdf_timer.c \ + prov/usnic/src/usdf_timer.h \ + prov/usnic/src/usdf_poll.c \ + prov/usnic/src/usdf_poll.h \ + prov/usnic/src/usdf_ext.c \ + prov/usnic/src/usdf_wait.h \ + prov/usnic/src/usdf_wait.c + +if USNIC_BUILD_FAKE_VERBS_DRIVER +_usnic_files += prov/usnic/src/usdf_fake_ibv.c +endif + +_usnic_cppflags = \ + -D__LIBUSNIC__ -DWANT_DEBUG_MSGS=0 \ + -DHAVE_LIBNL3=$(HAVE_LIBNL3) $(usnic_CPPFLAGS) \ + -I$(top_srcdir)/prov/usnic/src/usnic_direct + +rdmainclude_HEADERS += \ + prov/usnic/src/fi_ext_usnic.h + +if HAVE_USNIC_DL +pkglib_LTLIBRARIES += libusnic-fi.la +libusnic_fi_la_CPPFLAGS = $(AM_CPPFLAGS) $(_usnic_cppflags) +libusnic_fi_la_SOURCES = $(_usnic_files) $(common_srcs) +libusnic_fi_la_LDFLAGS = \ + $(usnic_ln_LDFLAGS) \ + -module -avoid-version -shared -export-dynamic +libusnic_fi_la_LIBADD = $(linkback) $(usnic_LIBS) +libusnic_fi_la_DEPENDENCIES = $(linkback) +else !HAVE_USNIC_DL +src_libfabric_la_SOURCES += $(_usnic_files) +src_libfabric_la_CPPFLAGS += $(_usnic_cppflags) +src_libfabric_la_LDFLAGS += $(usnic_LDFLAGS) +src_libfabric_la_LIBADD += $(usnic_LIBS) +endif !HAVE_USNIC_DL + +prov_install_man_pages += man/man7/fi_usnic.7 + +endif HAVE_USNIC + +prov_dist_man_pages += man/man7/fi_usnic.7 diff --git a/prov/usnic/configure.m4 b/prov/usnic/configure.m4 new file mode 100644 index 00000000000..f31b40309af --- /dev/null +++ b/prov/usnic/configure.m4 @@ -0,0 +1,366 @@ +dnl +dnl Copyright (c) 2015-2017, Cisco Systems, Inc. All rights reserved. +dnl +dnl This software is available to you under a choice of one of two +dnl licenses. You may choose to be licensed under the terms of the GNU +dnl General Public License (GPL) Version 2, available from the file +dnl COPYING in the main directory of this source tree, or the +dnl BSD license below: +dnl +dnl Redistribution and use in source and binary forms, with or +dnl without modification, are permitted provided that the following +dnl conditions are met: +dnl +dnl - Redistributions of source code must retain the above +dnl copyright notice, this list of conditions and the following +dnl disclaimer. +dnl +dnl - Redistributions in binary form must reproduce the above +dnl copyright notice, this list of conditions and the following +dnl disclaimer in the documentation and/or other materials +dnl provided with the distribution. +dnl +dnl THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +dnl "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +dnl LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +dnl FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +dnl COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +dnl INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +dnl BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +dnl LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +dnl CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +dnl LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +dnl ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +dnl POSSIBILITY OF SUCH DAMAGE. +dnl + +dnl Configury specific to the libfabric usNIC provider + +dnl libnl is sadness, but we have to use it. The majority of this +dnl configure.m4 is just to deal with libnl. :-( + +dnl libnl has two versions: libnl (i.e., version 1) and libnl3. + +dnl These two versions have many of the same symbols, but they are +dnl incompatible with each other. We can handle this in the C code, but +dnl we must know which version to compile file (i.e., configure must +dnl figure this out). Additionally, if both versions get linked into +dnl the same process, they will disrupt each other's global state, and +dnl Random Bad Things happen. We can't always prevent this -- e.g., if we +dnl link against libnl vX and some other middleware links against libnl vY +dnl (and X != Y), prepare for unpleasentness. You have been warned. + +dnl As of this writing (March 2015), most Linux distros seem to be +dnl encouraging packages to prefer libnl v3 over libnl v1. + +dnl libnl wants us to use pkg-config to find CPPFLAGS and LDFLAGS and +dnl LIBS, but pkg-config isn't always available. So we have to test here. +dnl It gets more complicated because libnl changed several things between v1 +dnl and v3: + +dnl v1: +dnl - Header files (e.g., are in $prefix/include +dnl - Library is in $prefix/lib[64] +dnl - Library is named libnl. + +dnl v3: +dnl - Header files (e.g., are in $prefix/include/libnl3 +dnl *** NOTE: This means that a -I switch is REQUIRED to find +dnl the libnl3 headers (!) +dnl - Library is in $prefix/lib[64] +dnl - Library is named libnl-3. +dnl - We *also* need the libnl-route-3 library + +dnl These differing requirements make the configure/m4 tests a bit of +dnl a nightmare. :-( + +dnl --------------------------------------------------------------------------- + +dnl This configure.m4 script supports the following CLI options: + +dnl --with-libnl[=dir] +dnl If specified, look for libnl support. If it is not found, +dnl error/abort configure. If dir is specified, look in that +dnl directory (configure will first look for libnl v3 in that tree, and if +dnl it is not found, look for libnl v1 in that tree). If no dir is +dnl specified, this option is redundant with --with-usnic. + +dnl --without-libnl +dnl Do not look for libnl support. This means that the usnic provider +dnl will not be built (since the usnic provider *requires* libnl support). + +dnl --------------------------------------------------------------------------- + +dnl Called to configure this provider +dnl +dnl Arguments: +dnl +dnl $1: action if configured successfully +dnl $2: action if not configured successfully +dnl +AC_DEFUN([FI_USNIC_CONFIGURE],[ + # Determine if we can support the usnic provider + usnic_happy=0 + usnic_build_fake_driver=0 + AS_IF([test "x$enable_usnic" != "xno"], + [AC_CHECK_HEADER([infiniband/verbs.h], [usnic_happy=1]) + AS_IF([test $usnic_happy -eq 1], + [USNIC_CHECK_IF_NEED_FAKE_USNIC + USNIC_CHECK_LIBNL_SADNESS]) + ]) + + # AM_CONDITIONALs must always be defined + AM_CONDITIONAL([USNIC_BUILD_FAKE_VERBS_DRIVER], + [test $usnic_build_fake_driver -eq 1]) +]) + +dnl +dnl Helper function to parse --with-libnl* options +dnl +dnl $1: variable name +dnl $2: --with- value +dnl +AC_DEFUN([USNIC_PARSE_WITH],[ + case "$2" in + no) + # Nope, don't want it + usnic_want_$1=no + ;; + yes) + # Yes, definitely want it + usnic_want_$1=yes + ;; + default) + # Default case -- try and see if we can find it + usnic_want_$1=default + usnic_$1_location=/usr + ;; + *) + # Yes, definitely want it -- at a specific location + usnic_want_$1=yes + usnic_$1_location="$2" + ;; + esac +]) + +dnl +dnl Check for ibv_register_driver +dnl +dnl If libibverbs is available and is old enough, we need to install a +dnl "fake" usnic verbs driver to keep it from complaining to stderr +dnl that there is no usnic verbs provider. Newer versions of +dnl libibverbs won't complain. If we can detect a new-enough +dnl libibverbs, don't bother to compile the fake usnic verbs driver. +dnl +dnl Per +dnl https://github.com/ofiwg/libfabric/pull/2684#issuecomment-276462368, +dnl the logic boils down to: +dnl +dnl Compile the fake usnic verbs provider if +dnl exists and do not contain a prototype for verbs_register_driver(). +dnl +AC_DEFUN([USNIC_CHECK_IF_NEED_FAKE_USNIC],[ + AC_CHECK_HEADER([infiniband/driver.h], + [AC_CHECK_DECL([verbs_register_driver], + [], + [usnic_build_fake_driver=1], + [#include + ])]) + + AC_MSG_CHECKING([if building usnic fake verbs driver]) + AS_IF([test $usnic_build_fake_driver -eq 1], + [AC_MSG_RESULT([yes])], + [AC_MSG_RESULT([no])]) + AC_DEFINE_UNQUOTED([USNIC_BUILD_FAKE_VERBS_DRIVER], + [$usnic_build_fake_driver], + [Whether to build the fake usNIC verbs provider or not]) +]) + +dnl +dnl Shared macro +dnl +AC_DEFUN([USNIC_CHECK_LIBNL_SADNESS],[ + AC_ARG_WITH([libnl], + [AS_HELP_STRING([--with-libnl(=DIR)], + [Directory prefix for libnl (typically only necessary if libnl is installed in a location that the compiler/linker will not search by default)])], + [], [with_libnl=default]) + + # The --with options carry two pieces of information: 1) do + # you want a specific version of libnl, and 2) where that + # version of libnl lives. For simplicity, let's separate + # those two pieces of information. + USNIC_PARSE_WITH([libnl], [$with_libnl]) + + # Default to a numeric value (this value gets AC_DEFINEd) + HAVE_LIBNL3=0 + + ################################################### + # NOTE: We *must* check for libnl3 before libnl. + ################################################### + + AS_IF([test "$usnic_want_libnl" != "no"], + [USNIC_CHECK_LIBNL3([$usnic_libnl_location], [usnic_nl])]) + AS_IF([test "$usnic_want_libnl" != "no" && + test "$usnic_nl_LIBS" = ""], + [USNIC_CHECK_LIBNL([$usnic_libnl_location], [usnic_nl])]) + + AS_IF([test "$usnic_want_libnl" = "yes" && + test "$usnic_nl_LIBS" = ""], + [AC_MSG_WARN([--with-libnl specified, but not found]) + AC_MSG_ERROR([Cannot continue])]) + + # Final result + AC_SUBST([HAVE_LIBNL3]) + AC_DEFINE_UNQUOTED([HAVE_LIBNL3], [$HAVE_LIBNL3], + [Whether we have libl or libnl3]) + + usnic_CPPFLAGS=$usnic_nl_CPPFLAGS + usnic_LDFLAGS=$usnic_nl_LDFLAGS + usnic_LIBS=$usnic_nl_LIBS + + # If we're building the usNIC fake verbs provider, we need to + # -libverbs, so put it in usnic_LIBS (so that it will also get + # properly substituted into the pkg-config data files). + usnic_verbs_lib= + AS_IF([test $usnic_build_fake_driver -eq 1], + [usnic_verbs_lib="-libverbs"]) + usnic_LIBS="$usnic_LIBS $usnic_verbs_lib" + + AC_SUBST([usnic_CPPFLAGS]) + AC_SUBST([usnic_LDFLAGS]) + AC_SUBST([usnic_LIBS]) + + AS_IF([test "$usnic_nl_LIBS" = ""], + [usnic_happy=0]) +]) + +dnl +dnl Check for libnl-3. +dnl +dnl Inputs: +dnl +dnl $1: prefix where to look for libnl-3 +dnl $2: var name prefix of _CPPFLAGS and _LDFLAGS and _LIBS +dnl +dnl Outputs: +dnl +dnl - Set $2_CPPFLAGS necessary to compile with libnl-3 +dnl - Set $2_LDFLAGS necessary to link with libnl-3 +dnl - Set $2_LIBS necessary to link with libnl-3 +dnl - Set HAVE_LIBNL3 1 if libnl-3 will be used +dnl +AC_DEFUN([USNIC_CHECK_LIBNL3],[ + AC_MSG_NOTICE([checking for libnl3]) + + AC_MSG_CHECKING([for libnl3 prefix]) + AC_MSG_RESULT([$1]) + AC_MSG_CHECKING([for $1/include/libnl3]) + AS_IF([test -d "$1/include/libnl3"], + [usnic_libnl3_happy=1 + AC_MSG_RESULT([found])], + [usnic_libnl3_happy=0 + AC_MSG_RESULT([not found])]) + + # Random note: netlink/version.h is only in libnl3 - it is not in libnl. + # Also, nl_socket_set_peer_groups is only in libnl3. + CPPFLAGS_save=$CPPFLAGS + usnic_tmp_CPPFLAGS="-I$1/include/libnl3" + CPPFLAGS="$usnic_tmp_CPPFLAGS $CPPFLAGS" + AS_IF([test $usnic_libnl3_happy -eq 1], + [FI_CHECK_PACKAGE([$2], + [netlink/version.h], + [nl-3], + [nl_socket_set_peer_groups], + [], + [$1], + [], + [usnic_libnl3_happy=1], + [usnic_libnl3_happy=0]) + + # Note that FI_CHECK_PACKAGE is going to add + # -I$dir/include into $2_CPPFLAGS. But because libnl3 + # puts the headers in $dir/libnl3, we need to + # overwrite $2_CPPFLAGS with -I$dir/libnl3. We can do + # this unconditionally; we don't have to check for + # success (checking for success occurs below). + $2_CPPFLAGS=$usnic_tmp_CPPFLAGS]) + + # If we found libnl-3, we *also* need libnl-route-3 + LIBS_save=$LIBS + LDFLAGS_save=$LDFLAGS + AS_IF([test "$$2_LDFLAGS" != ""], + [LDFLAGS="$$2_LDFLAGS $LDFLAGS"]) + AS_IF([test $usnic_libnl3_happy -eq 1], + [AC_SEARCH_LIBS([nl_rtgen_request], + [nl-route-3], + [usnic_libnl3_happy=1], + [usnic_libnl3_happy=0])]) + LIBS=$LIBS_save + LDFLAGS=$LDFLAGS_save + + # Just because libnl* is evil, double check that the + # netlink/version.h we found was for libnl3. As far as we + # know, netlink/version.h only first appeared in version + # 3... but let's really be sure. + AS_IF([test $usnic_libnl3_happy -eq 1], + [AC_MSG_CHECKING([to ensure these really are libnl3 headers]) + CPPFLAGS="$$2_CPPFLAGS $CPPFLAGS" + AC_COMPILE_IFELSE( + [AC_LANG_PROGRAM([[ +#include +#include +#ifndef LIBNL_VER_MAJ +#error "LIBNL_VER_MAJ not defined!" +#endif +/* to the best of our knowledge, version.h only exists in libnl3 */ +#if LIBNL_VER_MAJ != 3 +#error "LIBNL_VER_MAJ != 3, I am sad" +#endif + ]])], + [AC_MSG_RESULT([yes])], + [AC_MSG_RESULT([no]) + usnic_libnl3_happy=0] + )]) + CPPFLAGS=$CPPFLAGS_save + + # If we found everything + AS_IF([test $usnic_libnl3_happy -eq 1], + [$2_LIBS="-lnl-3 -lnl-route-3" + HAVE_LIBNL3=1], + [$2_CPPFLAGS= + $2_LDFLAGS= + $2_LIBS=]) +]) + +dnl +dnl Check for libnl. +dnl +dnl Inputs: +dnl +dnl $1: prefix where to look for libnl +dnl $2: var name prefix of _CPPFLAGS and _LDFLAGS and _LIBS +dnl +dnl Outputs: +dnl +dnl - Set $2_CPPFLAGS necessary to compile with libnl +dnl - Set $2_LDFLAGS necessary to link with libnl +dnl - Set $2_LIBS necessary to link with libnl +dnl - Set HAVE_LIBNL3 0 if libnl will be used +dnl +AC_DEFUN([USNIC_CHECK_LIBNL],[ + AC_MSG_NOTICE([checking for libnl]) + + FI_CHECK_PACKAGE([$2], + [netlink/netlink.h], + [nl], + [nl_connect], + [-lm], + [$1], + [], + [usnic_libnl_happy=1], + [usnic_libnl_happy=0]) + + AS_IF([test $usnic_libnl_happy -eq 1], + [$2_LIBS="-lnl -lm" + HAVE_LIBNL3=0]) +]) diff --git a/prov/usnic/libfabric-usnic.spec.in b/prov/usnic/libfabric-usnic.spec.in new file mode 100644 index 00000000000..0deada89b1a --- /dev/null +++ b/prov/usnic/libfabric-usnic.spec.in @@ -0,0 +1,52 @@ +%{!?configopts: %global configopts LDFLAGS=-Wl,--build-id} +%{!?provider: %define provider usnic} +%{!?provider_formal: %define provider_formal usNIC} + +Name: libfabric-%{provider} +Version: @VERSION@ +Release: 1%{?dist} +Summary: Dynamic %{provider_formal} provider for user-space RDMA Fabric Interfaces +Group: System Environment/Libraries +License: GPLv2 or BSD +Url: http://www.github.com/ofiwg/libfabric +Source: http://www.github.org/ofiwg/%{name}/releases/download/v{%version}/libfabric-%{version}.tar.bz2 +BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n) +Requires: libfabric +BuildRequires: libfabric + +%description +libfabric provides a user-space API to access high-performance fabric +services, such as RDMA. + +This RPM provides the %{provider_formal} provider as a "plugin" to an existing +Libfabric installation. This plugin will override any existing %{provider_formal} +provider functionality in the existing Libfabric installation. + +%prep +%setup -q -n libfabric-%{version} + +%build +%configure %{configopts} --enable-%{provider}=dl +make %{?_smp_mflags} + +%install +rm -rf %{buildroot} +%makeinstall installdirs + +%clean +rm -rf %{buildroot} + +%files +%defattr(-,root,root,-) +%{_libdir}/libfabric/*.so + +%exclude %{_libdir}/libfabric.* +%exclude %{_libdir}/libfabric/*.la +%exclude %{_libdir}/pkgconfig +%exclude %{_bindir} +%exclude %{_mandir} +%exclude %{_includedir} + +%changelog +* Wed May 24 2017 Open Fabrics Interfaces Working Group +- First release of specfile for packaging a single dl provider. diff --git a/prov/usnic/src/fi_ext_usnic.h b/prov/usnic/src/fi_ext_usnic.h new file mode 100644 index 00000000000..79e8082c9d8 --- /dev/null +++ b/prov/usnic/src/fi_ext_usnic.h @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2013-2014 Intel Corporation. All rights reserved. + * Copyright (c) 2015-2016 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _FI_EXT_USNIC_H_ +#define _FI_EXT_USNIC_H_ + +/* + * See the fi_usnic.7 man page for information about the usnic provider + * extensions provided in this header. + */ + +#include +#include + +#define FI_PROTO_RUDP (100U | (1UL << 31)) + +#define FI_EXT_USNIC_INFO_VERSION 2 + +#define FI_EXT_USNIC_MAX_DEVNAME 16 + +/* + * usNIC specific info + */ +/* Packed in 1.4, maintains the same alignment as <= 1.3.0 */ +struct fi_usnic_cap { + const char *uc_capability; + int uc_present; +} __attribute__((packed)); + +/* Packed in 1.4, maintains the same alignment as <= 1.3.0 */ +struct fi_usnic_info_v1 { + uint32_t ui_link_speed; + uint32_t ui_netmask_be; + char ui_ifname[IFNAMSIZ]; + + uint32_t ui_num_vf; + uint32_t ui_qp_per_vf; + uint32_t ui_cq_per_vf; +} __attribute__((packed)); + +struct fi_usnic_info_v2 { + /* Put all of the v1 fields at the start to provide some backward + * compatibility. + */ + uint32_t ui_link_speed; + uint32_t ui_netmask_be; + char ui_ifname[IFNAMSIZ]; + unsigned ui_num_vf; + unsigned ui_qp_per_vf; + unsigned ui_cq_per_vf; + + char ui_devname[FI_EXT_USNIC_MAX_DEVNAME]; + uint8_t ui_mac_addr[6]; + + /* Explicit padding to match 1.3 alignment */ + uint8_t ui_pad0[2]; + + uint32_t ui_ipaddr_be; + uint32_t ui_prefixlen; + uint32_t ui_mtu; + uint8_t ui_link_up; + + /* Explicit padding to match 1.3 alignment */ + uint8_t ui_pad1[3]; + + uint32_t ui_vendor_id; + uint32_t ui_vendor_part_id; + uint32_t ui_device_id; + char ui_firmware[64]; + + unsigned ui_intr_per_vf; + unsigned ui_max_cq; + unsigned ui_max_qp; + + unsigned ui_max_cqe; + unsigned ui_max_send_credits; + unsigned ui_max_recv_credits; + + const char *ui_nicname; + const char *ui_pid; + + struct fi_usnic_cap **ui_caps; +} __attribute__((packed)); + +/* In API version 1.2 and below, the v1 structure did not contain any 64-bit + * data types and therefore had a 4-byte alignment. Once v2 of the extension API + * was introduced in version 1.3, the extra pointers mandated an 8-byte + * alignment thus changing the offset of the v1 structure. This means that the + * alignment difference manifests when an application using v1 of the extension + * is compiled with Libfabric v1.1.x or v1.2.x, but then runs with libfabric.so + * that is v1.3.x or higher (and vice versa). Make the alignment explicit and + * consistent by adding an extra 32-bit padding (4 uint8_t). + */ +struct fi_usnic_info { + uint32_t ui_version; + uint8_t ui_pad0[4]; + union { + struct fi_usnic_info_v1 v1; + struct fi_usnic_info_v2 v2; + } ui; +} __attribute__((packed)); + +/* + * usNIC-specific fabric ops + */ +#define FI_USNIC_FABRIC_OPS_1 "fabric_ops 1" +struct fi_usnic_ops_fabric { + size_t size; + int (*getinfo)(uint32_t version, struct fid_fabric *fabric, + struct fi_usnic_info *info); +}; + +/* + * usNIC-specific AV ops + */ +#define FI_USNIC_AV_OPS_1 "av_ops 1" +struct fi_usnic_ops_av { + size_t size; + int (*get_distance)(struct fid_av *av, void *addr, int *metric); +}; + +int usdf_fabric_ops_open(struct fid *fid, const char *ops_name, uint64_t flags, + void **ops, void *context); +int usdf_av_ops_open(struct fid *fid, const char *ops_name, uint64_t flags, + void **ops, void *context); + +#endif /* _FI_EXT_USNIC_H_ */ diff --git a/prov/usnic/src/usdf.h b/prov/usnic/src/usdf.h new file mode 100644 index 00000000000..ba6a2cfb48e --- /dev/null +++ b/prov/usnic/src/usdf.h @@ -0,0 +1,525 @@ +/* + * Copyright (c) 2014-2019, Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _USDF_H_ +#define _USDF_H_ + +#include + +#include +#include + +#include +#include + +#include "usdf_progress.h" +#include "usd.h" + + +#define USDF_PROV_NAME "usnic" +#define USDF_MAJOR_VERS 1 +#define USDF_MINOR_VERS 0 +#define USDF_PROV_VERSION FI_VERSION(USDF_MAJOR_VERS, USDF_MINOR_VERS) + +extern struct fi_provider usdf_ops; + +#define USDF_WARN_SYS(subsys, ...) \ + FI_WARN(&usdf_ops, FI_LOG_ ## subsys, __VA_ARGS__) +#define USDF_TRACE_SYS(subsys, ...) \ + FI_TRACE(&usdf_ops, FI_LOG_ ## subsys, __VA_ARGS__) +#define USDF_INFO_SYS(subsys, ...) \ + FI_INFO(&usdf_ops, FI_LOG_ ## subsys, __VA_ARGS__) +#define USDF_DBG_SYS(subsys, ...) \ + FI_DBG(&usdf_ops, FI_LOG_ ## subsys, __VA_ARGS__) + +/* default to "FI_LOG_FABRIC" */ +#define USDF_WARN(...) USDF_WARN_SYS(FABRIC, __VA_ARGS__) +#define USDF_TRACE(...) USDF_TRACE_SYS(FABRIC, __VA_ARGS__) +#define USDF_INFO(...) USDF_INFO_SYS(FABRIC, __VA_ARGS__) +#define USDF_DBG(...) USDF_DBG_SYS(FABRIC, __VA_ARGS__) + +#define USDF_HDR_BUF_ENTRY 64 +#define USDF_EP_CAP_PIO (1ULL << 63) + +#define USDF_MAX_PEERS (16 * 1024) + +/* usdf event flags */ +#define USDF_EVENT_FLAG_ERROR (1ULL << 62) +#define USDF_EVENT_FLAG_FREE_BUF (1ULL << 63) + +/* usdf domain capability: no loopback */ +#define USDF_DOM_CAPS (FI_REMOTE_COMM) + +#define USDF_MR_IOV_LIMIT 1 +#define USDF_MR_CNT (65535) +#define USDF_ADDR_STR_LEN (INET6_ADDRSTRLEN+8) + +/* + * TAILQ stuff that should exist + */ +#define TAILQ_REMOVE_MARK(head, elm, link) \ + do { \ + TAILQ_REMOVE(head, elm, link); \ + (elm)->link.tqe_prev = NULL; \ + } while (0) + +#define TAILQ_ON_LIST(elm, link) ((elm)->link.tqe_prev != NULL) + +struct usdf_domain; + +struct usdf_dev_entry { + struct usd_device *ue_dev; + struct usd_device_attrs ue_dattr; + int ue_dev_ok; +}; +struct usdf_usnic_info { + int uu_num_devs; + struct usd_device_entry uu_devs[USD_MAX_DEVICES]; + struct usdf_dev_entry uu_info[USD_MAX_DEVICES]; +}; +extern struct usdf_usnic_info *__usdf_devinfo; + +struct usdf_fabric { + struct fid_fabric fab_fid; + struct fi_fabric_attr fab_attr; + struct usd_device_attrs *fab_dev_attrs; + int fab_arp_sockfd; + ofi_atomic32_t fab_refcnt; + ofi_atomic32_t num_blocked_waiting; + LIST_HEAD(,usdf_domain) fab_domain_list; + + /* progression */ + pthread_t fab_thread; + int fab_exit; + ofi_epoll_t fab_epollfd; + int fab_eventfd; + struct usdf_poll_item fab_poll_item; + + /* timer vars */ + uint32_t fab_active_timer_count; + LIST_HEAD(usdf_timer_bucket, usdf_timer_entry) *fab_timer_buckets; + uint64_t fab_cur_bucket_ms; + uint32_t fab_cur_bucket; + pthread_spinlock_t fab_timer_lock; +}; +#define fab_ftou(FAB) container_of(FAB, struct usdf_fabric, fab_fid) +#define fab_utof(FP) (&(FP)->fab_fid) +#define fab_fidtou(FID) container_of(FID, struct usdf_fabric, fab_fid.fid) + +struct usdf_domain { + struct fid_domain dom_fid; + struct usdf_fabric *dom_fabric; + struct fi_info *dom_info; + ofi_atomic32_t dom_refcnt; + struct usdf_eq *dom_eq; + struct usd_device *dom_dev; + + pthread_spinlock_t dom_progress_lock; + TAILQ_HEAD(,usdf_tx) dom_tx_ready; + TAILQ_HEAD(,usdf_cq_hard) dom_hcq_list; + + /* used only by connected endpoints */ + struct usdf_ep **dom_peer_tab; + uint32_t dom_next_peer; + + LIST_ENTRY(usdf_domain) dom_link; +}; +#define dom_ftou(FDOM) container_of(FDOM, struct usdf_domain, dom_fid) +#define dom_utof(DOM) (&(DOM)->dom_fid) +#define dom_fidtou(FID) container_of(FID, struct usdf_domain, dom_fid.fid) + +enum usdf_pep_state { + USDF_PEP_UNBOUND, + USDF_PEP_BOUND, + USDF_PEP_LISTENING, + + /* A "ROBBED" PEP has had its socket stolen. The only valid operation + * to call on a ROBBED PEP is fi_close(). */ + USDF_PEP_ROBBED +}; + +struct usdf_pep { + struct fid_pep pep_fid; + ofi_atomic32_t pep_refcnt; + struct usdf_fabric *pep_fabric; + struct usdf_eq *pep_eq; + int pep_sock; + union { + struct sockaddr_in sin; + char addr_str[USDF_ADDR_STR_LEN]; + } pep_src_addr; + enum usdf_pep_state pep_state; + struct usdf_poll_item pep_pollitem; + struct fi_info *pep_info; + + pthread_spinlock_t pep_cr_lock; + size_t pep_cr_max_data; + uint32_t pep_backlog; + uint32_t pep_cr_alloced; + TAILQ_HEAD(,usdf_connreq) pep_cr_free; + TAILQ_HEAD(,usdf_connreq) pep_cr_pending; +}; +#define pep_ftou(FPEP) container_of(FPEP, struct usdf_pep, pep_fid) +#define pep_fidtou(FID) container_of(FID, struct usdf_pep, pep_fid.fid) +#define pep_utof(PEP) (&(PEP)->pep_fid) +#define pep_utofid(PEP) (&(PEP)->pep_fid.fid) + +struct usdf_tx { + struct fid_stx tx_fid; + ofi_atomic32_t tx_refcnt; + struct usdf_domain *tx_domain; + TAILQ_ENTRY(usdf_tx) tx_link; + + struct fi_tx_attr tx_attr; + struct usd_qp *tx_qp; + void (*tx_progress)(struct usdf_tx *tx); + + union { + struct { + struct usdf_cq_hard *tx_hcq; + + uint8_t *tx_inject_bufs; + struct usdf_msg_qe *tx_wqe_buf; + TAILQ_HEAD(,usdf_msg_qe) tx_free_wqe; + TAILQ_HEAD(,usdf_ep) tx_ep_ready; + TAILQ_HEAD(,usdf_ep) tx_ep_have_acks; + size_t tx_num_free_wqe; + } msg; + struct { + struct usdf_cq_hard *tx_hcq; + + ofi_atomic32_t tx_next_msg_id; + struct usdf_rdm_qe *tx_wqe_buf; + uint8_t *tx_inject_bufs; + TAILQ_HEAD(,usdf_rdm_qe) tx_free_wqe; + TAILQ_HEAD(,usdf_rdm_connection) tx_rdc_ready; + TAILQ_HEAD(,usdf_rdm_connection) tx_rdc_have_acks; + size_t tx_num_free_wqe; + } rdm; + } t; +}; +#define tx_ftou(FEP) container_of(FEP, struct usdf_tx, tx_fid) +#define tx_fidtou(FID) container_of(FID, struct usdf_tx, tx_fid) +#define tx_utof(RX) (&(RX)->tx_fid) +#define tx_utofid(RX) (&(RX)->tx_fid.fid) + +struct usdf_rx { + struct fid_ep rx_fid; + ofi_atomic32_t rx_refcnt; + struct usdf_domain *rx_domain; + + struct fi_rx_attr rx_attr; + struct usd_qp *rx_qp; + + union { + struct { + struct usdf_cq_hard *rx_hcq; + + uint8_t *rx_bufs; + struct usdf_msg_qe *rx_rqe_buf; + TAILQ_HEAD(,usdf_msg_qe) rx_free_rqe; + TAILQ_HEAD(,usdf_msg_qe) rx_posted_rqe; + size_t rx_num_free_rqe; + } msg; + struct { + int rx_sock; + struct usdf_cq_hard *rx_hcq; + struct usdf_tx *rx_tx; + + uint8_t *rx_bufs; + struct usdf_rdm_qe *rx_rqe_buf; + TAILQ_HEAD(,usdf_rdm_qe) rx_free_rqe; + TAILQ_HEAD(,usdf_rdm_qe) rx_posted_rqe; + size_t rx_num_free_rqe; + } rdm; + } r; +}; +#define rx_ftou(FEP) container_of(FEP, struct usdf_rx, rx_fid) +#define rx_fidtou(FID) container_of(FID, struct usdf_rx, rx_fid) +#define rx_utof(RX) (&(RX)->rx_fid) +#define rx_utofid(RX) (&(RX)->rx_fid.fid) + +enum { + USDF_EP_ENABLED = (1 << 0) +}; + +struct usdf_ep { + struct fid_ep ep_fid; + struct usdf_domain *ep_domain; + ofi_atomic32_t ep_refcnt; + uint64_t ep_caps; + uint64_t ep_mode; + + uint8_t ep_tx_dflt_signal_comp; + uint8_t ep_rx_dflt_signal_comp; + + uint8_t ep_tx_completion; + uint8_t ep_rx_completion; + + uint32_t flags; + + uint32_t ep_wqe; /* requested queue sizes */ + uint32_t ep_rqe; + + struct usd_qp_attrs ep_qp_attrs; + + struct usdf_eq *ep_eq; + + struct usdf_tx *ep_tx; + struct usdf_rx *ep_rx; + + size_t max_msg_size; + + union { + struct { + struct usd_qp *ep_qp; + struct usdf_cq *ep_wcq; + struct usdf_cq *ep_rcq; + + int ep_sock; + struct usdf_av *ep_av; + + /* TODO: Remove in favor of accessing op flags through + * ep_tx and ep_rx. Update once tx/rx context support + * is added to dgram */ + uint64_t tx_op_flags; + uint64_t rx_op_flags; + + size_t tx_iov_limit; + size_t rx_iov_limit; + + void *ep_hdr_buf; + struct usd_udp_hdr **ep_hdr_ptr; + } dg; + struct { + struct usdf_connreq *ep_connreq; + int ep_cm_sock; + struct sockaddr_in ep_lcl_addr; + struct usd_dest *ep_dest; + uint32_t ep_rem_peer_id; + uint32_t ep_lcl_peer_id; + + TAILQ_HEAD(,usdf_msg_qe) ep_posted_wqe; + TAILQ_HEAD(usdf_msg_qe_head ,usdf_msg_qe) ep_sent_wqe; + uint32_t ep_fairness_credits; + uint32_t ep_seq_credits; + uint16_t ep_next_tx_seq; + uint16_t ep_last_rx_ack; + int ep_send_nak; + + struct usdf_msg_qe *ep_cur_recv; + uint16_t ep_next_rx_seq; + TAILQ_ENTRY(usdf_ep) ep_ack_link; + + struct usdf_timer_entry *ep_ack_timer; + + TAILQ_ENTRY(usdf_ep) ep_link; + } msg; + struct { + int ep_sock; + struct usdf_av *ep_av; + + } rdm; + } e; +}; +#define ep_ftou(FEP) container_of(FEP, struct usdf_ep, ep_fid) +#define ep_fidtou(FID) container_of(FID, struct usdf_ep, ep_fid.fid) +#define ep_utof(EP) (&(EP)->ep_fid) +#define ep_utofid(EP) (&(EP)->ep_fid.fid) + +struct usdf_mr { + struct fid_mr mr_fid; + struct usd_mr *mr_mr; +}; + +struct usdf_cq_hard { + struct usdf_cq *cqh_cq; + struct usd_cq *cqh_ucq; + ofi_atomic32_t cqh_refcnt; + void (*cqh_progress)(struct usdf_cq_hard *hcq); + void (*cqh_post)(struct usdf_cq_hard *hcq, void *context, size_t len, + int prov_errno, uint64_t flags); + TAILQ_ENTRY(usdf_cq_hard) cqh_link; + TAILQ_ENTRY(usdf_cq_hard) cqh_dom_link; +}; + +struct usdf_cq_soft_entry { + void *cse_context; + uint64_t cse_flags; + size_t cse_len; + void *cse_buf; + uint64_t cse_data; + int cse_prov_errno; +}; + +struct usdf_cq { + struct fid_cq cq_fid; + ofi_atomic32_t cq_refcnt; + struct usdf_domain *cq_domain; + struct fi_cq_attr cq_attr; + uint8_t cq_is_soft; + uint8_t cq_waiting; + + union { + int fd; + struct fi_mutex_cond mutex_cond; + } object; + + union { + struct { + struct usd_cq *cq_cq; + } hard; + struct { + struct usdf_cq_soft_entry *cq_comps; + struct usdf_cq_soft_entry *cq_end; + struct usdf_cq_soft_entry *cq_head; + struct usdf_cq_soft_entry *cq_tail; + /* Last operation used to distinguish full vs empty. */ + uint8_t cq_last_op; + TAILQ_HEAD(,usdf_cq_hard) cq_list; + } soft; + } c; + struct usd_completion cq_comp; + struct fi_ops_cq cq_ops; +}; + +enum { + USDF_SOFT_CQ_READ, + USDF_SOFT_CQ_WRITE +}; + +#define cq_ftou(FCQ) container_of(FCQ, struct usdf_cq, cq_fid) +#define cq_fidtou(FID) container_of(FID, struct usdf_cq, cq_fid.fid) +#define cq_utof(CQ) (&(CQ)->cq_fid) + +struct usdf_err_data_entry { + struct slist_entry entry; + uint8_t seen; + uint8_t err_data[]; +}; + +struct usdf_event { + uint32_t ue_event; + void *ue_buf; + size_t ue_len; + uint64_t ue_flags; +}; + +struct usdf_eq { + struct fid_eq eq_fid; + struct usdf_fabric *eq_fabric; + ofi_atomic32_t eq_refcnt; + + pthread_spinlock_t eq_lock; + + struct fi_eq_err_entry *eq_ev_buf; + struct usdf_event *eq_ev_ring; + struct usdf_event *eq_ev_head; + struct usdf_event *eq_ev_tail; + struct usdf_event *eq_ev_end; + int eq_ev_ring_size; + ofi_atomic32_t eq_num_events; + + /* various ways to wait */ + struct fi_eq_attr eq_attr; + union { + int eq_fd; + }; + + struct slist eq_err_data; + struct fi_ops_eq eq_ops_data; +}; +#define eq_ftou(FEQ) container_of(FEQ, struct usdf_eq, eq_fid) +#define eq_fidtou(FID) container_of(FID, struct usdf_eq, eq_fid.fid) +#define eq_utof(EQ) (&(EQ)->eq_fid) + +/* + * Prototypes + */ + +ssize_t usdf_eq_write_internal(struct usdf_eq *eq, uint32_t event, + const void *buf, size_t len, uint64_t flags); + +/* fi_ops_fabric */ +int usdf_domain_open(struct fid_fabric *fabric, struct fi_info *info, + struct fid_domain **domain, void *context); +int usdf_eq_open(struct fid_fabric *fabric, struct fi_eq_attr *attr, + struct fid_eq **eq, void *context); +int usdf_pep_open(struct fid_fabric *fabric, struct fi_info *info, + struct fid_pep **pep_p, void *context); + +/* fi_ops_domain */ +int usdf_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, + struct fid_cq **cq_o, void *context); +int usdf_endpoint_open(struct fid_domain *domain, struct fi_info *info, + struct fid_ep **ep, void *context); +int usdf_av_open(struct fid_domain *domain, struct fi_av_attr *attr, + struct fid_av **av_o, void *context); +int usdf_query_atomic(struct fid_domain *domain, enum fi_datatype datatype, + enum fi_op op, struct fi_atomic_attr *attr, uint64_t flags); + +/* Domain name functionality */ +int usdf_domain_getname(uint32_t version, struct usd_device_attrs *dap, + char **name); +bool usdf_domain_checkname(uint32_t version, struct usd_device_attrs *dap, + const char *hint); + +/* fi_ops_mr */ +int usdf_reg_mr(struct fid *fid, const void *buf, size_t len, + uint64_t access, uint64_t offset, uint64_t requested_key, + uint64_t flags, struct fid_mr **mr_o, void *context); +int usdf_regv_mr(struct fid *fid, const struct iovec *iov, + size_t count, uint64_t access, + uint64_t offset, uint64_t requested_key, + uint64_t flags, struct fid_mr **mr, void *context); +int usdf_regattr(struct fid *fid, const struct fi_mr_attr *attr, + uint64_t flags, struct fid_mr **mr); + +/* Fake IBV provider */ +void usdf_setup_fake_ibv_provider(void); + +/* passive endpoint functions */ +int usdf_pep_steal_socket(struct usdf_pep *pep, int *is_bound, int *sock_o); + +/* Utility functions */ +int usdf_catch_dom_attr(uint32_t version, const struct fi_info *hints, + struct fi_domain_attr *dom_attr); +int usdf_catch_tx_attr(uint32_t version, const struct fi_tx_attr *tx_attr); +int usdf_catch_rx_attr(uint32_t version, const struct fi_rx_attr *rx_attr); +struct sockaddr_in *usdf_format_to_sin(const struct fi_info *info, const void *addr); +void *usdf_sin_to_format(const struct fi_info *info, void *addr, size_t *len); +void usdf_free_sin_if_needed(const struct fi_info *info, struct sockaddr_in *sin); + +#endif /* _USDF_H_ */ diff --git a/prov/usnic/src/usdf_av.c b/prov/usnic/src/usdf_av.c new file mode 100644 index 00000000000..39b9e1294ad --- /dev/null +++ b/prov/usnic/src/usdf_av.c @@ -0,0 +1,887 @@ +/* + * Copyright (c) 2014-2019, Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "rdma/fi_errno.h" +#include "ofi_enosys.h" +#include "ofi.h" + +#include "usnic_direct.h" +#include "usnic_ip_utils.h" +#include "libnl_utils.h" +#include "usd.h" +#include "usd_queue.h" + +#include "usdf.h" +#include "usdf_av.h" +#include "usdf_cm.h" +#include "usdf_timer.h" + +#include "fi_ext_usnic.h" + +static int usdf_av_alloc_dest(struct usdf_dest **dest_o) +{ + struct usdf_dest *dest; + + dest = calloc(1, sizeof(**dest_o)); + if (dest == NULL) + return -errno; + + *dest_o = dest; + return 0; +} + +static void usdf_av_free_dest(struct usdf_dest *dest) +{ + LIST_REMOVE(dest, ds_addresses_entry); + + free(dest); +} + +static int usdf_av_close_(struct usdf_av *av) +{ + struct usdf_dest *entry; + + USDF_TRACE_SYS(AV, "\n"); + + pthread_spin_lock(&av->av_lock); + + if (av->av_eq) + ofi_atomic_dec32(&av->av_eq->eq_refcnt); + + ofi_atomic_dec32(&av->av_domain->dom_refcnt); + + while (!LIST_EMPTY(&av->av_addresses)) { + entry = LIST_FIRST(&av->av_addresses); + usdf_av_free_dest(entry); + } + + pthread_spin_destroy(&av->av_lock); + free(av); + + USDF_DBG_SYS(AV, "AV successfully destroyed\n"); + + return 0; +} + +static int usdf_av_close(struct fid *fid) +{ + struct usdf_av *av; + int pending; + + USDF_TRACE_SYS(AV, "\n"); + + av = container_of(fid, struct usdf_av, av_fid.fid); + if (ofi_atomic_get32(&av->av_refcnt) > 0) + return -FI_EBUSY; + + pending = ofi_atomic_get32(&av->av_active_inserts); + assert(pending >= 0); + + if (pending) { + USDF_DBG_SYS(AV, "%d pending inserts, defer closing\n", + pending); + ofi_atomic_set32(&av->av_closing, 1); + } else { + usdf_av_close_(av); + } + + return 0; +} + +static void +usdf_av_insert_async_complete(struct usdf_av_insert *insert) +{ + struct fi_eq_entry entry; + struct usdf_av *av; + int pending; + int closing; + + av = insert->avi_av; + + entry.fid = &av->av_fid.fid; + entry.context = insert->avi_context; + entry.data = insert->avi_successes; + usdf_eq_write_internal(av->av_eq, + FI_AV_COMPLETE, &entry, sizeof(entry), 0); + + usdf_timer_free(av->av_domain->dom_fabric, insert->avi_timer); + + pending = ofi_atomic_dec32(&av->av_active_inserts); + USDF_DBG_SYS(AV, "new active insert value: %d\n", pending); + assert(pending >= 0); + + closing = ofi_atomic_get32(&av->av_closing); + + if (!pending && closing) + usdf_av_close_(av); + + free(insert); +} + +/* + * A request failed, post an error event to the EQ + */ +static void +usdf_post_insert_request_error(struct usdf_av_insert *insert, + struct usdf_av_req *req) +{ + struct fi_eq_err_entry err_entry = {0}; + struct usdf_av *av; + + av = insert->avi_av; + + *req->avr_fi_addr = FI_ADDR_NOTAVAIL; + free(req->avr_dest); + + err_entry.fid = &av->av_fid.fid; + err_entry.context = insert->avi_context; + err_entry.data = req - (struct usdf_av_req *)(insert + 1); + err_entry.err = -req->avr_status; + err_entry.err_data = NULL; + err_entry.err_data_size = 0; + + usdf_eq_write_internal(av->av_eq, 0, + &err_entry, sizeof(err_entry), + USDF_EVENT_FLAG_ERROR); +} + +/* + * Called by progression thread to look for AV completions on this domain + */ +static void +usdf_av_insert_progress(void *v) +{ + int ret; + struct usdf_av_insert *insert; + struct usdf_fabric *fp; + struct usdf_dest *dest; + struct usdf_av_req *req; + struct usdf_av_req *tmpreq; + struct usd_device_attrs *dap; + uint64_t now; + uint8_t *eth; + + insert = v; + fp = insert->avi_av->av_domain->dom_fabric; + dap = fp->fab_dev_attrs; + + TAILQ_FOREACH_SAFE(req, tmpreq, &insert->avi_req_list, avr_link) { + + dest = req->avr_dest; + eth = &dest->ds_dest.ds_dest.ds_udp.u_hdr.uh_eth.ether_dhost[0]; + ret = usnic_arp_lookup(dap->uda_ifname, + req->avr_daddr_be, fp->fab_arp_sockfd, eth); + + /* anything besides EAGAIN means request is completed */ + if (ret != EAGAIN) { + TAILQ_REMOVE(&insert->avi_req_list, req, avr_link); + req->avr_status = -ret; + + if (ret == 0) { + ++insert->avi_successes; + *(struct usdf_dest **)req->avr_fi_addr = dest; + + LIST_INSERT_HEAD(&insert->avi_av->av_addresses, + dest, ds_addresses_entry); + } else { + usdf_post_insert_request_error(insert, req); + } + } + } + + /* Time for a new ARP? */ + now = usdf_get_ms(); + if (now - insert->avi_last_arp_time > USDF_AV_ARP_INTERVAL) { + + /* If no more ARP requests left, fail all remaining requests */ + if (insert->avi_arps_left == 0) { + TAILQ_FOREACH(req, &insert->avi_req_list, avr_link) { + req->avr_status = -FI_EHOSTUNREACH; + usdf_post_insert_request_error(insert, req); + } + TAILQ_INIT(&insert->avi_req_list); + + /* Trigger an ARP request for all pending requests */ + } else { + TAILQ_FOREACH_SAFE(req, tmpreq, + &insert->avi_req_list, avr_link) { + ret = usnic_arp_request(req->avr_daddr_be, + fp->fab_arp_sockfd); + if (ret != 0) { + req->avr_status = -ret; + TAILQ_REMOVE(&insert->avi_req_list, + req, avr_link); + usdf_post_insert_request_error(insert, + req); + } + } + + insert->avi_last_arp_time = now; + --insert->avi_arps_left; + } + } + + /* If no more pending requests, all done! */ + if (TAILQ_EMPTY(&insert->avi_req_list)) { + usdf_av_insert_async_complete(insert); + } else { + /* retry in 1 ms */ + usdf_timer_set(fp, insert->avi_timer, 1); + } + +} + +static int +usdf_am_insert_async(struct fid_av *fav, const void *addr, size_t count, + fi_addr_t *fi_addr, uint64_t flags, void *context) +{ + const struct sockaddr_in *sin; + const char **addr_str; + struct sockaddr_in *cur_sin; + struct usd_device_attrs *dap; + struct usdf_av_insert *insert; + struct usdf_av_req *req; + struct usdf_av *av; + struct usdf_fabric *fp; + struct usd_dest *u_dest; + struct fi_info *info; + int ret; + size_t i; + bool addr_format_str; + + USDF_TRACE_SYS(AV, "\n"); + + if ((flags & ~(FI_MORE)) != 0) + return -FI_EBADFLAGS; + + av = av_ftou(fav); + fp = av->av_domain->dom_fabric; + dap = fp->fab_dev_attrs; + info = av->av_domain->dom_info; + addr_format_str = (info->addr_format == FI_ADDR_STR); + + if (av->av_eq == NULL) { + return -FI_ENOEQ; + } + + sin = addr; + addr_str = (const char **)addr; + + /* allocate an insert record and N requests */ + insert = calloc(1, sizeof(*insert) + count * sizeof(*req)); + if (insert == NULL) { + return -errno; + } + insert->avi_av = av; + insert->avi_context = context; + ret = usdf_timer_alloc(usdf_av_insert_progress, insert, + &insert->avi_timer); + if (ret != 0) { + goto fail; + } + TAILQ_INIT(&insert->avi_req_list); + insert->avi_arps_left = USDF_AV_MAX_ARPS; + + ret = ofi_atomic_inc32(&av->av_active_inserts); + USDF_DBG_SYS(AV, "new active insert value: %d\n", ret); + + /* If no addresses, complete now */ + if (count == 0) { + usdf_av_insert_async_complete(insert); + return 0; + } + + req = (struct usdf_av_req *)(insert + 1); + + for (i = 0; i < count; i++) { + req->avr_fi_addr = &fi_addr[i]; + + if (addr_format_str) { + usdf_str_toaddr(addr_str[i], &cur_sin); + if (NULL == cur_sin) { + ret = -FI_ENOMEM; + goto fail; + } + sin = cur_sin; + } + + /* find the address we actually need to look up */ + ret = usnic_nl_rt_lookup(dap->uda_ipaddr_be, + sin->sin_addr.s_addr, dap->uda_ifindex, + &req->avr_daddr_be); + if (ret != 0) { + if (ret == EHOSTUNREACH) { + req->avr_status = -FI_EHOSTUNREACH; + usdf_post_insert_request_error(insert, req); + } else { + ret = -ret; + goto fail; + } + + } else { + if (req->avr_daddr_be == 0) { + req->avr_daddr_be = sin->sin_addr.s_addr; + } + req->avr_dest = calloc(1, sizeof(*req->avr_dest)); + if (req->avr_dest == NULL) { + ret = -FI_ENOMEM; + goto fail; + } + u_dest = &req->avr_dest->ds_dest; + usd_fill_udp_dest(u_dest, dap, + sin->sin_addr.s_addr, sin->sin_port); + u_dest->ds_dest.ds_udp.u_hdr.uh_ip.frag_off |= + htons(IP_DF); + + TAILQ_INSERT_TAIL(&insert->avi_req_list, req, avr_link); + } + + if (addr_format_str) { + free(cur_sin); + cur_sin = NULL; + } else { + ++sin; + } + + ++req; + } + + /* resolve all addresses we can */ + usdf_av_insert_progress(insert); + + return 0; + +fail: + if (insert != NULL) { + if (insert->avi_timer != NULL) { + usdf_timer_free(fp, insert->avi_timer); + } + free(insert); + } + return ret; +} + +static int +usdf_am_insert_sync(struct fid_av *fav, const void *addr, size_t count, + fi_addr_t *fi_addr, uint64_t flags, void *context) +{ + const struct sockaddr_in *sin; + const char **addr_str; + struct sockaddr_in *cur_sin; + struct usdf_av *av; + struct usd_dest *u_dest; + struct usdf_dest *dest; + struct fi_info *info; + int ret_count; + int ret; + int *errors; + uint32_t api_version; + size_t i; + bool addr_format_str; + + USDF_TRACE_SYS(AV, "\n"); + + ret_count = 0; + av = av_ftou(fav); + api_version = av->av_domain->dom_fabric->fab_attr.fabric->api_version; + info = av->av_domain->dom_info; + addr_format_str = (info->addr_format == FI_ADDR_STR); + errors = context; + + /* Screen out unsupported flags. */ + if ((flags & ~(FI_MORE|FI_SYNC_ERR)) != 0) + return -FI_EBADFLAGS; + + /* If user set FI_SYNC_ERR, we have to report back to user's buffer. */ + if (flags & FI_SYNC_ERR) { + if (FI_VERSION_LT(api_version, FI_VERSION(1, 5))) + return -FI_EBADFLAGS; + + memset(errors, 0, sizeof(int) * count); + } + + sin = addr; + addr_str = (const char **)addr; + + /* XXX parallelize, this will also eliminate u_dest silliness */ + for (i = 0; i < count; i++) { + + if (addr_format_str) { + usdf_str_toaddr(addr_str[i], &cur_sin); + if (NULL == cur_sin) { + if (flags & FI_SYNC_ERR) + errors[i] = -ENOMEM; + + return ret_count; + } + sin = cur_sin; + } + + dest = NULL; + u_dest = NULL; + ret = usdf_av_alloc_dest(&dest); + if (ret == 0) { + USDF_DBG_SYS(AV, "usd_create_dest(addr=0x%x, port=0x%x)\n", + ntohl(sin->sin_addr.s_addr), ntohs(sin->sin_port)); + ret = usd_create_dest(av->av_domain->dom_dev, + sin->sin_addr.s_addr, sin->sin_port, + &u_dest); + } + if (ret == 0) { + u_dest->ds_dest.ds_udp.u_hdr.uh_ip.frag_off |= + htons(IP_DF); + dest->ds_dest = *u_dest; + fi_addr[i] = (fi_addr_t)dest; + LIST_INSERT_HEAD(&av->av_addresses, dest, + ds_addresses_entry); + ++ret_count; + } else { + if (flags & FI_SYNC_ERR) + errors[i] = -ret; + + fi_addr[i] = FI_ADDR_NOTAVAIL; + free(dest); + } + free(u_dest); + + if (addr_format_str) { + free(cur_sin); + cur_sin = NULL; + } else { + ++sin; + } + } + + return ret_count; +} + +static int usdf_resolve_addr(const char *node, const char *service, + struct sockaddr_in *in) +{ + struct addrinfo *ai; + int ret; + + struct addrinfo hints = { + .ai_family = AF_INET, + }; + + if (!node || !service || !in) + return -FI_EINVAL; + + ret = getaddrinfo(node, service, &hints, &ai); + if (ret) { + USDF_DBG("getaddrinfo: %s\n", gai_strerror(ret)); + return -FI_EINVAL; + } + + *in = *(struct sockaddr_in *) ai->ai_addr; + + assert(ai->ai_family == AF_INET); + assert(in->sin_family == AF_INET); + + freeaddrinfo(ai); + return ret; +} + +static int usdf_av_insertsvc(struct fid_av *fav, const char *node, + const char *service, fi_addr_t *fi_addr, uint64_t flags, + void *context) +{ + struct sockaddr_in addr; + struct usdf_av *av; + struct fi_info *info; + int ret; + bool addr_format_str; + + USDF_TRACE_SYS(AV, "\n"); + + av = av_ftou(fav); + info = av->av_domain->dom_info; + addr_format_str = (info->addr_format == FI_ADDR_STR); + + if (!fav) + return -FI_EINVAL; + + if (addr_format_str) { + /* string format should not come with service param. */ + if (service) + return -FI_EINVAL; + + ret = fav->ops->insert(fav, &node, 1, fi_addr, flags, context); + } else { + ret = usdf_resolve_addr(node, service, &addr); + if (ret) + goto fail; + + ret = fav->ops->insert(fav, &addr, 1, fi_addr, flags, context); + } + +fail: + return ret; +} + +static int +usdf_am_remove(struct fid_av *fav, fi_addr_t *fi_addr, size_t count, + uint64_t flags) +{ + struct usdf_dest *dest; + size_t i; + + USDF_TRACE_SYS(AV, "\n"); + + for (i = 0; i < count; ++i) { + if (fi_addr[i] != FI_ADDR_NOTAVAIL) { + dest = (struct usdf_dest *)(uintptr_t)fi_addr[i]; + usdf_av_free_dest(dest); + + /* Mark invalid by setting to FI_ADDR_NOTAVAIL*/ + fi_addr[i] = FI_ADDR_NOTAVAIL; + } + } + + return 0; +} + +static int +usdf_am_lookup(struct fid_av *fav, fi_addr_t fi_addr, void *addr, + size_t *addrlen) +{ + struct usdf_dest *dest; + struct usdf_av *av; + struct fi_info *info; + struct sockaddr_in sin = { 0 }; + size_t copylen; + bool addr_format_str; + + USDF_TRACE_SYS(AV, "\n"); + + av = av_ftou(fav); + info = av->av_domain->dom_info; + addr_format_str = (info->addr_format == FI_ADDR_STR); + + if (fi_addr == FI_ADDR_NOTAVAIL) { + USDF_WARN_SYS(AV, "invalid address, can't lookup\n"); + return -FI_EINVAL; + } + + dest = (struct usdf_dest *)(uintptr_t)fi_addr; + if (*addrlen < sizeof(sin)) { + copylen = *addrlen; + } else { + copylen = sizeof(sin); + } + + sin.sin_family = AF_INET; + usd_expand_dest(&dest->ds_dest, &sin.sin_addr.s_addr, &sin.sin_port); + + if (addr_format_str) + usdf_addr_tostr(&sin, addr, addrlen); + else { + memcpy(addr, &sin, copylen); + *addrlen = sizeof(sin); + } + return 0; +} + +static const char * +usdf_av_straddr(struct fid_av *fav, const void *addr, + char *buf, size_t *len) +{ + struct fi_info *info; + struct usdf_av *av; + + if (!len || !addr || !buf) + return NULL; + + av = av_fidtou(fav); + info = av->av_domain->dom_info; + + return ofi_straddr(buf, len, info->addr_format, addr); +} + +static int +usdf_av_bind(struct fid *fid, struct fid *bfid, uint64_t flags) +{ + struct usdf_av *av; + + USDF_TRACE_SYS(AV, "\n"); + + av = av_fidtou(fid); + + switch (bfid->fclass) { + case FI_CLASS_EQ: + if (av->av_eq != NULL) { + return -FI_EINVAL; + } + av->av_eq = eq_fidtou(bfid); + ofi_atomic_inc32(&av->av_eq->eq_refcnt); + break; + default: + return -FI_EINVAL; + } + + return 0; +} + +static struct fi_ops usdf_av_fi_ops = { + .size = sizeof(struct fi_ops), + .close = usdf_av_close, + .bind = usdf_av_bind, + .control = fi_no_control, + .ops_open = usdf_av_ops_open, +}; + +static struct fi_ops_av usdf_am_ops_async = { + .size = sizeof(struct fi_ops_av), + .insert = usdf_am_insert_async, + .insertsvc = usdf_av_insertsvc, + .insertsym = fi_no_av_insertsym, + .remove = usdf_am_remove, + .lookup = usdf_am_lookup, + .straddr = usdf_av_straddr +}; + +static struct fi_ops_av usdf_am_ops_sync = { + .size = sizeof(struct fi_ops_av), + .insert = usdf_am_insert_sync, + .insertsvc = usdf_av_insertsvc, + .insertsym = fi_no_av_insertsym, + .remove = usdf_am_remove, + .lookup = usdf_am_lookup, + .straddr = usdf_av_straddr +}; + +static int usdf_av_process_attr(struct fi_av_attr *attr) +{ + USDF_TRACE_SYS(AV, "\n"); + + if (attr == NULL) { + USDF_WARN_SYS(AV, "NULL AV attribute structure is invalid\n"); + return -FI_EINVAL; + } + + if (attr->name || attr->map_addr || (attr->flags & FI_READ)) { + USDF_WARN_SYS(AV, "named AVs are not supported\n"); + return -FI_ENOSYS; + } + + if (attr->flags & ~FI_EVENT) { + USDF_WARN_SYS(AV, "invalid flag, only FI_EVENT is supported\n"); + return -FI_EINVAL; + } + + if (attr->rx_ctx_bits) { + USDF_WARN_SYS(AV, "scalable endpoints not supported\n"); + return -FI_EINVAL; + } + + if (attr->ep_per_node > 1) + USDF_WARN_SYS(AV, "ep_per_node not supported, ignoring\n"); + + switch (attr->type) { + case FI_AV_UNSPEC: + USDF_DBG_SYS(AV, "no AV type specified, using FI_AV_MAP\n"); + case FI_AV_MAP: + break; + case FI_AV_TABLE: + USDF_DBG_SYS(AV, "FI_AV_TABLE is unsupported\n"); + return -FI_ENOSYS; + default: + USDF_WARN_SYS(AV, "unknown AV type %d, not supported", + attr->type); + return -FI_EINVAL; + } + + return FI_SUCCESS; +} + +int +usdf_av_open(struct fid_domain *domain, struct fi_av_attr *attr, + struct fid_av **av_o, void *context) +{ + struct usdf_domain *udp; + struct usdf_av *av; + int ret; + + USDF_TRACE_SYS(AV, "\n"); + + if (!av_o) { + USDF_WARN_SYS(AV, "provided AV pointer can not be NULL\n"); + return -FI_EINVAL; + } + + ret = usdf_av_process_attr(attr); + if (ret) + return ret; + + udp = dom_ftou(domain); + + av = calloc(1, sizeof(*av)); + if (av == NULL) { + return -FI_ENOMEM; + } + + if (attr->flags & FI_EVENT) { + av->av_fid.ops = &usdf_am_ops_async; + } else { + av->av_fid.ops = &usdf_am_ops_sync; + } + + LIST_INIT(&av->av_addresses); + + av->av_fid.fid.fclass = FI_CLASS_AV; + av->av_fid.fid.context = context; + av->av_fid.fid.ops = &usdf_av_fi_ops; + av->av_flags = attr->flags; + + pthread_spin_init(&av->av_lock, PTHREAD_PROCESS_PRIVATE); + ofi_atomic_initialize32(&av->av_active_inserts, 0); + ofi_atomic_initialize32(&av->av_closing, 0); + + ofi_atomic_initialize32(&av->av_refcnt, 0); + ofi_atomic_inc32(&udp->dom_refcnt); + av->av_domain = udp; + + *av_o = av_utof(av); + return 0; +} + +/* Look up if the sin address has been already inserted. + * if match, return the address of the dest pointer. otherwise, + * returns FI_ADDR_NOTAVAIL. + */ +fi_addr_t usdf_av_lookup_addr(struct usdf_av *av, + const struct sockaddr_in *sin) +{ + struct usdf_dest *cur; + struct usd_udp_hdr u_hdr; + + for (cur = av->av_addresses.lh_first; cur; + cur = cur->ds_addresses_entry.le_next) { + u_hdr = cur->ds_dest.ds_dest.ds_udp.u_hdr; + if (sin->sin_addr.s_addr == u_hdr.uh_ip.daddr && + sin->sin_port == u_hdr.uh_udp.dest) + return (fi_addr_t)(uintptr_t)cur; + } + return FI_ADDR_NOTAVAIL; +} + +/* Return sockaddr_in pointer. Must be used with usdf_free_sin_if_needed() + * to cleanup properly. + */ +struct sockaddr_in *usdf_format_to_sin(const struct fi_info *info, const void *addr) +{ + struct sockaddr_in *sin; + + if (!info) + return (struct sockaddr_in *)addr; + + switch (info->addr_format) { + case FI_FORMAT_UNSPEC: + case FI_SOCKADDR: + case FI_SOCKADDR_IN: + return (struct sockaddr_in *)addr; + case FI_ADDR_STR: + usdf_str_toaddr(addr, &sin); + return sin; + default: + return NULL; + } +} + +/* Utility function to free the sockaddr_in allocated from usdf_format_to_sin() + */ +void usdf_free_sin_if_needed(const struct fi_info *info, struct sockaddr_in *sin) +{ + if (info && info->addr_format == FI_ADDR_STR) + free(sin); +} + +/* Convert sockaddr_in pointer to appropriate format. + * If conversion happens, destroy the origin. (to minimize cleaning up code) + */ +void *usdf_sin_to_format(const struct fi_info *info, void *addr, size_t *len) +{ + size_t addr_strlen; + char *addrstr; + + if (!info) + return addr; + + switch (info->addr_format) { + case FI_FORMAT_UNSPEC: + case FI_SOCKADDR: + case FI_SOCKADDR_IN: + if (len) + *len = sizeof(struct sockaddr_in); + return addr; + case FI_ADDR_STR: + addrstr = calloc(1, USDF_ADDR_STR_LEN); + if (addrstr == NULL) { + USDF_DBG_SYS(AV, "memory allocation failed\n"); + return NULL; + } + + addr_strlen = USDF_ADDR_STR_LEN; + usdf_addr_tostr(addr, addrstr, &addr_strlen); + + if (len) + *len = addr_strlen; + + free(addr); + return addrstr; + default: + return NULL; + } + +} diff --git a/prov/usnic/src/usdf_av.h b/prov/usnic/src/usdf_av.h new file mode 100644 index 00000000000..24e3cd511e6 --- /dev/null +++ b/prov/usnic/src/usdf_av.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2014-2019, Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _USDF_AV_H_ +#define _USDF_AV_H_ + +#include "usd_dest.h" + +#define USDF_AV_MAX_ARPS 3 +#define USDF_AV_ARP_INTERVAL 1000 + +struct usdf_rdm_connection; + +/* + * libfabric version of dest + */ +struct usdf_dest { + struct usd_dest ds_dest; + + LIST_ENTRY(usdf_dest) ds_addresses_entry; +}; + +/* struct used to track async insert requests */ +struct usdf_av_req { + fi_addr_t *avr_fi_addr; + struct usdf_dest *avr_dest; + int avr_status; + + uint32_t avr_daddr_be; + + TAILQ_ENTRY(usdf_av_req) avr_link; +}; + +struct usdf_av_insert { + struct usdf_av *avi_av; + void *avi_context; + + struct usdf_timer_entry *avi_timer; + + uint32_t avi_successes; + TAILQ_HEAD(,usdf_av_req) avi_req_list; + uint32_t avi_arps_left; + uint64_t avi_last_arp_time; +}; + +struct usdf_av { + struct fid_av av_fid; + struct usdf_domain *av_domain; + uint64_t av_flags; + struct usdf_eq *av_eq; + ofi_atomic32_t av_refcnt; + ofi_atomic32_t av_closing; + ofi_atomic32_t av_active_inserts; + pthread_spinlock_t av_lock; + LIST_HEAD(, usdf_dest) av_addresses; +}; + +#define av_ftou(FAV) container_of(FAV, struct usdf_av, av_fid) +#define av_fidtou(FID) container_of(FID, struct usdf_av, av_fid.fid) +#define av_utof(AV) (&(AV)->av_fid) + +fi_addr_t usdf_av_lookup_addr(struct usdf_av *av, + const struct sockaddr_in *sin); + +#endif /* _USDF_AV_H_ */ diff --git a/prov/usnic/src/usdf_cm.c b/prov/usnic/src/usdf_cm.c new file mode 100644 index 00000000000..cc2198e9b7a --- /dev/null +++ b/prov/usnic/src/usdf_cm.c @@ -0,0 +1,321 @@ +/* + * Copyright (c) 2014-2019, Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include "ofi.h" +#include "ofi_file.h" + +#include "usnic_direct.h" +#include "usdf.h" +#include "usdf_endpoint.h" +#include "usdf_dgram.h" +#include "usdf_av.h" +#include "usdf_cm.h" + +/* Given a connection request structure containing data, make a copy of the data + * that can be accessed in error entries on the EQ. The return value is the size + * of the data stored in the error entry. If the return value is a non-negative + * value, then the function has suceeded and the size and output data can be + * assumed to be valid. If the function fails, then the data will be NULL and + * the size will be a negative error value. + */ +static int usdf_cm_generate_err_data(struct usdf_eq *eq, + struct usdf_connreq *crp, void **data) +{ + struct usdf_err_data_entry *err_data_entry; + struct usdf_connreq_msg *reqp; + size_t entry_size; + size_t data_size; + + if (!eq || !crp || !data) { + USDF_DBG_SYS(EP_CTRL, + "eq, crp, or data is NULL.\n"); + return -FI_EINVAL; + } + + /* Initialize to NULL so data can't be used in the error case. */ + *data = NULL; + + reqp = (struct usdf_connreq_msg *) crp->cr_data; + + /* This is a normal case, maybe there was no data. */ + if (!reqp || !reqp->creq_datalen) + return 0; + + data_size = reqp->creq_datalen; + + entry_size = sizeof(*err_data_entry) + data_size; + + err_data_entry = calloc(1, entry_size); + if (!err_data_entry) { + USDF_WARN_SYS(EP_CTRL, + "failed to allocate err data entry\n"); + return -FI_ENOMEM; + } + + /* This data should be copied and owned by the provider. Keep + * track of it in the EQ, this will be freed in the next EQ read + * call after it has been read. + */ + memcpy(err_data_entry->err_data, reqp->creq_data, data_size); + slist_insert_tail(&err_data_entry->entry, &eq->eq_err_data); + + *data = err_data_entry->err_data; + + return data_size; +} + +/* Report a connection management related failure. Sometimes there is connection + * event data that should be copied into the generated event. If the copy_data + * parameter evaluates to true, then the data will be copied. + * + * If data is to be generated for the error entry, then the connection request + * is assumed to have the data size in host order. If something fails during + * processing of the error data, then the EQ entry will still be generated + * without the error data. + */ +void usdf_cm_report_failure(struct usdf_connreq *crp, int error, bool copy_data) +{ + struct fi_eq_err_entry err = {0}; + struct usdf_pep *pep; + struct usdf_ep *ep; + struct usdf_eq *eq; + fid_t fid; + int ret; + + USDF_DBG_SYS(EP_CTRL, "error=%d (%s)\n", error, fi_strerror(error)); + + pep = crp->cr_pep; + ep = crp->cr_ep; + + if (ep != NULL) { + fid = ep_utofid(ep); + eq = ep->ep_eq; + ep->ep_domain->dom_peer_tab[ep->e.msg.ep_rem_peer_id] = NULL; + } else { + fid = pep_utofid(pep); + eq = pep->pep_eq; + } + + /* Try to generate the space necessary for the error data. If the + * function returns a number greater than or equal to 0, then it was a + * success. The return value is the size of the data. + */ + if (copy_data) { + ret = usdf_cm_generate_err_data(eq, crp, &err.err_data); + if (ret >= 0) + err.err_data_size = ret; + } + + err.fid = fid; + err.err = -error; + + usdf_eq_write_internal(eq, 0, &err, sizeof(err), USDF_EVENT_FLAG_ERROR); +} + +/* A wrapper to core function to translate string address to + * sockaddr_in type. We are expecting a NULL sockaddr_in**. + * The core function will allocated it for us. The caller HAS TO FREE it. + */ +int usdf_str_toaddr(const char *str, struct sockaddr_in **outaddr) +{ + uint32_t type; + size_t size; + int ret; + + type = FI_SOCKADDR_IN; + + /* call the core function. The core always allocate the addr for us. */ + ret = ofi_str_toaddr(str, &type, (void **)outaddr, &size); + +#if ENABLE_DEBUG + char outstr[USDF_ADDR_STR_LEN]; + size_t out_size = USDF_ADDR_STR_LEN; + + inet_ntop(AF_INET, &((*outaddr)->sin_addr), outstr, out_size); + USDF_DBG_SYS(EP_CTRL, + "%s(string) converted to addr :%s:%u(inet)\n", + str, outstr, ntohs((*outaddr)->sin_port)); +#endif + + return ret; +} + +/* A wrapper to core function to translate sockaddr_in address to + * string. This function is not allocating any memory. We are expected + * an allocated buffer. + */ +const char *usdf_addr_tostr(const struct sockaddr_in *sin, + char *addr_str, size_t *size) +{ + const char *ret; + + ret = ofi_straddr(addr_str, size, FI_SOCKADDR_IN, sin); + +#if ENABLE_DEBUG + char outstr[USDF_ADDR_STR_LEN]; + size_t out_size = USDF_ADDR_STR_LEN; + + inet_ntop(AF_INET, &sin->sin_addr, outstr, out_size); + USDF_DBG_SYS(EP_CTRL, + "%s:%d(inet) converted to %s(string)\n", + outstr, ntohs(sin->sin_port), addr_str); +#endif + + return ret; +} + +/* + * Return local address of an EP + */ +static int usdf_cm_copy_name(struct fi_info *info, struct sockaddr_in *sin, + void *addr, size_t *addrlen) +{ + int ret; + char addr_str[USDF_ADDR_STR_LEN]; + size_t len; + + USDF_TRACE_SYS(EP_CTRL, "\n"); + + ret = FI_SUCCESS; + switch (info->addr_format) { + case FI_ADDR_STR: + len = USDF_ADDR_STR_LEN; + usdf_addr_tostr(sin, addr_str, &len); + snprintf(addr, MIN(len, *addrlen), "%s", addr_str); + break; + case FI_SOCKADDR: + case FI_SOCKADDR_IN: + len = sizeof(*sin); + memcpy(addr, sin, MIN(len, *addrlen)); + break; + default: + return -FI_EINVAL; + } + + /* If the buffer is too small, tell the user. */ + if (*addrlen < len) + ret = -FI_ETOOSMALL; + + /* Always return the actual size. */ + *addrlen = len; + return ret; +} + +int usdf_cm_dgram_getname(fid_t fid, void *addr, size_t *addrlen) +{ + int ret; + struct usdf_ep *ep; + struct sockaddr_in sin; + struct fi_info *info; + socklen_t slen; + + USDF_TRACE_SYS(EP_CTRL, "\n"); + + ep = ep_fidtou(fid); + info = ep->ep_domain->dom_info; + + memset(&sin, 0, sizeof(sin)); + if (ep->e.dg.ep_qp == NULL) { + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = + ep->ep_domain->dom_fabric->fab_dev_attrs->uda_ipaddr_be; + sin.sin_port = 0; + } else { + slen = sizeof(sin); + ret = getsockname(ep->e.dg.ep_sock, (struct sockaddr *)&sin, &slen); + if (ret == -1) { + return -errno; + } + assert(((struct sockaddr *)&sin)->sa_family == AF_INET); + assert(slen == sizeof(sin)); + assert(sin.sin_addr.s_addr == + ep->ep_domain->dom_fabric->fab_dev_attrs->uda_ipaddr_be); + } + + return usdf_cm_copy_name(info, &sin, addr, addrlen); +} + +/* Checks that the given address is actually a sockaddr_in of appropriate + * length. "addr_format" is an FI_ constant like FI_SOCKADDR_IN indicating the + * claimed type of the given address. + * + * Returns true if address is actually a sockaddr_in, false otherwise. + * + * Upon successful return, "addr" can be safely cast to either + * "struct sockaddr_in *" or "struct sockaddr *". + * + * "addr" should not be NULL. + */ +bool usdf_cm_addr_is_valid_sin(void *addr, size_t addrlen, uint32_t addr_format) +{ + assert(addr != NULL); + + switch (addr_format) { + case FI_SOCKADDR_IN: + case FI_SOCKADDR: + if (addrlen != sizeof(struct sockaddr_in)) { + USDF_WARN("addrlen is incorrect\n"); + return false; + } + if (((struct sockaddr *)addr)->sa_family != AF_INET) { + USDF_WARN("unknown/unsupported addr_format\n"); + return false; + } + return true; + default: + USDF_WARN("unknown/unsupported addr_format\n"); + return false; + } +} diff --git a/prov/usnic/src/usdf_cm.h b/prov/usnic/src/usdf_cm.h new file mode 100644 index 00000000000..d361818055b --- /dev/null +++ b/prov/usnic/src/usdf_cm.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2014-2019, Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _USDF_CM_H_ +#define _USDF_CM_H_ + +#include +#include +#include +#include +#include + +#define USDF_MAX_CONN_DATA 256 + +struct usdf_connreq_msg { + uint32_t creq_peer_id; + uint32_t creq_ipaddr; + uint32_t creq_port; + uint32_t creq_result; + uint32_t creq_reason; + uint32_t creq_datalen; + uint8_t creq_data[]; +} __attribute__((packed)); + +struct usdf_connreq { + struct fid handle; + int cr_sockfd; + struct usdf_pep *cr_pep; + struct usdf_ep *cr_ep; + TAILQ_ENTRY(usdf_connreq) cr_link; + + struct usdf_poll_item cr_pollitem; + + uint8_t *cr_ptr; + size_t cr_resid; + + size_t cr_datalen; + uint8_t cr_data[]; +}; + +void usdf_cm_report_failure(struct usdf_connreq *crp, int error, + bool skip_data); + +int usdf_cm_dgram_getname(fid_t fid, void *addr, size_t *addrlen); + +bool usdf_cm_addr_is_valid_sin(void *addr, size_t addrlen, + uint32_t addr_format); + +int usdf_str_toaddr(const char *str, struct sockaddr_in **outaddr); +const char *usdf_addr_tostr(const struct sockaddr_in *sin, + char *addr_str, size_t *size); + +#endif /* _USDF_CM_H_ */ diff --git a/prov/usnic/src/usdf_cq.c b/prov/usnic/src/usdf_cq.c new file mode 100644 index 00000000000..604ca15ad6f --- /dev/null +++ b/prov/usnic/src/usdf_cq.c @@ -0,0 +1,1333 @@ +/* + * Copyright (c) 2014-2017, Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include "ofi.h" +#include "ofi_enosys.h" + +#include "usnic_direct.h" +#include "usd.h" +#include "usdf.h" +#include "usdf_av.h" +#include "usdf_progress.h" +#include "usdf_cq.h" +#include "usd_ib_cmd.h" +#include "usdf_wait.h" + +static inline int usdf_cqe_to_flags(struct usd_completion *comp) +{ + switch (comp->uc_type) { + case USD_COMPTYPE_SEND: + return (FI_MSG | FI_SEND); + case USD_COMPTYPE_RECV: + return (FI_MSG | FI_RECV); + default: + USDF_DBG_SYS(CQ, "WARNING: unknown completion type! (%d)\n", + comp->uc_type); + return 0; + } + +} + +static ssize_t +usdf_cq_readerr(struct fid_cq *fcq, struct fi_cq_err_entry *entry, + uint64_t flags) +{ + struct usdf_cq *cq; + uint32_t api_version; + + USDF_TRACE_SYS(CQ, "\n"); + + cq = container_of(fcq, struct usdf_cq, cq_fid); + api_version = cq->cq_domain->dom_fabric->fab_attr.fabric->api_version; + + // The return values are analogous to sockets cq_readerr + if (cq->cq_comp.uc_status == 0) { + return -FI_EAGAIN; + } + + entry->op_context = cq->cq_comp.uc_context; + entry->flags = 0; + switch (cq->cq_comp.uc_status) { + case USD_COMPSTAT_SUCCESS: + entry->prov_errno = FI_SUCCESS; + break; + case USD_COMPSTAT_ERROR_CRC: + entry->prov_errno = FI_ECRC; + break; + case USD_COMPSTAT_ERROR_TRUNC: + entry->prov_errno = FI_ETRUNC; + break; + case USD_COMPSTAT_ERROR_TIMEOUT: + entry->prov_errno = FI_ETIMEDOUT; + break; + case USD_COMPSTAT_ERROR_INTERNAL: + default: + entry->prov_errno = FI_EOTHER; + break; + } + entry->err = entry->prov_errno; + + cq->cq_comp.uc_status = 0; + + /* We don't have err_data to give back to the user. */ + if (FI_VERSION_GE(api_version, FI_VERSION(1, 5))) + entry->err_data_size = 0; + + return 1; +} + +static ssize_t +usdf_cq_readerr_soft(struct fid_cq *fcq, struct fi_cq_err_entry *entry, + uint64_t flags) +{ + struct usdf_cq *cq; + struct usdf_cq_soft_entry *tail; + + USDF_TRACE_SYS(CQ, "\n"); + + cq = container_of(fcq, struct usdf_cq, cq_fid); + + tail = cq->c.soft.cq_tail; + + entry->op_context = tail->cse_context; + entry->flags = 0; + entry->prov_errno = tail->cse_prov_errno; + entry->err = entry->prov_errno; + + tail++; + if (tail == cq->c.soft.cq_end) { + tail = cq->c.soft.cq_comps; + } + cq->c.soft.cq_tail = tail; + + return 1; +} + +/* Completion lengths should reflect the length given by the application to the + * send/recv call. This means we need to update the lengths for both prefix and + * non-prefix send paths. + * + * RECEIVE COMPLETIONS + * + * Non-prefix: the application isn't aware of the usd_udp_hdr struct. Default + * completion semantics include this in the completion length since it is part + * of the send. + * + * Prefix: the application has allocated a buffer that includes the advertised + * prefix size. For performance reasons our advertised prefix size is not the + * same size as hour headers. To reflect the correct size we need to add the + * size of the padding. + * + * SEND COMPLETIONS + * The send completions are dependent upon the wp_len value that is set by the + * library when using the underscore prefixed variants of the usd functions or + * by the usd library when using the non-underscore prefixed variants. + * Currently all send functions have been unified to report wp_len as the + * length of the payload. This means that adjustments need to be made when in + * libfabric prefix mode. + */ +static inline void usdf_cq_adjust_len(struct usd_completion *src, + size_t *len) +{ + struct usdf_ep *ep = src->uc_qp->uq_context; + + if (src->uc_type == USD_COMPTYPE_RECV) { + if (ep->ep_mode & FI_MSG_PREFIX) + *len += (USDF_HDR_BUF_ENTRY - + sizeof(struct usd_udp_hdr)); + else + *len -= sizeof(struct usd_udp_hdr); + } else { + if (ep->ep_mode & FI_MSG_PREFIX) + *len += USDF_HDR_BUF_ENTRY; + } +} + +static inline ssize_t +usdf_cq_copy_cq_entry(void *dst, struct usd_completion *src, + enum fi_cq_format format) +{ + struct fi_cq_entry *ctx_entry; + struct fi_cq_msg_entry *msg_entry; + struct fi_cq_data_entry *data_entry; + + switch (format) { + case FI_CQ_FORMAT_CONTEXT: + ctx_entry = (struct fi_cq_entry *)dst; + ctx_entry->op_context = src->uc_context; + break; + case FI_CQ_FORMAT_MSG: + msg_entry = (struct fi_cq_msg_entry *)dst; + msg_entry->op_context = src->uc_context; + msg_entry->flags = usdf_cqe_to_flags(src); + msg_entry->len = src->uc_bytes; + + usdf_cq_adjust_len(src, &msg_entry->len); + + break; + case FI_CQ_FORMAT_DATA: + data_entry = (struct fi_cq_data_entry *)dst; + data_entry->op_context = src->uc_context; + data_entry->flags = usdf_cqe_to_flags(src); + data_entry->len = src->uc_bytes; + data_entry->buf = 0; /* XXX */ + data_entry->data = 0; + + usdf_cq_adjust_len(src, &data_entry->len); + + break; + default: + USDF_WARN("unexpected CQ format, internal error\n"); + return -FI_EOPNOTSUPP; + } + + return FI_SUCCESS; +} + +/* + * poll a hard CQ + * Since this routine is an inline and is always called with format as + * a constant, I am counting on the compiler optimizing away all the switches + * on format. + */ +static inline ssize_t +usdf_cq_read_common(struct fid_cq *fcq, void *buf, size_t count, + enum fi_cq_format format) +{ + struct usdf_cq *cq; + struct usdf_fabric *fab; + size_t copylen; + size_t copied; + uint8_t *dest; + ssize_t ret; + + cq = cq_ftou(fcq); + fab = cq->cq_domain->dom_fabric; + + if (cq->cq_comp.uc_status != USD_COMPSTAT_SUCCESS) + return -FI_EAVAIL; + + switch (format) { + case FI_CQ_FORMAT_CONTEXT: + copylen = sizeof(struct fi_cq_entry); + break; + case FI_CQ_FORMAT_MSG: + copylen = sizeof(struct fi_cq_msg_entry); + break; + case FI_CQ_FORMAT_DATA: + copylen = sizeof(struct fi_cq_data_entry); + break; + default: + USDF_WARN_SYS(CQ, "unexpected CQ format, internal error\n"); + return -FI_EOPNOTSUPP; + } + + dest = buf; + + for (copied = 0; copied < count; copied++) { + ret = usd_poll_cq(cq->c.hard.cq_cq, &cq->cq_comp); + if (ret == -EAGAIN) + break; + + if (cq->cq_comp.uc_status != USD_COMPSTAT_SUCCESS) { + if (copied == 0) + return -FI_EAVAIL; + + break; + } + + ret = usdf_cq_copy_cq_entry(dest, &cq->cq_comp, format); + if (ret < 0) + return ret; + + dest += copylen; + } + + if (cq->cq_waiting) { + cq->cq_waiting = false; + ofi_atomic_dec32(&fab->num_blocked_waiting); + } + + return copied > 0 ? copied : -FI_EAGAIN; +} + +static ssize_t +usdf_cq_read_context(struct fid_cq *fcq, void *buf, size_t count) +{ + return usdf_cq_read_common(fcq, buf, count, FI_CQ_FORMAT_CONTEXT); +} + +static ssize_t +usdf_cq_read_msg(struct fid_cq *fcq, void *buf, size_t count) +{ + return usdf_cq_read_common(fcq, buf, count, FI_CQ_FORMAT_MSG); +} + +static ssize_t +usdf_cq_read_data(struct fid_cq *fcq, void *buf, size_t count) +{ + return usdf_cq_read_common(fcq, buf, count, FI_CQ_FORMAT_DATA); +} + +static ssize_t +usdf_cq_readfrom_context(struct fid_cq *fcq, void *buf, size_t count, + fi_addr_t *src_addr) +{ + struct usdf_cq *cq; + struct usd_cq_impl *ucq; + struct fi_cq_entry *entry; + struct fi_cq_entry *last; + ssize_t ret; + struct cq_desc *cq_desc; + struct usdf_ep *ep; + struct sockaddr_in sin; + struct usd_udp_hdr *hdr; + uint16_t index; + + cq = cq_ftou(fcq); + if (cq->cq_comp.uc_status != 0) { + return -FI_EAVAIL; + } + ucq = to_cqi(cq->c.hard.cq_cq); + + ret = 0; + entry = buf; + last = entry + count; + while (entry < last) { + cq_desc = (struct cq_desc *)((uint8_t *)ucq->ucq_desc_ring + + (ucq->ucq_next_desc << 4)); + + ret = usd_poll_cq(cq->c.hard.cq_cq, &cq->cq_comp); + if (ret == -EAGAIN) { + ret = 0; + break; + } + if (cq->cq_comp.uc_status != 0) { + ret = -FI_EAVAIL; + break; + } + + if (cq->cq_comp.uc_type == USD_COMPTYPE_RECV) { + index = le16_to_cpu(cq_desc->completed_index) & + CQ_DESC_COMP_NDX_MASK; + ep = cq->cq_comp.uc_qp->uq_context; + hdr = ep->e.dg.ep_hdr_ptr[index]; + memset(&sin, 0, sizeof(sin)); + + sin.sin_addr.s_addr = hdr->uh_ip.saddr; + sin.sin_port = hdr->uh_udp.source; + + *src_addr = usdf_av_lookup_addr(ep->e.dg.ep_av, &sin); + ++src_addr; + } + + + entry->op_context = cq->cq_comp.uc_context; + + entry++; + } + + if (entry > (struct fi_cq_entry *)buf) { + return entry - (struct fi_cq_entry *)buf; + } else { + return ret; + } +} + +/***************************************************************** + * "soft" CQ support + *****************************************************************/ + +void +usdf_progress_hard_cq(struct usdf_cq_hard *hcq) +{ + int ret; + struct usd_completion comp; + struct usdf_cq_soft_entry *entry; + struct usdf_cq *cq; + + cq = hcq->cqh_cq; + + do { + ret = usd_poll_cq(hcq->cqh_ucq, &comp); + if (ret == 0) { + entry = cq->c.soft.cq_head; + + /* If the current entry is equal to the tail and the + * last operation was a write, then we have filled the + * queue and we just drop whatever there isn't space + * for. + */ + if ((entry == cq->c.soft.cq_tail) && + (cq->c.soft.cq_last_op == + USDF_SOFT_CQ_WRITE)) + return; + + entry->cse_context = cq->cq_comp.uc_context; + entry->cse_flags = 0; + entry->cse_len = cq->cq_comp.uc_bytes; + entry->cse_buf = 0; /* XXX TODO */ + entry->cse_data = 0; + + /* update with wrap */ + entry++; + if (entry != cq->c.soft.cq_end) { + cq->c.soft.cq_head = entry; + } else { + cq->c.soft.cq_head = cq->c.soft.cq_comps; + } + + cq->c.soft.cq_last_op = USDF_SOFT_CQ_WRITE; + } + } while (ret != -EAGAIN); +} + +void +usdf_cq_post_soft(struct usdf_cq_hard *hcq, void *context, size_t len, + int prov_errno, uint64_t flags) +{ + int ret; + struct usdf_cq_soft_entry *entry; + struct usdf_cq *cq; + uint64_t val = 1; + + cq = hcq->cqh_cq; + + entry = cq->c.soft.cq_head; + + /* If the current entry is equal to the tail and the + * last operation was a write, then we have filled the + * queue and we just drop whatever there isn't space + * for. + */ + if ((entry == cq->c.soft.cq_tail) && + (cq->c.soft.cq_last_op == USDF_SOFT_CQ_WRITE)) + return; + + entry->cse_context = context; + entry->cse_len = len; + entry->cse_prov_errno = prov_errno; + entry->cse_flags = flags; + + /* update with wrap */ + entry++; + if (entry != cq->c.soft.cq_end) { + cq->c.soft.cq_head = entry; + } else { + cq->c.soft.cq_head = cq->c.soft.cq_comps; + } + + cq->c.soft.cq_last_op = USDF_SOFT_CQ_WRITE; + + if (cq->cq_attr.wait_obj == FI_WAIT_SET || + cq->cq_attr.wait_obj == FI_WAIT_FD) + while (1) { + ret = write(cq->object.fd, &val, sizeof(val)); + assert(ret == sizeof(val) || + (ret == -1 && errno == EINTR)); + if (ret == sizeof(val)) + return; + else if (ret == -1 && errno == EINTR) + continue; + + /* If the write() fails, there will be no user + * notification. Best we can do is emit a + * debug notice... + */ + USDF_WARN_SYS(CQ, "error while writing to wake CQ\n"); + return; + } +} + +static inline ssize_t +usdf_cq_copy_soft_entry(void *dst, const struct usdf_cq_soft_entry *src, + enum fi_cq_format dst_format) +{ + struct fi_cq_entry *ctx_entry; + struct fi_cq_msg_entry *msg_entry; + struct fi_cq_data_entry *data_entry; + + switch (dst_format) { + case FI_CQ_FORMAT_CONTEXT: + ctx_entry = (struct fi_cq_entry *)dst; + ctx_entry->op_context = src->cse_context; + break; + case FI_CQ_FORMAT_MSG: + msg_entry = (struct fi_cq_msg_entry *)dst; + msg_entry->op_context = src->cse_context; + msg_entry->flags = src->cse_flags; + msg_entry->len = src->cse_len; + break; + case FI_CQ_FORMAT_DATA: + data_entry = (struct fi_cq_data_entry *)dst; + data_entry->op_context = src->cse_context; + data_entry->flags = src->cse_flags; + data_entry->len = src->cse_len; + data_entry->buf = src->cse_buf; + data_entry->data = src->cse_data; + break; + default: + USDF_WARN("unexpected CQ format, internal error\n"); + return -FI_EOPNOTSUPP; + } + + return FI_SUCCESS; +} + +static ssize_t usdf_cq_sread(struct fid_cq *fcq, void *buf, size_t count, + const void *cond, int timeout_ms) +{ + struct usdf_cq *cq; + size_t sleep_time_us; + size_t time_spent_us = 0; + ssize_t ret; + + cq = cq_ftou(fcq); + + if (cq->cq_attr.wait_obj == FI_WAIT_NONE) + return -FI_EOPNOTSUPP; + + sleep_time_us = SREAD_INIT_SLEEP_TIME_US; + + while (1) { + ret = fi_cq_read(fcq, buf, count); + if (ret != -FI_EAGAIN) + return ret; + + if (timeout_ms >= 0) { + if (time_spent_us >= (1000 * timeout_ms)) + break; + } + + usleep(sleep_time_us); + time_spent_us += sleep_time_us; + + /* exponentially back off up to a limit */ + if (sleep_time_us < SREAD_MAX_SLEEP_TIME_US) + sleep_time_us *= SREAD_EXP_BASE; + sleep_time_us = MIN(sleep_time_us, SREAD_MAX_SLEEP_TIME_US); + } + + return -FI_EAGAIN; +} + +static ssize_t usdf_cq_sread_fd(struct fid_cq *fcq, void *buf, size_t count, + const void *cond, int timeout_ms) +{ + struct usdf_cq *cq; + struct usdf_fabric *fabric; + int ret; + + cq = cq_ftou(fcq); + fabric = cq->cq_domain->dom_fabric; + + ret = usdf_cq_trywait(&fcq->fid); + if (ret == FI_SUCCESS) { + ofi_atomic_inc32(&fabric->num_blocked_waiting); + + ret = usdf_fabric_wake_thread(fabric); + if (ret) { + USDF_DBG_SYS(CQ, + "error while waking progress thread\n"); + goto err; + } + + ret = fi_poll_fd(cq->object.fd, timeout_ms); + if (ret == 0) { + ret = -FI_EAGAIN; + goto err; + } else if (ret < 0) { + USDF_DBG_SYS(CQ, "poll failed: %s\n", strerror(-ret)); + goto err; + } + + ofi_atomic_dec32(&fabric->num_blocked_waiting); + } else if ((ret < 0) && (ret != -FI_EAGAIN)) { + return ret; + } + + return fi_cq_read(fcq, buf, count); + +err: + ofi_atomic_dec32(&fabric->num_blocked_waiting); + return ret; +} + +/* + * poll a soft CQ + * This will loop over all the hard CQs within, collecting results. + * Since this routine is an inline and is always called with format as + * a constant, I am counting on the compiler optimizing away all the switches + * on format. + */ +static inline ssize_t +usdf_cq_read_common_soft(struct fid_cq *fcq, void *buf, size_t count, + enum fi_cq_format format) +{ + struct usdf_cq *cq; + uint8_t *dest; + struct usdf_cq_soft_entry *tail; + size_t copylen; + size_t copied; + ssize_t ret; + + cq = cq_ftou(fcq); + + if (cq->cq_comp.uc_status != USD_COMPSTAT_SUCCESS) + return -FI_EAVAIL; + + /* progress... */ + usdf_domain_progress(cq->cq_domain); + + switch (format) { + case FI_CQ_FORMAT_CONTEXT: + copylen = sizeof(struct fi_cq_entry); + break; + case FI_CQ_FORMAT_MSG: + copylen = sizeof(struct fi_cq_msg_entry); + break; + case FI_CQ_FORMAT_DATA: + copylen = sizeof(struct fi_cq_data_entry); + break; + default: + USDF_WARN_SYS(CQ, "unexpected CQ format, internal error\n"); + return -FI_EOPNOTSUPP; + } + + dest = buf; + tail = cq->c.soft.cq_tail; + + for (copied = 0; copied < count; copied++) { + if (tail == cq->c.soft.cq_head) { + /* If the tail and head match and the last operation was + * a read then we have an empty queue. + */ + if (cq->c.soft.cq_last_op == USDF_SOFT_CQ_READ) + break; + } + + if (tail->cse_prov_errno != FI_SUCCESS) { + /* If this is the first read, then just return EAVAIL. + * Although we already checked above, this last read may + * have contained an error. If this isn't the first read + * then break and return the count read. The next read + * will yield an error. + */ + if (copied == 0) + return -FI_EAVAIL; + + break; + } + + ret = usdf_cq_copy_soft_entry(dest, tail, format); + if (ret < 0) + return ret; + + dest += copylen; + + tail++; + if (tail == cq->c.soft.cq_end) + tail = cq->c.soft.cq_comps; + + cq->c.soft.cq_last_op = USDF_SOFT_CQ_READ; + } + + cq->c.soft.cq_tail = tail; + + return copied > 0 ? copied : -FI_EAGAIN; +} + +static ssize_t +usdf_cq_read_context_soft(struct fid_cq *fcq, void *buf, size_t count) +{ + return usdf_cq_read_common_soft(fcq, buf, count, FI_CQ_FORMAT_CONTEXT); +} + +static ssize_t +usdf_cq_read_msg_soft(struct fid_cq *fcq, void *buf, size_t count) +{ + return usdf_cq_read_common_soft(fcq, buf, count, FI_CQ_FORMAT_MSG); +} + +static ssize_t +usdf_cq_read_data_soft(struct fid_cq *fcq, void *buf, size_t count) +{ + return usdf_cq_read_common_soft(fcq, buf, count, FI_CQ_FORMAT_DATA); +} + +/***************************************************************** + * common CQ support + *****************************************************************/ + +static const char * +usdf_cq_strerror(struct fid_cq *eq, int prov_errno, const void *err_data, + char *buf, size_t len) +{ + if (buf && len) { + strncpy(buf, fi_strerror(prov_errno), len); + buf[len-1] = '\0'; + return buf; + } + return fi_strerror(prov_errno); +} + +/* Handle the associated wait object when closing a CQ. + * - Remove the FD from the wait set epoll context + * - Decrement the ref count on the wait set + * - Remove the CQ from the CQ list attached to the wait set + */ +static int usdf_cq_unbind_wait(struct usdf_cq *cq) +{ + int ret; + struct usdf_wait *wait_priv; + + if (!cq->cq_attr.wait_set) { + USDF_DBG_SYS(CQ, "can't unbind from non-existent wait set\n"); + return -FI_EINVAL; + } + + wait_priv = wait_ftou(cq->cq_attr.wait_set); + + ret = ofi_epoll_del(wait_priv->object.epfd, cq->object.fd); + if (ret) { + USDF_WARN_SYS(CQ, "failed to remove FD from wait set\n"); + return ret; + } + + fid_list_remove(&wait_priv->list, &wait_priv->lock, &cq->cq_fid.fid); + + ofi_atomic_dec32(&wait_priv->wait_refcnt); + + USDF_DBG_SYS(CQ, + "dissasociated CQ FD %d from epoll FD %d using FID: %p\n", + cq->object.fd, wait_priv->object.epfd, &cq->cq_fid.fid); + + return FI_SUCCESS; +} + +static int +usdf_cq_close(fid_t fid) +{ + int ret; + struct usdf_cq *cq; + struct usdf_fabric *fab; + struct usdf_cq_hard *hcq; + + USDF_TRACE_SYS(CQ, "\n"); + + cq = container_of(fid, struct usdf_cq, cq_fid.fid); + fab = cq->cq_domain->dom_fabric; + + if (ofi_atomic_get32(&cq->cq_refcnt) > 0) { + return -FI_EBUSY; + } + + if (cq->cq_attr.wait_obj == FI_WAIT_SET) { + ret = usdf_cq_unbind_wait(cq); + if (ret) + return ret; + } + + if (cq->cq_is_soft) { + while (!TAILQ_EMPTY(&cq->c.soft.cq_list)) { + hcq = TAILQ_FIRST(&cq->c.soft.cq_list); + if (ofi_atomic_get32(&hcq->cqh_refcnt) > 0) { + return -FI_EBUSY; + } + TAILQ_REMOVE(&cq->c.soft.cq_list, hcq, cqh_link); + TAILQ_REMOVE(&cq->cq_domain->dom_hcq_list, hcq, + cqh_dom_link); + if (hcq->cqh_ucq != NULL) { + ret = usd_destroy_cq(hcq->cqh_ucq); + if (ret != 0) { + return ret; + } + } + free(hcq); + } + } else { + if (cq->c.hard.cq_cq) { + ret = usd_destroy_cq(cq->c.hard.cq_cq); + if (ret != 0) { + return ret; + } + } + } + + if (cq->cq_waiting) + ofi_atomic_dec32(&fab->num_blocked_waiting); + + free(cq); + return 0; +} + +static int usdf_cq_get_wait(struct usdf_cq *cq, void *arg) +{ + USDF_TRACE_SYS(CQ, "\n"); + + switch (cq->cq_attr.wait_obj) { + case FI_WAIT_FD: + if (cq->object.fd == -1) { + USDF_WARN_SYS(CQ, + "CQ must be bound before FD can be retrieved\n"); + return -FI_EOPBADSTATE; + } + + *(int *) arg = cq->object.fd; + break; + default: + USDF_WARN_SYS(CQ, "unsupported wait type\n"); + return -FI_EINVAL; + } + + return FI_SUCCESS; +} + +static int usdf_wait_control(struct fid *fcq, int command, void *arg) +{ + struct usdf_cq *cq; + + USDF_TRACE_SYS(CQ, "\n"); + + if (!fcq || !arg) { + USDF_WARN_SYS(CQ, "CQ fid and arg can't be NULL\n"); + return -FI_EINVAL; + } + + cq = cq_fidtou(fcq); + + switch (command) { + case FI_GETWAIT: + break; + default: + USDF_WARN_SYS(CQ, "unsupported control command\n"); + return -FI_EINVAL; + } + + return usdf_cq_get_wait(cq, arg); +} + +static struct fi_ops_cq usdf_cq_context_ops = { + .size = sizeof(struct fi_ops_cq), + .read = usdf_cq_read_context, + .readfrom = usdf_cq_readfrom_context, + .readerr = usdf_cq_readerr, + .sread = usdf_cq_sread, + .sreadfrom = fi_no_cq_sreadfrom, + .signal = fi_no_cq_signal, + .strerror = usdf_cq_strerror, +}; + +static struct fi_ops_cq usdf_cq_context_soft_ops = { + .size = sizeof(struct fi_ops_cq), + .read = usdf_cq_read_context_soft, + .readfrom = fi_no_cq_readfrom, + .readerr = usdf_cq_readerr_soft, + .sread = usdf_cq_sread, + .sreadfrom = fi_no_cq_sreadfrom, + .signal = fi_no_cq_signal, + .strerror = usdf_cq_strerror, +}; + +static struct fi_ops_cq usdf_cq_msg_ops = { + .size = sizeof(struct fi_ops_cq), + .read = usdf_cq_read_msg, + .readfrom = fi_no_cq_readfrom, /* XXX */ + .readerr = usdf_cq_readerr, + .sread = usdf_cq_sread, + .sreadfrom = fi_no_cq_sreadfrom, + .signal = fi_no_cq_signal, + .strerror = usdf_cq_strerror, +}; + +static struct fi_ops_cq usdf_cq_msg_soft_ops = { + .size = sizeof(struct fi_ops_cq), + .read = usdf_cq_read_msg_soft, + .readfrom = fi_no_cq_readfrom, /* XXX */ + .readerr = usdf_cq_readerr_soft, + .sread = usdf_cq_sread, + .sreadfrom = fi_no_cq_sreadfrom, + .signal = fi_no_cq_signal, + .strerror = usdf_cq_strerror, +}; + +static struct fi_ops_cq usdf_cq_data_ops = { + .size = sizeof(struct fi_ops_cq), + .read = usdf_cq_read_data, + .readfrom = fi_no_cq_readfrom, /* XXX */ + .readerr = usdf_cq_readerr, + .sread = usdf_cq_sread, + .sreadfrom = fi_no_cq_sreadfrom, + .signal = fi_no_cq_signal, + .strerror = usdf_cq_strerror, +}; + +static struct fi_ops_cq usdf_cq_data_soft_ops = { + .size = sizeof(struct fi_ops_cq), + .read = usdf_cq_read_data_soft, + .readfrom = fi_no_cq_readfrom, /* XXX */ + .readerr = usdf_cq_readerr_soft, + .sread = usdf_cq_sread, + .sreadfrom = fi_no_cq_sreadfrom, + .signal = fi_no_cq_signal, + .strerror = usdf_cq_strerror, +}; + +static struct fi_ops usdf_cq_fi_ops = { + .size = sizeof(struct fi_ops), + .close = usdf_cq_close, + .bind = fi_no_bind, + .control = usdf_wait_control, + .ops_open = fi_no_ops_open, +}; + +int +usdf_cq_make_soft(struct usdf_cq *cq) +{ + struct fi_ops_cq *soft_ops; + struct usdf_cq_hard *hcq; + struct usd_cq *ucq; + + switch (cq->cq_attr.format) { + case FI_CQ_FORMAT_CONTEXT: + soft_ops = &usdf_cq_context_soft_ops; + break; + case FI_CQ_FORMAT_MSG: + soft_ops = &usdf_cq_msg_soft_ops; + break; + case FI_CQ_FORMAT_DATA: + soft_ops = &usdf_cq_data_soft_ops; + break; + default: + return 0; + } + + if (!cq->cq_is_soft) { + + /* save the CQ before we trash the union */ + ucq = cq->c.hard.cq_cq; + + /* fill in the soft part of union */ + TAILQ_INIT(&cq->c.soft.cq_list); + cq->c.soft.cq_comps = calloc(cq->cq_attr.size, + sizeof(struct usdf_cq_soft_entry)); + if (cq->c.soft.cq_comps == NULL) { + return -FI_ENOMEM; + } + cq->c.soft.cq_end = cq->c.soft.cq_comps + cq->cq_attr.size; + cq->c.soft.cq_head = cq->c.soft.cq_comps; + cq->c.soft.cq_tail = cq->c.soft.cq_comps; + + /* need to add hard queue to list? */ + if (ucq != NULL) { + hcq = malloc(sizeof(*hcq)); + if (hcq == NULL) { + free(cq->c.soft.cq_comps); + cq->c.hard.cq_cq = ucq; /* restore */ + return -FI_ENOMEM; + } + + hcq->cqh_cq = cq; + hcq->cqh_ucq = ucq; + hcq->cqh_progress = usdf_progress_hard_cq; + + ofi_atomic_initialize32(&hcq->cqh_refcnt, + ofi_atomic_get32(&cq->cq_refcnt)); + TAILQ_INSERT_HEAD(&cq->c.soft.cq_list, hcq, cqh_link); + } + + cq->cq_is_soft = 1; + cq->cq_ops = *soft_ops; + } + return 0; +} + +int usdf_check_empty_soft_cq(struct usdf_cq *cq) +{ + if (cq->c.soft.cq_tail == cq->c.soft.cq_head) + return cq->c.soft.cq_last_op == USDF_SOFT_CQ_READ; + + return 0; +} + +int usdf_check_empty_hard_cq(struct usdf_cq *cq) +{ + struct usd_cq_impl *cqi; + struct cq_desc *cq_desc; + struct cq_desc *base; + uint8_t last_color; + uint8_t current_color; + + cqi = to_cqi(cq->c.hard.cq_cq); + + base = cqi->ucq_desc_ring; + cq_desc = &base[cqi->ucq_next_desc]; + + last_color = cqi->ucq_last_color; + current_color = cq_desc->type_color >> CQ_DESC_COLOR_SHIFT; + + return current_color == last_color; +} + +static int +usdf_cq_process_attr(struct fi_cq_attr *attr, struct usdf_domain *udp) +{ + if (!attr || !udp) + return -FI_EINVAL; + + switch (attr->wait_obj) { + case FI_WAIT_NONE: + case FI_WAIT_UNSPEC: + break; + case FI_WAIT_FD: + case FI_WAIT_SET: + if (!usd_get_cap(udp->dom_dev, USD_CAP_GRP_INTR)) { + USDF_WARN_SYS(CQ, "FD request invalid.\n"); + USDF_WARN_SYS(CQ, "group interrupts not supported.\n"); + return -FI_EINVAL; + } + break; + default: + return -FI_ENOSYS; + } + + /* bound and default size */ + if (attr->size > udp->dom_fabric->fab_dev_attrs->uda_max_cqe) { + return -FI_EINVAL; + } + if (attr->size == 0) { + attr->size = udp->dom_fabric->fab_dev_attrs->uda_max_cqe; + } + + /* default format is FI_CQ_FORMAT_CONTEXT */ + if (attr->format == FI_CQ_FORMAT_UNSPEC) { + + attr->format = FI_CQ_FORMAT_CONTEXT; + } + return 0; +} + +static int usdf_cq_fd_set_nonblock(int fd) +{ + int flags; + + flags = fcntl(fd, F_GETFL, 0); + if (flags == -1) { + USDF_WARN_SYS(CQ, "fcntl getfl failed[%d]\n", errno); + return -errno; + } + + if (fcntl(fd, F_SETFL, flags | O_NONBLOCK) < 0) { + USDF_WARN_SYS(CQ, "fcntl setfl failed[%d]\n", errno); + return -errno; + } + + return FI_SUCCESS; +} + +static int usdf_cq_create_fd(struct usdf_cq *cq) +{ + int ret; + + ret = usd_ib_cmd_create_comp_channel(cq->cq_domain->dom_dev, + &cq->object.fd); + if (ret) { + USDF_WARN_SYS(CQ, "failed to create comp channel\n"); + return -FI_EINVAL; + } + + USDF_DBG_SYS(CQ, "successfully created comp channel with fd %d\n", + cq->object.fd); + + /* Going to need this assuming edge-triggered semantics. + */ + return usdf_cq_fd_set_nonblock(cq->object.fd); +} + +int usdf_cq_trywait(struct fid *fcq) +{ + struct usdf_cq *cq; + struct usdf_fabric *fab; + uint64_t ev; + int empty; + int ret; + + cq = cq_fidtou(fcq); + fab = cq->cq_domain->dom_fabric; + + switch (cq->cq_attr.wait_obj) { + case FI_WAIT_UNSPEC: + return FI_SUCCESS; + case FI_WAIT_FD: + case FI_WAIT_SET: + break; + default: + USDF_WARN_SYS(CQ, "unsupported wait object type\n"); + return -FI_EINVAL; + } + + while (1) { + ret = read(cq->object.fd, &ev, sizeof(ev)); + if (ret == 0) { + USDF_WARN_SYS(CQ, + "FD read returned 0, is it closed?\n"); + return -FI_EINVAL; + } + + if (ret < 0) { + if (errno == EAGAIN) + break; + else + return -errno; + } + } + + cq->cq_waiting = true; + ofi_atomic_inc32(&fab->num_blocked_waiting); + ret = usdf_fabric_wake_thread(fab); + if (ret) { + USDF_DBG_SYS(FABRIC, "error while waking progress thread\n"); + ofi_atomic_dec32(&fab->num_blocked_waiting); + } + + if (cq->cq_is_soft) { + empty = usdf_check_empty_soft_cq(cq); + } else { + usd_poll_req_notify(cq->c.hard.cq_cq); + empty = usdf_check_empty_hard_cq(cq); + } + + if (empty) + return FI_SUCCESS; + + return -FI_EAGAIN; +} + + +static int usdf_cq_bind_wait(struct usdf_cq *cq) +{ + int ret; + struct usdf_wait *wait_priv; + + if (!cq->cq_attr.wait_set) { + USDF_DBG_SYS(CQ, "can't bind to non-existent wait set\n"); + return -FI_EINVAL; + } + + /* Wait set ref count doesn't need to be incremented here since it was + * already incremented during CQ open. It's incremented in CQ open + * because the CQ isn't actually created until bind time, and we want + * to make sure that the wait object is not closed in between open and + * bind. + */ + wait_priv = wait_ftou(cq->cq_attr.wait_set); + + ret = fid_list_insert(&wait_priv->list, &wait_priv->lock, + &cq->cq_fid.fid); + if (ret) { + USDF_WARN_SYS(CQ, + "failed to associate cq with wait fid list\n"); + return ret; + } + + ret = ofi_epoll_add(wait_priv->object.epfd, cq->object.fd, + OFI_EPOLL_IN, cq); + if (ret) { + USDF_WARN_SYS(CQ, "failed to associate FD with wait set\n"); + goto err; + } + + USDF_DBG_SYS(CQ, "associated CQ FD %d with epoll FD %d using fid %p\n", + cq->object.fd, wait_priv->object.epfd, &cq->cq_fid.fid); + + return ret; + +err: + fid_list_remove(&wait_priv->list, &wait_priv->lock, &cq->cq_fid.fid); + return ret; +} + +/* If cq->cq_attr.wait_obj == (FI_WAIT_FD | FI_WAIT_SET), then use an FD with + * the CQ. If create_fd evaluates to true, then it will create a hardware + * completion channel. + * + * If create_fd does not evaluate to true, then it is assumed that a valid file + * descriptor is available in cq->object.fd. + */ +int usdf_cq_create_cq(struct usdf_cq *cq, struct usd_cq **ucq, int create_fd) +{ + int ret; + struct usd_cq_init_attr attr = {0}; + + if (!cq || !cq->cq_domain || !cq->cq_domain->dom_dev) { + USDF_DBG_SYS(CQ, "Invalid input.\n"); + return -FI_EINVAL; + } + + attr.num_entries = cq->cq_attr.size; + attr.comp_fd = -1; + + /* For hard queues we will need to create an FD for CQs configured to + * use both wait sets and FDs. For a wait set this FD will get added to + * the epoll structure used by the waitset. + * + * For soft queues (emulated endpoints) we will not be creating an FD, + * but will need to set the appropriate functions and bind to the wait + * object, if any. + */ + if ((cq->cq_attr.wait_obj == FI_WAIT_FD) || + (cq->cq_attr.wait_obj == FI_WAIT_SET)) { + cq->cq_ops.sread = usdf_cq_sread_fd; + + if (create_fd) { + ret = usdf_cq_create_fd(cq); + if (ret) + return ret; + + attr.comp_fd = cq->object.fd; + + /* usd_create_cq will only set + * USNIC_CQ_COMP_SIGNAL_VERBS if an ibv_cq is present, + * but we don't have one. Just shove the cq in. + */ + attr.ibv_cq = &ucq; + } + + if (cq->cq_attr.wait_obj == FI_WAIT_SET) { + cq->cq_ops.sread = fi_no_cq_sread; + ret = usdf_cq_bind_wait(cq); + if (ret) + return ret; + } + } + + ret = usd_create_cq(cq->cq_domain->dom_dev, &attr, ucq); + if (ret && cq->cq_attr.wait_obj == FI_WAIT_SET) + usdf_cq_unbind_wait(cq); + return ret; +} + +int +usdf_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, + struct fid_cq **cq_o, void *context) +{ + struct usdf_cq *cq; + struct usdf_domain *udp; + struct usdf_wait *wait_priv; + int ret; + + USDF_TRACE_SYS(CQ, "\n"); + + udp = dom_ftou(domain); + ret = usdf_cq_process_attr(attr, udp); + if (ret != 0) { + return ret; + } + + cq = calloc(1, sizeof(*cq)); + if (cq == NULL) { + return -FI_ENOMEM; + } + + /* Do this here because we don't actually create the CQ until bind + * time. At open time the CQ should be associated with the wait set + * using the ref count so the app can't delete the wait set out from + * under the CQ. + */ + if (attr->wait_obj == FI_WAIT_SET) { + wait_priv = wait_ftou(attr->wait_set); + ofi_atomic_inc32(&wait_priv->wait_refcnt); + } + + cq->object.fd = -1; + cq->cq_domain = udp; + cq->cq_fid.fid.fclass = FI_CLASS_CQ; + cq->cq_fid.fid.context = context; + cq->cq_fid.fid.ops = &usdf_cq_fi_ops; + ofi_atomic_initialize32(&cq->cq_refcnt, 0); + + switch (attr->format) { + case FI_CQ_FORMAT_CONTEXT: + cq->cq_ops = usdf_cq_context_ops; + break; + case FI_CQ_FORMAT_MSG: + cq->cq_ops = usdf_cq_msg_ops; + break; + case FI_CQ_FORMAT_DATA: + cq->cq_ops = usdf_cq_data_ops; + break; + default: + ret = -FI_ENOSYS; + goto fail; + } + + cq->cq_fid.ops = &cq->cq_ops; + + cq->cq_attr = *attr; + *cq_o = &cq->cq_fid; + return 0; + +fail: + if (cq != NULL) { + if (cq->c.hard.cq_cq != NULL) { + usd_destroy_cq(cq->c.hard.cq_cq); + } + free(cq); + } + return ret; +} diff --git a/prov/usnic/src/usdf_cq.h b/prov/usnic/src/usdf_cq.h new file mode 100644 index 00000000000..bad6d742668 --- /dev/null +++ b/prov/usnic/src/usdf_cq.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2014-2016, Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _USDF_CQ_H_ +#define _USDF_CQ_H_ + +/* exponential backoff settings for fi_cq_sread */ +#define SREAD_EXP_BASE 2 +#define SREAD_INIT_SLEEP_TIME_US 1 +#define SREAD_MAX_SLEEP_TIME_US 5000 + +int usdf_cq_make_soft(struct usdf_cq *cq); +int usdf_cq_create_cq(struct usdf_cq *cq, struct usd_cq **ucq, int create_fd); +int usdf_check_empty_hard_cq(struct usdf_cq *cq); +int usdf_check_empty_soft_cq(struct usdf_cq *cq); +int usdf_cq_trywait(struct fid *fcq); + +void usdf_progress_hard_cq(struct usdf_cq_hard *hcq); + +void usdf_cq_post_soft(struct usdf_cq_hard *hcq, void *context, + size_t len, int prov_errno, uint64_t flags); + +#endif /* _USDF_CQ_H_ */ diff --git a/prov/usnic/src/usdf_dgram.c b/prov/usnic/src/usdf_dgram.c new file mode 100644 index 00000000000..905d94be6af --- /dev/null +++ b/prov/usnic/src/usdf_dgram.c @@ -0,0 +1,805 @@ +/* + * Copyright (c) 2014-2016, Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include "ofi.h" + +#include "usd.h" +#include "usd_post.h" + +#include "usdf.h" +#include "usdf_dgram.h" +#include "usdf_av.h" + +static inline size_t _usdf_iov_len(const struct iovec *iov, size_t count) +{ + size_t len; + size_t i; + + for (i = 0, len = 0; i < count; i++) + len += iov[i].iov_len; + + return len; +} + +static inline struct usd_udp_hdr *_usdf_find_hdr(struct usd_wq *wq) +{ + uint8_t *copybuf; + + copybuf = wq->uwq_copybuf + (wq->uwq_post_index * USD_SEND_MAX_COPY); + + return (struct usd_udp_hdr *) copybuf; +} + +static inline void _usdf_adjust_hdr(struct usd_udp_hdr *hdr, + struct usd_qp_impl *qp, size_t len) +{ + hdr->uh_ip.tot_len = htons(len + sizeof(struct usd_udp_hdr) - + sizeof(struct ether_header)); + hdr->uh_udp.len = htons(len + sizeof(struct usd_udp_hdr) - + sizeof(struct ether_header) - + sizeof(struct iphdr)); + hdr->uh_udp.source = + qp->uq_attrs.uqa_local_addr.ul_addr.ul_udp.u_addr.sin_port; +} + +static inline void _usdf_adjust_prefix_hdr(struct usd_udp_hdr *hdr, + struct usd_qp_impl *qp, size_t len, size_t padding) +{ + + hdr->uh_ip.tot_len = htons(len - padding - sizeof(struct ether_header)); + hdr->uh_udp.len = htons(len - padding - sizeof(struct ether_header) - + sizeof(struct iphdr)); + hdr->uh_udp.source = + qp->uq_attrs.uqa_local_addr.ul_addr.ul_udp.u_addr.sin_port; +} + +static inline void _usdf_adjust_post_info(struct usd_wq *wq, uint32_t last_post, + void *context, size_t len) +{ + struct usd_wq_post_info *info; + + info = &wq->uwq_post_info[last_post]; + info->wp_context = context; + info->wp_len = len; +} + +ssize_t +usdf_dgram_recv(struct fid_ep *fep, void *buf, size_t len, + void *desc, fi_addr_t src_addr, void *context) +{ + struct usdf_ep *ep; + struct usd_qp_impl *qp; + struct usd_recv_desc rxd; + uint32_t index; + + ep = ep_ftou(fep); + qp = to_qpi(ep->e.dg.ep_qp); + + index = qp->uq_rq.urq_post_index; + rxd.urd_context = context; + rxd.urd_iov[0].iov_base = (uint8_t *)ep->e.dg.ep_hdr_buf + + (index * USDF_HDR_BUF_ENTRY) + + (USDF_HDR_BUF_ENTRY - sizeof(struct usd_udp_hdr)); + rxd.urd_iov[0].iov_len = sizeof(struct usd_udp_hdr); + rxd.urd_iov[1].iov_base = buf; + rxd.urd_iov[1].iov_len = len; + rxd.urd_iov_cnt = 2; + rxd.urd_next = NULL; + + ep->e.dg.ep_hdr_ptr[index] = rxd.urd_iov[0].iov_base; + index = (index + 1) & qp->uq_rq.urq_post_index_mask; + ep->e.dg.ep_hdr_ptr[index] = rxd.urd_iov[0].iov_base; + + return usd_post_recv(ep->e.dg.ep_qp, &rxd); +} + +ssize_t +usdf_dgram_recvv(struct fid_ep *fep, const struct iovec *iov, void **desc, + size_t count, fi_addr_t src_addr, void *context) +{ + struct usdf_ep *ep; + struct usd_recv_desc rxd; + struct usd_qp_impl *qp; + uint32_t index; + size_t i; + + ep = ep_ftou(fep); + qp = to_qpi(ep->e.dg.ep_qp); + + rxd.urd_context = context; + rxd.urd_iov[0].iov_base = ((uint8_t *)ep->e.dg.ep_hdr_buf) + + qp->uq_rq.urq_post_index * USDF_HDR_BUF_ENTRY; + rxd.urd_iov[0].iov_len = sizeof(struct usd_udp_hdr); + memcpy(&rxd.urd_iov[1], iov, sizeof(*iov) * count); + rxd.urd_iov_cnt = count + 1; + rxd.urd_next = NULL; + + index = qp->uq_rq.urq_post_index; + for (i = 0; i < count; ++i) { + ep->e.dg.ep_hdr_ptr[index] = rxd.urd_iov[0].iov_base; + index = (index + 1) & qp->uq_rq.urq_post_index_mask; + } + + return usd_post_recv(ep->e.dg.ep_qp, &rxd); +} + +ssize_t +usdf_dgram_recvmsg(struct fid_ep *fep, const struct fi_msg *msg, uint64_t flags) +{ + struct usdf_ep *ep; + struct usd_qp_impl *qp; + struct usd_rq *rq; + struct vnic_rq *vrq; + struct rq_enet_desc *desc; + const struct iovec *iovp; + uint8_t *hdr_ptr; + uint32_t index; + unsigned i; + + ep = ep_ftou(fep); + qp = to_qpi(ep->e.dg.ep_qp); + rq = &qp->uq_rq; + vrq = &rq->urq_vnic_rq; + desc = rq->urq_next_desc; + index = rq->urq_post_index; + + iovp = msg->msg_iov; + rq->urq_context[index] = msg->context; + hdr_ptr = ((uint8_t *)ep->e.dg.ep_hdr_buf) + + (index * USDF_HDR_BUF_ENTRY); + rq_enet_desc_enc(desc, (dma_addr_t) hdr_ptr, + RQ_ENET_TYPE_ONLY_SOP, sizeof(struct usd_udp_hdr)); + ep->e.dg.ep_hdr_ptr[index] = (struct usd_udp_hdr *) hdr_ptr; + + index = (index + 1) & rq->urq_post_index_mask; + desc = (struct rq_enet_desc *) + ((uintptr_t)rq->urq_desc_ring + (index << 4)); + + for (i = 0; i < msg->iov_count; ++i) { + rq->urq_context[index] = msg->context; + rq_enet_desc_enc(desc, (dma_addr_t) iovp[i].iov_base, + RQ_ENET_TYPE_NOT_SOP, iovp[i].iov_len); + ep->e.dg.ep_hdr_ptr[index] = (struct usd_udp_hdr *) hdr_ptr; + + index = (index + 1) & rq->urq_post_index_mask; + desc = (struct rq_enet_desc *) + ((uintptr_t)rq->urq_desc_ring + (index << 4)); + } + + if ((flags & FI_MORE) == 0) { + wmb(); + iowrite32(index, &vrq->ctrl->posted_index); + } + + rq->urq_next_desc = desc; + rq->urq_post_index = index; + rq->urq_recv_credits -= msg->iov_count + 1; + + return 0; +} + +ssize_t +usdf_dgram_send(struct fid_ep *fep, const void *buf, size_t len, void *desc, + fi_addr_t dest_addr, void *context) +{ + struct usdf_dest *dest; + struct usdf_ep *ep; + uint32_t flags; + + ep = ep_ftou(fep); + dest = (struct usdf_dest *)(uintptr_t) dest_addr; + flags = (ep->ep_tx_completion) ? USD_SF_SIGNAL : 0; + + assert(len <= ep->max_msg_size); + + if (len + sizeof(struct usd_udp_hdr) <= USD_SEND_MAX_COPY) { + return usd_post_send_one_copy(ep->e.dg.ep_qp, &dest->ds_dest, + buf, len, flags, + context); + } else if (ep->e.dg.tx_op_flags & FI_INJECT) { + USDF_DBG_SYS(EP_DATA, + "given inject length (%zu) exceeds max inject length (%d)\n", + len + sizeof(struct usd_udp_hdr), + USD_SEND_MAX_COPY); + return -FI_ENOSPC; + } + + return usd_post_send_one(ep->e.dg.ep_qp, &dest->ds_dest, buf, len, + flags, context); +} + +static ssize_t +_usdf_dgram_send_iov_copy(struct usdf_ep *ep, struct usd_dest *dest, + const struct iovec *iov, size_t count, void *context, + uint8_t cq_entry) +{ + struct usd_wq *wq; + struct usd_qp_impl *qp; + struct usd_udp_hdr *hdr; + uint32_t last_post; + size_t len; + unsigned i; + + qp = to_qpi(ep->e.dg.ep_qp); + wq = &qp->uq_wq; + + hdr = _usdf_find_hdr(wq); + memcpy(hdr, &dest->ds_dest.ds_udp.u_hdr, sizeof(*hdr)); + + len = 0; + for (i = 0; i < count; i++) { + memcpy((char *) hdr + sizeof(*hdr) + len, iov[i].iov_base, + iov[i].iov_len); + len += iov[i].iov_len; + } + + assert(len <= ep->max_msg_size); + + _usdf_adjust_hdr(hdr, qp, len); + + last_post = _usd_post_send_one(wq, hdr, len + sizeof(*hdr), cq_entry); + + _usdf_adjust_post_info(wq, last_post, context, len); + + return 0; +} + +static ssize_t _usdf_dgram_send_iov(struct usdf_ep *ep, struct usd_dest *dest, + const struct iovec *iov, size_t count, void *context, uint8_t + cq_entry) +{ + struct iovec send_iov[USDF_DGRAM_MAX_SGE]; + struct usd_udp_hdr *hdr; + struct usd_qp_impl *qp; + struct usd_wq *wq; + uint32_t last_post; + size_t len; + + qp = to_qpi(ep->e.dg.ep_qp); + wq = &qp->uq_wq; + + len = _usdf_iov_len(iov, count); + hdr = _usdf_find_hdr(wq); + memcpy(hdr, &dest->ds_dest.ds_udp.u_hdr, sizeof(*hdr)); + _usdf_adjust_hdr(hdr, qp, len); + + assert(len <= ep->max_msg_size); + + send_iov[0].iov_base = hdr; + send_iov[0].iov_len = sizeof(*hdr); + memcpy(&send_iov[1], iov, sizeof(struct iovec) * count); + + last_post = _usd_post_send_iov(wq, send_iov, count + 1, + cq_entry); + _usdf_adjust_post_info(wq, last_post, context, len); + + return FI_SUCCESS; +} + +ssize_t +usdf_dgram_sendv(struct fid_ep *fep, const struct iovec *iov, void **desc, + size_t count, fi_addr_t dest_addr, void *context) +{ + struct usd_dest *dest; + struct usdf_ep *ep; + size_t len; + + ep = ep_ftou(fep); + len = sizeof(struct usd_udp_hdr); + dest = (struct usd_dest *)(uintptr_t) dest_addr; + + len += _usdf_iov_len(iov, count); + assert(len <= ep->max_msg_size); + + if (len <= USD_SEND_MAX_COPY) { + return _usdf_dgram_send_iov_copy(ep, dest, iov, count, context, + ep->ep_tx_completion); + } else if (ep->e.dg.tx_op_flags & FI_INJECT) { + USDF_DBG_SYS(EP_DATA, + "given inject length (%zu) exceeds max inject length (%d)\n", + len, USD_SEND_MAX_COPY); + return -FI_ENOSPC; + } + + if (count > ep->e.dg.tx_iov_limit) { + USDF_DBG_SYS(EP_DATA, "max iov count exceeded: %zu\n", count); + return -FI_ENOSPC; + } + + return _usdf_dgram_send_iov(ep, dest, iov, count, context, + ep->ep_tx_completion); +} + +ssize_t +usdf_dgram_sendmsg(struct fid_ep *fep, const struct fi_msg *msg, uint64_t flags) +{ + struct usd_dest *dest; + struct usdf_ep *ep; + uint8_t completion; + size_t len; + + ep = ep_ftou(fep); + len = sizeof(struct usd_udp_hdr); + dest = (struct usd_dest *)(uintptr_t) msg->addr; + completion = ep->ep_tx_dflt_signal_comp || (flags & FI_COMPLETION); + + len += _usdf_iov_len(msg->msg_iov, msg->iov_count); + assert(len <= ep->max_msg_size); + + if (len <= USD_SEND_MAX_COPY) { + return _usdf_dgram_send_iov_copy(ep, dest, msg->msg_iov, + msg->iov_count, + msg->context, + completion); + } else if (flags & FI_INJECT) { + USDF_DBG_SYS(EP_DATA, + "given inject length (%zu) exceeds max inject length (%d)\n", + len, USD_SEND_MAX_COPY); + return -FI_ENOSPC; + } + + if (msg->iov_count > ep->e.dg.tx_iov_limit) { + USDF_DBG_SYS(EP_DATA, "max iov count exceeded: %zu\n", + msg->iov_count); + return -FI_ENOSPC; + } + + return _usdf_dgram_send_iov(ep, dest, msg->msg_iov, msg->iov_count, + msg->context, completion); +} + +ssize_t +usdf_dgram_inject(struct fid_ep *fep, const void *buf, size_t len, + fi_addr_t dest_addr) +{ + struct usdf_dest *dest; + struct usdf_ep *ep; + + ep = ep_ftou(fep); + dest = (struct usdf_dest *)(uintptr_t) dest_addr; + + if (len + sizeof(struct usd_udp_hdr) > USD_SEND_MAX_COPY) { + USDF_DBG_SYS(EP_DATA, + "given inject length (%zu) exceeds max inject length (%d)\n", + len + sizeof(struct usd_udp_hdr), + USD_SEND_MAX_COPY); + return -FI_ENOSPC; + } + + /* + * fi_inject never generates a completion + */ + return usd_post_send_one_copy(ep->e.dg.ep_qp, &dest->ds_dest, buf, len, + 0, NULL); +} + +ssize_t usdf_dgram_prefix_inject(struct fid_ep *fep, const void *buf, + size_t len, fi_addr_t dest_addr) +{ + return usdf_dgram_inject(fep, ((uint8_t *)buf) + USDF_HDR_BUF_ENTRY, + len - USDF_HDR_BUF_ENTRY, dest_addr); +} + +ssize_t usdf_dgram_rx_size_left(struct fid_ep *fep) +{ + struct usdf_ep *ep; + + USDF_DBG_SYS(EP_DATA, "\n"); + + if (fep == NULL) + return -FI_EINVAL; + + ep = ep_ftou(fep); + + if (!(ep->flags & USDF_EP_ENABLED)) + return -FI_EOPBADSTATE; + + return usd_get_recv_credits(ep->e.dg.ep_qp) / + (ep->e.dg.rx_iov_limit + 1); +} + +ssize_t usdf_dgram_tx_size_left(struct fid_ep *fep) +{ + struct usdf_ep *ep; + + USDF_DBG_SYS(EP_DATA, "\n"); + + if (fep == NULL) + return -FI_EINVAL; + + ep = ep_ftou(fep); + + if (!(ep->flags & USDF_EP_ENABLED)) + return -FI_EOPBADSTATE; + + return usd_get_send_credits(ep->e.dg.ep_qp) / + (ep->e.dg.tx_iov_limit + 1); +} + +/* + * Versions that rely on user to reserve space for header at start of buffer + */ +ssize_t +usdf_dgram_prefix_recv(struct fid_ep *fep, void *buf, size_t len, + void *desc, fi_addr_t src_addr, void *context) +{ + struct usdf_ep *ep; + struct usd_qp_impl *qp; + struct usd_recv_desc rxd; + uint32_t index; + + ep = ep_ftou(fep); + qp = to_qpi(ep->e.dg.ep_qp); + + index = qp->uq_rq.urq_post_index; + rxd.urd_context = context; + rxd.urd_iov[0].iov_base = (uint8_t *)buf + + USDF_HDR_BUF_ENTRY - sizeof(struct usd_udp_hdr); + rxd.urd_iov[0].iov_len = len - + (USDF_HDR_BUF_ENTRY - sizeof(struct usd_udp_hdr)); + rxd.urd_iov_cnt = 1; + rxd.urd_next = NULL; + + ep->e.dg.ep_hdr_ptr[index] = rxd.urd_iov[0].iov_base; + + return usd_post_recv(ep->e.dg.ep_qp, &rxd); +} + +ssize_t +usdf_dgram_prefix_recvv(struct fid_ep *fep, const struct iovec *iov, + void **desc, size_t count, fi_addr_t src_addr, void *context) +{ + struct usdf_ep *ep; + struct usd_recv_desc rxd; + struct usd_qp_impl *qp; + uint32_t index; + size_t i; + + ep = ep_ftou(fep); + qp = to_qpi(ep->e.dg.ep_qp); + + rxd.urd_context = context; + memcpy(&rxd.urd_iov[0], iov, sizeof(*iov) * count); + rxd.urd_iov[0].iov_base = (uint8_t *)rxd.urd_iov[0].iov_base + + USDF_HDR_BUF_ENTRY - sizeof(struct usd_udp_hdr); + rxd.urd_iov[0].iov_len -= (USDF_HDR_BUF_ENTRY - + sizeof(struct usd_udp_hdr)); + + rxd.urd_iov_cnt = count; + rxd.urd_next = NULL; + + index = qp->uq_rq.urq_post_index; + for (i = 0; i < count; ++i) { + ep->e.dg.ep_hdr_ptr[index] = rxd.urd_iov[0].iov_base; + index = (index + 1) & qp->uq_rq.urq_post_index_mask; + } + + return usd_post_recv(ep->e.dg.ep_qp, &rxd); +} + +ssize_t +usdf_dgram_prefix_recvmsg(struct fid_ep *fep, const struct fi_msg *msg, uint64_t flags) +{ + struct usdf_ep *ep; + struct usd_qp_impl *qp; + struct usd_rq *rq; + struct vnic_rq *vrq; + struct rq_enet_desc *desc; + uint8_t *hdr_ptr; + const struct iovec *iovp; + uint32_t index; + unsigned i; + + ep = ep_ftou(fep); + qp = to_qpi(ep->e.dg.ep_qp); + rq = &qp->uq_rq; + vrq = &rq->urq_vnic_rq; + desc = rq->urq_next_desc; + index = rq->urq_post_index; + + iovp = msg->msg_iov; + rq->urq_context[index] = msg->context; + hdr_ptr = ((uint8_t *)iovp[0].iov_base) + + (USDF_HDR_BUF_ENTRY - sizeof(struct usd_udp_hdr)); + rq_enet_desc_enc(desc, (dma_addr_t) hdr_ptr, + RQ_ENET_TYPE_ONLY_SOP, + iovp[0].iov_len - + (USDF_HDR_BUF_ENTRY - sizeof(struct usd_udp_hdr))); + ep->e.dg.ep_hdr_ptr[index] = (struct usd_udp_hdr *) hdr_ptr; + + index = (index+1) & rq->urq_post_index_mask; + desc = (struct rq_enet_desc *) ((uintptr_t)rq->urq_desc_ring + + (index<<4)); + + for (i = 1; i < msg->iov_count; ++i) { + rq->urq_context[index] = msg->context; + rq_enet_desc_enc(desc, (dma_addr_t) iovp[i].iov_base, + RQ_ENET_TYPE_NOT_SOP, iovp[i].iov_len); + ep->e.dg.ep_hdr_ptr[index] = (struct usd_udp_hdr *) hdr_ptr; + + index = (index+1) & rq->urq_post_index_mask; + desc = (struct rq_enet_desc *) ((uintptr_t)rq->urq_desc_ring + + (index<<4)); + } + + if ((flags & FI_MORE) == 0) { + wmb(); + iowrite32(index, &vrq->ctrl->posted_index); + } + + rq->urq_next_desc = desc; + rq->urq_post_index = index; + rq->urq_recv_credits -= msg->iov_count; + + return 0; +} + +ssize_t +usdf_dgram_prefix_send(struct fid_ep *fep, const void *buf, size_t len, + void *desc, fi_addr_t dest_addr, void *context) +{ + struct usd_udp_hdr *hdr; + struct usd_qp_impl *qp; + struct usdf_dest *dest; + struct usdf_ep *ep; + struct usd_wq *wq; + uint32_t last_post; + uint32_t flags; + size_t padding; + + ep = ep_ftou(fep); + dest = (struct usdf_dest *)(uintptr_t) dest_addr; + padding = USDF_HDR_BUF_ENTRY - sizeof(struct usd_udp_hdr); + flags = (ep->ep_tx_completion) ? USD_SF_SIGNAL : 0; + + assert(len <= ep->max_msg_size); + + if (ep->e.dg.tx_op_flags & FI_INJECT) { + if ((len - padding) > USD_SEND_MAX_COPY) { + USDF_DBG_SYS(EP_DATA, + "given inject length (%zu) exceeds max inject length (%d)\n", + len, USD_SEND_MAX_COPY); + return -FI_ENOSPC; + } + + return usd_post_send_one_copy(ep->e.dg.ep_qp, &dest->ds_dest, + ((uint8_t *)buf) + USDF_HDR_BUF_ENTRY, len - + USDF_HDR_BUF_ENTRY, flags, + context); + } + + qp = to_qpi(ep->e.dg.ep_qp); + wq = &qp->uq_wq; + + hdr = (struct usd_udp_hdr *) ((char *) buf + padding); + memcpy(hdr, &dest->ds_dest.ds_dest.ds_udp.u_hdr, sizeof(*hdr)); + + _usdf_adjust_prefix_hdr(hdr, qp, len, padding); + + last_post = _usd_post_send_one(wq, hdr, len - padding, + ep->ep_tx_completion); + + _usdf_adjust_post_info(wq, last_post, context, len - USDF_HDR_BUF_ENTRY); + + return FI_SUCCESS; +} + +static ssize_t +_usdf_dgram_send_iov_prefix(struct usdf_ep *ep, + struct usd_dest *dest, const struct iovec *iov, + size_t count, void *context, uint8_t cq_entry) +{ + struct iovec send_iov[USDF_DGRAM_MAX_SGE]; + struct usd_udp_hdr *hdr; + struct usd_qp_impl *qp; + uint32_t last_post; + struct usd_wq *wq; + size_t padding; + size_t len; + + qp = to_qpi(ep->e.dg.ep_qp); + wq = &qp->uq_wq; + + len = _usdf_iov_len(iov, count); + padding = USDF_HDR_BUF_ENTRY - sizeof(struct usd_udp_hdr); + + assert(len <= ep->max_msg_size); + + hdr = (struct usd_udp_hdr *) ((char *) iov[0].iov_base + + padding); + memcpy(hdr, &dest->ds_dest.ds_udp.u_hdr, sizeof(*hdr)); + + _usdf_adjust_prefix_hdr(hdr, qp, len, padding); + + memcpy(send_iov, iov, sizeof(struct iovec) * count); + send_iov[0].iov_base = hdr; + send_iov[0].iov_len -= padding; + + last_post = _usd_post_send_iov(wq, send_iov, count, cq_entry); + _usdf_adjust_post_info(wq, last_post, context, len - USDF_HDR_BUF_ENTRY); + + return FI_SUCCESS; +} + +ssize_t +usdf_dgram_prefix_sendv(struct fid_ep *fep, const struct iovec *iov, + void **desc, size_t count, fi_addr_t dest_addr, void *context) +{ + struct iovec send_iov[USDF_DGRAM_MAX_SGE]; + struct usd_dest *dest; + struct usdf_ep *ep; + size_t len; + size_t padding; + + ep = ep_ftou(fep); + dest = (struct usd_dest *)(uintptr_t) dest_addr; + len = _usdf_iov_len(iov, count); + padding = USDF_HDR_BUF_ENTRY - sizeof(struct usd_udp_hdr); + + assert(len <= ep->max_msg_size); + + if (count > ep->e.dg.tx_iov_limit) { + USDF_DBG_SYS(EP_DATA, "max iov count exceeded: %zu\n", count); + return -FI_ENOSPC; + } + + if ((len - padding) <= USD_SEND_MAX_COPY) { + /* _usdf_dgram_send_iov_copy isn't prefix aware and allocates + * its own prefix. reorganize iov[0] base to point to data and + * len to reflect data length. + */ + memcpy(send_iov, iov, sizeof(struct iovec) * count); + send_iov[0].iov_base = ((char *) send_iov[0].iov_base + + USDF_HDR_BUF_ENTRY); + send_iov[0].iov_len -= USDF_HDR_BUF_ENTRY; + + return _usdf_dgram_send_iov_copy(ep, dest, send_iov, count, + context, ep->ep_tx_completion); + } else if (ep->e.dg.tx_op_flags & FI_INJECT) { + USDF_DBG_SYS(EP_DATA, + "given inject length (%zu) exceeds max inject length (%d)\n", + len, USD_SEND_MAX_COPY); + return -FI_ENOSPC; + } + + return _usdf_dgram_send_iov_prefix(ep, dest, iov, count, context, + ep->ep_tx_completion); +} + +ssize_t +usdf_dgram_prefix_sendmsg(struct fid_ep *fep, const struct fi_msg *msg, + uint64_t flags) +{ + struct iovec send_iov[USDF_DGRAM_MAX_SGE]; + struct usd_dest *dest; + struct usdf_ep *ep; + uint8_t completion; + size_t len; + size_t padding; + + ep = ep_ftou(fep); + dest = (struct usd_dest *)(uintptr_t) msg->addr; + len = _usdf_iov_len(msg->msg_iov, msg->iov_count); + completion = ep->ep_tx_dflt_signal_comp || (flags & FI_COMPLETION); + padding = USDF_HDR_BUF_ENTRY - sizeof(struct usd_udp_hdr); + + assert(len <= ep->max_msg_size); + + if (msg->iov_count > ep->e.dg.tx_iov_limit) { + USDF_DBG_SYS(EP_DATA, "max iov count exceeded: %zu\n", + msg->iov_count); + return -FI_ENOSPC; + } + + if ((len - padding) <= USD_SEND_MAX_COPY) { + /* _usdf_dgram_send_iov_copy isn't prefix aware and allocates + * its own prefix. reorganize iov[0] base to point to data and + * len to reflect data length. + */ + memcpy(send_iov, msg->msg_iov, + sizeof(struct iovec) * msg->iov_count); + send_iov[0].iov_base = ((char *) send_iov[0].iov_base + + USDF_HDR_BUF_ENTRY); + send_iov[0].iov_len -= USDF_HDR_BUF_ENTRY; + + return _usdf_dgram_send_iov_copy(ep, dest, send_iov, + msg->iov_count, msg->context, completion); + } else if (flags & FI_INJECT) { + USDF_DBG_SYS(EP_DATA, + "given inject length (%zu) exceeds max inject length (%d)\n", + len, USD_SEND_MAX_COPY); + return -FI_ENOSPC; + } + + return _usdf_dgram_send_iov_prefix(ep, dest, msg->msg_iov, + msg->iov_count, msg->context, completion); +} + +ssize_t usdf_dgram_prefix_rx_size_left(struct fid_ep *fep) +{ + struct usdf_ep *ep; + + USDF_DBG_SYS(EP_DATA, "\n"); + + if (fep == NULL) + return -FI_EINVAL; + + ep = ep_ftou(fep); + + if (!(ep->flags & USDF_EP_ENABLED)) + return -FI_EOPBADSTATE; + + /* prefix_recvv can post up to iov_limit descriptors + */ + return (usd_get_recv_credits(ep->e.dg.ep_qp) / ep->e.dg.rx_iov_limit); +} + +ssize_t usdf_dgram_prefix_tx_size_left(struct fid_ep *fep) +{ + struct usdf_ep *ep; + + USDF_DBG_SYS(EP_DATA, "\n"); + + if (fep == NULL) + return -FI_EINVAL; + + ep = ep_ftou(fep); + + if (!(ep->flags & USDF_EP_ENABLED)) + return -FI_EOPBADSTATE; + + /* prefix_sendvcan post up to iov_limit descriptors + */ + return (usd_get_send_credits(ep->e.dg.ep_qp) / ep->e.dg.tx_iov_limit); +} diff --git a/prov/usnic/src/usdf_dgram.h b/prov/usnic/src/usdf_dgram.h new file mode 100644 index 00000000000..8c3b6a54182 --- /dev/null +++ b/prov/usnic/src/usdf_dgram.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2014-2017, Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _USDF_DGRAM_H_ +#define _USDF_DGRAM_H_ + +#define USDF_DGRAM_MAX_SGE 8 +#define USDF_DGRAM_DFLT_SGE 4 + +#define USDF_DGRAM_CAPS (FI_MSG | FI_SOURCE | FI_SEND | FI_RECV) + +#define USDF_DGRAM_SUPP_MODE (FI_LOCAL_MR | FI_MSG_PREFIX) + +#define USDF_DGRAM_MSG_ORDER (FI_ORDER_NONE) +#define USDF_DGRAM_COMP_ORDER (FI_ORDER_NONE) +#define USDF_DGRAM_INJECT_SIZE \ + (USD_SEND_MAX_COPY - sizeof(struct usd_udp_hdr)) +#define USDF_DGRAM_SUPP_SENDMSG_FLAGS \ + (FI_INJECT | FI_COMPLETION | FI_INJECT_COMPLETE | FI_TRANSMIT_COMPLETE) +#define USDF_DGRAM_SUPP_RECVMSG_FLAGS (FI_COMPLETION) +#define USDF_DGRAM_IOV_LIMIT (USDF_DGRAM_DFLT_SGE) +#define USDF_DGRAM_RMA_IOV_LIMIT 0 +#define USDF_DGRAM_CNTR_CNT 0 +#define USDF_DGRAM_MR_IOV_LIMIT (USDF_MR_IOV_LIMIT) +#define USDF_DGRAM_MR_CNT (USDF_MR_CNT) + + +int usdf_dgram_fill_rx_attr(uint32_t version, const struct fi_info *hints, + struct fi_info *fi, struct usd_device_attrs *dap); +int usdf_dgram_fill_tx_attr(uint32_t version, const struct fi_info *hints, + struct fi_info *fi, struct usd_device_attrs *dap); +int usdf_dgram_fill_dom_attr(uint32_t version, const struct fi_info *hints, + struct fi_info *fi, struct usd_device_attrs *dap); +int usdf_dgram_fill_ep_attr(uint32_t version, const struct fi_info *hints, + struct fi_info *fi, struct usd_device_attrs *dap); + +/* fi_ops_msg for DGRAM */ +ssize_t usdf_dgram_recv(struct fid_ep *ep, void *buf, size_t len, void *desc, + fi_addr_t src_addr, void *context); +ssize_t usdf_dgram_recvv(struct fid_ep *ep, const struct iovec *iov, + void **desc, size_t count, fi_addr_t src_addr, void *context); +ssize_t usdf_dgram_recvmsg(struct fid_ep *ep, const struct fi_msg *msg, + uint64_t flags); +ssize_t usdf_dgram_send(struct fid_ep *ep, const void *buf, size_t len, + void *desc, fi_addr_t dest_addr, void *context); +ssize_t usdf_dgram_sendv(struct fid_ep *ep, const struct iovec *iov, + void **desc, size_t count, fi_addr_t dest_addr, void *context); +ssize_t usdf_dgram_sendmsg(struct fid_ep *ep, const struct fi_msg *msg, + uint64_t flags); +ssize_t usdf_dgram_inject(struct fid_ep *ep, const void *buf, size_t len, + fi_addr_t dest_addr); +ssize_t usdf_dgram_rx_size_left(struct fid_ep *ep); +ssize_t usdf_dgram_tx_size_left(struct fid_ep *ep); + +ssize_t usdf_dgram_prefix_recv(struct fid_ep *ep, void *buf, size_t len, + void *desc, fi_addr_t src_addr, void *context); +ssize_t usdf_dgram_prefix_recvv(struct fid_ep *ep, const struct iovec *iov, + void **desc, size_t count, fi_addr_t src_addr, void *context); +ssize_t usdf_dgram_prefix_recvmsg(struct fid_ep *fep, const struct fi_msg *msg, + uint64_t flags); +ssize_t usdf_dgram_prefix_send(struct fid_ep *ep, const void *buf, size_t len, + void *desc, fi_addr_t dest_addr, void *context); +ssize_t usdf_dgram_prefix_sendv(struct fid_ep *fep, const struct iovec *iov, + void **desc, size_t count, fi_addr_t dest_addr, void *context); +ssize_t usdf_dgram_prefix_sendmsg(struct fid_ep *ep, const struct fi_msg *msg, + uint64_t flags); +ssize_t usdf_dgram_prefix_inject(struct fid_ep *ep, const void *buf, size_t len, + fi_addr_t dest_addr); +ssize_t usdf_dgram_prefix_rx_size_left(struct fid_ep *ep); +ssize_t usdf_dgram_prefix_tx_size_left(struct fid_ep *ep); + +#endif /* _USDF_DGRAM_H_ */ diff --git a/prov/usnic/src/usdf_domain.c b/prov/usnic/src/usdf_domain.c new file mode 100644 index 00000000000..fb4aa4caf54 --- /dev/null +++ b/prov/usnic/src/usdf_domain.c @@ -0,0 +1,427 @@ +/* + * Copyright (c) 2014-2019, Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include "ofi.h" +#include "ofi_enosys.h" +#include "ofi_util.h" + +#include "usnic_direct.h" +#include "usdf.h" +#include "usdf_timer.h" +#include "usdf_poll.h" +#include "usdf_cm.h" + +static int +usdf_domain_bind(struct fid *fid, struct fid *bfid, uint64_t flags) +{ + struct usdf_domain *udp; + + USDF_TRACE_SYS(DOMAIN, "\n"); + + if (flags & FI_REG_MR) { + USDF_WARN_SYS(DOMAIN, + "FI_REG_MR for EQs is not supported by the usnic provider"); + return -FI_EOPNOTSUPP; + } + + udp = dom_fidtou(fid); + + switch (bfid->fclass) { + case FI_CLASS_EQ: + if (udp->dom_eq != NULL) { + return -FI_EINVAL; + } + udp->dom_eq = eq_fidtou(bfid); + ofi_atomic_inc32(&udp->dom_eq->eq_refcnt); + break; + default: + return -FI_EINVAL; + } + + return 0; +} + +static int +usdf_domain_close(fid_t fid) +{ + struct usdf_domain *udp; + int ret; + + USDF_TRACE_SYS(DOMAIN, "\n"); + + udp = container_of(fid, struct usdf_domain, dom_fid.fid); + if (ofi_atomic_get32(&udp->dom_refcnt) > 0) { + return -FI_EBUSY; + } + + if (udp->dom_dev != NULL) { + ret = usd_close(udp->dom_dev); + if (ret != 0) { + return ret; + } + } + + if (udp->dom_eq != NULL) { + ofi_atomic_dec32(&udp->dom_eq->eq_refcnt); + } + ofi_atomic_dec32(&udp->dom_fabric->fab_refcnt); + LIST_REMOVE(udp, dom_link); + fi_freeinfo(udp->dom_info); + free(udp); + + return 0; +} + +static struct fi_ops usdf_fid_ops = { + .size = sizeof(struct fi_ops), + .close = usdf_domain_close, + .bind = usdf_domain_bind, + .control = fi_no_control, + .ops_open = fi_no_ops_open, +}; + +static struct fi_ops_mr usdf_domain_mr_ops = { + .size = sizeof(struct fi_ops_mr), + .reg = usdf_reg_mr, + .regv = usdf_regv_mr, + .regattr = usdf_regattr, +}; + +static struct fi_ops_domain usdf_domain_ops = { + .size = sizeof(struct fi_ops_domain), + .av_open = usdf_av_open, + .cq_open = usdf_cq_open, + .endpoint = usdf_endpoint_open, + .scalable_ep = fi_no_scalable_ep, + .cntr_open = fi_no_cntr_open, + .poll_open = usdf_poll_open, + .stx_ctx = fi_no_stx_context, + .srx_ctx = fi_no_srx_context, + .query_atomic = usdf_query_atomic, + .query_collective = fi_no_query_collective, +}; + +int +usdf_domain_open(struct fid_fabric *fabric, struct fi_info *info, + struct fid_domain **domain, void *context) +{ + struct usdf_fabric *fp; + struct usdf_domain *udp; + struct sockaddr_in *sin; + size_t addrlen; + int ret; +#if ENABLE_DEBUG + char requested[INET_ADDRSTRLEN], actual[INET_ADDRSTRLEN]; +#endif + + USDF_TRACE_SYS(DOMAIN, "\n"); + sin = NULL; + + fp = fab_fidtou(fabric); + + if (info->domain_attr != NULL) { + /* No versioning information available here. */ + if (!usdf_domain_checkname(0, fp->fab_dev_attrs, + info->domain_attr->name)) { + USDF_WARN_SYS(DOMAIN, "domain name mismatch\n"); + return -FI_ENODATA; + } + + if (ofi_check_mr_mode( + &usdf_ops, fabric->api_version, + FI_MR_BASIC | FI_MR_ALLOCATED | FI_MR_LOCAL, info)) { + /* the caller ignored our fi_getinfo results */ + USDF_WARN_SYS(DOMAIN, "MR mode (%d) not supported\n", + info->domain_attr->mr_mode); + return -FI_ENODATA; + } + } + + udp = calloc(1, sizeof *udp); + if (udp == NULL) { + USDF_DBG("unable to alloc mem for domain\n"); + ret = -FI_ENOMEM; + goto fail; + } + + USDF_DBG("uda_devname=%s\n", fp->fab_dev_attrs->uda_devname); + + /* + * Make sure address format is good and matches this fabric + */ + switch (info->addr_format) { + case FI_SOCKADDR: + addrlen = sizeof(struct sockaddr); + sin = info->src_addr; + break; + case FI_SOCKADDR_IN: + addrlen = sizeof(struct sockaddr_in); + sin = info->src_addr; + break; + case FI_ADDR_STR: + sin = usdf_format_to_sin(info, info->src_addr); + if (NULL == sin) { + ret = -FI_ENOMEM; + goto fail; + } + goto skip_size_check; + default: + ret = -FI_EINVAL; + goto fail; + } + + if (info->src_addrlen != addrlen) { + ret = -FI_EINVAL; + goto fail; + } + +skip_size_check: + if (sin->sin_family != AF_INET || + sin->sin_addr.s_addr != fp->fab_dev_attrs->uda_ipaddr_be) { + USDF_DBG_SYS(DOMAIN, "requested src_addr (%s) != fabric addr (%s)\n", + inet_ntop(AF_INET, &sin->sin_addr.s_addr, + requested, sizeof(requested)), + inet_ntop(AF_INET, &fp->fab_dev_attrs->uda_ipaddr_be, + actual, sizeof(actual))); + + ret = -FI_EINVAL; + usdf_free_sin_if_needed(info, sin); + goto fail; + } + usdf_free_sin_if_needed(info, sin); + + ret = usd_open(fp->fab_dev_attrs->uda_devname, &udp->dom_dev); + if (ret != 0) { + goto fail; + } + + udp->dom_fid.fid.fclass = FI_CLASS_DOMAIN; + udp->dom_fid.fid.context = context; + udp->dom_fid.fid.ops = &usdf_fid_ops; + udp->dom_fid.ops = &usdf_domain_ops; + udp->dom_fid.mr = &usdf_domain_mr_ops; + + ret = pthread_spin_init(&udp->dom_progress_lock, + PTHREAD_PROCESS_PRIVATE); + if (ret != 0) { + ret = -ret; + goto fail; + } + TAILQ_INIT(&udp->dom_tx_ready); + TAILQ_INIT(&udp->dom_hcq_list); + + udp->dom_info = fi_dupinfo(info); + if (udp->dom_info == NULL) { + ret = -FI_ENOMEM; + goto fail; + } + if (udp->dom_info->dest_addr != NULL) { + free(udp->dom_info->dest_addr); + udp->dom_info->dest_addr = NULL; + } + + udp->dom_fabric = fp; + LIST_INSERT_HEAD(&fp->fab_domain_list, udp, dom_link); + ofi_atomic_initialize32(&udp->dom_refcnt, 0); + ofi_atomic_inc32(&fp->fab_refcnt); + + *domain = &udp->dom_fid; + return 0; + +fail: + if (udp != NULL) { + if (udp->dom_info != NULL) { + fi_freeinfo(udp->dom_info); + } + if (udp->dom_dev != NULL) { + usd_close(udp->dom_dev); + } + free(udp); + } + return ret; +} + +/* In pre-1.4, the domain name was NULL. + * + * There used to be elaborate schemes to try to preserve this pre-1.4 + * behavior. In Nov 2019 discussions, however, it was determined that + * we could rationalize classifying this as buggy behavior. + * Specifically: we should just now always return a domain name -- + * even if the requested version is <1.4. + * + * This greatly simplifies the logic here, and also greatly simplifies + * layering with the rxd provider. + */ +int usdf_domain_getname(uint32_t version, struct usd_device_attrs *dap, + char **name) +{ + int ret = FI_SUCCESS; + char *buf = NULL; + + buf = strdup(dap->uda_devname); + if (NULL == buf) { + ret = -errno; + USDF_DBG("strdup failed while creating domain name\n"); + } else { + *name = buf; + } + + return ret; +} + +/* Check to see if the name supplied in a hint matches the name of our + * current domain. + * + * In pre-1.4, the domain name was NULL. + * + * There used to be elaborate schemes to try to preserve this pre-1.4 + * behavior. In Nov 2019 discussions, however, it was determined that + * we could rationalize classifying this as buggy behavior. + * Specifically: we should just now always return a domain name -- + * even if the requested version is <1.4. + * + * This greatly simplifies the logic here, and also greatly simplifies + * layering with the rxd provider. + * + * Hence, if a hint was provided, check the domain name (that we now + * always have) against the hint. + */ +bool usdf_domain_checkname(uint32_t version, struct usd_device_attrs *dap, + const char *hint) +{ + char *reference = NULL; + bool valid; + int ret; + + /* If no hint was provided, then by definition, we agree with + * the hint. */ + if (NULL == hint) { + return true; + } + + USDF_DBG("checking domain name: domain name='%s'\n", hint); + + ret = usdf_domain_getname(version, dap, &reference); + if (ret < 0) { + return false; + } + + valid = (strcmp(reference, hint) == 0); + if (!valid) { + USDF_DBG("given hint %s does not match %s -- invalid\n", + hint, reference); + } + + free(reference); + return valid; +} + +/* Query domain's atomic capability. + * We dont support atomic operations, just return EOPNOTSUPP. + */ +int usdf_query_atomic(struct fid_domain *domain, enum fi_datatype datatype, + enum fi_op op, struct fi_atomic_attr *attr, uint64_t flags) +{ + return -FI_EOPNOTSUPP; +} + +/* Catch the version changes for domain_attr. */ +int usdf_catch_dom_attr(uint32_t version, const struct fi_info *hints, + struct fi_domain_attr *dom_attr) +{ + /* version 1.5 introduced new bits. If the user asked for older + * version, we can't return these new bits. + */ + if (FI_VERSION_LT(version, FI_VERSION(1, 5))) { + /* We checked mr_mode compatibility before calling + * this function. This means it is safe to return + * 1.4 default mr_mode. + */ + dom_attr->mr_mode = FI_MR_BASIC; + + /* FI_REMOTE_COMM is introduced in 1.5. So don't return it. */ + dom_attr->caps &= ~FI_REMOTE_COMM; + + /* If FI_REMOTE_COMM is given for version < 1.5, fail. */ + if (hints && hints->domain_attr) { + if (hints->domain_attr->caps == FI_REMOTE_COMM) + return -FI_EBADFLAGS; + } + } else { + dom_attr->mr_mode &= ~(FI_MR_BASIC | FI_MR_SCALABLE); + } + + return FI_SUCCESS; +} + +/* Catch the version changes for tx_attr. */ +int usdf_catch_tx_attr(uint32_t version, const struct fi_tx_attr *tx_attr) +{ + /* In version < 1.5, FI_LOCAL_MR is required. */ + if (FI_VERSION_LT(version, FI_VERSION(1, 5))) { + if ((tx_attr->mode & FI_LOCAL_MR) == 0) + return -FI_ENODATA; + } + + return FI_SUCCESS; +} + +/* Catch the version changes for rx_attr. */ +int usdf_catch_rx_attr(uint32_t version, const struct fi_rx_attr *rx_attr) +{ + /* In version < 1.5, FI_LOCAL_MR is required. */ + if (FI_VERSION_LT(version, FI_VERSION(1, 5))) { + if ((rx_attr->mode & FI_LOCAL_MR) == 0) + return -FI_ENODATA; + } + + return FI_SUCCESS; +} diff --git a/prov/usnic/src/usdf_endpoint.c b/prov/usnic/src/usdf_endpoint.c new file mode 100644 index 00000000000..1fa33ce1728 --- /dev/null +++ b/prov/usnic/src/usdf_endpoint.c @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2014-2019, Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include "ofi.h" +#include "ofi_enosys.h" + +#include "usdf.h" +#include "usdf_endpoint.h" +#include "usdf_cm.h" + +int +usdf_endpoint_open(struct fid_domain *domain, struct fi_info *info, + struct fid_ep **ep_o, void *context) +{ + USDF_TRACE_SYS(EP_CTRL, "\n"); + + switch (info->ep_attr->type) { + case FI_EP_DGRAM: + return usdf_ep_dgram_open(domain, info, ep_o, context); + default: + return -FI_ENODEV; + } +} + +int usdf_ep_getopt_connected(fid_t fid, int level, int optname, void *optval, + size_t *optlen) +{ + size_t *cm_size; + size_t dest_size; + + USDF_TRACE_SYS(EP_CTRL, "\n"); + + if (!optval || !optlen) + return -FI_EINVAL; + + if (level != FI_OPT_ENDPOINT) + return -FI_ENOPROTOOPT; + + switch (optname) { + case FI_OPT_CM_DATA_SIZE: + dest_size = *optlen; + *optlen = sizeof(*cm_size); + + if (dest_size < sizeof(*cm_size)) + return -FI_ETOOSMALL; + + cm_size = optval; + *cm_size = USDF_MAX_CONN_DATA; + break; + default: + return -FI_ENOPROTOOPT; + } + + return FI_SUCCESS; +} + +int usdf_ep_getopt_unconnected(fid_t fid, int level, int optname, void *optval, + size_t *optlen) +{ + USDF_TRACE_SYS(EP_CTRL, "\n"); + + return -FI_ENOPROTOOPT; +} + + +int usdf_ep_setopt(fid_t fid, int level, int optname, const void *optval, + size_t optlen) +{ + USDF_TRACE_SYS(EP_CTRL, "\n"); + + return -FI_ENOPROTOOPT; +} diff --git a/prov/usnic/src/usdf_endpoint.h b/prov/usnic/src/usdf_endpoint.h new file mode 100644 index 00000000000..1bbad52869e --- /dev/null +++ b/prov/usnic/src/usdf_endpoint.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2014-2019, Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _USDF_ENDPOINT_H_ +#define _USDF_ENDPOINT_H_ + +int usdf_ep_port_bind(struct usdf_ep *ep, struct fi_info *info); +int usdf_ep_dgram_open(struct fid_domain *domain, struct fi_info *info, + struct fid_ep **ep, void *context); +int usdf_msg_upd_lcl_addr(struct usdf_ep *ep); + +int usdf_ep_getopt_connected(fid_t fid, int level, int optname, void *optval, + size_t *optlen); +int usdf_ep_getopt_unconnected(fid_t fid, int level, int optname, void *optval, + size_t *optlen); +int usdf_ep_setopt(fid_t fid, int level, int optname, const void *optval, + size_t optlen); + +extern struct fi_ops usdf_ep_ops; + +#endif /* _USDF_ENDPOINT_H_ */ diff --git a/prov/usnic/src/usdf_ep_dgram.c b/prov/usnic/src/usdf_ep_dgram.c new file mode 100644 index 00000000000..d260308dbb2 --- /dev/null +++ b/prov/usnic/src/usdf_ep_dgram.c @@ -0,0 +1,943 @@ +/* + * Copyright (c) 2014-2019, Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include "ofi.h" +#include "ofi_enosys.h" +#include "ofi_util.h" + +#include "usnic_direct.h" +#include "usd.h" +#include "usdf.h" +#include "usdf_endpoint.h" +#include "usdf_dgram.h" +#include "usdf_av.h" +#include "usdf_cq.h" +#include "usdf_cm.h" + +static int +usdf_ep_dgram_enable(struct fid_ep *fep) +{ + struct usdf_ep *ep; + struct usd_filter filt; + struct usd_qp_impl *uqp; + int ret; + + USDF_TRACE_SYS(EP_CTRL, "\n"); + + ep = ep_ftou(fep); + + if (ep->e.dg.ep_wcq == NULL) { + ret = -FI_EOPBADSTATE; + goto fail; + } + if (ep->e.dg.ep_rcq == NULL) { + ret = -FI_EOPBADSTATE; + goto fail; + } + + filt.uf_type = USD_FTY_UDP_SOCK; + filt.uf_filter.uf_udp_sock.u_sock = ep->e.dg.ep_sock; + + if (ep->ep_caps & USDF_EP_CAP_PIO) { + ret = usd_create_qp(ep->ep_domain->dom_dev, + USD_QTR_UDP, + USD_QTY_UD_PIO, + ep->e.dg.ep_wcq->c.hard.cq_cq, + ep->e.dg.ep_rcq->c.hard.cq_cq, + 127, // XXX + 127, // XXX + &filt, + &ep->e.dg.ep_qp); + } else { + ret = -FI_EAGAIN; + } + + if (ret != 0) { + ret = usd_create_qp(ep->ep_domain->dom_dev, + USD_QTR_UDP, + USD_QTY_UD, + ep->e.dg.ep_wcq->c.hard.cq_cq, + ep->e.dg.ep_rcq->c.hard.cq_cq, + ep->ep_wqe, + ep->ep_rqe, + &filt, + &ep->e.dg.ep_qp); + } + if (ret != 0) { + goto fail; + } + ep->e.dg.ep_qp->uq_context = ep; + + /* + * Allocate a memory region big enough to hold a header for each + * RQ entry + */ + uqp = to_qpi(ep->e.dg.ep_qp); + ep->e.dg.ep_hdr_ptr = calloc(uqp->uq_rq.urq_num_entries, + sizeof(ep->e.dg.ep_hdr_ptr[0])); + if (ep->e.dg.ep_hdr_ptr == NULL) { + ret = -FI_ENOMEM; + goto fail; + } + + ret = usd_alloc_mr(ep->ep_domain->dom_dev, + usd_get_recv_credits(ep->e.dg.ep_qp) * USDF_HDR_BUF_ENTRY, + &ep->e.dg.ep_hdr_buf); + if (ret != 0) { + goto fail; + } + + ep->flags |= USDF_EP_ENABLED; + + return 0; + +fail: + free(ep->e.dg.ep_hdr_ptr); + ep->e.dg.ep_hdr_ptr = NULL; + + if (ep->e.dg.ep_qp != NULL) { + usd_destroy_qp(ep->e.dg.ep_qp); + ep->e.dg.ep_qp = NULL; + } + return ret; +} + +static int +usdf_ep_dgram_bind(struct fid *fid, struct fid *bfid, uint64_t flags) +{ + int ret; + struct usdf_ep *ep; + struct usdf_cq *cq; + struct usdf_av *av; + + USDF_TRACE_SYS(EP_CTRL, "\n"); + + /* Backward compatibility case for Open MPI. We haven't been validating the flags until now. + * Older version of Open MPI gives FI_RECV as AV bind flag (bug). */ + if (bfid->fclass == FI_CLASS_AV) { + av = av_fidtou(bfid); + if (av->av_domain->dom_info->fabric_attr->api_version <= FI_VERSION(1, 4) && (flags & FI_RECV)) + flags = flags & ~FI_RECV; + } + + /* Check if the binding flags are valid. */ + ret = ofi_ep_bind_valid(&usdf_ops, bfid, flags); + if (ret) + return ret; + + ep = ep_fidtou(fid); + + switch (bfid->fclass) { + + case FI_CLASS_AV: + if (ep->e.dg.ep_av != NULL) { + return -FI_EINVAL; + } + + av = av_fidtou(bfid); + ep->e.dg.ep_av = av; + ofi_atomic_inc32(&av->av_refcnt); + break; + + case FI_CLASS_CQ: + cq = cq_fidtou(bfid); + + /* actually, could look through CQ list for a hard + * CQ with function usd_poll_cq() and use that... XXX + */ + if (cq->cq_is_soft) { + return -FI_EINVAL; + } + if (cq->c.hard.cq_cq == NULL) { + ret = usdf_cq_create_cq(cq, &cq->c.hard.cq_cq, true); + if (ret != 0) { + return ret; + } + } + + if (flags & FI_SEND) { + if (ep->e.dg.ep_wcq != NULL) { + return -FI_EINVAL; + } + + ep->ep_tx_dflt_signal_comp = + (flags & FI_SELECTIVE_COMPLETION) ? 0 : 1; + + ep->ep_tx_completion = (ep->ep_tx_dflt_signal_comp || + (ep->e.dg.tx_op_flags & FI_COMPLETION)); + + ep->e.dg.ep_wcq = cq; + ofi_atomic_inc32(&cq->cq_refcnt); + } + + if (flags & FI_RECV) { + if (ep->e.dg.ep_rcq != NULL) { + return -FI_EINVAL; + } + + if (flags & FI_SELECTIVE_COMPLETION) + return -FI_EOPNOTSUPP; + + ep->ep_rx_dflt_signal_comp = + (flags & FI_SELECTIVE_COMPLETION) ? 0 : 1; + + ep->ep_rx_completion = (ep->ep_rx_dflt_signal_comp || + (ep->e.dg.rx_op_flags & FI_COMPLETION)); + + ep->e.dg.ep_rcq = cq; + ofi_atomic_inc32(&cq->cq_refcnt); + } + break; + + case FI_CLASS_EQ: + if (ep->ep_eq != NULL) { + return -FI_EINVAL; + } + ep->ep_eq = eq_fidtou(bfid); + ofi_atomic_inc32(&ep->ep_eq->eq_refcnt); + break; + default: + return -FI_EINVAL; + } + + return 0; +} + +static void +usdf_ep_dgram_deref_cq(struct usdf_cq *cq) +{ + struct usdf_cq_hard *hcq; + void (*rtn)(struct usdf_cq_hard *hcq); + + if (cq == NULL) { + return; + } + ofi_atomic_dec32(&cq->cq_refcnt); + + rtn = usdf_progress_hard_cq; + + if (cq->cq_is_soft) { + TAILQ_FOREACH(hcq, &cq->c.soft.cq_list, cqh_link) { + if (hcq->cqh_progress == rtn) { + ofi_atomic_dec32(&hcq->cqh_refcnt); + return; + } + } + } +} + +static int +usdf_ep_dgram_close(fid_t fid) +{ + struct usdf_ep *ep; + + USDF_TRACE_SYS(EP_CTRL, "\n"); + + ep = ep_fidtou(fid); + + if (ofi_atomic_get32(&ep->ep_refcnt) > 0) { + return -FI_EBUSY; + } + + free(ep->e.dg.ep_hdr_ptr); + + if (ep->e.dg.ep_qp != NULL) { + usd_destroy_qp(ep->e.dg.ep_qp); + } + ofi_atomic_dec32(&ep->ep_domain->dom_refcnt); + if (ep->ep_eq != NULL) { + ofi_atomic_dec32(&ep->ep_eq->eq_refcnt); + } + + if (ep->e.dg.ep_av) + ofi_atomic_dec32(&ep->e.dg.ep_av->av_refcnt); + + usdf_ep_dgram_deref_cq(ep->e.dg.ep_wcq); + usdf_ep_dgram_deref_cq(ep->e.dg.ep_rcq); + + if (ep->e.dg.ep_sock != -1) { + close(ep->e.dg.ep_sock); + } + + free(ep); + return 0; +} + +static struct fi_ops_ep usdf_base_dgram_ops = { + .size = sizeof(struct fi_ops_ep), + .cancel = fi_no_cancel, + .getopt = fi_no_getopt, + .setopt = fi_no_setopt, + .tx_ctx = fi_no_tx_ctx, + .rx_ctx = fi_no_rx_ctx, + .rx_size_left = usdf_dgram_rx_size_left, + .tx_size_left = usdf_dgram_tx_size_left, +}; + +static struct fi_ops_ep usdf_base_dgram_prefix_ops = { + .size = sizeof(struct fi_ops_ep), + .cancel = fi_no_cancel, + .getopt = fi_no_getopt, + .setopt = fi_no_setopt, + .tx_ctx = fi_no_tx_ctx, + .rx_ctx = fi_no_rx_ctx, + .rx_size_left = usdf_dgram_prefix_rx_size_left, + .tx_size_left = usdf_dgram_prefix_tx_size_left, +}; + +static struct fi_ops_msg usdf_dgram_ops = { + .size = sizeof(struct fi_ops_msg), + .recv = usdf_dgram_recv, + .recvv = usdf_dgram_recvv, + .recvmsg = usdf_dgram_recvmsg, + .send = usdf_dgram_send, + .sendv = usdf_dgram_sendv, + .sendmsg = usdf_dgram_sendmsg, + .inject = usdf_dgram_inject, + .senddata = fi_no_msg_senddata, + .injectdata = fi_no_msg_injectdata, +}; + +static struct fi_ops_msg usdf_dgram_prefix_ops = { + .size = sizeof(struct fi_ops_msg), + .recv = usdf_dgram_prefix_recv, + .recvv = usdf_dgram_prefix_recvv, + .recvmsg = usdf_dgram_prefix_recvmsg, + .send = usdf_dgram_prefix_send, + .sendv = usdf_dgram_prefix_sendv, + .sendmsg = usdf_dgram_prefix_sendmsg, + .inject = usdf_dgram_prefix_inject, + .senddata = fi_no_msg_senddata, + .injectdata = fi_no_msg_injectdata, +}; + +static struct fi_ops_cm usdf_cm_dgram_ops = { + .size = sizeof(struct fi_ops_cm), + .setname = fi_no_setname, + .getname = usdf_cm_dgram_getname, + .getpeer = fi_no_getpeer, + .connect = fi_no_connect, + .listen = fi_no_listen, + .accept = fi_no_accept, + .reject = fi_no_reject, + .shutdown = fi_no_shutdown, + .join = fi_no_join, +}; + +static struct fi_ops_atomic usdf_dgram_atomic_ops = { + .size = sizeof(struct fi_ops_atomic), + .write = fi_no_atomic_write, + .writev = fi_no_atomic_writev, + .writemsg = fi_no_atomic_writemsg, + .inject = fi_no_atomic_inject, + .readwrite = fi_no_atomic_readwrite, + .readwritev = fi_no_atomic_readwritev, + .readwritemsg = fi_no_atomic_readwritemsg, + .compwrite = fi_no_atomic_compwrite, + .compwritev = fi_no_atomic_compwritev, + .compwritemsg = fi_no_atomic_compwritemsg, + .writevalid = fi_no_atomic_writevalid, + .readwritevalid = fi_no_atomic_readwritevalid, + .compwritevalid = fi_no_atomic_compwritevalid, +}; + +/******************************************************************************* + * Default values for dgram attributes + ******************************************************************************/ +static const struct fi_tx_attr dgram_dflt_tx_attr = { + .caps = USDF_DGRAM_CAPS, + .mode = USDF_DGRAM_SUPP_MODE, + .op_flags = 0, + .msg_order = USDF_DGRAM_MSG_ORDER, + .comp_order = USDF_DGRAM_COMP_ORDER, + .inject_size = USDF_DGRAM_INJECT_SIZE, + .iov_limit = USDF_DGRAM_IOV_LIMIT, + .rma_iov_limit = USDF_DGRAM_RMA_IOV_LIMIT +}; + +static const struct fi_rx_attr dgram_dflt_rx_attr = { + .caps = USDF_DGRAM_CAPS, + .mode = USDF_DGRAM_SUPP_MODE, + .op_flags = 0, + .msg_order = USDF_DGRAM_MSG_ORDER, + .comp_order = USDF_DGRAM_COMP_ORDER, + .total_buffered_recv = 0, + .iov_limit = USDF_DGRAM_IOV_LIMIT +}; + +static const struct fi_ep_attr dgram_dflt_ep_attr = { + .type = FI_EP_DGRAM, + .protocol = FI_PROTO_UDP, + .msg_prefix_size = 0, + .max_order_raw_size = 0, + .max_order_war_size = 0, + .max_order_waw_size = 0, + .mem_tag_format = 0, + .tx_ctx_cnt = 1, + .rx_ctx_cnt = 1 +}; + +static const struct fi_domain_attr dgram_dflt_domain_attr = { + .caps = USDF_DOM_CAPS, + .threading = FI_THREAD_ENDPOINT, + .control_progress = FI_PROGRESS_AUTO, + .data_progress = FI_PROGRESS_MANUAL, + .resource_mgmt = FI_RM_DISABLED, + .mr_mode = FI_MR_ALLOCATED | FI_MR_LOCAL | FI_MR_BASIC, + .cntr_cnt = USDF_DGRAM_CNTR_CNT, + .mr_iov_limit = USDF_DGRAM_MR_IOV_LIMIT, + .mr_cnt = USDF_DGRAM_MR_CNT, +}; + +/******************************************************************************* + * Fill functions for attributes + ******************************************************************************/ +int usdf_dgram_fill_ep_attr(uint32_t version, const struct fi_info *hints, struct + fi_info *fi, struct usd_device_attrs *dap) +{ + struct fi_ep_attr defaults; + + defaults = dgram_dflt_ep_attr; + + /* The ethernet header does not count against the MTU. */ + defaults.max_msg_size = dap->uda_mtu - sizeof(struct usd_udp_hdr); + + if (FI_VERSION_GE(version, FI_VERSION(1, 3))) + defaults.max_msg_size += sizeof(struct ether_header); + + if (!hints || !hints->ep_attr) + goto out; + + /* In prefix mode the max message size is the same as in non-prefix mode + * with the advertised header size added on top. + */ + + if (hints->mode & FI_MSG_PREFIX) { + defaults.msg_prefix_size = USDF_HDR_BUF_ENTRY; + + if (FI_VERSION_GE(version, FI_VERSION(1, 3))) + defaults.max_msg_size += defaults.msg_prefix_size; + } + + if (hints->ep_attr->max_msg_size > defaults.max_msg_size) + return -FI_ENODATA; + + switch (hints->ep_attr->protocol) { + case FI_PROTO_UNSPEC: + case FI_PROTO_UDP: + break; + default: + return -FI_ENODATA; + } + + if (hints->ep_attr->tx_ctx_cnt > defaults.tx_ctx_cnt) + return -FI_ENODATA; + if (hints->ep_attr->rx_ctx_cnt > defaults.rx_ctx_cnt) + return -FI_ENODATA; + + if (hints->ep_attr->max_order_raw_size > defaults.max_order_raw_size) + return -FI_ENODATA; + if (hints->ep_attr->max_order_war_size > defaults.max_order_war_size) + return -FI_ENODATA; + if (hints->ep_attr->max_order_waw_size > defaults.max_order_waw_size) + return -FI_ENODATA; + +out: + *fi->ep_attr = defaults; + + return FI_SUCCESS; +} + +int usdf_dgram_fill_dom_attr(uint32_t version, const struct fi_info *hints, + struct fi_info *fi, struct usd_device_attrs *dap) +{ + int ret; + struct fi_domain_attr defaults; + + defaults = dgram_dflt_domain_attr; + ret = usdf_domain_getname(version, dap, &defaults.name); + if (ret < 0) + return -FI_ENODATA; + + if (!hints || !hints->domain_attr) + goto catch; + + switch (hints->domain_attr->threading) { + case FI_THREAD_UNSPEC: + case FI_THREAD_ENDPOINT: + break; + case FI_THREAD_FID: + case FI_THREAD_COMPLETION: + case FI_THREAD_DOMAIN: + defaults.threading = hints->domain_attr->threading; + break; + default: + return -FI_ENODATA; + } + + switch (hints->domain_attr->control_progress) { + case FI_PROGRESS_UNSPEC: + case FI_PROGRESS_AUTO: + break; + case FI_PROGRESS_MANUAL: + defaults.control_progress = + hints->domain_attr->control_progress; + break; + default: + return -FI_ENODATA; + } + + switch (hints->domain_attr->data_progress) { + case FI_PROGRESS_UNSPEC: + case FI_PROGRESS_MANUAL: + break; + default: + return -FI_ENODATA; + } + + switch (hints->domain_attr->resource_mgmt) { + case FI_RM_UNSPEC: + case FI_RM_DISABLED: + break; + default: + return -FI_ENODATA; + } + + switch (hints->domain_attr->caps) { + case 0: + case FI_REMOTE_COMM: + break; + default: + USDF_WARN_SYS(DOMAIN, + "invalid domain capabilities\n"); + return -FI_ENODATA; + } + + switch (hints->domain_attr->av_type) { + case FI_AV_UNSPEC: + case FI_AV_MAP: + break; + default: + return -FI_ENODATA; + } + + if (ofi_check_mr_mode(&usdf_ops, version, defaults.mr_mode, hints)) + return -FI_ENODATA; + + if (hints->domain_attr->mr_cnt) { + if (hints->domain_attr->mr_cnt <= USDF_DGRAM_MR_CNT) { + defaults.mr_cnt = hints->domain_attr->mr_cnt; + } else { + USDF_DBG_SYS(DOMAIN, + "mr_count exceeded provider limit\n"); + return -FI_ENODATA; + } + } + +catch: + /* catch the version change here. */ + ret = usdf_catch_dom_attr(version, hints, &defaults); + if (ret) + return ret; + + *fi->domain_attr = defaults; + return FI_SUCCESS; +} + +int usdf_dgram_fill_tx_attr(uint32_t version, const struct fi_info *hints, + struct fi_info *fi, + struct usd_device_attrs *dap) +{ + int ret; + struct fi_tx_attr defaults; + size_t entries; + + defaults = dgram_dflt_tx_attr; + + defaults.size = dap->uda_max_send_credits / defaults.iov_limit; + + if (!hints || !hints->tx_attr) + goto out; + + /* make sure we can support the capabilities that are requested */ + if (hints->tx_attr->caps & ~USDF_DGRAM_CAPS) + return -FI_ENODATA; + + /* clear the mode bits the app doesn't support */ + if (hints->mode || hints->tx_attr->mode) + defaults.mode &= (hints->mode | hints->tx_attr->mode); + + defaults.op_flags |= hints->tx_attr->op_flags; + + if ((hints->tx_attr->msg_order | USDF_DGRAM_MSG_ORDER) != + USDF_DGRAM_MSG_ORDER) + return -FI_ENODATA; + if ((hints->tx_attr->comp_order | USDF_DGRAM_COMP_ORDER) != + USDF_DGRAM_COMP_ORDER) + return -FI_ENODATA; + + if (hints->tx_attr->inject_size > defaults.inject_size) + return -FI_ENODATA; + + if (hints->tx_attr->iov_limit > USDF_DGRAM_MAX_SGE) + return -FI_ENODATA; + + /* make sure the values for iov_limit and size are within appropriate + * bounds. if only one of the two was given, then set the other based + * on: + * max_credits = size * iov_limit; + */ + if (hints->tx_attr->iov_limit && hints->tx_attr->size) { + defaults.size = hints->tx_attr->size; + defaults.iov_limit = hints->tx_attr->iov_limit; + } else if (hints->tx_attr->iov_limit) { + defaults.iov_limit = hints->tx_attr->iov_limit; + defaults.size = + dap->uda_max_send_credits / defaults.iov_limit; + } else if (hints->tx_attr->size) { + defaults.size = hints->tx_attr->size; + defaults.iov_limit = + dap->uda_max_send_credits / defaults.size; + } + + entries = defaults.size * defaults.iov_limit; + if (entries > dap->uda_max_send_credits) + return -FI_ENODATA; + + if (hints->tx_attr->rma_iov_limit > defaults.rma_iov_limit) + return -FI_ENODATA; + +out: + /* Non-prefix mode requires extra descriptor for header. + */ + if (!hints || (hints && !(hints->mode & FI_MSG_PREFIX))) + defaults.iov_limit -= 1; + + /* catch version changes here. */ + ret = usdf_catch_tx_attr(version, &defaults); + if (ret) + return ret; + + *fi->tx_attr = defaults; + + return FI_SUCCESS; +} + +int usdf_dgram_fill_rx_attr(uint32_t version, const struct fi_info *hints, + struct fi_info *fi, struct usd_device_attrs *dap) +{ + int ret; + struct fi_rx_attr defaults; + size_t entries; + + defaults = dgram_dflt_rx_attr; + + defaults.size = dap->uda_max_recv_credits / defaults.iov_limit; + + if (!hints || !hints->rx_attr) + goto out; + + /* make sure we can support the capabilities that are requested */ + if (hints->rx_attr->caps & ~USDF_DGRAM_CAPS) + return -FI_ENODATA; + + /* clear the mode bits the app doesn't support */ + if (hints->mode || hints->tx_attr->mode) + defaults.mode &= (hints->mode | hints->rx_attr->mode); + + defaults.op_flags |= hints->rx_attr->op_flags; + + if ((hints->rx_attr->msg_order | USDF_DGRAM_MSG_ORDER) != + USDF_DGRAM_MSG_ORDER) + return -FI_ENODATA; + if ((hints->rx_attr->comp_order | USDF_DGRAM_COMP_ORDER) != + USDF_DGRAM_COMP_ORDER) + return -FI_ENODATA; + + if (hints->rx_attr->total_buffered_recv > + defaults.total_buffered_recv) + return -FI_ENODATA; + + if (hints->rx_attr->iov_limit > USDF_DGRAM_MAX_SGE) + return -FI_ENODATA; + + /* make sure the values for iov_limit and size are within appropriate + * bounds. if only one of the two was given, then set the other based + * on: + * max_credits = size * iov_limit; + */ + if (hints->rx_attr->iov_limit && hints->rx_attr->size) { + defaults.size = hints->rx_attr->size; + defaults.iov_limit = hints->rx_attr->iov_limit; + } else if (hints->rx_attr->iov_limit) { + defaults.iov_limit = hints->rx_attr->iov_limit; + defaults.size = + dap->uda_max_recv_credits / defaults.iov_limit; + } else if (hints->rx_attr->size) { + defaults.size = hints->rx_attr->size; + defaults.iov_limit = + dap->uda_max_recv_credits / defaults.size; + } + + entries = defaults.size * defaults.iov_limit; + if (entries > dap->uda_max_recv_credits) + return -FI_ENODATA; + +out: + /* Non-prefix mode requires extra descriptor for header. + */ + if (!hints || (hints && !(hints->mode & FI_MSG_PREFIX))) + defaults.iov_limit -= 1; + + /* catch version changes here. */ + ret = usdf_catch_rx_attr(version, &defaults); + if (ret) + return ret; + + *fi->rx_attr = defaults; + + return FI_SUCCESS; +} + +static int usdf_ep_dgram_control(struct fid *fid, int command, void *arg) +{ + struct fid_ep *ep; + int ret; + + USDF_TRACE_SYS(EP_CTRL, "\n"); + + switch (fid->fclass) { + case FI_CLASS_EP: + ep = container_of(fid, struct fid_ep, fid); + switch (command) { + case FI_ENABLE: + ret = usdf_ep_dgram_enable(ep); + break; + default: + ret = -FI_ENOSYS; + } + break; + default: + ret = -FI_ENOSYS; + } + + return ret; +} + +static struct fi_ops usdf_ep_dgram_ops = { + .size = sizeof(struct fi_ops), + .close = usdf_ep_dgram_close, + .bind = usdf_ep_dgram_bind, + .control = usdf_ep_dgram_control, + .ops_open = fi_no_ops_open +}; + +int +usdf_ep_dgram_open(struct fid_domain *domain, struct fi_info *info, + struct fid_ep **ep_o, void *context) +{ + struct usdf_domain *udp; + struct usdf_ep *ep; + int ret; + struct usdf_pep *parent_pep; + void *src_addr; + int is_bound; + size_t tx_size; + size_t rx_size; + + USDF_TRACE_SYS(EP_CTRL, "\n"); + + parent_pep = NULL; + src_addr = NULL; + + if ((info->caps & ~USDF_DGRAM_CAPS) != 0) { + return -FI_EBADF; + } + + if (info->handle != NULL) { + if (info->handle->fclass != FI_CLASS_PEP) { + USDF_WARN_SYS(EP_CTRL, + "\"handle\" should be a PEP (or NULL)\n"); + return -FI_EINVAL; + } + parent_pep = pep_fidtou(info->handle); + } + + udp = dom_ftou(domain); + + ep = calloc(1, sizeof(*ep)); + if (ep == NULL) { + return -FI_ENOMEM; + } + + is_bound = 0; + if (parent_pep != NULL) { + ret = usdf_pep_steal_socket(parent_pep, &is_bound, &ep->e.dg.ep_sock); + if (ret) { + goto fail; + } + } else { + ep->e.dg.ep_sock = socket(AF_INET, SOCK_DGRAM, 0); + if (ep->e.dg.ep_sock == -1) { + ret = -errno; + goto fail; + } + } + + if (!is_bound) { + if (info->src_addr != NULL) + src_addr = usdf_format_to_sin(info, info->src_addr); + + if (src_addr != NULL) { + ret = bind(ep->e.dg.ep_sock, src_addr, + sizeof(struct sockaddr_in)); + if (ret == -1) { + ret = -errno; + goto fail; + } + } + + usdf_free_sin_if_needed(info, src_addr); + } + + ep->ep_fid.fid.fclass = FI_CLASS_EP; + ep->ep_fid.fid.context = context; + ep->ep_fid.fid.ops = &usdf_ep_dgram_ops; + ep->ep_fid.cm = &usdf_cm_dgram_ops; + ep->ep_fid.atomic = &usdf_dgram_atomic_ops; + ep->ep_domain = udp; + ep->ep_caps = info->caps; + ep->ep_mode = info->mode; + + ep->e.dg.tx_iov_limit = USDF_DGRAM_IOV_LIMIT; + tx_size = udp->dom_fabric->fab_dev_attrs->uda_max_send_credits / + ep->e.dg.tx_iov_limit; + + ep->e.dg.rx_iov_limit = USDF_DGRAM_IOV_LIMIT; + rx_size = udp->dom_fabric->fab_dev_attrs->uda_max_recv_credits / + ep->e.dg.rx_iov_limit; + + /* + * TODO: Add better management of tx_attr/rx_attr to getinfo and dgram + * open. + */ + if (info->tx_attr) { + ep->e.dg.tx_op_flags = info->tx_attr->op_flags; + if (info->tx_attr->iov_limit) + ep->e.dg.tx_iov_limit = info->tx_attr->iov_limit; + if (info->tx_attr->size) + tx_size = info->tx_attr->size; + } + + if (info->rx_attr) { + ep->e.dg.rx_op_flags = info->rx_attr->op_flags; + if (info->rx_attr->iov_limit) + ep->e.dg.rx_iov_limit = info->rx_attr->iov_limit; + if (info->rx_attr->size) + rx_size = info->rx_attr->size; + } + + if (info->ep_attr) + ep->max_msg_size = info->ep_attr->max_msg_size; + + if (ep->ep_mode & FI_MSG_PREFIX) { + ep->ep_wqe = tx_size * ep->e.dg.tx_iov_limit; + ep->ep_rqe = rx_size * ep->e.dg.rx_iov_limit; + } else { + ep->ep_wqe = tx_size * (ep->e.dg.tx_iov_limit + 1); + ep->ep_rqe = rx_size * (ep->e.dg.rx_iov_limit + 1); + } + + /* Check that the requested credit size is less than the max credit + * counts. If the fi_info struct was acquired from fi_getinfo then this + * will always be the case. + */ + if (ep->ep_wqe > udp->dom_fabric->fab_dev_attrs->uda_max_send_credits) { + ret = -FI_EINVAL; + goto fail; + } + if (ep->ep_rqe > udp->dom_fabric->fab_dev_attrs->uda_max_recv_credits) { + ret = -FI_EINVAL; + goto fail; + } + + if (ep->ep_mode & FI_MSG_PREFIX) { + if (info->ep_attr == NULL) { + ret = -FI_EBADF; + goto fail; + } + + ep->ep_fid.ops = &usdf_base_dgram_prefix_ops; + info->ep_attr->msg_prefix_size = USDF_HDR_BUF_ENTRY; + ep->ep_fid.msg = &usdf_dgram_prefix_ops; + } else { + ep->ep_fid.ops = &usdf_base_dgram_ops; + ep->ep_fid.msg = &usdf_dgram_ops; + } + ofi_atomic_initialize32(&ep->ep_refcnt, 0); + ofi_atomic_inc32(&udp->dom_refcnt); + + *ep_o = ep_utof(ep); + return 0; + +fail: + if (ep != NULL) { + if (ep->e.dg.ep_sock != -1) { + close(ep->e.dg.ep_sock); + } + free(ep); + } + return ret; +} diff --git a/prov/usnic/src/usdf_eq.c b/prov/usnic/src/usdf_eq.c new file mode 100644 index 00000000000..5030b73d552 --- /dev/null +++ b/prov/usnic/src/usdf_eq.c @@ -0,0 +1,660 @@ +/* + * Copyright (c) 2014-2017, Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include "ofi.h" +#include "ofi_enosys.h" + +#include "usnic_direct.h" +#include "usd.h" +#include "usdf.h" +#include "usdf_wait.h" +#include "ofi_util.h" + +static inline int +usdf_eq_empty(struct usdf_eq *eq) +{ + return (ofi_atomic_get32(&eq->eq_num_events) == 0); +} + +static inline int +usdf_eq_error(struct usdf_eq *eq) +{ + return ((eq->eq_ev_tail->ue_flags & USDF_EVENT_FLAG_ERROR) != 0); +} + +/* + * read an event from the ring. Caller must hold eq lock, and caller + * needs to have checked for empty and error + */ +static inline ssize_t usdf_eq_read_event(struct usdf_eq *eq, uint32_t *event, + void *buf, size_t len, uint64_t flags) +{ + struct usdf_event *ev; + size_t copylen; + ssize_t nbytes; + uint64_t val; + + ev = eq->eq_ev_tail; + + copylen = MIN(ev->ue_len, len); + + if (copylen < ev->ue_len) { + USDF_WARN_SYS(EQ, + "buffer too small, got: %zu needed %zu\n", + copylen, ev->ue_len); + return -FI_ETOOSMALL; + } + + /* copy out the event */ + if (event) + *event = ev->ue_event; + + memcpy(buf, ev->ue_buf, copylen); + + if (!(flags & FI_PEEK)) { + /* update count */ + ofi_atomic_dec32(&eq->eq_num_events); + + /* Free the event buf if needed */ + if (ev->ue_flags & USDF_EVENT_FLAG_FREE_BUF) + free(ev->ue_buf); + + /* new tail */ + eq->eq_ev_tail++; + if (eq->eq_ev_tail >= eq->eq_ev_end) + eq->eq_ev_tail = eq->eq_ev_ring; + + /* consume the event in eventfd */ + if (eq->eq_attr.wait_obj == FI_WAIT_FD) { + nbytes = read(eq->eq_fd, &val, sizeof(val)); + if (nbytes != sizeof(val)) + return -errno; + } + } + + return copylen; +} + +/* + * unconditionally write an event to the EQ. Caller is responsible for + * ensuring there is room. EQ must be locked. + */ +static inline ssize_t +usdf_eq_write_event(struct usdf_eq *eq, uint32_t event, + const void *buf, size_t len, uint64_t flags) +{ + struct usdf_event *ev; + void *ev_buf; + + ev = eq->eq_ev_head; + ev->ue_event = event; + ev->ue_len = len; + ev->ue_flags = flags; + + /* save the event data if we can, else malloc() */ + if (len <= sizeof(struct fi_eq_entry)) { + ev_buf = eq->eq_ev_buf + (ev - eq->eq_ev_ring); + } else { + ev_buf = malloc(len); + if (ev_buf == NULL) { + return -errno; + } + ev->ue_flags |= USDF_EVENT_FLAG_FREE_BUF; + } + memcpy(ev_buf, buf, len); + ev->ue_buf = ev_buf; + + /* new head */ + eq->eq_ev_head++; + if (eq->eq_ev_head >= eq->eq_ev_end) { + eq->eq_ev_head = eq->eq_ev_ring; + } + + /* increment queued event count */ + ofi_atomic_inc32(&eq->eq_num_events); + + return len; +} + +static void usdf_eq_clean_err(struct usdf_eq *eq, uint8_t destroy) +{ + struct usdf_err_data_entry *err_data_entry; + struct slist_entry *entry; + + while (!slist_empty(&eq->eq_err_data)) { + entry = slist_remove_head(&eq->eq_err_data); + err_data_entry = container_of(entry, struct usdf_err_data_entry, + entry); + if (err_data_entry->seen || destroy) { + free(err_data_entry); + } else { + /* Oops, the rest hasn't been seen yet. Put this back + * and exit. + */ + slist_insert_head(entry, &eq->eq_err_data); + break; + } + } +} + +static ssize_t usdf_eq_readerr(struct fid_eq *feq, + struct fi_eq_err_entry *given_buffer, uint64_t flags) +{ + struct usdf_err_data_entry *err_data_entry; + struct fi_eq_err_entry entry; + struct usdf_eq *eq; + ssize_t ret, err_data_size; + uint32_t api_version; + void *err_data = NULL; + + USDF_TRACE_SYS(EQ, "\n"); + + if (!feq) { + USDF_DBG_SYS(EQ, "invalid input\n"); + return -FI_EINVAL; + } + + eq = eq_ftou(feq); + + pthread_spin_lock(&eq->eq_lock); + + /* make sure there is an error on top */ + if (usdf_eq_empty(eq) || !usdf_eq_error(eq)) { + pthread_spin_unlock(&eq->eq_lock); + ret = -FI_EAGAIN; + goto done; + } + + ret = usdf_eq_read_event(eq, NULL, &entry, sizeof(entry), flags); + + pthread_spin_unlock(&eq->eq_lock); + + /* read the user's setting for err_data. */ + err_data = given_buffer->err_data; + err_data_size = given_buffer->err_data_size; + + /* Copy the entry. */ + *given_buffer = entry; + + /* Mark as seen so it can be cleaned on the next iteration of read. */ + if (entry.err_data_size) { + err_data_entry = container_of(entry.err_data, + struct usdf_err_data_entry, err_data); + err_data_entry->seen = 1; + } + + + /* For release > 1.5, we will copy the err_data directly + * to the user's buffer. + */ + api_version = eq->eq_fabric->fab_attr.fabric->api_version; + if (FI_VERSION_GE(api_version, FI_VERSION(1, 5))) { + given_buffer->err_data = err_data; + given_buffer->err_data_size = + MIN(err_data_size, entry.err_data_size); + memcpy(given_buffer->err_data, entry.err_data, + given_buffer->err_data_size); + + if (err_data_size < entry.err_data_size) { + USDF_DBG_SYS(EQ, "err_data truncated by %zd bytes.\n", + entry.err_data_size - err_data_size); + } + + usdf_eq_clean_err(eq, 0); + } + +done: + return ret; +} + +static ssize_t _usdf_eq_read(struct usdf_eq *eq, uint32_t *event, void *buf, + size_t len, uint64_t flags) +{ + ssize_t ret; + + pthread_spin_lock(&eq->eq_lock); + + if (usdf_eq_empty(eq)) { + ret = -FI_EAGAIN; + goto done; + } + + if (usdf_eq_error(eq)) { + ret = -FI_EAVAIL; + goto done; + } + + if (!slist_empty(&eq->eq_err_data)) + usdf_eq_clean_err(eq, 0); + + ret = usdf_eq_read_event(eq, event, buf, len, flags); + +done: + pthread_spin_unlock(&eq->eq_lock); + return ret; +} + +static ssize_t usdf_eq_read(struct fid_eq *feq, uint32_t *event, void *buf, + size_t len, uint64_t flags) +{ + struct usdf_eq *eq; + + USDF_DBG_SYS(EQ, "\n"); + + eq = eq_ftou(feq); + + /* Don't bother acquiring the lock if there is nothing to read. */ + if (usdf_eq_empty(eq)) + return -FI_EAGAIN; + + return _usdf_eq_read(eq, event, buf, len, flags); +} + +/* TODO: The timeout handling seems off on this one. */ +static ssize_t usdf_eq_sread_fd(struct fid_eq *feq, uint32_t *event, void *buf, + size_t len, int timeout, uint64_t flags) +{ + struct usdf_eq *eq; + struct pollfd pfd; + int ret; + + USDF_DBG_SYS(EQ, "\n"); + + eq = eq_ftou(feq); + + /* Setup poll context to block until the FD becomes readable. */ + pfd.fd = eq->eq_fd; + pfd.events = POLLIN; + +retry: + ret = poll(&pfd, 1, timeout); + if (ret < 0) + return -errno; + else if (ret == 0) + return -FI_EAGAIN; + + ret = _usdf_eq_read(eq, event, buf, len, flags); + if (ret == -FI_EAGAIN) + goto retry; + + return ret; +} + +ssize_t usdf_eq_write_internal(struct usdf_eq *eq, uint32_t event, + const void *buf, size_t len, uint64_t flags) +{ + uint64_t val = 1; + int ret; + int n; + + USDF_DBG_SYS(EQ, "event=%#" PRIx32 " flags=%#" PRIx64 "\n", event, + flags); + + pthread_spin_lock(&eq->eq_lock); + + /* Return -FI_EAGAIN if the EQ is full. + * TODO: Disable the EQ. + */ + if (ofi_atomic_get32(&eq->eq_num_events) == eq->eq_ev_ring_size) { + ret = -FI_EAGAIN; + goto done; + } + + ret = usdf_eq_write_event(eq, event, buf, len, flags); + + /* If successful, post to eventfd */ + if (ret >= 0 && eq->eq_attr.wait_obj == FI_WAIT_FD) { + n = write(eq->eq_fd, &val, sizeof(val)); + + /* TODO: If the write call fails, then roll back the EQ entry. + */ + if (n != sizeof(val)) + ret = -FI_EIO; + } + +done: + pthread_spin_unlock(&eq->eq_lock); + return ret; +} + +static ssize_t usdf_eq_write(struct fid_eq *feq, uint32_t event, + const void *buf, size_t len, uint64_t flags) +{ + struct usdf_eq *eq; + + USDF_DBG_SYS(EQ, "\n"); + + if (!feq) { + USDF_DBG_SYS(EQ, "invalid input\n"); + return -FI_EINVAL; + } + + eq = eq_ftou(feq); + + return usdf_eq_write_internal(eq, event, buf, len, flags); +} + +static const char * +usdf_eq_strerror(struct fid_eq *feq, int prov_errno, const void *err_data, + char *buf, size_t len) +{ + return NULL; +} + +static int usdf_eq_get_wait(struct usdf_eq *eq, void *arg) +{ + USDF_TRACE_SYS(EQ, "\n"); + + switch (eq->eq_attr.wait_obj) { + case FI_WAIT_FD: + *(int *) arg = eq->eq_fd; + break; + default: + USDF_WARN_SYS(EQ, "unsupported wait type\n"); + return -FI_EINVAL; + } + + return FI_SUCCESS; +} + +static int +usdf_eq_control(fid_t fid, int command, void *arg) +{ + struct usdf_eq *eq; + + USDF_TRACE_SYS(EQ, "\n"); + + eq = eq_fidtou(fid); + + switch (command) { + case FI_GETWAIT: + break; + default: + return -FI_EINVAL; + } + + return usdf_eq_get_wait(eq, arg); +} + +static int usdf_eq_bind_wait(struct usdf_eq *eq) +{ + int ret; + struct usdf_wait *wait_priv; + + if (!eq->eq_attr.wait_set) { + USDF_DBG_SYS(EQ, "can't bind to non-existent wait set\n"); + return -FI_EINVAL; + } + + wait_priv = wait_ftou(eq->eq_attr.wait_set); + + ret = fid_list_insert(&wait_priv->list, &wait_priv->lock, + &eq->eq_fid.fid); + if (ret) { + USDF_WARN_SYS(EQ, + "failed to associate eq with wait fid list\n"); + return ret; + } + + ret = ofi_epoll_add(wait_priv->object.epfd, eq->eq_fd, OFI_EPOLL_IN, eq); + if (ret) { + USDF_WARN_SYS(EQ, "failed to associate FD with wait set\n"); + goto err; + } + + USDF_DBG_SYS(EQ, "associated EQ FD %d with epoll FD %d using fid %p\n", + eq->eq_fd, wait_priv->object.epfd, &eq->eq_fid.fid); + + return ret; + +err: + fid_list_remove(&wait_priv->list, &wait_priv->lock, &eq->eq_fid.fid); + return ret; +} + +static int usdf_eq_unbind_wait(struct usdf_eq *eq) +{ + int ret; + struct usdf_wait *wait_priv; + + if (!eq->eq_attr.wait_set) { + USDF_DBG_SYS(EQ, "can't unbind from non-existent wait set\n"); + return -FI_EINVAL; + } + + wait_priv = wait_ftou(eq->eq_attr.wait_set); + + ret = ofi_epoll_del(wait_priv->object.epfd, eq->eq_fd); + if (ret) { + USDF_WARN_SYS(EQ, + "failed to remove FD from wait set\n"); + return ret; + } + + fid_list_remove(&wait_priv->list, &wait_priv->lock, &eq->eq_fid.fid); + + ofi_atomic_dec32(&wait_priv->wait_refcnt); + + USDF_DBG_SYS(EQ, + "dissasociated EQ FD %d from epoll FD %d using FID: %p\n", + eq->eq_fd, wait_priv->object.epfd, &eq->eq_fid.fid); + + return FI_SUCCESS; +} + +static int +usdf_eq_close(fid_t fid) +{ + struct usdf_eq *eq; + int ret = FI_SUCCESS; + + USDF_TRACE_SYS(EQ, "\n"); + + eq = eq_fidtou(fid); + + if (ofi_atomic_get32(&eq->eq_refcnt) > 0) { + return -FI_EBUSY; + } + ofi_atomic_dec32(&eq->eq_fabric->fab_refcnt); + + /* release wait obj */ + switch (eq->eq_attr.wait_obj) { + case FI_WAIT_SET: + ret = usdf_eq_unbind_wait(eq); + /* FALLTHROUGH */ + /* Need to close the FD used for wait set. */ + case FI_WAIT_FD: + close(eq->eq_fd); + break; + default: + break; + } + + /* Set destroy flag to clear everything out */ + usdf_eq_clean_err(eq, 1); + + free(eq->eq_ev_ring); + free(eq->eq_ev_buf); + free(eq); + + return ret; +} + +static struct fi_ops_eq usdf_eq_ops = { + .size = sizeof(struct fi_ops_eq), + .read = usdf_eq_read, + .readerr = usdf_eq_readerr, + .write = usdf_eq_write, + .sread = fi_no_eq_sread, + .strerror = usdf_eq_strerror, +}; + +static struct fi_ops usdf_eq_fi_ops = { + .size = sizeof(struct fi_ops), + .close = usdf_eq_close, + .bind = fi_no_bind, + .control = usdf_eq_control, + .ops_open = fi_no_ops_open, +}; + +int +usdf_eq_open(struct fid_fabric *fabric, struct fi_eq_attr *attr, + struct fid_eq **feq, void *context) +{ + struct usdf_eq *eq; + struct usdf_fabric *fab; + int ret; + + USDF_TRACE_SYS(EQ, "\n"); + + fab = fab_ftou(fabric); + + eq = calloc(1, sizeof(*eq)); + if (eq == NULL) { + ret = -errno; + goto fail; + } + + /* fill in the EQ struct */ + eq->eq_fid.fid.fclass = FI_CLASS_EQ; + eq->eq_fid.fid.context = context; + eq->eq_fid.fid.ops = &usdf_eq_fi_ops; + eq->eq_fid.ops = &eq->eq_ops_data; + + eq->eq_fabric = fab; + ofi_atomic_initialize32(&eq->eq_refcnt, 0); + ret = pthread_spin_init(&eq->eq_lock, PTHREAD_PROCESS_PRIVATE); + if (ret != 0) { + ret = -ret; + goto fail; + } + + slist_init(&eq->eq_err_data); + + /* get baseline routines */ + eq->eq_ops_data = usdf_eq_ops; + + /* fill in sread based on wait type */ + switch (attr->wait_obj) { + case FI_WAIT_NONE: + break; + case FI_WAIT_UNSPEC: + /* default to FD */ + attr->wait_obj = FI_WAIT_FD; + /* FALLTHROUGH */ + case FI_WAIT_FD: + eq->eq_ops_data.sread = usdf_eq_sread_fd; + /* FALLTHROUGH */ + /* Don't set sread for wait set. */ + case FI_WAIT_SET: + eq->eq_fd = eventfd(0, EFD_NONBLOCK | EFD_SEMAPHORE); + if (eq->eq_fd == -1) { + ret = -errno; + goto fail; + } + + if (attr->wait_obj == FI_WAIT_SET) { + ret = usdf_eq_bind_wait(eq); + if (ret) + goto fail; + } + break; + default: + ret = -FI_ENOSYS; + goto fail; + } + + /* + * Dis-allow write if requested + */ + if ((attr->flags & FI_WRITE) == 0) { + eq->eq_ops_data.write = fi_no_eq_write; + } + + /* + * Allocate and initialize event ring + */ + if (attr->size == 0) { + attr->size = 1024; // XXX + } + eq->eq_ev_ring = calloc(attr->size, sizeof(*eq->eq_ev_ring)); + eq->eq_ev_buf = calloc(attr->size, sizeof(*eq->eq_ev_buf)); + if (eq->eq_ev_ring == NULL || eq->eq_ev_buf == NULL) { + ret = -errno; + goto fail; + } + eq->eq_ev_head = eq->eq_ev_ring; + eq->eq_ev_tail = eq->eq_ev_ring; + eq->eq_ev_ring_size = attr->size; + eq->eq_ev_end = eq->eq_ev_ring + eq->eq_ev_ring_size; + ofi_atomic_initialize32(&eq->eq_num_events, 0); + + ofi_atomic_inc32(&eq->eq_fabric->fab_refcnt); + + eq->eq_attr = *attr; + *feq = eq_utof(eq); + + return 0; + +fail: + if (eq != NULL) { + free(eq->eq_ev_ring); + free(eq->eq_ev_buf); + free(eq); + } + return ret; +} diff --git a/prov/usnic/src/usdf_ext.c b/prov/usnic/src/usdf_ext.c new file mode 100644 index 00000000000..eefdec67908 --- /dev/null +++ b/prov/usnic/src/usdf_ext.c @@ -0,0 +1,248 @@ +/* + * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ofi.h" + +#include "usdf.h" +#include "usnic_direct.h" +#include "fi_ext_usnic.h" +#include "usdf_av.h" + +/******************************************************************************* + * Fabric extensions + ******************************************************************************/ +static int +usdf_usnic_getinfo_v1(uint32_t version, struct fid_fabric *fabric, + struct fi_usnic_info *uip) +{ + struct usdf_fabric *fp; + struct usd_device_attrs *dap; + + USDF_TRACE("\n"); + + fp = fab_ftou(fabric); + dap = fp->fab_dev_attrs; + + /* this assignment was missing in libfabric v1.1.1 and earlier */ + uip->ui_version = 1; + + uip->ui.v1.ui_link_speed = dap->uda_bandwidth; + uip->ui.v1.ui_netmask_be = dap->uda_netmask_be; + snprintf(uip->ui.v1.ui_ifname, sizeof(uip->ui.v1.ui_ifname), "%s", + dap->uda_ifname); + uip->ui.v1.ui_num_vf = dap->uda_num_vf; + uip->ui.v1.ui_qp_per_vf = dap->uda_qp_per_vf; + uip->ui.v1.ui_cq_per_vf = dap->uda_cq_per_vf; + + return 0; +} + +static int usdf_usnic_getinfo_v2(uint32_t version, struct fid_fabric *ffabric, + struct fi_usnic_info *uip) +{ + struct usd_open_params params; + struct usd_device_attrs *dap; + struct usdf_fabric *fabric; + struct usd_device *dev; + struct fi_usnic_cap **cap; + size_t len; + int ret; + int i; + + USDF_TRACE("\n"); + + fabric = fab_ftou(ffabric); + dap = fabric->fab_dev_attrs; + + memset(¶ms, 0, sizeof(params)); + params.flags = UOPF_SKIP_LINK_CHECK | UOPF_SKIP_PD_ALLOC; + params.cmd_fd = -1; + params.context = NULL; + + ret = usd_open_with_params(dap->uda_devname, ¶ms, &dev); + if (ret) + return -ret; + + uip->ui_version = FI_EXT_USNIC_INFO_VERSION; + + len = ARRAY_SIZE(uip->ui.v2.ui_devname); + strncpy(uip->ui.v2.ui_devname, dap->uda_devname, len - 1); + uip->ui.v2.ui_devname[len - 1] = '\0'; + + len = ARRAY_SIZE(uip->ui.v2.ui_ifname); + strncpy(uip->ui.v2.ui_ifname, dap->uda_ifname, len - 1); + uip->ui.v2.ui_ifname[len - 1] = '\0'; + + memcpy(uip->ui.v2.ui_mac_addr, dap->uda_mac_addr, + MIN(sizeof(dap->uda_mac_addr), + sizeof(uip->ui.v2.ui_mac_addr))); + + uip->ui.v2.ui_ipaddr_be = dap->uda_ipaddr_be; + uip->ui.v2.ui_netmask_be = dap->uda_netmask_be; + uip->ui.v2.ui_prefixlen = dap->uda_prefixlen; + uip->ui.v2.ui_mtu = dap->uda_mtu; + uip->ui.v2.ui_link_up = dap->uda_link_state; + + uip->ui.v2.ui_vendor_id = dap->uda_vendor_id; + uip->ui.v2.ui_vendor_part_id = dap->uda_vendor_part_id; + uip->ui.v2.ui_device_id = dap->uda_device_id; + + len = ARRAY_SIZE(uip->ui.v2.ui_firmware); + strncpy(uip->ui.v2.ui_firmware, dap->uda_firmware, len - 1); + uip->ui.v2.ui_firmware[len - 1] = '\0'; + + uip->ui.v2.ui_num_vf = dap->uda_num_vf; + uip->ui.v2.ui_cq_per_vf = dap->uda_cq_per_vf; + uip->ui.v2.ui_qp_per_vf = dap->uda_qp_per_vf; + uip->ui.v2.ui_intr_per_vf = dap->uda_intr_per_vf; + uip->ui.v2.ui_max_cq = dap->uda_max_cq; + uip->ui.v2.ui_max_qp = dap->uda_max_qp; + + uip->ui.v2.ui_link_speed = dap->uda_bandwidth; + uip->ui.v2.ui_max_cqe = dap->uda_max_cqe; + uip->ui.v2.ui_max_send_credits = dap->uda_max_send_credits; + uip->ui.v2.ui_max_recv_credits = dap->uda_max_recv_credits; + + uip->ui.v2.ui_caps = calloc(USD_CAP_MAX + 1, + sizeof(*uip->ui.v2.ui_caps)); + if (!uip->ui.v2.ui_caps) + return -FI_ENOMEM; + + uip->ui.v2.ui_nicname = usd_devid_to_nicname(uip->ui.v2.ui_vendor_id, + uip->ui.v2.ui_device_id); + uip->ui.v2.ui_pid = usd_devid_to_pid(uip->ui.v2.ui_vendor_id, + uip->ui.v2.ui_device_id); + + for (i = 0; i < USD_CAP_MAX; i++) { + uip->ui.v2.ui_caps[i] = calloc(1, + sizeof(*(uip->ui.v2.ui_caps[i]))); + + if (!uip->ui.v2.ui_caps[i]) { + ret = -FI_ENOMEM; + goto fail; + } + + uip->ui.v2.ui_caps[i]->uc_capability = usd_capability(i); + uip->ui.v2.ui_caps[i]->uc_present = usd_get_cap(dev, i); + } + + usd_close(dev); + + return FI_SUCCESS; + +fail: + for (cap = uip->ui.v2.ui_caps; *cap; cap++) + free(*cap); + + free(uip->ui.v2.ui_caps); + + usd_close(dev); + + return ret; +} + +static int usdf_usnic_getinfo(uint32_t version, struct fid_fabric *fabric, + struct fi_usnic_info *uip) +{ + assert(FI_EXT_USNIC_INFO_VERSION == 2); + + switch (version) { + case 1: + return usdf_usnic_getinfo_v1(version, fabric, uip); + case 2: + return usdf_usnic_getinfo_v2(version, fabric, uip); + default: + USDF_DBG_SYS(FABRIC, "invalid version\n"); + return -FI_EINVAL; + } +} + +static struct fi_usnic_ops_fabric usdf_usnic_ops_fabric = { + .size = sizeof(struct fi_usnic_ops_fabric), + .getinfo = usdf_usnic_getinfo +}; + +int +usdf_fabric_ops_open(struct fid *fid, const char *ops_name, uint64_t flags, + void **ops, void *context) +{ + USDF_TRACE("\n"); + + if (strcmp(ops_name, FI_USNIC_FABRIC_OPS_1) == 0) { + *ops = &usdf_usnic_ops_fabric; + } else { + return -FI_EINVAL; + } + + return 0; +} + +/******************************************************************************* + * Address vector extensions + ******************************************************************************/ +static int +usdf_am_get_distance(struct fid_av *fav, void *addr, int *metric_o) +{ + struct usdf_av *av; + struct usdf_domain *udp; + struct sockaddr_in *sin; + int ret; + + USDF_TRACE_SYS(DOMAIN, "\n"); + + av = av_ftou(fav); + udp = av->av_domain; + sin = addr; + + ret = usd_get_dest_distance(udp->dom_dev, + sin->sin_addr.s_addr, metric_o); + return ret; +} + +static struct fi_usnic_ops_av usdf_usnic_ops_av = { + .size = sizeof(struct fi_usnic_ops_av), + .get_distance = usdf_am_get_distance, +}; + +int usdf_av_ops_open(struct fid *fid, const char *ops_name, uint64_t flags, + void **ops, void *context) +{ + USDF_TRACE_SYS(AV, "\n"); + + if (strcmp(ops_name, FI_USNIC_AV_OPS_1) == 0) { + *ops = &usdf_usnic_ops_av; + } else { + return -FI_EINVAL; + } + + return 0; +} diff --git a/prov/usnic/src/usdf_fabric.c b/prov/usnic/src/usdf_fabric.c new file mode 100644 index 00000000000..65b2e5daaa2 --- /dev/null +++ b/prov/usnic/src/usdf_fabric.c @@ -0,0 +1,1057 @@ +/* + * Copyright (c) 2014-2019, Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include "ofi.h" +#include "ofi_enosys.h" +#include "ofi_prov.h" + +#include "usnic_direct.h" +#include "libnl_utils.h" + +#include "usdf.h" +#include "usdf_wait.h" +#include "fi_ext_usnic.h" +#include "usdf_progress.h" +#include "usdf_timer.h" +#include "usdf_dgram.h" +#include "usdf_cm.h" + +struct usdf_usnic_info *__usdf_devinfo; + +static int usdf_fabric_getname(uint32_t version, struct usd_device_attrs *dap, + char **name) +{ + int ret = FI_SUCCESS; + char *bufp = NULL; + struct in_addr in; + char *addrnetw; + + if (FI_VERSION_GE(version, FI_VERSION(1, 4))) { + in.s_addr = dap->uda_ipaddr_be & dap->uda_netmask_be; + addrnetw = inet_ntoa(in); + ret = asprintf(&bufp, "%s/%d", addrnetw, dap->uda_prefixlen); + if (ret < 0) { + USDF_DBG( + "asprintf failed while creating fabric name\n"); + ret = -ENOMEM; + } + } else { + bufp = strdup(dap->uda_devname); + if (!bufp) { + USDF_DBG("strdup failed while creating fabric name\n"); + ret = -errno; + } + } + + *name = bufp; + + return ret; +} + +static bool usdf_fabric_checkname(uint32_t version, + struct usd_device_attrs *dap, const char *hint) +{ + int ret; + bool valid = false; + char *reference; + + USDF_DBG("checking devname: version=%d, devname='%s'\n", version, hint); + + if (version) { + ret = usdf_fabric_getname(version, dap, &reference); + if (ret < 0) + return false; + + if (strcmp(reference, hint) == 0) { + valid = true; + } else { + USDF_DBG("hint %s failed to match %s\n", hint, + reference); + } + + free(reference); + return valid; + } + + /* The hint string itself is kind of a version check, in pre-1.4 the + * name was just the device name. In 1.4 and beyond, then name is + * actually CIDR + * notation. + */ + if (strstr(hint, "/")) + return usdf_fabric_checkname(FI_VERSION(1, 4), dap, hint); + + return usdf_fabric_checkname(FI_VERSION(1, 3), dap, hint); +} + +static int usdf_validate_hints(uint32_t version, const struct fi_info *hints) +{ + struct fi_fabric_attr *fattrp; + size_t size; + + switch (hints->addr_format) { + case FI_FORMAT_UNSPEC: + case FI_SOCKADDR_IN: + size = sizeof(struct sockaddr_in); + break; + case FI_SOCKADDR: + size = sizeof(struct sockaddr); + break; + case FI_ADDR_STR: + if (hints->src_addr != NULL && + strlen((char *)hints->src_addr) > USDF_ADDR_STR_LEN) + return -FI_ENODATA; + + if (hints->dest_addr != NULL && + strlen((char *)hints->dest_addr) > USDF_ADDR_STR_LEN) + return -FI_ENODATA; + + goto skip_sockaddr_size_check; + default: + return -FI_ENODATA; + } + + if (hints->src_addr != NULL && hints->src_addrlen < size) { + return -FI_ENODATA; + } + if (hints->dest_addr != NULL && hints->dest_addrlen < size) { + return -FI_ENODATA; + } + +skip_sockaddr_size_check: + if (hints->ep_attr != NULL) { + switch (hints->ep_attr->protocol) { + case FI_PROTO_UNSPEC: + case FI_PROTO_UDP: + case FI_PROTO_RUDP: + break; + default: + return -FI_ENODATA; + } + + if (hints->ep_attr->auth_key || hints->ep_attr->auth_key_size) { + USDF_WARN_SYS(EP_CTRL, + "\"authorization key\" is not supported in this provider.\n"); + return -FI_ENODATA; + } + } + + fattrp = hints->fabric_attr; + if (fattrp != NULL) { + if (fattrp->prov_version != 0 && + fattrp->prov_version != USDF_PROV_VERSION) { + return -FI_ENODATA; + } + } + return FI_SUCCESS; +} + +static int +usdf_fill_sockaddr_info(struct fi_info *fi, + struct sockaddr_in *src, struct sockaddr_in *dest, + struct usd_device_attrs *dap) +{ + int ret; + struct sockaddr_in *sin; + + sin = calloc(1, sizeof(*sin)); + fi->src_addr = sin; + if (sin == NULL) { + ret = -FI_ENOMEM; + return ret; + } + fi->src_addrlen = sizeof(struct sockaddr_in); + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = dap->uda_ipaddr_be; + if (src != NULL) + sin->sin_port = src->sin_port; + + /* copy in dest if specified */ + if (dest != NULL) { + sin = calloc(1, sizeof(*sin)); + if (NULL == sin) { + free(fi->src_addr); + return -FI_ENOMEM; + } + *sin = *dest; + fi->dest_addr = sin; + fi->dest_addrlen = sizeof(*sin); + } + return FI_SUCCESS; +} + +static int +usdf_fill_straddr_info(struct fi_info *fi, + char *src, char *dest, struct usd_device_attrs *dap) +{ + char *address_string; + struct sockaddr_in *sin; + + /* If NULL, we have to create the sockaddr_in + * and convert it to string format. + */ + if (src == NULL) { + sin = calloc(1, sizeof(*sin)); + if (NULL == sin) + return -FI_ENOMEM; + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = dap->uda_ipaddr_be; + + address_string = calloc(1, USDF_ADDR_STR_LEN); + fi->src_addr = address_string; + fi->src_addrlen = USDF_ADDR_STR_LEN; + + usdf_addr_tostr(sin, fi->src_addr, &fi->src_addrlen); + free(sin); + } else { + /* Otherwise, it is already in string format. + * Just copy it. + */ + address_string = strdup(src); + if (NULL == address_string) + return -FI_ENOMEM; + fi->src_addr = address_string; + fi->src_addrlen = strlen(address_string); + } + + /* Same goes for dest. */ + if (dest != NULL) { + address_string = strdup(dest); + fi->dest_addr = address_string; + fi->dest_addrlen = strlen(address_string); + } + + return FI_SUCCESS; +} +static int +usdf_fill_addr_info(struct fi_info *fi, uint32_t addr_format, + void *src, void *dest, struct usd_device_attrs *dap) +{ + int ret; + + if (addr_format != FI_FORMAT_UNSPEC) { + fi->addr_format = addr_format; + } else { + fi->addr_format = FI_SOCKADDR_IN; + } + + switch (fi->addr_format) { + case FI_SOCKADDR: + case FI_SOCKADDR_IN: + ret = usdf_fill_sockaddr_info(fi, src, dest, dap); + if (ret != FI_SUCCESS) + goto fail; + break; + case FI_ADDR_STR: + ret = usdf_fill_straddr_info(fi, src, dest, dap); + if (ret != FI_SUCCESS) + goto fail; + break; + default: + ret = -FI_ENODATA; + goto fail; + } + + return 0; + +fail: + return ret; // fi_freeinfo() in caller frees all +} + +static int validate_modebits(uint32_t version, const struct fi_info *hints, + uint64_t supported, uint64_t *mode_out) +{ + uint64_t mode; + + /* If there is no hints, return everything we supported. */ + if (!hints) { + *mode_out = supported; + return FI_SUCCESS; + } + + mode = hints->mode & supported; + + /* Before version 1.5, FI_LOCAL_MR is a requirement. */ + if (FI_VERSION_LT(version, FI_VERSION(1, 5))) { + if ((mode & FI_LOCAL_MR) == 0) + return -FI_ENODATA; + } + + *mode_out = mode; + + return FI_SUCCESS; +} + +static int usdf_alloc_fid_nic(struct fi_info *fi, + struct usd_device_attrs *dap) +{ + int ret; + struct fid_nic *nic = NULL; + struct fi_device_attr *da = NULL; + struct fi_link_attr *la = NULL; + + nic = ofi_nic_dup(NULL); + if (!nic) + goto nomem; + + da = nic->device_attr; + da->name = strdup(dap->uda_devname); + if (!da->name) + goto nomem; + ret = asprintf(&da->device_id, "%s (%s)", + usd_devid_to_pid(dap->uda_vendor_id, + dap->uda_device_id), + usd_devid_to_nicname(dap->uda_vendor_id, + dap->uda_device_id)); + if (ret < 0) + goto nomem; + ret = asprintf(&da->device_version, "0x%x", dap->uda_vendor_part_id); + if (ret < 0) + goto nomem; + ret = asprintf(&da->vendor_id, "0x%x", dap->uda_vendor_id); + if (ret < 0) + goto nomem; + da->driver = strdup("usnic_verbs"); + if (!da->driver) + goto nomem; + da->firmware = strdup(dap->uda_firmware); + if (!da->firmware) + goto nomem; + + // usnic does not currently expose PCI bus information, so we + // set the bus type to unknown. + nic->bus_attr->bus_type = FI_BUS_UNKNOWN; + + la = nic->link_attr; + + socklen_t size = INET_ADDRSTRLEN; + la->address = calloc(1, size); + if (!la->address) + goto nomem; + inet_ntop(AF_INET, &dap->uda_ipaddr_be, la->address, size); + la->mtu = dap->uda_mtu; + la->speed = dap->uda_bandwidth; + switch (dap->uda_link_state) { + case USD_LINK_UP: + la->state = FI_LINK_UP; + break; + case USD_LINK_DOWN: + la->state = FI_LINK_DOWN; + break; + default: + la->state = FI_LINK_UNKNOWN; + break; + } + la->network_type = strdup("Ethernet"); + if (!la->network_type) + goto nomem; + + fi->nic = nic; + + return FI_SUCCESS; + +nomem: + if (nic) + fi_close(&nic->fid); + return -FI_ENOMEM; +} + +static int usdf_fill_info_dgram( + uint32_t version, + const struct fi_info *hints, + void *src, + void *dest, + struct usd_device_attrs *dap, + struct fi_info **fi_first, + struct fi_info **fi_last) +{ + struct fi_info *fi; + struct fi_fabric_attr *fattrp; + uint32_t addr_format; + int ret; + + fi = fi_allocinfo(); + if (fi == NULL) { + ret = -FI_ENOMEM; + goto fail; + } + + fi->caps = USDF_DGRAM_CAPS; + + ret = validate_modebits(version, hints, + USDF_DGRAM_SUPP_MODE, &fi->mode); + if (ret) + goto fail; + + if (hints != NULL) { + addr_format = hints->addr_format; + + /* check that we are capable of what's requested */ + if ((hints->caps & ~USDF_DGRAM_CAPS) != 0) { + ret = -FI_ENODATA; + goto fail; + } + + fi->handle = hints->handle; + } else { + addr_format = FI_FORMAT_UNSPEC; + } + fi->ep_attr->type = FI_EP_DGRAM; + + ret = usdf_fill_addr_info(fi, addr_format, src, dest, dap); + if (ret != 0) { + goto fail; + } + + /* fabric attrs */ + fattrp = fi->fabric_attr; + ret = usdf_fabric_getname(version, dap, &fattrp->name); + if (ret < 0 || fattrp->name == NULL) { + ret = -FI_ENOMEM; + goto fail; + } + + if (fi->mode & FI_MSG_PREFIX) { + if (FI_VERSION_GE(version, FI_VERSION(1, 1))) + fi->ep_attr->msg_prefix_size = USDF_HDR_BUF_ENTRY; + else + fi->mode &= ~FI_MSG_PREFIX; + } + + ret = usdf_dgram_fill_ep_attr(version, hints, fi, dap); + if (ret) + goto fail; + + ret = usdf_dgram_fill_dom_attr(version, hints, fi, dap); + if (ret) + goto fail; + + ret = usdf_dgram_fill_tx_attr(version, hints, fi, dap); + if (ret) + goto fail; + + ret = usdf_dgram_fill_rx_attr(version, hints, fi, dap); + if (ret) + goto fail; + + ret = usdf_alloc_fid_nic(fi, dap); + if (ret) + goto fail; + + /* add to tail of list */ + if (*fi_first == NULL) { + *fi_first = fi; + } else { + (*fi_last)->next = fi; + } + *fi_last = fi; + + return 0; + +fail: + if (fi != NULL) { + fi_freeinfo(fi); + } + return ret; +} + +static int +usdf_get_devinfo(void) +{ + struct usdf_usnic_info *dp; + struct usdf_dev_entry *dep; + struct usd_open_params params; + int ret; + int d; + + assert(__usdf_devinfo == NULL); + + dp = calloc(1, sizeof(*dp)); + if (dp == NULL) { + ret = -FI_ENOMEM; + goto fail; + } + __usdf_devinfo = dp; + + dp->uu_num_devs = USD_MAX_DEVICES; + ret = usd_get_device_list(dp->uu_devs, &dp->uu_num_devs); + if (ret != 0) { + dp->uu_num_devs = 0; + goto fail; + } + + for (d = 0; d < dp->uu_num_devs; ++d) { + dep = &dp->uu_info[d]; + + memset(¶ms, 0, sizeof(params)); + params.flags = UOPF_SKIP_PD_ALLOC; + params.cmd_fd = -1; + params.context = NULL; + ret = usd_open_with_params(dp->uu_devs[d].ude_devname, + ¶ms, &dep->ue_dev); + if (ret != 0) { + continue; + } + + ret = usd_get_device_attrs(dep->ue_dev, &dep->ue_dattr); + if (ret != 0) { + continue; + } + + dep->ue_dev_ok = 1; /* this device is OK */ + + usd_close(dep->ue_dev); + dep->ue_dev = NULL; + } + return 0; + +fail: + return ret; +} + +static int +usdf_get_distance( + struct usd_device_attrs *dap, + uint32_t daddr_be, + int *metric_o) +{ + uint32_t nh_ip_addr; + int ret; + + USDF_TRACE("\n"); + + ret = usnic_nl_rt_lookup(dap->uda_ipaddr_be, daddr_be, + dap->uda_ifindex, &nh_ip_addr); + if (ret != 0) { + *metric_o = -1; + ret = 0; + } else if (nh_ip_addr == 0) { + *metric_o = 0; + } else { + *metric_o = 1; + } + + return ret; +} + +/* Check all things related to a device. Make sure it's okay, the source address + * matches the requested address, the destination is reachable from the device, + * the device fabric name matches the requested fabric name, and the device + * domain name matches the requested domain name. + * + * @param version Libfabric API version used to verify the domain / fabric name. + * @param hints Hints passed to fi_getinfo. + * @param src Source address being requested. + * @param dest Destination address to communicate with. + * @param dep usNIC device entry being checked. + * + * @return true on success, false on failure. For debug logging can be enabled + * to see why a device was disqualified. + */ +static bool usdf_check_device(uint32_t version, const struct fi_info *hints, + void *src, void *dest, + struct usdf_dev_entry *dep) +{ + char dest_str[INET_ADDRSTRLEN]; + char src_str[INET_ADDRSTRLEN]; + char dev_str[INET_ADDRSTRLEN]; + struct usd_device_attrs *dap; + struct sockaddr_in *sin; + int reachable; + int ret; + + reachable = -1; + dap = &dep->ue_dattr; + + /* Skip the device if it has problems. */ + if (!dep->ue_dev_ok) { + USDF_WARN_SYS(FABRIC, "skipping %s/%s device not ok\n", + dap->uda_devname, dap->uda_ifname); + return false; + } + + /* If the given source address is not INADDR_ANY, compare against the + * device. + */ + if (src) { + sin = usdf_format_to_sin(hints, src); + if (sin->sin_addr.s_addr != INADDR_ANY) { + if (sin->sin_addr.s_addr != dap->uda_ipaddr_be) { + inet_ntop(AF_INET, &sin->sin_addr.s_addr, + src_str, sizeof(src_str)); + inet_ntop(AF_INET, &dap->uda_ipaddr_be, + dev_str, sizeof(dev_str)); + USDF_WARN_SYS(FABRIC, + "src addr<%s> != dev addr<%s>\n", + src_str, dev_str); + goto fail; + } + } + + usdf_free_sin_if_needed(hints, sin); + } + + /* Check that the given destination address is reachable from the + * interface. + */ + if (dest) { + sin = usdf_format_to_sin(hints, dest); + if (sin->sin_addr.s_addr != INADDR_ANY) { + ret = usdf_get_distance(dap, sin->sin_addr.s_addr, + &reachable); + if (ret) { + inet_ntop(AF_INET, + &sin->sin_addr.s_addr, dest_str, + sizeof(dest_str)); + USDF_WARN_SYS(FABRIC, + "get_distance failed @ %s\n", + dest_str); + goto fail; + } + } + + if (reachable == -1) { + inet_ntop(AF_INET, &sin->sin_addr.s_addr, dest_str, + sizeof(dest_str)); + USDF_WARN_SYS(FABRIC, + "dest %s unreachable from %s/%s, skipping\n", + dest_str, dap->uda_devname, + dap->uda_ifname); + goto fail; + } + + usdf_free_sin_if_needed(hints, sin); + } + + /* Checks that the fabric name is correct for the given interface. The + * fabric name contains the CIDR notation for the interface. + */ + if (hints && hints->fabric_attr && hints->fabric_attr->name) { + if (!usdf_fabric_checkname(version, dap, + hints->fabric_attr->name)) + return false; + } + + /* Check that the domain name is correct for the given interface. The + * domain name is the device name. + */ + if (hints && hints->domain_attr && hints->domain_attr->name) { + if (!usdf_domain_checkname(version, dap, + hints->domain_attr->name)) + return false; + } + + return true; + +fail: + usdf_free_sin_if_needed(hints, sin); + + return false; +} + +static int +usdf_handle_node_and_service(const char *node, const char *service, + uint64_t flags, void **src, void **dest, + const struct fi_info *hints, struct addrinfo **ai) +{ + int ret; + struct sockaddr_in *sin; + + if (node != NULL || service != NULL) { + if (hints && hints->addr_format == FI_ADDR_STR) { + /* FI_ADDR_STR can't have service param. */ + if (service) + return -FI_EINVAL; + + sin = usdf_format_to_sin(hints, node); + + if (!sin) + /* This could be invalid or no memory. */ + return -FI_EINVAL; + } else { + ret = getaddrinfo(node, service, NULL, ai); + if (ret != 0) { + USDF_DBG("getaddrinfo failed: %d: <%s>\n", ret, + gai_strerror(ret)); + return ret; + } + sin = (struct sockaddr_in *)(*ai)->ai_addr; + } + + if (flags & FI_SOURCE) + *src = usdf_sin_to_format(hints, sin, NULL); + else + *dest = usdf_sin_to_format(hints, sin, NULL); + } + + return FI_SUCCESS; +} + +static int +usdf_getinfo(uint32_t version, const char *node, const char *service, + uint64_t flags, const struct fi_info *hints, struct fi_info **info) +{ + struct usdf_usnic_info *dp; + struct usdf_dev_entry *dep; + struct usd_device_attrs *dap; + struct fi_info *fi_first; + struct fi_info *fi_last; + struct addrinfo *ai; + void *src; + void *dest; + enum fi_ep_type ep_type; + int d; + int ret; + + USDF_TRACE("\n"); + + fi_first = NULL; + fi_last = NULL; + ai = NULL; + src = NULL; + dest = NULL; + + /* + * Get and cache usNIC device info + */ + if (__usdf_devinfo == NULL) { + ret = usdf_get_devinfo(); + if (ret != 0) { + USDF_WARN("failed to usdf_get_devinfo, ret=%d (%s)\n", + ret, fi_strerror(-ret)); + if (ret == -FI_ENODEV) + ret = -FI_ENODATA; + goto fail; + } + } + dp = __usdf_devinfo; + + /* Check the hints up front and fail if they're invalid. */ + if (hints) { + ret = usdf_validate_hints(version, hints); + if (ret) { + USDF_WARN_SYS(FABRIC, "hints failed to validate\n"); + goto fail; + } + } + + /* Get the src and dest if user specified. */ + ret = usdf_handle_node_and_service(node, service, flags, + &src, &dest, hints, &ai); + if (ret) { + USDF_WARN_SYS(FABRIC, "failed to handle node and service.\n"); + goto fail; + } + + if (hints != NULL) { + if (dest == NULL && hints->dest_addr != NULL) + dest = hints->dest_addr; + if (src == NULL && hints->src_addr != NULL) + src = hints->src_addr; + } + + for (d = 0; d < dp->uu_num_devs; ++d) { + dep = &dp->uu_info[d]; + dap = &dep->ue_dattr; + + /* If the device has an issue or the hints don't match the + * device information, then skip. + */ + if (!usdf_check_device(version, hints, src, dest, dep)) + continue; + + if (hints && hints->ep_attr) + ep_type = hints->ep_attr->type; + else + ep_type = FI_EP_UNSPEC; + + if (ep_type == FI_EP_DGRAM || ep_type == FI_EP_UNSPEC) { + ret = usdf_fill_info_dgram(version, hints, src, dest, + dap, &fi_first, &fi_last); + if (ret != 0 && ret != -FI_ENODATA) { + goto fail; + } + } + } + + if (fi_first != NULL) { + *info = fi_first; + ret = 0; + } else { + ret = -FI_ENODATA; + } + + +fail: + if (ai) + freeaddrinfo(ai); + + if (ret != 0) { + fi_freeinfo(fi_first); + USDF_INFO("returning %d (%s)\n", ret, fi_strerror(-ret)); + } + + return ret; +} + +static int +usdf_fabric_close(fid_t fid) +{ + struct usdf_fabric *fp; + int ret; + void *rv; + + USDF_TRACE("\n"); + + fp = fab_fidtou(fid); + if (ofi_atomic_get32(&fp->fab_refcnt) > 0) { + return -FI_EBUSY; + } + /* Tell progression thread to exit */ + fp->fab_exit = 1; + + free(fp->fab_attr.name); + free(fp->fab_attr.prov_name); + + if (fp->fab_thread) { + ret = usdf_fabric_wake_thread(fp); + if (ret != 0) { + return ret; + } + pthread_join(fp->fab_thread, &rv); + } + usdf_timer_deinit(fp); + if (fp->fab_epollfd != OFI_EPOLL_INVALID) { + ofi_epoll_close(fp->fab_epollfd); + } + if (fp->fab_eventfd != -1) { + close(fp->fab_eventfd); + } + if (fp->fab_arp_sockfd != -1) { + close(fp->fab_arp_sockfd); + } + + free(fp); + return 0; +} + +static struct fi_ops usdf_fi_ops = { + .size = sizeof(struct fi_ops), + .close = usdf_fabric_close, + .bind = fi_no_bind, + .control = fi_no_control, + .ops_open = usdf_fabric_ops_open, +}; + +static struct fi_ops_fabric usdf_ops_fabric = { + .size = sizeof(struct fi_ops_fabric), + .domain = usdf_domain_open, + .passive_ep = usdf_pep_open, + .eq_open = usdf_eq_open, + .wait_open = usdf_wait_open, + .trywait = usdf_trywait +}; + +static int +usdf_fabric_open(struct fi_fabric_attr *fattrp, struct fid_fabric **fabric, + void *context) +{ + struct fid_fabric *ff; + struct usdf_fabric *fp; + struct usdf_usnic_info *dp; + struct usdf_dev_entry *dep; + struct sockaddr_in sin; + int ret; + int d; + + USDF_TRACE("\n"); + + /* Make sure this fabric exists */ + dp = __usdf_devinfo; + for (d = 0; d < dp->uu_num_devs; ++d) { + dep = &dp->uu_info[d]; + if (dep->ue_dev_ok && + usdf_fabric_checkname(0, &(dep->ue_dattr), fattrp->name)) { + break; + } + } + if (d >= dp->uu_num_devs) { + USDF_INFO("device \"%s\" does not exit, returning -FI_ENODEV\n", + fattrp->name); + return -FI_ENODEV; + } + + fp = calloc(1, sizeof(*fp)); + if (fp == NULL) { + USDF_INFO("unable to allocate memory for fabric\n"); + return -FI_ENOMEM; + } + fp->fab_epollfd = OFI_EPOLL_INVALID; + fp->fab_arp_sockfd = -1; + LIST_INIT(&fp->fab_domain_list); + + fp->fab_attr.fabric = fab_utof(fp); + fp->fab_attr.name = strdup(fattrp->name); + fp->fab_attr.prov_name = strdup(USDF_PROV_NAME); + fp->fab_attr.prov_version = USDF_PROV_VERSION; + if (fp->fab_attr.name == NULL || + fp->fab_attr.prov_name == NULL) { + ret = -FI_ENOMEM; + goto fail; + } + + fp->fab_fid.fid.fclass = FI_CLASS_FABRIC; + fp->fab_fid.fid.context = context; + fp->fab_fid.fid.ops = &usdf_fi_ops; + fp->fab_fid.ops = &usdf_ops_fabric; + + fp->fab_dev_attrs = &dep->ue_dattr; + + ret = ofi_epoll_create(&fp->fab_epollfd); + if (ret) { + USDF_INFO("unable to allocate epoll fd\n"); + goto fail; + } + + fp->fab_eventfd = eventfd(0, EFD_NONBLOCK | EFD_SEMAPHORE); + if (fp->fab_eventfd == -1) { + ret = -errno; + USDF_INFO("unable to allocate event fd\n"); + goto fail; + } + fp->fab_poll_item.pi_rtn = usdf_fabric_progression_cb; + fp->fab_poll_item.pi_context = fp; + ret = ofi_epoll_add(fp->fab_epollfd, fp->fab_eventfd, OFI_EPOLL_IN, + &fp->fab_poll_item); + if (ret) { + USDF_INFO("unable to EPOLL_CTL_ADD\n"); + goto fail; + } + + /* initialize timer subsystem */ + ret = usdf_timer_init(fp); + if (ret != 0) { + USDF_INFO("unable to initialize timer\n"); + goto fail; + } + + /* create and bind socket for ARP resolution */ + memset(&sin, 0, sizeof(sin)); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = fp->fab_dev_attrs->uda_ipaddr_be; + fp->fab_arp_sockfd = socket(AF_INET, SOCK_DGRAM, 0); + if (fp->fab_arp_sockfd == -1) { + USDF_INFO("unable to create socket\n"); + goto fail; + } + ret = bind(fp->fab_arp_sockfd, (struct sockaddr *) &sin, sizeof(sin)); + if (ret == -1) { + ret = -errno; + goto fail; + } + + ofi_atomic_initialize32(&fp->fab_refcnt, 0); + ofi_atomic_initialize32(&fp->num_blocked_waiting, 0); + + ret = pthread_create(&fp->fab_thread, NULL, + usdf_fabric_progression_thread, fp); + if (ret != 0) { + ret = -ret; + USDF_INFO("unable to create progress thread\n"); + goto fail; + } + + fattrp->fabric = fab_utof(fp); + fattrp->prov_version = USDF_PROV_VERSION; + *fabric = fab_utof(fp); + USDF_INFO("successfully opened %s/%s\n", fattrp->name, + fp->fab_dev_attrs->uda_ifname); + return 0; + +fail: + free(fp->fab_attr.name); + free(fp->fab_attr.prov_name); + ff = fab_utof(fp); + usdf_fabric_close(&ff->fid); + USDF_DBG("returning %d (%s)\n", ret, fi_strerror(-ret)); + return ret; +} + +static void usdf_fini(void) +{ + USDF_TRACE("\n"); +} + +struct fi_provider usdf_ops = { + .name = USDF_PROV_NAME, + .version = USDF_PROV_VERSION, + .fi_version = OFI_VERSION_LATEST, + .getinfo = usdf_getinfo, + .fabric = usdf_fabric_open, + .cleanup = usdf_fini +}; + +USNIC_INI +{ +#if USNIC_BUILD_FAKE_VERBS_DRIVER + usdf_setup_fake_ibv_provider(); +#endif + return (&usdf_ops); +} diff --git a/prov/usnic/src/usdf_fake_ibv.c b/prov/usnic/src/usdf_fake_ibv.c new file mode 100644 index 00000000000..5b51d0b363d --- /dev/null +++ b/prov/usnic/src/usdf_fake_ibv.c @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2015, Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * The code in this file prevents spurious libibverbs warnings on + * stderr about devices that it doesn't recognize. + * + * Specifically, Cisco usNIC devices are exposed through the Linux + * InfiniBand kernel interface (i.e., they show up in + * /sys/class/infiniband). However, the userspace side of these + * drivers is not exposed through libibverbs (i.e., there is no + * libibverbs provider/plugin for usNIC). Therefore, when + * ibv_get_device_list() is invoked, libibverbs cannot find a plugin + * for usnic devices. This causes libibverbs to emit a spurious + * warning message on stderr. + * + * Since libfabric can have a verbs provider, libibverbs is invoked, + * triggering the sequence described above, resulting in warning + * messages about usnic devices. To avoid these extra stderr + * warnings, we insert a fake usnic verbs libibverbs provider that + * safely squelches these warnings. + * + * More specifically: the userspace side of usNIC is exposed through + * libfabric; we don't need libibverbs warnings about not being able + * to find a usnic driver. + */ + +#include "config.h" + +#include + +#include +#include + +/***********************************************************************/ + +#ifndef PCI_VENDOR_ID_CISCO +#define PCI_VENDOR_ID_CISCO 0x1137 +#endif + +static struct ibv_context *fake_alloc_context(struct ibv_device *ibdev, + int cmd_fd) +{ + /* Nothing to do here */ + return NULL; +} + +static void fake_free_context(struct ibv_context *ibctx) +{ + /* Nothing to do here */ +} + +/* Put just enough in here to convince libibverbs that this is a valid + device, and a little extra just in case someone looks at this + struct in a debugger. */ +static struct ibv_device fake_dev = { + .ops = { + .alloc_context = fake_alloc_context, + .free_context = fake_free_context + }, + .name = "fake ibv_device inserted by libfabric:usNIC" +}; + +static struct ibv_device *fake_driver_init(const char *uverbs_sys_path, + int abi_version) +{ + char value[8]; + int vendor; + + /* This function should only be invoked for + /sys/class/infiniband/usnic_X devices, but double check just to + be absolutely sure: read the vendor ID and ensure that it is + Cisco. */ + if (ibv_read_sysfs_file(uverbs_sys_path, "device/vendor", + value, sizeof(value)) < 0) { + return NULL; + } + sscanf(value, "%i", &vendor); + + if (vendor == PCI_VENDOR_ID_CISCO) { + return &fake_dev; + } + + /* We didn't find a device that we want to support */ + return NULL; +} + + +void usdf_setup_fake_ibv_provider(void) +{ + /* Register a fake driver for "usnic_verbs" devices */ + ibv_register_driver("usnic_verbs", fake_driver_init); +} diff --git a/prov/usnic/src/usdf_mem.c b/prov/usnic/src/usdf_mem.c new file mode 100644 index 00000000000..10fd43744ed --- /dev/null +++ b/prov/usnic/src/usdf_mem.c @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2014-2017, Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include "ofi.h" +#include "ofi_enosys.h" + +#include "usnic_direct.h" +#include "usdf.h" + +static +int usdf_dereg_mr(fid_t fid) +{ + struct usdf_mr *mr; + int ret; + + mr = container_of(fid, struct usdf_mr, mr_fid.fid); + ret = usd_dereg_mr(mr->mr_mr); + if (ret == 0) { + free(mr); + } + return ret; +} + +static struct fi_ops usdf_mr_ops = { + .size = sizeof(struct fi_ops), + .close = usdf_dereg_mr, + .bind = fi_no_bind, + .control = fi_no_control, + .ops_open = fi_no_ops_open, +}; + +int +usdf_reg_mr(struct fid *fid, const void *buf, size_t len, + uint64_t access, uint64_t offset, uint64_t requested_key, + uint64_t flags, struct fid_mr **mr_o, void *context) +{ + struct usdf_mr *mr; + struct usdf_domain *udp; + int ret; + struct fid_domain *domain; + + if (flags != 0) { + return -FI_EBADFLAGS; + } + + if (fid->fclass != FI_CLASS_DOMAIN) { + USDF_DBG("memory registration only supported " + "for struct fid_domain\n"); + return -FI_EINVAL; + } + domain = container_of(fid, struct fid_domain, fid); + + mr = calloc(1, sizeof *mr); + if (mr == NULL) { + return -FI_ENOMEM; + } + + mr->mr_fid.fid.fclass = FI_CLASS_MR; + mr->mr_fid.fid.context = context; + mr->mr_fid.fid.ops = &usdf_mr_ops; + + udp = container_of(domain, struct usdf_domain, dom_fid.fid); + ret = usd_reg_mr(udp->dom_dev, (void *) buf, len, &mr->mr_mr); + if (ret != 0) { + goto fail; + } + + *mr_o = &mr->mr_fid; + return 0; + +fail: + free(mr); + return ret; +} + +/* We dont have proper support for regv and regattr. This is just + * a simple mapping to usdf_reg_mr. We can do this because we forced + * mr_iov_limit = 1 (made this mapping possible) by default. + */ +int usdf_regv_mr(struct fid *fid, const struct iovec *iov, + size_t count, uint64_t access, + uint64_t offset, uint64_t requested_key, + uint64_t flags, struct fid_mr **mr, void *context) +{ + if (count > USDF_MR_IOV_LIMIT) { + USDF_DBG_SYS(DOMAIN, "usnic provider only support 1 iov.\n"); + return -FI_EINVAL; + } + + return usdf_reg_mr(fid, iov[0].iov_base, iov[0].iov_len, access, + offset, requested_key, flags, mr, context); +} + +int usdf_regattr(struct fid *fid, const struct fi_mr_attr *attr, + uint64_t flags, struct fid_mr **mr) +{ + if (attr->iov_count > USDF_MR_IOV_LIMIT) { + USDF_DBG_SYS(DOMAIN, "usnic provider only support 1 iov.\n"); + return -FI_EINVAL; + } + + return usdf_reg_mr(fid, attr->mr_iov[0].iov_base, + attr->mr_iov[0].iov_len, + attr->access, + attr->offset, + attr->requested_key, + flags, mr, attr->context); +} diff --git a/prov/usnic/src/usdf_pep.c b/prov/usnic/src/usdf_pep.c new file mode 100644 index 00000000000..a9d57889c99 --- /dev/null +++ b/prov/usnic/src/usdf_pep.c @@ -0,0 +1,839 @@ +/* + * Copyright (c) 2014-2019, Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include "ofi.h" +#include "ofi_enosys.h" +#include "ofi_file.h" + +#include "fi_ext_usnic.h" +#include "usnic_direct.h" +#include "usd.h" +#include "usdf.h" +#include "usdf_endpoint.h" +#include "usdf_cm.h" + +static int +usdf_pep_bind(fid_t fid, fid_t bfid, uint64_t flags) +{ + struct usdf_pep *pep; + + USDF_TRACE_SYS(EP_CTRL, "\n"); + + pep = pep_fidtou(fid); + + switch (bfid->fclass) { + + case FI_CLASS_EQ: + if (pep->pep_eq != NULL) { + return -FI_EINVAL; + } + pep->pep_eq = eq_fidtou(bfid); + ofi_atomic_inc32(&pep->pep_eq->eq_refcnt); + break; + + default: + return -FI_EINVAL; + } + + return 0; +} + +static struct fi_info * +usdf_pep_conn_info(struct usdf_connreq *crp) +{ + struct fi_info *ip; + struct usdf_pep *pep; + struct sockaddr_in *sin; + struct usdf_connreq_msg *reqp; + + pep = crp->cr_pep; + reqp = (struct usdf_connreq_msg *)crp->cr_data; + + ip = fi_dupinfo(pep->pep_info); + if (!ip) { + USDF_WARN_SYS(EP_CTRL, "failed to duplicate pep info\n"); + return NULL; + } + + /* fill in dest addr */ + ip->dest_addrlen = ip->src_addrlen; + sin = calloc(1, ip->dest_addrlen); + if (sin == NULL) { + goto fail; + } + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = reqp->creq_ipaddr; + sin->sin_port = reqp->creq_port; + + ip->dest_addr = usdf_sin_to_format(pep->pep_info, sin, + &ip->dest_addrlen); + + ip->handle = (fid_t) crp; + return ip; +fail: + fi_freeinfo(ip); + return NULL; +} + +/* + * Remove connection request from epoll list if not done already. + * crp->cr_pollitem.pi_rtn is non-NULL when epoll() is active + */ +static int +usdf_pep_creq_epoll_del(struct usdf_connreq *crp) +{ + int ret; + struct usdf_pep *pep; + + pep = crp->cr_pep; + + if (crp->cr_pollitem.pi_rtn != NULL) { + ret = ofi_epoll_del(pep->pep_fabric->fab_epollfd, + crp->cr_sockfd); + crp->cr_pollitem.pi_rtn = NULL; + if (ret != 0) { + ret = -errno; + } + } else { + ret = 0; + } + return ret; +} + +static int +usdf_pep_read_connreq(void *v) +{ + struct usdf_connreq *crp; + struct usdf_pep *pep; + struct usdf_connreq_msg *reqp; + struct fi_eq_cm_entry *entry; + size_t entry_len; + int ret; + int n; + + crp = v; + pep = crp->cr_pep; + + n = read(crp->cr_sockfd, crp->cr_ptr, crp->cr_resid); + if (n == -1) { + ret = -errno; + goto report_failure_skip_data; + } + + crp->cr_ptr += n; + crp->cr_resid -= n; + + reqp = (struct usdf_connreq_msg *)crp->cr_data; + + if (crp->cr_resid == 0 && crp->cr_ptr == crp->cr_data + sizeof(*reqp)) { + reqp->creq_datalen = ntohl(reqp->creq_datalen); + crp->cr_resid = reqp->creq_datalen; + } + + /* if resid is 0 now, completely done */ + if (crp->cr_resid == 0) { + ret = usdf_pep_creq_epoll_del(crp); + if (ret != 0) + goto report_failure_skip_data; + + /* create CONNREQ EQ entry */ + entry_len = sizeof(*entry) + reqp->creq_datalen; + entry = malloc(entry_len); + if (entry == NULL) { + ret = -errno; + goto report_failure_skip_data; + } + + entry->fid = &pep->pep_fid.fid; + entry->info = usdf_pep_conn_info(crp); + if (entry->info == NULL) { + ret = -FI_ENOMEM; + goto free_entry_and_report_failure; + } + + memcpy(entry->data, reqp->creq_data, reqp->creq_datalen); + ret = usdf_eq_write_internal(pep->pep_eq, FI_CONNREQ, entry, + entry_len, 0); + + if (ret != (int)entry_len) + goto free_entry_and_report_failure; + + free(entry); + } + + return 0; + +free_entry_and_report_failure: + free(entry); +report_failure_skip_data: + usdf_cm_report_failure(crp, ret, false); + return 0; +} + +static int +usdf_pep_listen_cb(void *v) +{ + struct usdf_pep *pep; + struct sockaddr_in sin; + struct usdf_connreq *crp; + socklen_t socklen; + int ret; + int s; + + pep = v; + + socklen = sizeof(sin); + s = accept(pep->pep_sock, &sin, &socklen); + if (s == -1) { + /* ignore early failure */ + return 0; + } + crp = NULL; + pthread_spin_lock(&pep->pep_cr_lock); + if (!TAILQ_EMPTY(&pep->pep_cr_free)) { + crp = TAILQ_FIRST(&pep->pep_cr_free); + TAILQ_REMOVE_MARK(&pep->pep_cr_free, crp, cr_link); + TAILQ_NEXT(crp, cr_link) = NULL; + } + pthread_spin_unlock(&pep->pep_cr_lock); + + /* no room for request, just drop it */ + if (crp == NULL) { + /* XXX send response? */ + close(s); + return 0; + } + + crp->cr_sockfd = s; + crp->cr_pep = pep; + crp->cr_ptr = crp->cr_data; + crp->cr_resid = sizeof(struct usdf_connreq_msg); + + crp->cr_pollitem.pi_rtn = usdf_pep_read_connreq; + crp->cr_pollitem.pi_context = crp; + + ret = ofi_epoll_add(pep->pep_fabric->fab_epollfd, crp->cr_sockfd, + OFI_EPOLL_IN, &crp->cr_pollitem); + if (ret) { + usdf_cm_report_failure(crp, ret, false); + return 0; + } + + TAILQ_INSERT_TAIL(&pep->pep_cr_pending, crp, cr_link); + + return 0; +} + +static int +usdf_pep_listen(struct fid_pep *fpep) +{ + struct usdf_pep *pep; + struct usdf_fabric *fp; + struct sockaddr_in *sin; + socklen_t socklen; + int ret; + bool addr_format_str; + + USDF_TRACE_SYS(EP_CTRL, "\n"); + + pep = pep_ftou(fpep); + fp = pep->pep_fabric; + addr_format_str = (pep->pep_info->addr_format == FI_ADDR_STR); + sin = NULL; + + switch (pep->pep_state) { + case USDF_PEP_UNBOUND: + case USDF_PEP_BOUND: + break; + case USDF_PEP_LISTENING: + USDF_WARN_SYS(EP_CTRL, "PEP already LISTENING!\n"); + return -FI_EOPBADSTATE; + case USDF_PEP_ROBBED: + USDF_WARN_SYS(EP_CTRL, + "PEP already consumed, you may only fi_close() now\n"); + return -FI_EOPBADSTATE; + default: + USDF_WARN_SYS(EP_CTRL, "unhandled case! (%d)\n", + pep->pep_state); + abort(); + } + + /* we could already be bound if the user called fi_setname() or if we + * already did the bind in a previous call to usdf_pep_listen() and the + * listen(2) call failed */ + if (pep->pep_state == USDF_PEP_UNBOUND) { + sin = usdf_format_to_sin(pep->pep_info, &pep->pep_src_addr); + if (sin == NULL) + goto fail; + + ret = bind(pep->pep_sock, sin, sizeof(struct sockaddr_in)); + if (ret == -1) { + goto fail; + } + + /* Get the actual port (since we may have requested + * port 0) + */ + socklen = sizeof(*sin); + ret = getsockname(pep->pep_sock, sin, + &socklen); + if (ret == -1) + goto fail; + + /* If it's FI_ADDR_STR, we have to update the string + * with this method. (FI_SOCKADDR_IN got taken care of, above) + */ + if (addr_format_str) { + pep->pep_info->src_addrlen = USDF_ADDR_STR_LEN; + usdf_addr_tostr(sin, pep->pep_src_addr.addr_str, + &pep->pep_info->src_addrlen); + } + + /* Update the state to bound. */ + pep->pep_state = USDF_PEP_BOUND; + } + + ret = listen(pep->pep_sock, pep->pep_backlog); + if (ret != 0) { + goto fail; + } + pep->pep_state = USDF_PEP_LISTENING; + + pep->pep_pollitem.pi_rtn = usdf_pep_listen_cb; + pep->pep_pollitem.pi_context = pep; + ret = ofi_epoll_add(fp->fab_epollfd, pep->pep_sock, OFI_EPOLL_IN, + &pep->pep_pollitem); + if (ret) { + errno = -ret; + goto fail; + } + + return 0; + +fail: + usdf_free_sin_if_needed(pep->pep_info, sin); + + return -errno; +} + +/* Register as a callback triggered by the socket becoming writeable. Write as + * much data as can be written in a single write, and keep track of how much + * data is left. If the data is not fully written, it will finish getting + * written in another iteration of the progression. + */ +static int usdf_pep_reject_async(void *vreq) +{ + struct usdf_connreq *crp; + int ret; + + crp = vreq; + + do { + ret = write(crp->cr_sockfd, crp->cr_ptr, crp->cr_resid); + } while ((ret < 0) && (errno == EINTR)); + + if ((ret <= 0) && (errno != EAGAIN)) { + USDF_DBG_SYS(EP_CTRL, "write failed: %s\n", + strerror(errno)); + usdf_cm_report_failure(crp, -errno, false); + return -errno; + } + + crp->cr_resid -= ret; + crp->cr_ptr += ret; + + return FI_SUCCESS; +} + +static int usdf_pep_reject(struct fid_pep *fpep, fid_t handle, const void *param, + size_t paramlen) +{ + struct usdf_pep *pep; + struct usdf_connreq *crp; + struct usdf_connreq_msg *reqp; + int ret; + + if (paramlen > USDF_MAX_CONN_DATA) { + USDF_WARN_SYS(EP_CTRL, + "reject payload size %zu exceeds max %u\n", + paramlen, USDF_MAX_CONN_DATA); + return -FI_EINVAL; + } + + if (!fpep || !handle) { + USDF_WARN_SYS(EP_CTRL, + "handle and passive ep needed for reject\n"); + return -FI_EINVAL; + } + + if (!param && paramlen > 0) { + USDF_WARN_SYS(EP_CTRL, + "NULL data pointer with non-zero data length\n"); + return -FI_EINVAL; + } + + /* usdf_pep_conn_info stashed the pep pointer into the handle field of + * the info struct previously returned + */ + crp = (struct usdf_connreq *) handle; + pep = pep_ftou(fpep); + + crp->cr_ptr = crp->cr_data; + crp->cr_resid = sizeof(*reqp) + paramlen; + + reqp = (struct usdf_connreq_msg *) crp->cr_data; + + /* The result field is used on the remote end to detect whether the + * connection succeeded or failed. + */ + reqp->creq_result = htonl(-FI_ECONNREFUSED); + reqp->creq_datalen = htonl(paramlen); + memcpy(reqp->creq_data, param, paramlen); + + crp->cr_pollitem.pi_rtn = usdf_pep_reject_async; + crp->cr_pollitem.pi_context = crp; + + ret = ofi_epoll_add(pep->pep_fabric->fab_epollfd, crp->cr_sockfd, + OFI_EPOLL_OUT, &crp->cr_pollitem); + return ret; +} + +static void +usdf_pep_free_cr_lists(struct usdf_pep *pep) +{ + struct usdf_connreq *crp; + + while (!TAILQ_EMPTY(&pep->pep_cr_free)) { + crp = TAILQ_FIRST(&pep->pep_cr_free); + TAILQ_REMOVE(&pep->pep_cr_free, crp, cr_link); + free(crp); + } + + while (!TAILQ_EMPTY(&pep->pep_cr_pending)) { + crp = TAILQ_FIRST(&pep->pep_cr_pending); + TAILQ_REMOVE(&pep->pep_cr_pending, crp, cr_link); + free(crp); + } +} + +static int +usdf_pep_grow_backlog(struct usdf_pep *pep) +{ + struct usdf_connreq *crp; + size_t extra; + + extra = sizeof(struct usdf_connreq_msg) + pep->pep_cr_max_data; + + pthread_spin_lock(&pep->pep_cr_lock); + while (pep->pep_cr_alloced < pep->pep_backlog) { + crp = calloc(1, sizeof(*crp) + extra); + if (crp == NULL) { + pthread_spin_unlock(&pep->pep_cr_lock); + return -FI_ENOMEM; + } + crp->handle.fclass = FI_CLASS_CONNREQ; + TAILQ_INSERT_TAIL(&pep->pep_cr_free, crp, cr_link); + ++pep->pep_cr_alloced; + } + pthread_spin_unlock(&pep->pep_cr_lock); + return 0; +} + +static int +usdf_pep_close(fid_t fid) +{ + struct usdf_pep *pep; + + USDF_TRACE_SYS(EP_CTRL, "\n"); + + pep = pep_fidtou(fid); + if (ofi_atomic_get32(&pep->pep_refcnt) > 0) { + return -FI_EBUSY; + } + + usdf_pep_free_cr_lists(pep); + close(pep->pep_sock); + pep->pep_sock = -1; + if (pep->pep_eq != NULL) { + ofi_atomic_dec32(&pep->pep_eq->eq_refcnt); + } + ofi_atomic_dec32(&pep->pep_fabric->fab_refcnt); + fi_freeinfo(pep->pep_info); + free(pep); + + return 0; +} + +static int usdf_pep_getname(fid_t fid, void *addr, size_t *addrlen) +{ + int ret; + struct usdf_pep *pep; + struct fi_info *info; + size_t copylen; + + USDF_TRACE_SYS(EP_CTRL, "\n"); + + ret = FI_SUCCESS; + pep = pep_fidtou(fid); + info = pep->pep_info; + + copylen = info->src_addrlen; + memcpy(addr, &pep->pep_src_addr, MIN(copylen, *addrlen)); + + if (*addrlen < copylen) { + USDF_WARN_SYS(EP_CTRL, "*addrlen is too short\n"); + ret = -FI_ETOOSMALL; + } + + *addrlen = copylen; + return ret; +} + +static int usdf_pep_setname(fid_t fid, void *addr, size_t addrlen) +{ + int ret; + struct usdf_pep *pep; + struct fi_info *info; + struct sockaddr_in *sin; + uint32_t req_addr_be; + socklen_t socklen; + char namebuf[INET_ADDRSTRLEN]; + char servbuf[INET_ADDRSTRLEN]; + bool addr_format_str; + + USDF_TRACE_SYS(EP_CTRL, "\n"); + + pep = pep_fidtou(fid); + info = pep->pep_info; + addr_format_str = (info->addr_format == FI_ADDR_STR); + sin = NULL; + + if (pep->pep_state != USDF_PEP_UNBOUND) { + USDF_WARN_SYS(EP_CTRL, "PEP cannot be bound\n"); + return -FI_EOPBADSTATE; + } + + switch (info->addr_format) { + case FI_SOCKADDR: + case FI_SOCKADDR_IN: + /* It is possible for passive endpoint to not have src_addr. */ + if (info->src_addr) { + ret = usdf_cm_addr_is_valid_sin(info->src_addr, + info->src_addrlen, + info->addr_format); + if (!ret) + return -FI_EINVAL; + } + break; + case FI_ADDR_STR: + break; + default: + return -FI_EINVAL; + } + + sin = usdf_format_to_sin(info, addr); + req_addr_be = sin->sin_addr.s_addr; + + namebuf[0] = '\0'; + servbuf[0] = '\0'; + ret = getnameinfo((struct sockaddr *)sin, sizeof(struct sockaddr_in), + namebuf, sizeof(namebuf), + servbuf, sizeof(servbuf), + NI_NUMERICHOST|NI_NUMERICSERV); + if (ret != 0) + USDF_WARN_SYS(EP_CTRL, "unable to getnameinfo(0x%x)\n", + req_addr_be); + + if (req_addr_be != pep->pep_fabric->fab_dev_attrs->uda_ipaddr_be) { + USDF_WARN_SYS(EP_CTRL, "requested addr (%s:%s) does not match fabric addr\n", + namebuf, servbuf); + return -FI_EADDRNOTAVAIL; + } + + ret = bind(pep->pep_sock, sin, sizeof(*sin)); + if (ret == -1) { + return -errno; + } + pep->pep_state = USDF_PEP_BOUND; + + /* store the resulting port so that can implement getname() properly */ + socklen = sizeof(*sin); + ret = getsockname(pep->pep_sock, sin, &socklen); + if (ret == -1) { + ret = -errno; + USDF_WARN_SYS(EP_CTRL, "getsockname failed %d (%s), PEP may be in bad state\n", + ret, strerror(-ret)); + return ret; + } + + if (addr_format_str) { + /* We have to reset src_addrlen here and + * the conversion will update it to the correct len. + */ + info->src_addrlen = USDF_ADDR_STR_LEN; + usdf_addr_tostr(sin, pep->pep_src_addr.addr_str, + &info->src_addrlen); + free(sin); + } else { + memcpy(&pep->pep_src_addr, sin, sizeof(*sin)); + } + + return 0; +} + +struct fi_ops usdf_pep_ops = { + .size = sizeof(struct fi_ops), + .close = usdf_pep_close, + .bind = usdf_pep_bind, + .control = fi_no_control, + .ops_open = fi_no_ops_open +}; + +static struct fi_ops_ep usdf_pep_base_ops = { + .size = sizeof(struct fi_ops_ep), + .cancel = fi_no_cancel, + .getopt = usdf_ep_getopt_connected, + .setopt = usdf_ep_setopt, + .tx_ctx = fi_no_tx_ctx, + .rx_ctx = fi_no_rx_ctx, + .rx_size_left = fi_no_rx_size_left, + .tx_size_left = fi_no_tx_size_left, +}; + +static struct fi_ops_cm usdf_pep_cm_ops = { + .size = sizeof(struct fi_ops_cm), + .setname = usdf_pep_setname, + .getname = usdf_pep_getname, + .getpeer = fi_no_getpeer, + .connect = fi_no_connect, + .listen = usdf_pep_listen, + .accept = fi_no_accept, + .reject = usdf_pep_reject, + .shutdown = fi_no_shutdown, + .join = fi_no_join, +}; + +int +usdf_pep_open(struct fid_fabric *fabric, struct fi_info *info, + struct fid_pep **pep_o, void *context) +{ + struct usdf_pep *pep; + struct usdf_fabric *fp; + struct sockaddr_in *sin; + int ret; + int optval; + + USDF_TRACE_SYS(EP_CTRL, "\n"); + + if (!info) { + USDF_DBG_SYS(EP_CTRL, "null fi_info struct is invalid\n"); + return -FI_EINVAL; + } + + if (info->ep_attr->type != FI_EP_MSG) { + return -FI_ENODEV; + } + + switch (info->addr_format) { + case FI_SOCKADDR: + case FI_SOCKADDR_IN: + /* It is possible for passive endpoint to not have src_addr. */ + if (info->src_addr) { + ret = usdf_cm_addr_is_valid_sin(info->src_addr, + info->src_addrlen, + info->addr_format); + if (!ret) + return -FI_EINVAL; + } + break; + case FI_ADDR_STR: + break; + default: + USDF_WARN_SYS(EP_CTRL, "unknown/unsupported addr_format\n"); + return -FI_EINVAL; + } + + fp = fab_ftou(fabric); + + pep = calloc(1, sizeof(*pep)); + if (pep == NULL) { + return -FI_ENOMEM; + } + + pep->pep_fid.fid.fclass = FI_CLASS_PEP; + pep->pep_fid.fid.context = context; + pep->pep_fid.fid.ops = &usdf_pep_ops; + pep->pep_fid.ops = &usdf_pep_base_ops; + pep->pep_fid.cm = &usdf_pep_cm_ops; + pep->pep_fabric = fp; + + pep->pep_state = USDF_PEP_UNBOUND; + pep->pep_sock = socket(AF_INET, SOCK_STREAM, 0); + if (pep->pep_sock == -1) { + ret = -errno; + goto fail; + } + ret = fi_fd_nonblock(pep->pep_sock); + if (ret) { + ret = -errno; + goto fail; + } + + /* set SO_REUSEADDR to prevent annoying "Address already in use" errors + * on successive runs of programs listening on a well known port */ + optval = 1; + ret = setsockopt(pep->pep_sock, SOL_SOCKET, SO_REUSEADDR, &optval, + sizeof(optval)); + if (ret == -1) { + ret = -errno; + goto fail; + } + + pep->pep_info = fi_dupinfo(info); + if (!pep->pep_info) { + ret = -FI_ENOMEM; + goto fail; + } + + if (info->src_addrlen == 0) { + /* Copy the source address information from the device + * attributes. + */ + pep->pep_info->src_addrlen = sizeof(struct sockaddr_in); + sin = calloc(1, pep->pep_info->src_addrlen); + if (!sin) { + USDF_WARN_SYS(EP_CTRL, + "calloc for src address failed\n"); + goto fail; + } + + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = fp->fab_dev_attrs->uda_ipaddr_be; + + pep->pep_info->src_addr = + usdf_sin_to_format(pep->pep_info, + sin, &pep->pep_info->src_addrlen); + } + + memcpy(&pep->pep_src_addr, pep->pep_info->src_addr, + pep->pep_info->src_addrlen); + + /* initialize connreq freelist */ + ret = pthread_spin_init(&pep->pep_cr_lock, PTHREAD_PROCESS_PRIVATE); + if (ret != 0) { + ret = -ret; + goto fail; + } + TAILQ_INIT(&pep->pep_cr_free); + TAILQ_INIT(&pep->pep_cr_pending); + pep->pep_backlog = 10; + pep->pep_cr_max_data = USDF_MAX_CONN_DATA; + + ret = usdf_pep_grow_backlog(pep); + if (ret != 0) { + goto fail; + } + + ofi_atomic_initialize32(&pep->pep_refcnt, 0); + ofi_atomic_inc32(&fp->fab_refcnt); + + *pep_o = pep_utof(pep); + return 0; + +fail: + if (pep != NULL) { + usdf_pep_free_cr_lists(pep); + if (pep->pep_sock != -1) { + close(pep->pep_sock); + } + fi_freeinfo(pep->pep_info); + free(pep); + } + return ret; +} + +/* Steals the socket underpinning the PEP for use by an active endpoint. After + * this call, the only valid action a user may take on this PEP is to close it. + * Sets "*is_bound=1" if the socket was already bound to an address, + * "*is_bound=0" if not bound, or "*is_bound" will be undefined if this function + * returns a non-zero error code. */ +int usdf_pep_steal_socket(struct usdf_pep *pep, int *is_bound, int *sock_o) +{ + switch (pep->pep_state) { + case USDF_PEP_UNBOUND: + if (is_bound != NULL) + *is_bound = 0; + break; + case USDF_PEP_BOUND: + if (is_bound != NULL) + *is_bound = 1; + break; + case USDF_PEP_LISTENING: + USDF_WARN_SYS(EP_CTRL, + "PEP already listening, cannot use as \"handle\" in fi_endpoint()\n"); + return -FI_EOPBADSTATE; + case USDF_PEP_ROBBED: + USDF_WARN_SYS(EP_CTRL, + "PEP already consumed, you may only fi_close() now\n"); + return -FI_EOPBADSTATE; + } + + *sock_o = pep->pep_sock; + pep->pep_sock = -1; + pep->pep_state = USDF_PEP_ROBBED; + return 0; +} diff --git a/prov/usnic/src/usdf_poll.c b/prov/usnic/src/usdf_poll.c new file mode 100644 index 00000000000..9c8081c34f9 --- /dev/null +++ b/prov/usnic/src/usdf_poll.c @@ -0,0 +1,293 @@ +/* + * Copyright (c) 2016, Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include + +#include "usdf.h" +#include "usdf_cq.h" +#include "usdf_poll.h" + +static int usdf_poll_poll(struct fid_poll *fps, void **context, int count) +{ + struct usdf_cq *cq; + struct usdf_poll *ps; + struct dlist_entry *item; + struct fid_list_entry *entry; + int progressed = 0; + int copied = 0; + int pending; + + if (!fps || !context) { + USDF_WARN_SYS(DOMAIN, "pollset and context can't be NULL.\n"); + return -FI_EINVAL; + } + + ps = poll_ftou(fps); + + ofi_mutex_lock(&ps->lock); + + dlist_foreach(&ps->list, item) { + entry = container_of(item, struct fid_list_entry, entry); + assert(entry->fid->fclass == FI_CLASS_CQ); + + cq = cq_fidtou(entry->fid); + + if (cq->cq_is_soft) { + if (!progressed) { + usdf_domain_progress(ps->poll_domain); + progressed = 1; + } + + pending = !usdf_check_empty_soft_cq(cq); + } else { + pending = !usdf_check_empty_hard_cq(cq); + } + + if (pending) { + context[copied++] = entry->fid->context; + + if (copied >= count) + break; + } + } + + ofi_mutex_unlock(&ps->lock); + + return copied; +} + +static int usdf_poll_add(struct fid_poll *fps, struct fid *event_fid, + uint64_t flags) +{ + struct usdf_poll *ps; + struct usdf_cq *cq; + int ret; + + USDF_TRACE_SYS(DOMAIN, "\n"); + + if (!fps || !event_fid) { + USDF_WARN_SYS(DOMAIN, "pollset and event_fid can't be NULL.\n"); + return -FI_EINVAL; + } + + ps = poll_ftou(fps); + + switch (event_fid->fclass) { + case FI_CLASS_CQ: + break; + default: + USDF_WARN_SYS(DOMAIN, "invalid fid class.\n"); + return -FI_EINVAL; + } + + ret = fid_list_insert(&ps->list, &ps->lock, event_fid); + if (ret) + return ret; + + cq = cq_fidtou(event_fid); + ret = ofi_atomic_inc32(&cq->cq_refcnt); + assert(ret > 0); + USDF_DBG_SYS(DOMAIN, "associated with CQ: [%p] with new refcnt: [%d]\n", + cq, ret); + + return FI_SUCCESS; +} + +static int usdf_poll_del(struct fid_poll *fps, struct fid *event_fid, + uint64_t flags) +{ + struct usdf_poll *ps; + struct usdf_cq *cq; + int ret; + + if (!fps || !event_fid) { + USDF_WARN_SYS(DOMAIN, "pollset and event_fid can't be NULL.\n"); + return -FI_EINVAL; + } + + USDF_TRACE_SYS(DOMAIN, "\n"); + + ps = poll_ftou(fps); + + switch (event_fid->fclass) { + case FI_CLASS_CQ: + break; + default: + USDF_WARN_SYS(DOMAIN, "invalid fid class.\n"); + return -FI_EINVAL; + } + + fid_list_remove(&ps->list, &ps->lock, event_fid); + + cq = cq_fidtou(event_fid); + ret = ofi_atomic_dec32(&cq->cq_refcnt); + + USDF_DBG_SYS(DOMAIN, + "disassociating from CQ: [%p] with new refcnt: [%d]\n", + cq, ret); + assert(ret >= 0); + + if (ret >= 0) + ret = FI_SUCCESS; + else + ret = -FI_EINVAL; + return ret; +} + +static int usdf_poll_close(struct fid *fps) +{ + struct usdf_poll *ps; + struct dlist_entry *item; + struct dlist_entry *head; + struct fid_list_entry *entry; + struct usdf_cq *cq; + int val, ret = FI_SUCCESS; + + USDF_TRACE_SYS(DOMAIN, "\n"); + + if (!fps) { + USDF_WARN_SYS(DOMAIN, "pollset can't be NULL.\n"); + return -FI_EINVAL; + } + + ps = poll_ftou(fps); + + if (ofi_atomic_get32(&ps->poll_refcnt) > 0) { + USDF_WARN_SYS(DOMAIN, + "failed to close pollset with non-zero refcnt"); + return -FI_EBUSY; + } + + head = &ps->list; + while (!dlist_empty(head)) { + item = head->next; + entry = container_of(item, struct fid_list_entry, entry); + + switch (entry->fid->fclass) { + case FI_CLASS_CQ: + cq = cq_fidtou(entry->fid); + val = ofi_atomic_dec32(&cq->cq_refcnt); + + USDF_DBG_SYS(DOMAIN, + "disassociating from CQ: [%p] with new refcnt: [%d]\n", + cq, val); + assert(val >= 0); + if (val < 0) + ret = -FI_EINVAL; + break; + default: + USDF_WARN_SYS(DOMAIN, "invalid object\n"); + break; + } + + dlist_remove(item); + free(entry); + } + + ofi_atomic_dec32(&ps->poll_domain->dom_refcnt); + ofi_mutex_destroy(&ps->lock); + free(ps); + + return ret; +} + +struct fi_ops_poll usdf_poll_ops = { + .size = sizeof(struct fi_ops_poll), + .poll = usdf_poll_poll, + .poll_add = usdf_poll_add, + .poll_del = usdf_poll_del +}; + +struct fi_ops usdf_poll_fi_ops = { + .size = sizeof(struct fi_ops), + .close = usdf_poll_close, + .bind = fi_no_bind, + .ops_open = fi_no_ops_open +}; + +int usdf_poll_open(struct fid_domain *fdom, struct fi_poll_attr *attr, + struct fid_poll **fps) +{ + struct usdf_poll *ps; + struct usdf_domain *dom; + int ret; + + USDF_TRACE_SYS(DOMAIN, "\n"); + + if (attr && attr->flags != 0) { + USDF_WARN_SYS(DOMAIN, "flags field of poll attr must be 0.\n"); + ret = -FI_EINVAL; + goto error; + } + + dom = dom_ftou(fdom); + + ps = calloc(1, sizeof(*ps)); + if (!ps) { + USDF_WARN_SYS(DOMAIN, + "unable to allocate memory for poll obj"); + ret = -FI_ENOMEM; + goto error; + } + + dlist_init(&ps->list); + ofi_atomic_initialize32(&ps->poll_refcnt, 0); + ofi_mutex_init(&ps->lock); + + ps->poll_fid.fid.ops = &usdf_poll_fi_ops; + ps->poll_fid.fid.fclass = FI_CLASS_POLL; + ps->poll_fid.fid.context = 0; + + ps->poll_fid.ops = &usdf_poll_ops; + + ps->poll_domain = dom; + + ret = ofi_atomic_inc32(&ps->poll_domain->dom_refcnt); + + USDF_DBG_SYS(DOMAIN, + "created pollset from domain: [%p] with new refcnt: [%d]\n", + ps->poll_domain, ret); + + *fps = &ps->poll_fid; + + return FI_SUCCESS; + +error: + *fps = NULL; + return ret; +} diff --git a/prov/usnic/src/usdf_poll.h b/prov/usnic/src/usdf_poll.h new file mode 100644 index 00000000000..1bb27c9e5b8 --- /dev/null +++ b/prov/usnic/src/usdf_poll.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2016, Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _USDF_POLL_H_ +#define _USDF_POLL_H_ + +#include "ofi_list.h" + +struct usdf_poll { + struct fid_poll poll_fid; + struct usdf_domain *poll_domain; + + ofi_atomic32_t poll_refcnt; + ofi_mutex_t lock; + struct dlist_entry list; +}; + +#define poll_ftou(fpl) container_of((fpl), struct usdf_poll, poll_fid) + +int usdf_poll_open(struct fid_domain *domain, struct fi_poll_attr *attr, + struct fid_poll **pollset); + +#endif diff --git a/prov/usnic/src/usdf_progress.c b/prov/usnic/src/usdf_progress.c new file mode 100644 index 00000000000..b5db07e14f7 --- /dev/null +++ b/prov/usnic/src/usdf_progress.c @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2014-2016, Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include "ofi.h" + +#include "usnic_direct.h" +#include "usdf.h" +#include "usdf_progress.h" +#include "usdf_timer.h" + +int +usdf_fabric_wake_thread(struct usdf_fabric *fp) +{ + uint64_t val; + int n; + + val = 1; + n = write(fp->fab_eventfd, &val, sizeof(val)); + if (n != sizeof(val)) { + return -FI_EIO; + } + return 0; +} + +int +usdf_fabric_progression_cb(void *v) +{ + struct usdf_fabric *fp; + uint64_t val; + int n; + + fp = v; + n = read(fp->fab_eventfd, &val, sizeof(val)); + if (n != sizeof(val)) { + return -FI_EIO; + } + return 0; +} + +void * +usdf_fabric_progression_thread(void *v) +{ + struct usdf_fabric *fp; + struct usdf_poll_item *pip; + struct usdf_domain *dom; + int num_blocked_waiting; + int sleep_time; + ofi_epoll_t epfd; + struct ofi_epollfds_event event; + int ret; + int n; + + fp = v; + epfd = fp->fab_epollfd; + + while (1) { + num_blocked_waiting = ofi_atomic_get32(&fp->num_blocked_waiting); + + /* sleep inifinitely if nothing to do */ + if ((fp->fab_active_timer_count > 0) || + (num_blocked_waiting > 0)) { + sleep_time = 1; + } else { + sleep_time = -1; + } + + n = ofi_epoll_wait(epfd, &event, 1, sleep_time); + if (fp->fab_exit || (n < 0 && n != EINTR)) { + pthread_exit(NULL); + } + + /* consume event if there was one */ + if (n == 1) { + pip = event.data.ptr; + ret = pip->pi_rtn(pip->pi_context); + if (ret != 0) { + pthread_exit(NULL); + } + } + + /* call timer progress each wakeup */ + usdf_timer_progress(fp); + + LIST_FOREACH(dom, &fp->fab_domain_list, dom_link) { + usdf_domain_progress(dom); + } + } +} + +/* + * Progress operations in this domain + */ +void +usdf_domain_progress(struct usdf_domain *udp) +{ + struct usdf_tx *tx; + struct usdf_cq_hard *hcq; + + /* one big hammer lock... */ + pthread_spin_lock(&udp->dom_progress_lock); + + TAILQ_FOREACH(hcq, &udp->dom_hcq_list, cqh_dom_link) { + hcq->cqh_progress(hcq); + } + + while (!TAILQ_EMPTY(&udp->dom_tx_ready)) { + tx = TAILQ_FIRST(&udp->dom_tx_ready); + TAILQ_REMOVE_MARK(&udp->dom_tx_ready, tx, tx_link); + + tx->tx_progress(tx); + } + + pthread_spin_unlock(&udp->dom_progress_lock); +} diff --git a/prov/usnic/src/usdf_progress.h b/prov/usnic/src/usdf_progress.h new file mode 100644 index 00000000000..5ac184fa00b --- /dev/null +++ b/prov/usnic/src/usdf_progress.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _USDF_PROGRESS_H_ +#define _USDF_PROGRESS_H_ + +struct usdf_poll_item { + int (*pi_rtn)(void *context); + void *pi_context; +}; + +struct usdf_fabric; +struct usdf_domain; + +void *usdf_fabric_progression_thread(void *v); +int usdf_fabric_wake_thread(struct usdf_fabric *fp); +int usdf_fabric_progression_cb(void *v); +void usdf_domain_progress(struct usdf_domain *udp); + +#endif /* _USDF_PROGRESS_H_ */ diff --git a/prov/usnic/src/usdf_rudp.h b/prov/usnic/src/usdf_rudp.h new file mode 100644 index 00000000000..e284408dfd9 --- /dev/null +++ b/prov/usnic/src/usdf_rudp.h @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _USDF_RUDP_H_ +#define _USDF_RUDP_H_ + +#include "usnic_direct.h" + +#define USDF_RUDP_SEQ_CREDITS 256 +#define USDF_RUDP_ACK_TIMEOUT 5 /* ms */ + +#define RUDP_SEQ_DIFF(A, B) ((int16_t)((u_int16_t)(A) - (u_int16_t)(B))) +#define RUDP_SEQ_LT(A, B) (RUDP_SEQ_DIFF((A), (B)) < 0) +#define RUDP_SEQ_LE(A, B) (RUDP_SEQ_DIFF((A), (B)) <= 0) +#define RUDP_SEQ_GT(A, B) (RUDP_SEQ_DIFF((A), (B)) > 0) +#define RUDP_SEQ_GE(A, B) (RUDP_SEQ_DIFF((A), (B)) >= 0) + +#define RUDP_MSGID_DIFF(A, B) ((int32_t)((u_int32_t)(A) - (u_int32_t)(B))) +#define RUDP_MSGID_LT(A, B) (RUDP_MSGID_DIFF((A), (B)) < 0) +#define RUDP_MSGID_LE(A, B) (RUDP_MSGID_DIFF((A), (B)) <= 0) +#define RUDP_MSGID_GT(A, B) (RUDP_MSGID_DIFF((A), (B)) > 0) +#define RUDP_MSGID_GE(A, B) (RUDP_MSGID_DIFF((A), (B)) >= 0) + +enum { + /* data messages (a bitmask of FIRST and LAST) */ + RUDP_OP_MID = 0x00, + RUDP_OP_FIRST = 0x01, + RUDP_OP_LAST = 0x02, + RUDP_OP_ONLY = 0x03, + + /* control messages */ + RUDP_OP_CONNECT_REQ = 0x81, + RUDP_OP_CONNECT_RESP = 0x82, + RUDP_OP_NAK = 0x83, + RUDP_OP_ACK = 0x84, +}; + +#define RUDP_OP_DATA_MASK (RUDP_OP_FIRST | RUDP_OP_LAST) + +struct rudp_rc_data_msg { + u_int32_t offset; /* 4 */ + u_int16_t rkey; /* 8 */ + u_int16_t length; /* 10 */ + u_int16_t seqno; /* 12 */ + u_int16_t rdma_id; /* 14 */ +} __attribute__ ((__packed__)); + +struct rudp_msg { + u_int16_t opcode; + u_int16_t src_peer_id; + u_int32_t msg_id; + union { + struct rudp_rc_data_msg rc_data; + struct { + u_int16_t dst_peer_id; + } connect_req; + struct { + u_int16_t dst_peer_id; + } connect_resp; + struct { + u_int16_t ack_seq; + } ack; + struct { + u_int16_t nak_seq; + u_int32_t seq_mask; + } nak; + } __attribute__ ((__packed__)) m; +} __attribute__ ((__packed__)); + + +struct rudp_pkt { + struct usd_udp_hdr hdr; + struct rudp_msg msg; +} __attribute__ ((__packed__)); + + +#endif /* _USDF_RUDP_H_ */ diff --git a/prov/usnic/src/usdf_socket.c b/prov/usnic/src/usdf_socket.c new file mode 100644 index 00000000000..2d1f7cfe438 --- /dev/null +++ b/prov/usnic/src/usdf_socket.c @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include +#include +#include +#include + +#include + +#include "usdf_socket.h" + +int +usdf_check_sock_error(int sock, uint32_t events) +{ + socklen_t len; + int status; + int ret; + + if (events & EPOLLBAD) { + return -FI_ECONNRESET; + } + + len = sizeof(status); + ret = getsockopt(sock, SOL_SOCKET, SO_ERROR, &status, &len); + if (ret == -1) { + return -errno; + } + if (status != 0) { + return -status; + } + return 0; +} diff --git a/prov/usnic/src/usdf_socket.h b/prov/usnic/src/usdf_socket.h new file mode 100644 index 00000000000..790a0fe59e0 --- /dev/null +++ b/prov/usnic/src/usdf_socket.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _USDF_SOCKET_H_ +#define _USDF_SOCKET_H_ + +#define EPOLLBAD (EPOLLHUP | EPOLLRDHUP | EPOLLERR) + +int usdf_check_sock_error(int sock, uint32_t events); + +#endif /* _USDF_SOCKET_H_ */ diff --git a/prov/usnic/src/usdf_timer.c b/prov/usnic/src/usdf_timer.c new file mode 100644 index 00000000000..2ee31697d8b --- /dev/null +++ b/prov/usnic/src/usdf_timer.c @@ -0,0 +1,266 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include +#include +#include + +#include "rdma/fabric.h" +#include "rdma/fi_errno.h" +#include "ofi.h" + +#include "usnic_direct.h" + +#include "usdf.h" +#include "usdf_timer.h" + +enum { + USDF_TF_QUEUED = (1 << 0), + USDF_TF_ACTIVE = (1 << 1), + USDF_TF_FREED = (1 << 2) +}; + +#define USDF_NUM_TIMER_BUCKETS (16 * 1024) /* roughly 16 seconds max delay */ + +struct usdf_timer_entry { + struct usdf_fabric *te_fabric; + + usdf_timer_callback_t te_callback; + void *te_context; + + uint32_t te_flags; + LIST_ENTRY(usdf_timer_entry) te_link; +}; + +/* + * Create a timer entry, registering a callback and argument. + */ +int +usdf_timer_alloc(usdf_timer_callback_t cb, void *context, + struct usdf_timer_entry **entry_o) +{ + struct usdf_timer_entry *entry; + + entry = calloc(1, sizeof(*entry)); + if (entry == NULL) { + return -FI_ENOMEM; + } + + entry->te_callback = cb; + entry->te_context = context; + entry->te_flags = 0; + + *entry_o = entry; + return 0; +} + +void +usdf_timer_free(struct usdf_fabric *fp, struct usdf_timer_entry *entry) +{ + pthread_spin_lock(&fp->fab_timer_lock); + + if (entry->te_flags & USDF_TF_ACTIVE) { + entry->te_flags |= USDF_TF_FREED; + } else { + if (entry->te_flags & USDF_TF_QUEUED) { + LIST_REMOVE(entry, te_link); + } + free(entry); + } + + pthread_spin_unlock(&fp->fab_timer_lock); +} + +void +usdf_timer_cancel(struct usdf_fabric *fp, struct usdf_timer_entry *entry) +{ + pthread_spin_lock(&fp->fab_timer_lock); + + if (entry->te_flags & USDF_TF_QUEUED) { + LIST_REMOVE(entry, te_link); + entry->te_flags &= ~USDF_TF_QUEUED; + --fp->fab_active_timer_count; + } + + pthread_spin_unlock(&fp->fab_timer_lock); +} + +/* + * Set this timer to fire "ms" milliseconds from now. If the timer is already + * queued, previous timeout will be discarded. + * + * When timer expires, the registered timer callback will be called and + * the timer entry removed from the queued list. The timer routine will not + * be called again until usdf_timer_set() is called again to re-set it. + * usdf_timer_set() is safe to call from timer service routine. + */ +static inline int +_usdf_timer_do_set(struct usdf_fabric *fp, struct usdf_timer_entry *entry, + uint32_t ms) +{ + int ret; + unsigned bucket; + + /* If no timers active, cur_bucket_ms may need catchup */ + ++fp->fab_active_timer_count; + if (fp->fab_active_timer_count == 1) { + fp->fab_cur_bucket_ms = usdf_get_ms(); + ret = usdf_fabric_wake_thread(fp); + if (ret != 0) { + --fp->fab_active_timer_count; + return ret; + } + } + + if (entry->te_flags & USDF_TF_QUEUED) { + LIST_REMOVE(entry, te_link); + --fp->fab_active_timer_count; + } + + // we could make "overflow" bucket... + if (ms >= USDF_NUM_TIMER_BUCKETS) { + --fp->fab_active_timer_count; + return -FI_EINVAL; + } + bucket = (fp->fab_cur_bucket + ms) & (USDF_NUM_TIMER_BUCKETS - 1); + + LIST_INSERT_HEAD(&fp->fab_timer_buckets[bucket], entry, te_link); + entry->te_flags |= USDF_TF_QUEUED; + return 0; +} + +int +usdf_timer_set(struct usdf_fabric *fp, struct usdf_timer_entry *entry, + uint32_t ms) +{ + int ret; + + pthread_spin_lock(&fp->fab_timer_lock); + if (entry->te_flags & USDF_TF_QUEUED) { + ret = 0; + } else { + ret = _usdf_timer_do_set(fp, entry, ms); + } + pthread_spin_unlock(&fp->fab_timer_lock); + + return ret; +} + +int +usdf_timer_reset(struct usdf_fabric *fp, struct usdf_timer_entry *entry, + uint32_t ms) +{ + int ret; + + pthread_spin_lock(&fp->fab_timer_lock); + ret = _usdf_timer_do_set(fp, entry, ms); + pthread_spin_unlock(&fp->fab_timer_lock); + + return ret; +} + + +static inline void +usdf_run_bucket(struct usdf_fabric *fp, struct usdf_timer_bucket *bp) +{ + struct usdf_timer_entry *entry; + + while (!LIST_EMPTY(bp)) { + entry = LIST_FIRST(bp); + LIST_REMOVE(entry, te_link); + entry->te_flags |= USDF_TF_ACTIVE; + entry->te_flags &= ~USDF_TF_QUEUED; + --fp->fab_active_timer_count; + + /* call timer service routine without lock */ + pthread_spin_unlock(&fp->fab_timer_lock); + entry->te_callback(entry->te_context); + pthread_spin_lock(&fp->fab_timer_lock); + } +} + +/* + * Called only from fabric progression thread + */ +void +usdf_timer_progress(struct usdf_fabric *fp) +{ + pthread_spin_lock(&fp->fab_timer_lock); + + while (fp->fab_cur_bucket_ms < usdf_get_ms()) { + usdf_run_bucket(fp, + &fp->fab_timer_buckets[fp->fab_cur_bucket]); + + ++fp->fab_cur_bucket_ms; + fp->fab_cur_bucket = (fp->fab_cur_bucket + 1) & + (USDF_NUM_TIMER_BUCKETS - 1); + } + + pthread_spin_unlock(&fp->fab_timer_lock); +} + +/* + * Initialize timer data + */ +int +usdf_timer_init(struct usdf_fabric *fp) +{ + int i; + + pthread_spin_init(&fp->fab_timer_lock, PTHREAD_PROCESS_PRIVATE); + + fp->fab_timer_buckets = calloc(USDF_NUM_TIMER_BUCKETS, + sizeof(struct usdf_timer_bucket)); + if (fp->fab_timer_buckets == NULL) { + return -FI_ENOMEM; + } + + for (i = 0; i < USDF_NUM_TIMER_BUCKETS; ++i) { + LIST_INIT(&fp->fab_timer_buckets[i]); + } + + fp->fab_cur_bucket = 0; + fp->fab_cur_bucket_ms = usdf_get_ms(); + return 0; +} + +void +usdf_timer_deinit(struct usdf_fabric *fp) +{ + free(fp->fab_timer_buckets); +} diff --git a/prov/usnic/src/usdf_timer.h b/prov/usnic/src/usdf_timer.h new file mode 100644 index 00000000000..aecebc8257d --- /dev/null +++ b/prov/usnic/src/usdf_timer.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _USDF_TIMER_H_ +#define _USDF_TIMER_H_ + +#include + +struct usdf_timer_entry; + +static inline uint64_t +usdf_get_ms(void) +{ + struct timespec now; + uint64_t ms; + + clock_gettime(CLOCK_MONOTONIC, &now); + ms = now.tv_sec * 1000 + now.tv_nsec / 1000000; + + return ms; +} + +typedef void (*usdf_timer_callback_t)(void *); + +int usdf_timer_alloc(usdf_timer_callback_t cb, void *arg, + struct usdf_timer_entry **entry); + +void usdf_timer_free(struct usdf_fabric *fp, struct usdf_timer_entry *entry); + +int usdf_timer_set(struct usdf_fabric *fp, struct usdf_timer_entry *entry, + uint32_t timeout); +int usdf_timer_reset(struct usdf_fabric *fp, struct usdf_timer_entry *entry, + uint32_t timeout); + +void usdf_timer_cancel(struct usdf_fabric *fp, struct usdf_timer_entry *entry); + +void usdf_timer_progress(struct usdf_fabric *fp); + +int usdf_timer_init(struct usdf_fabric *fp); +void usdf_timer_deinit(struct usdf_fabric *fp); + +#endif /* _USDF_TIMER_H_ */ diff --git a/prov/usnic/src/usdf_wait.c b/prov/usnic/src/usdf_wait.c new file mode 100644 index 00000000000..d2125e9c1ee --- /dev/null +++ b/prov/usnic/src/usdf_wait.c @@ -0,0 +1,346 @@ +/* + * Copyright (c) 2016, Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* This needs to be included for usdf.h */ +#include "ofi.h" +#include "ofi_enosys.h" +#include "ofi_util.h" + +#include "usdf.h" +#include "usdf_cq.h" +#include "usdf_wait.h" + + +/* Necessary to support top-of-file struct declarations. */ +static int usdf_wait_wait(struct fid_wait *wait_fid, int timeout); +static int usdf_wait_close(struct fid *waitset); +static int usdf_wait_control(struct fid *fid, int command, void *arg); + +static struct fi_ops_wait usdf_wait_ops = { + .size = sizeof(struct fi_ops_wait), + .wait = usdf_wait_wait, +}; + +static struct fi_ops usdf_wait_fi_ops = { + .size = sizeof(struct fi_ops), + .close = usdf_wait_close, + .bind = fi_no_bind, + .control = usdf_wait_control, + .ops_open = fi_no_ops_open +}; + +static int usdf_wait_trywait(struct fid *fwait) +{ + struct usdf_wait *wait; + struct dlist_entry *item; + struct fid_list_entry *entry; + int ret = FI_SUCCESS; + + wait = wait_fidtou(fwait); + + dlist_foreach(&wait->list, item) { + entry = container_of(item, struct fid_list_entry, entry); + + switch (entry->fid->fclass) { + case FI_CLASS_EQ: + continue; + case FI_CLASS_CQ: + ret = usdf_cq_trywait(entry->fid); + if (ret) + return ret; + break; + default: + USDF_DBG_SYS(FABRIC, "invalid fid %p\n", entry->fid); + return -FI_EINVAL; + } + } + + return ret; +} + +int usdf_trywait(struct fid_fabric *fabric, struct fid **fids, int count) +{ + size_t i; + int ret; + + for (i = 0; i < count; i++) { + assert(fids[i]); + + switch (fids[i]->fclass) { + case FI_CLASS_EQ: + continue; + case FI_CLASS_CQ: + ret = usdf_cq_trywait(fids[i]); + break; + case FI_CLASS_WAIT: + ret = usdf_wait_trywait(fids[i]); + break; + default: + USDF_DBG_SYS(FABRIC, "invalid fid\n"); + return -FI_EINVAL; + } + + if (ret) + return ret; + } + + return FI_SUCCESS; +} + +/* Since a domain hasn't been opened at the time of wait object creation, open a + * device temporarily to check for the group interrupt capability. + */ +static int usdf_wait_check_support(struct usdf_fabric *fabric_priv) +{ + struct usd_open_params params = { + .flags = UOPF_SKIP_PD_ALLOC, + .cmd_fd = -1, + .context = NULL + }; + struct usd_device *dev; + int ret; + + ret = usd_open_with_params(fabric_priv->fab_dev_attrs->uda_devname, + ¶ms, &dev); + if (ret) { + USDF_DBG_SYS(FABRIC, + "opening device to check fd support failed.\n"); + return ret; + } + + if (!usd_get_cap(dev, USD_CAP_GRP_INTR)) { + USDF_WARN_SYS(FABRIC, "FD request invalid.\n"); + USDF_WARN_SYS(FABRIC, "group interrupts not supported.\n"); + ret = usd_close(dev); + if (ret) + USDF_WARN_SYS(FABRIC, "closing usd device failed: %s\n", + strerror(ret)); + + return -FI_EOPNOTSUPP; + } + + return usd_close(dev); +} + +/* Non-static because this is exported due to being returned as a callback for + * fabric ops. + * + * Supporting wait objects in the usNIC provider is done using an epoll + * context. When fi_wait_open is called an epoll context is created using + * epoll_create1. This simplifies multi-CQ support and also allows us to get + * around a limitation of the usNIC provider. IB completion channels are opened + * on the domain because we have a context associated with the domain. At + * fi_wait_open time, we only have access to the fabric. It isn't guaranteed + * that a domain has been opened yet. The epoll context approach allows us to + * defer creating the completion channel for the CQ until CQ open time. + */ +int usdf_wait_open(struct fid_fabric *fabric, struct fi_wait_attr *attr, + struct fid_wait **waitset) +{ + struct usdf_wait *wait_priv; + struct usdf_fabric *fabric_priv; + ofi_epoll_t epfd; + int ret; + + USDF_TRACE_SYS(FABRIC, "\n"); + + switch (attr->wait_obj) { + case FI_WAIT_UNSPEC: + case FI_WAIT_FD: + break; + default: + USDF_WARN_SYS(FABRIC, "unsupported wait object type\n"); + ret = -FI_EINVAL; + goto error; + } + + fabric_priv = fab_fidtou(fabric); + ret = usdf_wait_check_support(fabric_priv); + if (ret) + goto error; + + ret = ofi_epoll_create(&epfd); + if (ret) { + USDF_WARN_SYS(FABRIC, "failed to create epoll fd[%d]\n", errno); + goto error; + } + + USDF_DBG_SYS(FABRIC, "successfully created epoll fd: %d\n", epfd); + + wait_priv = calloc(1, sizeof(*wait_priv)); + if (!wait_priv) { + USDF_WARN_SYS(FABRIC, + "unable to allocate memory for usdf_wait obj"); + ret = -FI_ENOMEM; + goto calloc_fail; + } + + wait_priv->wait_fid.fid.fclass = FI_CLASS_WAIT; + wait_priv->wait_fid.fid.ops = &usdf_wait_fi_ops; + wait_priv->wait_fid.ops = &usdf_wait_ops; + wait_priv->wait_fid.fid.context = 0; + wait_priv->wait_fabric = fabric_priv; + wait_priv->wait_obj = attr->wait_obj; + wait_priv->object.epfd = epfd; + + ofi_atomic_initialize32(&wait_priv->wait_refcnt, 0); + ofi_mutex_init(&wait_priv->lock); + dlist_init(&wait_priv->list); + + ofi_atomic_inc32(&wait_priv->wait_fabric->fab_refcnt); + + *waitset = &wait_priv->wait_fid; + + return FI_SUCCESS; + +calloc_fail: + ofi_epoll_close(epfd); +error: + *waitset = NULL; + return ret; +} + +/* Close a wait object. Make sure all resources associated with the wait object + * have been closed. + */ +static int usdf_wait_close(struct fid *waitset) +{ + struct usdf_wait *wait_priv; + + USDF_TRACE_SYS(FABRIC, "\n"); + if (!waitset) { + USDF_WARN_SYS(FABRIC, "invalid input.\n"); + return -FI_EINVAL; + } + + wait_priv = wait_ftou(waitset); + + if (ofi_atomic_get32(&wait_priv->wait_refcnt) > 0) { + USDF_DBG_SYS(FABRIC, + "failed to close waitset with non-zero refcnt"); + return -FI_EBUSY; + } + + switch (wait_priv->wait_obj) { + case FI_WAIT_UNSPEC: + case FI_WAIT_FD: + ofi_epoll_close(wait_priv->object.epfd); + break; + default: + USDF_WARN_SYS(FABRIC, + "unsupported wait object type\n"); + return -FI_EINVAL; + } + + ofi_atomic_dec32(&wait_priv->wait_fabric->fab_refcnt); + free(wait_priv); + + return FI_SUCCESS; +} + +static int usdf_wait_wait(struct fid_wait *fwait, int timeout) +{ + struct usdf_wait *wait; + struct ofi_epollfds_event event; + int ret = FI_SUCCESS; + int nevents; + + USDF_TRACE_SYS(FABRIC, "\n"); + wait = wait_ftou(fwait); + + ret = usdf_wait_trywait(&fwait->fid); + if (ret) { + if (ret == -FI_EAGAIN) + return FI_SUCCESS; + + return ret; + } + + nevents = ofi_epoll_wait(wait->object.epfd, &event, 1, timeout); + if (nevents == 0) { + ret = -FI_ETIMEDOUT; + } else if (nevents < 0) { + USDF_DBG_SYS(FABRIC, "epoll wait failed\n"); + ret = nevents; + } + + return ret; +} + +static int usdf_wait_get_wait(struct usdf_wait *wait_priv, void *arg) +{ + USDF_TRACE_SYS(FABRIC, "\n"); + + if (!arg || !wait_priv) { + USDF_WARN_SYS(FABRIC, "invalid input\n"); + return -FI_EINVAL; + } + + switch (wait_priv->wait_obj) { + case FI_WAIT_UNSPEC: + case FI_WAIT_FD: +#ifdef HAVE_EPOLL + *(int *) arg = wait_priv->object.epfd; +#else + return -FI_ENOSYS; +#endif + break; + default: + USDF_DBG_SYS(FABRIC, "unsupported wait type\n"); + return -FI_EINVAL; + } + + return FI_SUCCESS; +} + +static int usdf_wait_control(struct fid *fid, int command, void *arg) +{ + struct usdf_wait *wait_priv; + + USDF_TRACE_SYS(FABRIC, "\n"); + + wait_priv = container_of(fid, struct usdf_wait, wait_fid.fid); + + switch (command) { + case FI_GETWAIT: + break; + default: + USDF_DBG_SYS(FABRIC, "unsupported control command\n"); + return -FI_EINVAL; + } + + return usdf_wait_get_wait(wait_priv, arg); +} diff --git a/prov/usnic/src/usdf_wait.h b/prov/usnic/src/usdf_wait.h new file mode 100644 index 00000000000..64219f15afd --- /dev/null +++ b/prov/usnic/src/usdf_wait.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2016, Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _USDF_WAIT_H_ +#define _USDF_WAIT_H_ + +#include "ofi_list.h" + +struct usdf_wait { + struct fid_wait wait_fid; + struct usdf_fabric *wait_fabric; + + enum fi_wait_obj wait_obj; + union { + ofi_epoll_t epfd; + struct fi_mutex_cond mutex_cond; + } object; + + ofi_atomic32_t wait_refcnt; + + ofi_mutex_t lock; + struct dlist_entry list; +}; + +#define wait_ftou(FWT) container_of(FWT, struct usdf_wait, wait_fid) +#define wait_fidtou(FWT) container_of(FWT, struct usdf_wait, wait_fid.fid) + +int usdf_wait_open(struct fid_fabric *fabric, struct fi_wait_attr *attr, + struct fid_wait **waitset); +int usdf_trywait(struct fid_fabric *fabric, struct fid **fids, int count); + +#endif diff --git a/prov/usnic/src/usnic_direct/cq_desc.h b/prov/usnic/src/usnic_direct/cq_desc.h new file mode 100644 index 00000000000..f110c02c79c --- /dev/null +++ b/prov/usnic/src/usnic_direct/cq_desc.h @@ -0,0 +1,136 @@ +/* + * Copyright 2008-2010 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#ifndef _CQ_DESC_H_ +#define _CQ_DESC_H_ + +/* + * Completion queue descriptor types + */ +enum cq_desc_types { + CQ_DESC_TYPE_WQ_ENET = 0, + CQ_DESC_TYPE_DESC_COPY = 1, + CQ_DESC_TYPE_WQ_EXCH = 2, + CQ_DESC_TYPE_RQ_ENET = 3, + CQ_DESC_TYPE_RQ_FCP = 4, + CQ_DESC_TYPE_IOMMU_MISS = 5, + CQ_DESC_TYPE_SGL = 6, + CQ_DESC_TYPE_CLASSIFIER = 7, + CQ_DESC_TYPE_TEST = 127, +}; + +/* Completion queue descriptor: 16B + * + * All completion queues have this basic layout. The + * type_specfic area is unique for each completion + * queue type. + */ +struct cq_desc { + __le16 completed_index; + __le16 q_number; + u8 type_specfic[11]; + u8 type_color; +}; + +#define CQ_DESC_TYPE_BITS 4 +#define CQ_DESC_TYPE_MASK ((1 << CQ_DESC_TYPE_BITS) - 1) +#define CQ_DESC_COLOR_MASK 1 +#define CQ_DESC_COLOR_SHIFT 7 +#define CQ_DESC_Q_NUM_BITS 10 +#define CQ_DESC_Q_NUM_MASK ((1 << CQ_DESC_Q_NUM_BITS) - 1) +#define CQ_DESC_COMP_NDX_BITS 12 +#define CQ_DESC_COMP_NDX_MASK ((1 << CQ_DESC_COMP_NDX_BITS) - 1) + +static inline void cq_color_enc(struct cq_desc *desc, const u8 color) +{ + if (color) + desc->type_color |= (1 << CQ_DESC_COLOR_SHIFT); + else + desc->type_color &= ~(1 << CQ_DESC_COLOR_SHIFT); +} + +static inline void cq_desc_enc(struct cq_desc *desc, + const u8 type, const u8 color, const u16 q_number, + const u16 completed_index) +{ + desc->type_color = (type & CQ_DESC_TYPE_MASK) | + ((color & CQ_DESC_COLOR_MASK) << CQ_DESC_COLOR_SHIFT); + desc->q_number = cpu_to_le16(q_number & CQ_DESC_Q_NUM_MASK); + desc->completed_index = cpu_to_le16(completed_index & + CQ_DESC_COMP_NDX_MASK); +} + +static inline void cq_desc_dec(const struct cq_desc *desc_arg, + u8 *type, u8 *color, u16 *q_number, u16 *completed_index) +{ + const struct cq_desc *desc = desc_arg; + const u8 type_color = desc->type_color; + + *color = (type_color >> CQ_DESC_COLOR_SHIFT) & CQ_DESC_COLOR_MASK; + +#if !defined(__LIBUSNIC__) + /* + * Make sure color bit is read from desc *before* other fields + * are read from desc. Hardware guarantees color bit is last + * bit (byte) written. Adding the rmb() prevents the compiler + * and/or CPU from reordering the reads which would potentially + * result in reading stale values. + */ + + rmb(); +#endif + + *type = type_color & CQ_DESC_TYPE_MASK; + *q_number = le16_to_cpu(desc->q_number) & CQ_DESC_Q_NUM_MASK; + *completed_index = le16_to_cpu(desc->completed_index) & + CQ_DESC_COMP_NDX_MASK; +} + +static inline void cq_color_dec(const struct cq_desc *desc_arg, u8 *color) +{ + volatile const struct cq_desc *desc = desc_arg; + + *color = (desc->type_color >> CQ_DESC_COLOR_SHIFT) & CQ_DESC_COLOR_MASK; +} + +#endif /* _CQ_DESC_H_ */ diff --git a/prov/usnic/src/usnic_direct/cq_enet_desc.h b/prov/usnic/src/usnic_direct/cq_enet_desc.h new file mode 100644 index 00000000000..c616095d4d8 --- /dev/null +++ b/prov/usnic/src/usnic_direct/cq_enet_desc.h @@ -0,0 +1,269 @@ +/* + * Copyright 2008-2010 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#ifndef _CQ_ENET_DESC_H_ +#define _CQ_ENET_DESC_H_ + +#include "cq_desc.h" + +/* Ethernet completion queue descriptor: 16B */ +struct cq_enet_wq_desc { + __le16 completed_index; + __le16 q_number; + u8 reserved[11]; + u8 type_color; +}; + +static inline void cq_enet_wq_desc_enc(struct cq_enet_wq_desc *desc, + u8 type, u8 color, u16 q_number, u16 completed_index) +{ + cq_desc_enc((struct cq_desc *)desc, type, + color, q_number, completed_index); +} + +static inline void cq_enet_wq_desc_dec(struct cq_enet_wq_desc *desc, + u8 *type, u8 *color, u16 *q_number, u16 *completed_index) +{ + cq_desc_dec((struct cq_desc *)desc, type, + color, q_number, completed_index); +} + +/* Completion queue descriptor: Ethernet receive queue, 16B */ +struct cq_enet_rq_desc { + __le16 completed_index_flags; + __le16 q_number_rss_type_flags; + __le32 rss_hash; + __le16 bytes_written_flags; + __le16 vlan; + __le16 checksum_fcoe; + u8 flags; + u8 type_color; +}; + +#define CQ_ENET_RQ_DESC_FLAGS_INGRESS_PORT (0x1 << 12) +#define CQ_ENET_RQ_DESC_FLAGS_FCOE (0x1 << 13) +#define CQ_ENET_RQ_DESC_FLAGS_EOP (0x1 << 14) +#define CQ_ENET_RQ_DESC_FLAGS_SOP (0x1 << 15) + +#define CQ_ENET_RQ_DESC_RSS_TYPE_BITS 4 +#define CQ_ENET_RQ_DESC_RSS_TYPE_MASK \ + ((1 << CQ_ENET_RQ_DESC_RSS_TYPE_BITS) - 1) +#define CQ_ENET_RQ_DESC_RSS_TYPE_NONE 0 +#define CQ_ENET_RQ_DESC_RSS_TYPE_IPv4 1 +#define CQ_ENET_RQ_DESC_RSS_TYPE_TCP_IPv4 2 +#define CQ_ENET_RQ_DESC_RSS_TYPE_IPv6 3 +#define CQ_ENET_RQ_DESC_RSS_TYPE_TCP_IPv6 4 +#define CQ_ENET_RQ_DESC_RSS_TYPE_IPv6_EX 5 +#define CQ_ENET_RQ_DESC_RSS_TYPE_TCP_IPv6_EX 6 + +#define CQ_ENET_RQ_DESC_FLAGS_CSUM_NOT_CALC (0x1 << 14) + +#define CQ_ENET_RQ_DESC_BYTES_WRITTEN_BITS 14 +#define CQ_ENET_RQ_DESC_BYTES_WRITTEN_MASK \ + ((1 << CQ_ENET_RQ_DESC_BYTES_WRITTEN_BITS) - 1) +#define CQ_ENET_RQ_DESC_FLAGS_TRUNCATED (0x1 << 14) +#define CQ_ENET_RQ_DESC_FLAGS_VLAN_STRIPPED (0x1 << 15) + +#define CQ_ENET_RQ_DESC_VLAN_TCI_VLAN_BITS 12 +#define CQ_ENET_RQ_DESC_VLAN_TCI_VLAN_MASK \ + ((1 << CQ_ENET_RQ_DESC_VLAN_TCI_VLAN_BITS) - 1) +#define CQ_ENET_RQ_DESC_VLAN_TCI_CFI_MASK (0x1 << 12) +#define CQ_ENET_RQ_DESC_VLAN_TCI_USER_PRIO_BITS 3 +#define CQ_ENET_RQ_DESC_VLAN_TCI_USER_PRIO_MASK \ + ((1 << CQ_ENET_RQ_DESC_VLAN_TCI_USER_PRIO_BITS) - 1) +#define CQ_ENET_RQ_DESC_VLAN_TCI_USER_PRIO_SHIFT 13 + +#define CQ_ENET_RQ_DESC_FCOE_SOF_BITS 8 +#define CQ_ENET_RQ_DESC_FCOE_SOF_MASK \ + ((1 << CQ_ENET_RQ_DESC_FCOE_SOF_BITS) - 1) +#define CQ_ENET_RQ_DESC_FCOE_EOF_BITS 8 +#define CQ_ENET_RQ_DESC_FCOE_EOF_MASK \ + ((1 << CQ_ENET_RQ_DESC_FCOE_EOF_BITS) - 1) +#define CQ_ENET_RQ_DESC_FCOE_EOF_SHIFT 8 + +#define CQ_ENET_RQ_DESC_FLAGS_TCP_UDP_CSUM_OK (0x1 << 0) +#define CQ_ENET_RQ_DESC_FCOE_FC_CRC_OK (0x1 << 0) +#define CQ_ENET_RQ_DESC_FLAGS_UDP (0x1 << 1) +#define CQ_ENET_RQ_DESC_FCOE_ENC_ERROR (0x1 << 1) +#define CQ_ENET_RQ_DESC_FLAGS_TCP (0x1 << 2) +#define CQ_ENET_RQ_DESC_FLAGS_IPV4_CSUM_OK (0x1 << 3) +#define CQ_ENET_RQ_DESC_FLAGS_IPV6 (0x1 << 4) +#define CQ_ENET_RQ_DESC_FLAGS_IPV4 (0x1 << 5) +#define CQ_ENET_RQ_DESC_FLAGS_IPV4_FRAGMENT (0x1 << 6) +#define CQ_ENET_RQ_DESC_FLAGS_FCS_OK (0x1 << 7) + +static inline void cq_enet_rq_desc_enc(struct cq_enet_rq_desc *desc, + u8 type, u8 color, u16 q_number, u16 completed_index, + u8 ingress_port, u8 fcoe, u8 eop, u8 sop, u8 rss_type, u8 csum_not_calc, + u32 rss_hash, u16 bytes_written, u8 packet_error, u8 vlan_stripped, + u16 vlan, u16 checksum, u8 fcoe_sof, u8 fcoe_fc_crc_ok, + u8 fcoe_enc_error, u8 fcoe_eof, u8 tcp_udp_csum_ok, u8 udp, u8 tcp, + u8 ipv4_csum_ok, u8 ipv6, u8 ipv4, u8 ipv4_fragment, u8 fcs_ok) +{ + cq_desc_enc((struct cq_desc *)desc, type, + color, q_number, completed_index); + + desc->completed_index_flags |= cpu_to_le16( + (ingress_port ? CQ_ENET_RQ_DESC_FLAGS_INGRESS_PORT : 0) | + (fcoe ? CQ_ENET_RQ_DESC_FLAGS_FCOE : 0) | + (eop ? CQ_ENET_RQ_DESC_FLAGS_EOP : 0) | + (sop ? CQ_ENET_RQ_DESC_FLAGS_SOP : 0)); + + desc->q_number_rss_type_flags |= cpu_to_le16( + ((rss_type & CQ_ENET_RQ_DESC_RSS_TYPE_MASK) << + CQ_DESC_Q_NUM_BITS) | + (csum_not_calc ? CQ_ENET_RQ_DESC_FLAGS_CSUM_NOT_CALC : 0)); + + desc->rss_hash = cpu_to_le32(rss_hash); + + desc->bytes_written_flags = cpu_to_le16( + (bytes_written & CQ_ENET_RQ_DESC_BYTES_WRITTEN_MASK) | + (packet_error ? CQ_ENET_RQ_DESC_FLAGS_TRUNCATED : 0) | + (vlan_stripped ? CQ_ENET_RQ_DESC_FLAGS_VLAN_STRIPPED : 0)); + + desc->vlan = cpu_to_le16(vlan); + + if (fcoe) { + desc->checksum_fcoe = cpu_to_le16( + (fcoe_sof & CQ_ENET_RQ_DESC_FCOE_SOF_MASK) | + ((fcoe_eof & CQ_ENET_RQ_DESC_FCOE_EOF_MASK) << + CQ_ENET_RQ_DESC_FCOE_EOF_SHIFT)); + } else { + desc->checksum_fcoe = cpu_to_le16(checksum); + } + + desc->flags = + (tcp_udp_csum_ok ? CQ_ENET_RQ_DESC_FLAGS_TCP_UDP_CSUM_OK : 0) | + (udp ? CQ_ENET_RQ_DESC_FLAGS_UDP : 0) | + (tcp ? CQ_ENET_RQ_DESC_FLAGS_TCP : 0) | + (ipv4_csum_ok ? CQ_ENET_RQ_DESC_FLAGS_IPV4_CSUM_OK : 0) | + (ipv6 ? CQ_ENET_RQ_DESC_FLAGS_IPV6 : 0) | + (ipv4 ? CQ_ENET_RQ_DESC_FLAGS_IPV4 : 0) | + (ipv4_fragment ? CQ_ENET_RQ_DESC_FLAGS_IPV4_FRAGMENT : 0) | + (fcs_ok ? CQ_ENET_RQ_DESC_FLAGS_FCS_OK : 0) | + (fcoe_fc_crc_ok ? CQ_ENET_RQ_DESC_FCOE_FC_CRC_OK : 0) | + (fcoe_enc_error ? CQ_ENET_RQ_DESC_FCOE_ENC_ERROR : 0); +} + +static inline void cq_enet_rq_desc_dec(struct cq_enet_rq_desc *desc, + u8 *type, u8 *color, u16 *q_number, u16 *completed_index, + u8 *ingress_port, u8 *fcoe, u8 *eop, u8 *sop, u8 *rss_type, + u8 *csum_not_calc, u32 *rss_hash, u16 *bytes_written, u8 *packet_error, + u8 *vlan_stripped, u16 *vlan_tci, u16 *checksum, u8 *fcoe_sof, + u8 *fcoe_fc_crc_ok, u8 *fcoe_enc_error, u8 *fcoe_eof, + u8 *tcp_udp_csum_ok, u8 *udp, u8 *tcp, u8 *ipv4_csum_ok, + u8 *ipv6, u8 *ipv4, u8 *ipv4_fragment, u8 *fcs_ok) +{ + u16 completed_index_flags; + u16 q_number_rss_type_flags; + u16 bytes_written_flags; + + cq_desc_dec((struct cq_desc *)desc, type, + color, q_number, completed_index); + + completed_index_flags = le16_to_cpu(desc->completed_index_flags); + q_number_rss_type_flags = + le16_to_cpu(desc->q_number_rss_type_flags); + bytes_written_flags = le16_to_cpu(desc->bytes_written_flags); + + *ingress_port = (completed_index_flags & + CQ_ENET_RQ_DESC_FLAGS_INGRESS_PORT) ? 1 : 0; + *fcoe = (completed_index_flags & CQ_ENET_RQ_DESC_FLAGS_FCOE) ? + 1 : 0; + *eop = (completed_index_flags & CQ_ENET_RQ_DESC_FLAGS_EOP) ? + 1 : 0; + *sop = (completed_index_flags & CQ_ENET_RQ_DESC_FLAGS_SOP) ? + 1 : 0; + + *rss_type = (u8)((q_number_rss_type_flags >> CQ_DESC_Q_NUM_BITS) & + CQ_ENET_RQ_DESC_RSS_TYPE_MASK); + *csum_not_calc = (q_number_rss_type_flags & + CQ_ENET_RQ_DESC_FLAGS_CSUM_NOT_CALC) ? 1 : 0; + + *rss_hash = le32_to_cpu(desc->rss_hash); + + *bytes_written = bytes_written_flags & + CQ_ENET_RQ_DESC_BYTES_WRITTEN_MASK; + *packet_error = (bytes_written_flags & + CQ_ENET_RQ_DESC_FLAGS_TRUNCATED) ? 1 : 0; + *vlan_stripped = (bytes_written_flags & + CQ_ENET_RQ_DESC_FLAGS_VLAN_STRIPPED) ? 1 : 0; + + /* + * Tag Control Information(16) = user_priority(3) + cfi(1) + vlan(12) + */ + *vlan_tci = le16_to_cpu(desc->vlan); + + if (*fcoe) { + *fcoe_sof = (u8)(le16_to_cpu(desc->checksum_fcoe) & + CQ_ENET_RQ_DESC_FCOE_SOF_MASK); + *fcoe_fc_crc_ok = (desc->flags & + CQ_ENET_RQ_DESC_FCOE_FC_CRC_OK) ? 1 : 0; + *fcoe_enc_error = (desc->flags & + CQ_ENET_RQ_DESC_FCOE_ENC_ERROR) ? 1 : 0; + *fcoe_eof = (u8)((le16_to_cpu(desc->checksum_fcoe) >> + CQ_ENET_RQ_DESC_FCOE_EOF_SHIFT) & + CQ_ENET_RQ_DESC_FCOE_EOF_MASK); + *checksum = 0; + } else { + *fcoe_sof = 0; + *fcoe_fc_crc_ok = 0; + *fcoe_enc_error = 0; + *fcoe_eof = 0; + *checksum = le16_to_cpu(desc->checksum_fcoe); + } + + *tcp_udp_csum_ok = + (desc->flags & CQ_ENET_RQ_DESC_FLAGS_TCP_UDP_CSUM_OK) ? 1 : 0; + *udp = (desc->flags & CQ_ENET_RQ_DESC_FLAGS_UDP) ? 1 : 0; + *tcp = (desc->flags & CQ_ENET_RQ_DESC_FLAGS_TCP) ? 1 : 0; + *ipv4_csum_ok = + (desc->flags & CQ_ENET_RQ_DESC_FLAGS_IPV4_CSUM_OK) ? 1 : 0; + *ipv6 = (desc->flags & CQ_ENET_RQ_DESC_FLAGS_IPV6) ? 1 : 0; + *ipv4 = (desc->flags & CQ_ENET_RQ_DESC_FLAGS_IPV4) ? 1 : 0; + *ipv4_fragment = + (desc->flags & CQ_ENET_RQ_DESC_FLAGS_IPV4_FRAGMENT) ? 1 : 0; + *fcs_ok = (desc->flags & CQ_ENET_RQ_DESC_FLAGS_FCS_OK) ? 1 : 0; +} + +#endif /* _CQ_ENET_DESC_H_ */ diff --git a/prov/usnic/src/usnic_direct/kcompat.h b/prov/usnic/src/usnic_direct/kcompat.h new file mode 100644 index 00000000000..e650e01c021 --- /dev/null +++ b/prov/usnic/src/usnic_direct/kcompat.h @@ -0,0 +1,108 @@ +/* + * Copyright 2008-2018 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#ifndef _KCOMPAT_H_ +#define _KCOMPAT_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define EXPORT_SYMBOL(x) +#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) +#define ALIGN(x, a) __ALIGN_MASK(x, (typeof(x))(a)-1) +#define __ALIGN_MASK(x, mask) (((x)+(mask))&~(mask)) +#define ETH_ALEN 6 +#define BUG() assert(0) +#define BUG_ON(x) assert(!x) +#define kzalloc(x, flags) calloc(1, x) +#define kfree(x) free(x) + +#define __iomem +#define udelay usleep +#define readl ioread32 +#define writel iowrite32 + +typedef int gfp_t; + +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) +#endif + +#ifndef offsetof +#define offsetof(t, m) ((size_t) &((t *)0)->m) +#endif + +static inline uint32_t ioread32(const volatile void *addr) +{ + return *(volatile uint32_t *)addr; +} + +static inline uint16_t ioread16(const volatile void *addr) +{ + return *(volatile uint16_t *)addr; +} + +static inline uint8_t ioread8(const volatile void *addr) +{ + return *(volatile uint8_t *)addr; +} + +static inline void iowrite64(uint64_t val, const volatile void *addr) +{ + *(volatile uint64_t *)addr = val; +} + +static inline void iowrite32(uint32_t val, const volatile void *addr) +{ + *(volatile uint32_t *)addr = val; +} + +#endif /* _KCOMPAT_H_ */ diff --git a/prov/usnic/src/usnic_direct/kcompat_priv.h b/prov/usnic/src/usnic_direct/kcompat_priv.h new file mode 100644 index 00000000000..01a7b2eca3c --- /dev/null +++ b/prov/usnic/src/usnic_direct/kcompat_priv.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#ifndef _KCOMPAT_PRIV_H_ +#define _KCOMPAT_PRIV_H_ + +#include +#include + +struct pci_dev; +typedef uint64_t dma_addr_t; +struct usd_device; + +int usd_alloc_mr(struct usd_device *dev, size_t size, void **vaddr_o); +int usd_free_mr(void *); +char *pci_name(struct pci_dev *pdev); + +static inline void *pci_alloc_consistent(struct pci_dev *hwdev, + size_t size, + dma_addr_t * dma_handle) +{ + int ret; + void *va; + + ret = usd_alloc_mr((struct usd_device *) hwdev, size, &va); + if (ret == 0) { + *dma_handle = (dma_addr_t) va; + return va; + } else { + return NULL; + } +} + +static inline void pci_free_consistent( __attribute__ ((unused)) + struct pci_dev *pdev, + __attribute__ ((unused)) size_t + size, void *vaddr, + __attribute__ ((unused)) dma_addr_t + dma) +{ + (void) usd_free_mr(vaddr); +} + +#define usd_err(args...) fprintf(stderr, args) +#define pr_err usd_err +#define pr_warning(args...) + +#ifndef wmb +#define wmb() asm volatile("" ::: "memory") +#endif + +#ifndef rmb +#define rmb() asm volatile("" ::: "memory") +#endif + +#endif /* _KCOMPAT_PRIV_H_ */ diff --git a/prov/usnic/src/usnic_direct/libnl1_utils.h b/prov/usnic/src/usnic_direct/libnl1_utils.h new file mode 100644 index 00000000000..a559373bf3a --- /dev/null +++ b/prov/usnic/src/usnic_direct/libnl1_utils.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#ifndef LIBNL1_UTILS_H +#define LIBNL1_UTILS_H + +#include +#include +#include +#include +#include + +typedef struct nl_handle NL_HANDLE; + +#define NLMSG_SIZE(size) nlmsg_msg_size(size) +#define NL_GETERROR(err) nl_geterror() +#define NL_HANDLE_ALLOC nl_handle_alloc +#define NL_HANDLE_FREE nl_handle_destroy +#define NL_DISABLE_SEQ_CHECK nl_disable_sequence_check +#define INC_CB_MSGCNT(arg) \ + do { \ + arg->msg_cnt++; \ + } while (0) + +/* + * the return value of nl_recvmsgs_default does not tell + * whether it returns because of successful read or socket + * timeout. This is a limitation in libnl1. So we compare + * message count before and after the call to decide if there + * is no new message arriving. In this case, this function + * needs to terminate to prevent the caller from + * blocking forever. + * NL_CB_MSG_IN traps every received message, so + * there should be no premature exit + */ +#define NL_RECVMSGS(nlh, cb_arg, rc, err, out) \ + do { \ + int msg_cnt = cb_arg.msg_cnt; \ + err = nl_recvmsgs_default(nlh); \ + if (err < 0) { \ + usnic_err("Failed to receive netlink reply message, error %s\n", \ + NL_GETERROR(err)); \ + goto out; \ + } \ + if (msg_cnt == cb_arg.msg_cnt) {\ + err = rc; \ + goto out; \ + } \ + } while (0) + +struct usnic_rt_cb_arg { + uint32_t nh_addr; + int oif; + int found; + int msg_cnt; + int retry; + struct usnic_nl_sk *unlsk; +}; + +/* libnl1 and libnl3 return kernel resource exhaustion in different + * ways. Use this macro to abstract the differences away. + * + * In libnl1, nl_send() will return -ECONNREFUSED. */ +#define USD_NL_SEND(nlh, msg, ret, retry) \ + do { \ + retry = 0; \ + ret = nl_send((nlh), (msg)); \ + if (ret == -ECONNREFUSED) { \ + retry = 1; \ + } \ + } while(0); + +#endif /* LIBNL1_UTILS_H */ diff --git a/prov/usnic/src/usnic_direct/libnl3_utils.h b/prov/usnic/src/usnic_direct/libnl3_utils.h new file mode 100644 index 00000000000..efc57f8da62 --- /dev/null +++ b/prov/usnic/src/usnic_direct/libnl3_utils.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#ifndef LIBNL3_UTILS_H +#define LIBNL3_UTILS_H + +#include +#include +#include +#include +#include + +typedef struct nl_sock NL_HANDLE; + +#define NLMSG_SIZE(size) nlmsg_size(size) +#define NL_GETERROR(err) nl_geterror(err) +#define NL_HANDLE_ALLOC nl_socket_alloc +#define NL_HANDLE_FREE nl_socket_free +#define NL_DISABLE_SEQ_CHECK nl_socket_disable_seq_check +#define INC_CB_MSGCNT(arg) + +/* err will be returned as -NLE_AGAIN */ +/* if the socket times out */ +#define NL_RECVMSGS(nlh, cb_arg, rc, err, out) \ + do { \ + err = nl_recvmsgs_default(nlh); \ + if (err < 0) { \ + usnic_err("Failed to receive netlink reply message, error %s\n", \ + NL_GETERROR(err)); \ + if (err == -NLE_AGAIN) \ + err = rc; \ + goto out; \ + } \ + } while (0) + +struct usnic_rt_cb_arg { + uint32_t nh_addr; + int oif; + int found; + int retry; + struct usnic_nl_sk *unlsk; +}; + +/* libnl1 and libnl3 return kernel resource exhaustion in different + * ways. Use this macro to abstract the differences away. + * + * In libnl3, nl_send() will return -NLE_FAILURE and + * errno==ECONNREFUSED. */ +#define USD_NL_SEND(nlh, msg, ret, retry) \ + do { \ + retry = 0; \ + ret = nl_send((nlh), (msg)); \ + if (ret == -NLE_FAILURE && errno == ECONNREFUSED) { \ + retry = 1; \ + } \ + } while(0); + +#endif /* LIBNL3_UTILS_H */ diff --git a/prov/usnic/src/usnic_direct/libnl_utils.h b/prov/usnic/src/usnic_direct/libnl_utils.h new file mode 100644 index 00000000000..77a965d88c8 --- /dev/null +++ b/prov/usnic/src/usnic_direct/libnl_utils.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2014-2015, Cisco Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#ifndef LIBNL_UTILS_H +#define LIBNL_UTILS_H + +#if !defined (HAVE_LIBNL3) +#error You must define HAVE_LIBNL3 to 0 or 1 before including libnl_utils.h +#elif HAVE_LIBNL3 +#include "libnl3_utils.h" +#else +#include "libnl1_utils.h" +#endif + +struct usnic_nl_sk { + NL_HANDLE *nlh; + uint32_t seq; +}; + +int usnic_nl_rt_lookup(uint32_t src_addr, uint32_t dst_addr, int oif, + uint32_t *nh_addr); +int usnic_nl_neigh_lookup(uint32_t dst_ip, int ifindex, uint8_t *n_lladdr, + uint16_t *n_state); + +#endif /* LIBNL_UTILS_H */ diff --git a/prov/usnic/src/usnic_direct/libnl_utils_common.c b/prov/usnic/src/usnic_direct/libnl_utils_common.c new file mode 100644 index 00000000000..87365b25b57 --- /dev/null +++ b/prov/usnic/src/usnic_direct/libnl_utils_common.c @@ -0,0 +1,465 @@ +/* + * Copyright (c) 2014-2016, Cisco Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ +#include +#include +#include +#include + +#include "libnl_utils.h" +#include "usnic_user_utils.h" + +#if USNIC_LOG_LVL >= USNIC_LOG_LVL_ERR +#define usnic_nlmsg_dump(msg) nl_msg_dump(msg, stderr) +#else +#define usnic_nlmsg_dump(msg) +#endif + +/* + * Querying the routing tables via netlink is expensive, especially + * when many processes are doing so at the same time on a single + * server (e.g., in an MPI job). As such, we cache netlink responses + * to alleviate pressure on the netlink kernel interface. + */ + struct usd_nl_cache_entry { + time_t timestamp; + + uint32_t src_ipaddr_be; + uint32_t dest_ipaddr_be; + int ifindex; + uint32_t nh_addr; + int reachable; + + /* For now, this cache is a simple linked list. Eventually, + * this cache should be a better data structure, such as a + * hash table. */ + struct usd_nl_cache_entry *prev; + struct usd_nl_cache_entry *next; +}; + +/* Semi-arbitrarily set cache TTL to 2 minutes */ +static time_t usd_nl_cache_timeout = 120; + +static struct usd_nl_cache_entry *cache = NULL; + + +static struct nla_policy route_policy[RTA_MAX+1] = { + [RTA_IIF] = { .type = NLA_STRING, + .maxlen = IFNAMSIZ, }, + [RTA_OIF] = { .type = NLA_U32 }, + [RTA_PRIORITY] = { .type = NLA_U32 }, + [RTA_FLOW] = { .type = NLA_U32 }, + [RTA_MP_ALGO] = { .type = NLA_U32 }, + [RTA_CACHEINFO] = { .minlen = sizeof(struct rta_cacheinfo) }, + [RTA_METRICS] = { .type = NLA_NESTED }, + [RTA_MULTIPATH] = { .type = NLA_NESTED }, +}; + +static int usnic_is_nlreply_expected(struct usnic_nl_sk *unlsk, + struct nlmsghdr *nlm_hdr) +{ + if (nlm_hdr->nlmsg_pid != nl_socket_get_local_port(unlsk->nlh) + || nlm_hdr->nlmsg_seq != unlsk->seq) { + usnic_err("Not an expected reply msg pid: %u local pid: %u msg seq: %u expected seq: %u\n", + nlm_hdr->nlmsg_pid, + nl_socket_get_local_port(unlsk->nlh), + nlm_hdr->nlmsg_seq, unlsk->seq); + return 0; + } + + return 1; +} + +static int usnic_is_nlreply_err(struct nlmsghdr *nlm_hdr, + struct usnic_rt_cb_arg *arg) +{ + if (nlm_hdr->nlmsg_type == NLMSG_ERROR) { + struct nlmsgerr *e = (struct nlmsgerr *)nlmsg_data(nlm_hdr); + if (nlm_hdr->nlmsg_len >= (__u32)NLMSG_SIZE(sizeof(*e))) { + usnic_strerror(e->error, + "Received a netlink error message"); + /* Sometimes nl_send() succeeds, but the + * request fails because the kernel is + * temporarily out of resources. In these + * cases, we should tell the caller that they + * should try again. */ + if (e->error == -ECONNREFUSED) { + arg->retry = 1; + } + } else + usnic_err( + "Received a truncated netlink error message\n"); + return 1; + } + + return 0; +} + +static int usnic_nl_send_query(struct usnic_nl_sk *unlsk, struct nl_msg *msg, + int protocol, int flag) +{ + int ret, retry; + struct nlmsghdr *nlhdr; + + nlhdr = nlmsg_hdr(msg); + while (1) { + nlhdr->nlmsg_pid = nl_socket_get_local_port(unlsk->nlh); + nlhdr->nlmsg_seq = ++unlsk->seq; + nlmsg_set_proto(msg, protocol); + nlhdr->nlmsg_flags = flag; + + /* Sometimes nl_send() can fail simply because the + * kernel is temporarily out of resources, and we + * should just try again. libnl1 and libnl3 handle + * this case a little differently, so use the + * USD_NL_SEND() macro to hide the differences. If + * retry comes back as true, then sleep a little and + * try again. */ + USD_NL_SEND(unlsk->nlh, msg, ret, retry); + if (retry) { + usleep(5); + continue; + } + break; + } + + return ret; +} + +static int usnic_nl_set_rcvsk_timer(NL_HANDLE *nlh) +{ + int err = 0; + struct timeval timeout; + + timeout.tv_sec = 1; + timeout.tv_usec = 0; + + err = setsockopt(nl_socket_get_fd(nlh), SOL_SOCKET, SO_RCVTIMEO, + (char *)&timeout, sizeof(timeout)); + if (err < 0) + usnic_perr("Failed to set SO_RCVTIMEO for nl socket"); + + return err; +} + +static int usnic_nl_sk_alloc(struct usnic_nl_sk **p_sk, int protocol) +{ + struct usnic_nl_sk *unlsk; + NL_HANDLE *nlh; + int err; + + unlsk = calloc(1, sizeof(*unlsk)); + if (!unlsk) { + usnic_err("Failed to allocate usnic_nl_sk struct\n"); + return ENOMEM; + } + + nlh = NL_HANDLE_ALLOC(); + if (!nlh) { + usnic_err("Failed to allocate nl handle\n"); + err = ENOMEM; + goto err_free_unlsk; + } + + err = nl_connect(nlh, protocol); + if (err < 0) { + usnic_err("Failed to connnect netlink route socket error: %s\n", + NL_GETERROR(err)); + err = EINVAL; + goto err_free_nlh; + } + + NL_DISABLE_SEQ_CHECK(nlh); + err = usnic_nl_set_rcvsk_timer(nlh); + if (err < 0) + goto err_close_nlh; + + unlsk->nlh = nlh; + unlsk->seq = (uint32_t) time(NULL); + *p_sk = unlsk; + return 0; + +err_close_nlh: + nl_close(nlh); +err_free_nlh: + NL_HANDLE_FREE(nlh); +err_free_unlsk: + free(unlsk); + return err; +} + +static void usnic_nl_sk_free(struct usnic_nl_sk *unlsk) +{ + nl_close(unlsk->nlh); + NL_HANDLE_FREE(unlsk->nlh); + free(unlsk); +} + +static int usnic_rt_raw_parse_cb(struct nl_msg *msg, void *arg) +{ + struct usnic_rt_cb_arg *lookup_arg = (struct usnic_rt_cb_arg *)arg; + struct usnic_nl_sk *unlsk = lookup_arg->unlsk; + struct nlmsghdr *nlm_hdr = nlmsg_hdr(msg); + struct rtmsg *rtm; + struct nlattr *tb[RTA_MAX + 1]; + int found = 0; + int err; + + INC_CB_MSGCNT(lookup_arg); + + if (!usnic_is_nlreply_expected(unlsk, nlm_hdr)) { + usnic_nlmsg_dump(msg); + return NL_SKIP; + } + + if (usnic_is_nlreply_err(nlm_hdr, lookup_arg)) { + usnic_nlmsg_dump(msg); + return NL_SKIP; + } + + if (nlm_hdr->nlmsg_type != RTM_NEWROUTE) { + char buf[128]; + nl_nlmsgtype2str(nlm_hdr->nlmsg_type, buf, sizeof(buf)); + usnic_err("Received an invalid route request reply message type: %s\n", + buf); + usnic_nlmsg_dump(msg); + return NL_SKIP; + } + + rtm = nlmsg_data(nlm_hdr); + if (rtm->rtm_family != AF_INET) { + usnic_err("RTM message contains invalid AF family: %u\n", + rtm->rtm_family); + usnic_nlmsg_dump(msg); + return NL_SKIP; + } + + err = nlmsg_parse(nlm_hdr, sizeof(struct rtmsg), tb, RTA_MAX, + route_policy); + if (err < 0) { + usnic_err("nlmsg parse error %s\n", NL_GETERROR(err)); + usnic_nlmsg_dump(msg); + return NL_SKIP; + } + + if (tb[RTA_OIF]) { + if (nla_get_u32(tb[RTA_OIF]) == (uint32_t)lookup_arg->oif) + found = 1; + else + usnic_err("Retrieved route has a different outgoing interface %d (expected %d)\n", + nla_get_u32(tb[RTA_OIF]), + lookup_arg->oif); + } + + if (found && tb[RTA_GATEWAY]) + lookup_arg->nh_addr = nla_get_u32(tb[RTA_GATEWAY]); + + lookup_arg->found = found; + return NL_STOP; +} + + +static struct usd_nl_cache_entry * +usd_nl_cache_lookup(uint32_t src_ipaddr_be, uint32_t dest_ipaddr_be, int ifindex) +{ + time_t now; + struct usd_nl_cache_entry *nlce; + struct usd_nl_cache_entry *stale; + + now = time(NULL); + for (nlce = cache; NULL != nlce; ) { + /* While we're traversing the cache, we might as well + * remove stale entries */ + if (now > nlce->timestamp + usd_nl_cache_timeout) { + stale = nlce; + nlce = nlce->next; + + if (stale->prev) { + stale->prev->next = stale->next; + } + if (stale->next) { + stale->next->prev = stale->prev; + } + if (cache == stale) { + cache = nlce; + } + free(stale); + + continue; + } + + if (nlce->src_ipaddr_be == src_ipaddr_be && + nlce->dest_ipaddr_be == dest_ipaddr_be && + nlce->ifindex == ifindex) { + return nlce; + } + + nlce = nlce->next; + } + + return NULL; +} + +static void +usd_nl_cache_save(int32_t src_ipaddr_be, uint32_t dest_ipaddr_be, int ifindex, + uint32_t nh_addr, int reachable) +{ + struct usd_nl_cache_entry *nlce; + + nlce = calloc(1, sizeof(*nlce)); + if (NULL == nlce) { + return; + } + + nlce->timestamp = time(NULL); + nlce->src_ipaddr_be = src_ipaddr_be; + nlce->dest_ipaddr_be = dest_ipaddr_be; + nlce->ifindex = ifindex; + nlce->nh_addr = nh_addr; + nlce->reachable = reachable; + + nlce->next = cache; + if (cache) { + cache->prev = nlce; + } + cache = nlce; +} + + +int usnic_nl_rt_lookup(uint32_t src_addr, uint32_t dst_addr, int oif, + uint32_t *nh_addr) +{ + struct usnic_nl_sk *unlsk; + struct nl_msg *nlm; + struct rtmsg rmsg; + struct usnic_rt_cb_arg arg; + int err; + + /* See if we have this NL result cached */ + struct usd_nl_cache_entry *nlce; + nlce = usd_nl_cache_lookup(src_addr, dst_addr, oif); + if (nlce) { + if (nlce->reachable) { + *nh_addr = nlce->nh_addr; + return 0; + } else { + return EHOSTUNREACH; + } + } + +retry: + unlsk = NULL; + err = usnic_nl_sk_alloc(&unlsk, NETLINK_ROUTE); + if (err) + return err; + + memset(&rmsg, 0, sizeof(rmsg)); + rmsg.rtm_family = AF_INET; + rmsg.rtm_dst_len = sizeof(dst_addr) * CHAR_BIT; + rmsg.rtm_src_len = sizeof(src_addr) * CHAR_BIT; + + nlm = nlmsg_alloc_simple(RTM_GETROUTE, 0); + if (!nlm) { + usnic_err("Failed to alloc nl message, %s\n", + NL_GETERROR(err)); + err = ENOMEM; + goto out; + } + nlmsg_append(nlm, &rmsg, sizeof(rmsg), NLMSG_ALIGNTO); + nla_put_u32(nlm, RTA_DST, dst_addr); + nla_put_u32(nlm, RTA_SRC, src_addr); + + err = usnic_nl_send_query(unlsk, nlm, NETLINK_ROUTE, NLM_F_REQUEST); + nlmsg_free(nlm); + if (err < 0) { + usnic_err("Failed to send RTM_GETROUTE query message, error %s\n", + NL_GETERROR(err)); + err = EINVAL; + goto out; + } + + memset(&arg, 0, sizeof(arg)); + arg.oif = oif; + arg.unlsk = unlsk; + err = nl_socket_modify_cb(unlsk->nlh, NL_CB_MSG_IN, NL_CB_CUSTOM, + usnic_rt_raw_parse_cb, &arg); + if (err != 0) { + usnic_err("Failed to setup callback function, error %s\n", + NL_GETERROR(err)); + err = EINVAL; + goto out; + } + + /* Sometimes the recvmsg can fail because something is + * temporarily out of resources. In this case, delay a little + * and try again. */ + do { + err = 0; + NL_RECVMSGS(unlsk->nlh, arg, EAGAIN, err, out); + if (err == EAGAIN) { + usleep(5); + } + } while (err == EAGAIN); + + /* If we got a reply back that indicated that the kernel was + * too busy to handle this request, delay a little and try + * again. */ + if (arg.retry) { + usleep(5); + goto retry; + } + + if (arg.found) { + *nh_addr = arg.nh_addr; + err = 0; + } else { + err = EHOSTUNREACH; + } + + /* Save this result in the cache */ + usd_nl_cache_save(src_addr, dst_addr, oif, + arg.nh_addr, arg.found); + +out: + usnic_nl_sk_free(unlsk); + return err; +} + diff --git a/prov/usnic/src/usnic_direct/linux/delay.h b/prov/usnic/src/usnic_direct/linux/delay.h new file mode 100644 index 00000000000..37760d6176c --- /dev/null +++ b/prov/usnic/src/usnic_direct/linux/delay.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#ifndef _DELAY_H_ +#define _DELAY_H_ + +/* Fake header file so that we can compile kernel code in userspace. */ + +#endif /* _DELAY_H_ */ diff --git a/prov/usnic/src/usnic_direct/linux/slab.h b/prov/usnic/src/usnic_direct/linux/slab.h new file mode 100644 index 00000000000..c90c608281f --- /dev/null +++ b/prov/usnic/src/usnic_direct/linux/slab.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#ifndef _SLAB_H_ +#define _SLAB_H_ + +/* Fake header file so that we can compile kernel code in userspace. */ + +#endif /* _SLAB_H_ */ diff --git a/prov/usnic/src/usnic_direct/linux_types.h b/prov/usnic/src/usnic_direct/linux_types.h new file mode 100644 index 00000000000..fc0cfb5fdd0 --- /dev/null +++ b/prov/usnic/src/usnic_direct/linux_types.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#ifndef __LINUX_TYPES_H__ +#define __LINUX_TYPES_H__ + +typedef u_int8_t u8; +typedef u_int16_t u16; +typedef u_int32_t u32; +typedef u_int64_t u64; + +typedef u_int16_t __le16; +typedef u_int32_t __le32; +#define __le64 ___le64 +typedef u_int64_t __le64; + +#define le16_to_cpu +#define le32_to_cpu +#define le64_to_cpu +#define cpu_to_le16 +#define cpu_to_le32 +#define cpu_to_le64 + +#if !defined(__LIBUSNIC__) +#define rmb() asm volatile("" ::: "memory") +#endif + +#endif // __LINUX_TYPES_H__ diff --git a/prov/usnic/src/usnic_direct/rq_enet_desc.h b/prov/usnic/src/usnic_direct/rq_enet_desc.h new file mode 100644 index 00000000000..3eed402b359 --- /dev/null +++ b/prov/usnic/src/usnic_direct/rq_enet_desc.h @@ -0,0 +1,84 @@ +/* + * Copyright 2008-2010 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#ifndef _RQ_ENET_DESC_H_ +#define _RQ_ENET_DESC_H_ + +/* Ethernet receive queue descriptor: 16B */ +struct rq_enet_desc { + __le64 address; + __le16 length_type; + u8 reserved[6]; +}; + +enum rq_enet_type_types { + RQ_ENET_TYPE_ONLY_SOP = 0, + RQ_ENET_TYPE_NOT_SOP = 1, + RQ_ENET_TYPE_RESV2 = 2, + RQ_ENET_TYPE_RESV3 = 3, +}; + +#define RQ_ENET_ADDR_BITS 64 +#define RQ_ENET_LEN_BITS 14 +#define RQ_ENET_LEN_MASK ((1 << RQ_ENET_LEN_BITS) - 1) +#define RQ_ENET_TYPE_BITS 2 +#define RQ_ENET_TYPE_MASK ((1 << RQ_ENET_TYPE_BITS) - 1) + +static inline void rq_enet_desc_enc(struct rq_enet_desc *desc, + u64 address, u8 type, u16 length) +{ + desc->address = cpu_to_le64(address); + desc->length_type = cpu_to_le16((length & RQ_ENET_LEN_MASK) | + ((type & RQ_ENET_TYPE_MASK) << RQ_ENET_LEN_BITS)); +} + +static inline void rq_enet_desc_dec(struct rq_enet_desc *desc, + u64 *address, u8 *type, u16 *length) +{ + *address = le64_to_cpu(desc->address); + *length = le16_to_cpu(desc->length_type) & RQ_ENET_LEN_MASK; + *type = (u8)((le16_to_cpu(desc->length_type) >> RQ_ENET_LEN_BITS) & + RQ_ENET_TYPE_MASK); +} + +#endif /* _RQ_ENET_DESC_H_ */ diff --git a/prov/usnic/src/usnic_direct/usd.h b/prov/usnic/src/usnic_direct/usd.h new file mode 100644 index 00000000000..b7f5c2adead --- /dev/null +++ b/prov/usnic/src/usnic_direct/usd.h @@ -0,0 +1,324 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#ifndef _USD_H_ +#define _USD_H_ + +#include + +#include "kcompat.h" +#include "vnic_rq.h" +#include "vnic_wq.h" +#include "vnic_cq.h" +#include "wq_enet_desc.h" +#include "rq_enet_desc.h" +#include "vnic_intr.h" + +#include "usnic_abi.h" +#include "usnic_direct.h" +#include "usd_ib_sysfs.h" + +#define USD_INVALID_HANDLE (~0) +#define USD_SF_ISSET(flags, flagname) \ + ((flags >> USD_SFS_##flagname) & 1) + +#define USD_SEND_MAX_COPY 992 +#define USD_MAX_PRESEND 4 + +#define USD_CTXF_CLOSE_CMD_FD (1u << 0) +#define USD_DEVF_CLOSE_CTX (1u << 0) + +#ifndef USD_DEBUG +#define USD_DEBUG 0 +#endif + +/* + * Group interrupt vector userspace map info + */ +struct usd_grp_vect_map { + void *va; + size_t len; + uint32_t vfid; +}; + +/* + * structure storing interrupt resource and its mapping to FD + */ +struct usd_cq_comp_intr { + struct vnic_intr uci_vintr; + int uci_offset; + int uci_refcnt; + LIST_ENTRY(usd_cq_comp_intr) uci_ctx_link; +}; + +/* + * Instance of a usd context, corresponding to an + * opened libibverbs context + */ +struct usd_context { + struct usd_ib_dev *ucx_ib_dev; /* parent IB dev */ + int ucx_ib_dev_fd; /* file desc for IB dev */ + int ucmd_ib_dev_fd; /* Another open file descriptor for IB dev + * used for encapusulating user commands + * through GET_CONTEXT IB command */ + + uint32_t ucx_flags; + int ucx_caps[USD_CAP_MAX]; /* device capablities */ + + pthread_mutex_t ucx_mutex; /* protect intr_list */ + LIST_HEAD(intr_head, usd_cq_comp_intr) ucx_intr_list; + + /* Remove these after moving ud_attrs here */ + int event_fd; + unsigned num_comp_vectors; +}; + +/* + * Instance of a device opened by user + */ +struct usd_device { + struct usd_context *ud_ctx; + + uint32_t ud_flags; + struct usd_device_attrs ud_attrs; /* TODO move this to usd_ctx */ + + /* VFs we have associated with this device */ + struct usd_vf *ud_vf_list; + + /* PD for this device */ + uint32_t ud_pd_handle; + + /* destination related */ + int ud_arp_sockfd; /* for ARP */ + TAILQ_HEAD(, usd_dest_req) ud_pending_reqs; + TAILQ_HEAD(, usd_dest_req) ud_completed_reqs; + + TAILQ_ENTRY(usd_device) ud_link; + + struct usd_grp_vect_map grp_vect_map; +}; + +/* + * Registered memory region + */ +struct usd_mr { + struct usd_device *umr_dev; + void *umr_vaddr; + uint32_t umr_handle; + uint32_t umr_lkey; + uint32_t umr_rkey; + size_t umr_length; +}; + +/* + * Information about the PCI virtual function + */ +struct usd_vf { + uint32_t vf_id; + int vf_refcnt; + struct vnic_dev_bar vf_bar0; + size_t vf_bar_map_len; + struct vnic_dev *vf_vdev; + struct vnic_dev_iomap_info iomaps[RES_TYPE_MAX]; + + /* Will also protect the devcmd region */ + pthread_mutex_t vf_lock; + struct usd_vf *vf_next; + struct usd_vf *vf_prev; +}; + +/* + * Holding place for information about a VF + */ +struct usd_vf_info { + uint32_t vi_vfid; + dma_addr_t vi_bar_bus_addr; + uint32_t vi_bar_len; + size_t vi_barhead_len; + struct usnic_vnic_barres_info barres[RES_TYPE_MAX]; +}; + +/* + * Internal representation of a filter + */ +struct usd_qp_filter { + enum usd_filter_type qf_type; + union { + struct { + int u_sockfd; + } qf_udp; + } qf_filter; +}; + +/* + * Definitions and structures about queues + */ + +/* + * this is used to keep track of what has been allocated and/or + * initialized to assist with teardown of partially completed queues + */ +enum usd_qstate { + USD_QS_FILTER_ALLOC = (1 << 0), + USD_QS_VERBS_CREATED = (1 << 1), + USD_QS_VF_MAPPED = (1 << 2), + USD_QS_VNIC_ALLOCATED = (1 << 3), + USD_QS_VNIC_INITIALIZED = (1 << 4), + USD_QS_READY = (1 << 5) +}; + +struct usd_cq_impl { + struct usd_cq ucq_cq; + struct usd_device *ucq_dev; + struct usd_vf *ucq_vf; + + uint32_t ucq_state; + + struct vnic_cq ucq_vnic_cq; + + void *ucq_desc_ring; + uint32_t ucq_next_desc; + uint32_t ucq_last_color; + + uint32_t ucq_index; + uint32_t ucq_num_entries; + uint32_t ucq_cqe_mask; + uint32_t ucq_color_shift; + uint32_t ucq_handle; + + int comp_fd; + int comp_vec; + int comp_req_notify; + int intr_offset; + struct usd_cq_comp_intr *ucq_intr; + + struct usd_rq **ucq_rq_map; + struct usd_wq **ucq_wq_map; +}; +#define to_cqi(CQ) ((struct usd_cq_impl *)(CQ)) +#define to_usdcq(CQ) (&(CQ)->ucq_cq) + +struct usd_rq { + struct usd_cq_impl *urq_cq; + uint32_t urq_state; + + uint32_t urq_index; + uint32_t urq_num_entries; + struct vnic_rq urq_vnic_rq; + + void **urq_context; + + char *urq_rxbuf; + char **urq_post_addr; + uint32_t urq_recv_credits; /* number of available descriptors */ + struct rq_enet_desc *urq_desc_ring; + struct rq_enet_desc *urq_next_desc; + uint32_t urq_post_index; /* next rxbuf to post */ + uint32_t urq_post_index_mask; + uint32_t urq_last_comp; + uint32_t urq_accum_bytes; + + uint32_t urq_num_rxbuf; + uint32_t urq_rxbuf_size; +}; + +struct usd_wq_post_info { + void *wp_context; + uint32_t wp_len; +}; + +struct usd_wq { + struct usd_cq_impl *uwq_cq; + uint32_t uwq_state; + struct vnic_wq uwq_vnic_wq; + + uint32_t uwq_index; + uint32_t uwq_num_entries; + uint32_t uwq_send_credits; + struct wq_enet_desc *uwq_desc_ring; + struct wq_enet_desc *uwq_next_desc; + uint32_t uwq_post_index; + uint32_t uwq_post_index_mask; + uint32_t uwq_last_comp; + + uint8_t *uwq_copybuf; + struct usd_wq_post_info *uwq_post_info; + + /* used only for PIO QPs */ + void *pio_v_wq_addr; + uint64_t pio_p_wq_addr; + char *pio_v_pkt_buf; + uint64_t pio_p_pkt_buf; +}; + +struct usd_qp_impl { + struct usd_qp uq_qp; /* user's view of QP */ + + struct usd_device *uq_dev; + struct usd_vf *uq_vf; + + struct usd_qp_attrs uq_attrs; + + uint32_t uq_qp_handle; /* IB QP handle */ + uint32_t uq_qp_num; + + /* primary filter for this QP */ + struct usd_qp_filter uq_filter; + + struct usd_wq uq_wq; + struct usd_rq uq_rq; +}; +#define to_qpi(Q) ((struct usd_qp_impl *)(Q)) +#define to_usdqp(Q) (&(Q)->uq_qp) + +struct usd_dest { + union { + struct { + struct usd_udp_hdr u_hdr; + } ds_udp; + } ds_dest; +}; + +extern struct usd_qp_ops usd_qp_ops_ud_udp; +extern struct usd_qp_ops usd_qp_ops_ud_pio_udp; +extern struct usd_qp_ops usd_qp_ops_ud_raw; +#endif /* _USD_H_ */ diff --git a/prov/usnic/src/usnic_direct/usd_caps.c b/prov/usnic/src/usnic_direct/usd_caps.c new file mode 100644 index 00000000000..c1c7d4068a6 --- /dev/null +++ b/prov/usnic/src/usnic_direct/usd_caps.c @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "usnic_direct.h" +#include "usd.h" + +int +usd_get_cap( + struct usd_device *dev, + enum usd_capability cap) +{ + if (cap >= USD_CAP_MAX) { + return 0; + } + + return dev->ud_ctx->ucx_caps[cap]; +} + diff --git a/prov/usnic/src/usnic_direct/usd_caps.h b/prov/usnic/src/usnic_direct/usd_caps.h new file mode 100644 index 00000000000..a24128934bd --- /dev/null +++ b/prov/usnic/src/usnic_direct/usd_caps.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#ifndef _USD_CAPS_H_ +#define _USD_CAPS_H_ + +int usd_read_caps(struct usd_device *dev); + +#endif /* _USD_CAPS_H_ */ diff --git a/prov/usnic/src/usnic_direct/usd_dest.c b/prov/usnic/src/usnic_direct/usd_dest.c new file mode 100644 index 00000000000..a1c42a9e355 --- /dev/null +++ b/prov/usnic/src/usnic_direct/usd_dest.c @@ -0,0 +1,595 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "usnic_ip_utils.h" +#include "libnl_utils.h" + +#include "usnic_direct.h" +#include "usd.h" +#include "usd_queue.h" +#include "usd_time.h" +#include "usd_dest.h" +#include "usd_socket.h" + +extern TAILQ_HEAD(, usd_device) usd_device_list; + +static struct usd_dest_params usd_dest_params = { + .dp_arp_timeout = 1000, + .dp_max_arps = 10 +}; + +int +usd_get_dest_distance( + struct usd_device *dev, + uint32_t daddr_be, + int *metric_o) +{ + uint32_t nh_ip_addr; + int ret; + + ret = usnic_nl_rt_lookup(dev->ud_attrs.uda_ipaddr_be, daddr_be, + dev->ud_attrs.uda_ifindex, &nh_ip_addr); + if (ret != 0) { + *metric_o = -1; + ret = 0; + } else if (nh_ip_addr == 0) { + *metric_o = 0; + } else { + *metric_o = 1; + } + + return ret; +} + +static void +usd_dest_set_complete( + struct usd_device *dev, + struct usd_dest_req *req) +{ + req->udr_complete = 1; + if (req->udr_status != 0 && req->udr_dest != NULL) { + free(req->udr_dest); + req->udr_dest = NULL; + } + TAILQ_REMOVE(&dev->ud_pending_reqs, req, udr_link); + TAILQ_INSERT_TAIL(&dev->ud_completed_reqs, req, udr_link); +} + +static int +usd_dest_trigger_arp( + struct usd_device *dev, + struct usd_dest_req *req) +{ + int ret; + + usd_get_time(&req->udr_last_arp); + req->udr_arps_sent++; + + ret = + usnic_arp_request(req->udr_daddr_be, dev->ud_arp_sockfd); + return ret; +} + +static int +usd_check_dest_resolved( + struct usd_device *dev, + struct usd_dest_req *req) +{ + struct ether_header *eth; + int ret; + + eth = &req->udr_dest->ds_dest.ds_udp.u_hdr.uh_eth; + ret = usnic_arp_lookup(dev->ud_attrs.uda_ifname, + req->udr_daddr_be, dev->ud_arp_sockfd, + ð->ether_dhost[0]); + + if (ret == EAGAIN) + return -EAGAIN; + + /* for better or worse, resolution is complete */ + req->udr_status = -ret; + return 0; +} + +/* + * Loop through the ordered pending create_dest request queue. + * If an entry is complete, move it to the completed queue. + * If the retry timeout for an entry has arrived, re-trigger the ARP + */ +static void +usd_dest_progress_dev( + struct usd_device *dev) +{ + struct usd_dest_req *req; + struct usd_dest_req *tmpreq; + usd_time_t now; + int delta; + int ret; + + usd_get_time(&now); + + TAILQ_FOREACH_SAFE(req, tmpreq, &dev->ud_pending_reqs, udr_link) { + + /* resolution complete? */ + ret = usd_check_dest_resolved(dev, req); + if (ret == 0) { + usd_dest_set_complete(dev, req); + continue; + } + + + /* time for next ARP trigger? */ + delta = usd_time_diff(req->udr_last_arp, now); + if (delta > (int) usd_dest_params.dp_arp_timeout) { + if (req->udr_arps_sent >= usd_dest_params.dp_max_arps) { + req->udr_status = -EHOSTUNREACH; + usd_dest_set_complete(dev, req); + continue; + } + + ret = usd_dest_trigger_arp(dev, req); + if (ret != 0) { + req->udr_status = ret; + usd_dest_set_complete(dev, req); + } + } + } +} + +static void +usd_dest_progress(void) +{ + struct usd_device *dev; + + TAILQ_FOREACH(dev, &usd_device_list, ud_link) { + usd_dest_progress_dev(dev); + } +} + +/* + * Fill in all of a header except the dest MAC and the UDP ports + * specified remote host + */ +void +usd_fill_udp_dest( + struct usd_dest *dest, + struct usd_device_attrs *dap, + uint32_t daddr_be, + uint16_t dport_be) +{ + struct ether_header eth = { + .ether_type = htons(0x0800) + }; + + struct udphdr udp = { + .dest = dport_be + }; + + struct iphdr ip = { + .saddr = dap->uda_ipaddr_be, + .daddr = daddr_be, + .protocol = IPPROTO_UDP, + .version = 4, + .frag_off = 0, + .ihl = 5, /* no options */ + .tos = 0, + .ttl = 8 + }; + + /* Workaround taking a pointer to an element of a packed structure due to + * warnings in Clang 4.0.1 and beyond. + */ + memcpy(eth.ether_shost, dap->uda_mac_addr, ETH_ALEN); + dest->ds_dest.ds_udp.u_hdr.uh_eth = eth; + dest->ds_dest.ds_udp.u_hdr.uh_udp = udp; + dest->ds_dest.ds_udp.u_hdr.uh_ip = ip; +} + +static int +usd_create_udp_dest_start( + struct usd_device *dev, + uint32_t daddr_be, + uint16_t dport_be, + struct usd_dest_req **req_o) +{ + struct usd_dest_req *req; + struct usd_dest *dest; + uint32_t first_hop_daddr_be; + int ret; + + /* catch a mistake that will almost always lead to hung programs */ + if (daddr_be == 0 || dport_be == 0) { + return -EINVAL; + } + + req = calloc(sizeof(*req), 1); + dest = calloc(sizeof(*dest), 1); + if (req == NULL || dest == NULL) { + ret = -errno; + goto fail; + } + + ret = usnic_nl_rt_lookup(dev->ud_attrs.uda_ipaddr_be, + daddr_be, dev->ud_attrs.uda_ifindex, + &first_hop_daddr_be); + if (ret != 0) { + /* EHOSTUNREACH is non-fatal, but we are done with resolution */ + if (ret == EHOSTUNREACH) { + req->udr_status = -EHOSTUNREACH; + free(dest); + goto complete; + } else { + ret = -ret; + } + goto fail; + } + if (first_hop_daddr_be == 0) + first_hop_daddr_be = daddr_be; + + /* Fill in dest as much as we can */ + usd_fill_udp_dest(dest, &dev->ud_attrs, daddr_be, dport_be); + + /* initiate request and add to tail of pending list */ + req->udr_daddr_be = first_hop_daddr_be; + req->udr_dest = dest; + + ret = usd_dest_trigger_arp(dev, req); + if (ret != 0) + goto fail; + +complete: + TAILQ_INSERT_TAIL(&dev->ud_pending_reqs, req, udr_link); + if (req->udr_status != 0) { + usd_dest_set_complete(dev, req); + } + *req_o = req; + + return 0; + + fail: + if (req != NULL) + free(req); + if (dest != NULL) + free(dest); + return ret; +} + + +/* + * synchronously create a UDP destination by initiating the + * resolution, then waiting for it to complete + */ +static int +usd_create_udp_dest( + struct usd_device *dev, + uint32_t daddr_be, + uint16_t dport_be, + struct usd_dest **dest_o) +{ + struct usd_dest_req *req; + int ret; + + ret = usd_create_udp_dest_start(dev, daddr_be, dport_be, &req); + if (ret != 0) + return ret; + + /* loop until request completes or times out */ + while (req->udr_complete == 0) { + usd_dest_progress(); + } + + ret = req->udr_status; + if (ret == 0) + *dest_o = req->udr_dest; + + TAILQ_REMOVE(&dev->ud_completed_reqs, req, udr_link); + free(req); + return ret; +} + +/* + * Build and save a IP header appropriate for sending to the + * specified remote host + */ +int +usd_create_ip_dest( + struct usd_device *dev, + uint32_t daddr_be, + struct usd_dest **dest_o) +{ + int ret; + + ret = usd_create_udp_dest(dev, daddr_be, 0, dest_o); + return ret; +} + +void +usd_dest_set_udp_ports( + struct usd_dest *dest, + struct usd_qp *src_uqp, + uint16_t dest_port_be) +{ + struct usd_qp_impl *qp = to_qpi(src_uqp); + struct udphdr udp = { + .source = qp->uq_attrs.uqa_local_addr.ul_addr.ul_udp.u_addr.sin_port, + .dest = dest_port_be + }; + + /* Workaround taking a pointer to an element of a packed structure due to + * warnings in Clang 4.0.1 and beyond. + */ + dest->ds_dest.ds_udp.u_hdr.uh_udp = udp; +} + +/* + * Synchronously creates a destination + */ +int +usd_create_dest( + struct usd_device *dev, + uint32_t daddr_be, + uint16_t dport_be, + struct usd_dest **dest_o) +{ + int ret; + + ret = usd_create_udp_dest(dev, daddr_be, dport_be, dest_o); + + return ret; +} + +int +usd_destroy_dest( + struct usd_dest *dest) +{ + if (dest != NULL) { + free(dest); + } + return 0; +} + +/* + * Get address resolution settings + */ +int +usd_get_dest_params( + struct usd_dest_params *params) +{ + if (params == NULL) + return -EINVAL; + + *params = usd_dest_params; + return 0; +} + +/* + * Set address resolution settings + * Settings may not be changed while any resolution requests are in progress. + */ +int +usd_set_dest_params( + struct usd_dest_params *params) +{ + if (params == NULL) + return -EINVAL; + + /* blindly set parameters, allowing user to shoot self if desired */ + usd_dest_params.dp_arp_timeout = params->dp_arp_timeout; + usd_dest_params.dp_max_arps = params->dp_max_arps; + + return 0; +} + +/* + * Start destination creation + * Resolution progress is performed in usd_create_dest_query() and + * usd_create_dest_poll() + */ +int +usd_create_dest_start( + struct usd_device *dev, + uint32_t daddr_be, + uint16_t dport_be, + void *context) +{ + struct usd_dest_req *req; + int ret; + + req = NULL; + ret = usd_create_udp_dest_start(dev, daddr_be, dport_be, &req); + + if (ret == 0) { + req->udr_context = context; + } + + return ret; +} + +/* + * Return first completed destinatin request + */ +int +usd_create_dest_poll( + struct usd_device *dev, + void **context_o, + int *status, + struct usd_dest **dest_o) +{ + struct usd_dest_req *req; + + usd_dest_progress(); + + if (!TAILQ_EMPTY(&dev->ud_completed_reqs)) { + req = TAILQ_FIRST(&dev->ud_completed_reqs); + TAILQ_REMOVE(&dev->ud_completed_reqs, req, udr_link); + *context_o = req->udr_context; + *status = req->udr_status; + if (*status == 0) + *dest_o = req->udr_dest; + free(req); + return 0; + + } else { + return -EAGAIN; + } +} + +/* + * Check completion of a particular request + */ +int +usd_create_dest_query( + struct usd_device *dev, + void *context, + int *status, + struct usd_dest **dest_o) +{ + struct usd_dest_req *req; + + usd_dest_progress(); + + TAILQ_FOREACH(req, &dev->ud_completed_reqs, udr_link) { + if (req->udr_context == context) { + TAILQ_REMOVE(&dev->ud_completed_reqs, req, udr_link); + *status = req->udr_status; + if (*status == 0) + *dest_o = req->udr_dest; + free(req); + return 0; + } + } + + return -EAGAIN; +} + +/* + * Cancel a destination creation in progress + * Look through both the pending and completed queues, simply + * squash the record if we find it. + */ +int +usd_create_dest_cancel( + struct usd_device *dev, + void *context) +{ + struct usd_dest_req *req; + + TAILQ_FOREACH(req, &dev->ud_pending_reqs, udr_link) { + if (req->udr_context == context) { + TAILQ_REMOVE(&dev->ud_pending_reqs, req, udr_link); + goto found; + } + } + + TAILQ_FOREACH(req, &dev->ud_completed_reqs, udr_link) { + if (req->udr_context == context) { + TAILQ_REMOVE(&dev->ud_completed_reqs, req, udr_link); + goto found; + } + } + + return -EINVAL; + + found: + free(req->udr_dest); + free(req); + return 0; +} + +/* + * Create a destination given a MAC address + */ +int +usd_create_dest_with_mac( + struct usd_device *dev, + uint32_t daddr_be, + uint16_t dport_be, + uint8_t * dmac, + struct usd_dest **dest_o) +{ + struct ether_header *eth; + struct usd_dest *dest; + + dest = calloc(sizeof(*dest), 1); + if (dest == NULL) + return -errno; + + /* Fill in dest as much as we can */ + usd_fill_udp_dest(dest, &dev->ud_attrs, daddr_be, dport_be); + + /* copy in MAC from caller */ + eth = &dest->ds_dest.ds_udp.u_hdr.uh_eth; + memcpy(ð->ether_dhost[0], dmac, ETH_ALEN); + + *dest_o = dest; + return 0; +} + +/* + * Expand a destination + */ +int +usd_expand_dest( + struct usd_dest *dest, + uint32_t *ip_be_o, + uint16_t *port_be_o) +{ + if (ip_be_o != NULL) { + *ip_be_o = dest->ds_dest.ds_udp.u_hdr.uh_ip.daddr; + } + if (port_be_o != NULL) { + *port_be_o = dest->ds_dest.ds_udp.u_hdr.uh_udp.dest; + } + + return 0; +} diff --git a/prov/usnic/src/usnic_direct/usd_dest.h b/prov/usnic/src/usnic_direct/usd_dest.h new file mode 100644 index 00000000000..8322e66b08a --- /dev/null +++ b/prov/usnic/src/usnic_direct/usd_dest.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + * definitions for address resolution + */ + +#ifndef _USD_DEST_H_ +#define _USD_DEST_H_ + +#include "usd_queue.h" +#include "usd_time.h" + +/* + * Record describing an address resolution in progress + */ +typedef struct usd_dest_req udr_t; +struct usd_dest_req { + struct usd_dest *udr_dest; + + uint32_t udr_daddr_be; + + unsigned udr_arps_sent; + usd_time_t udr_last_arp; /* time of last */ + + int udr_complete; + int udr_status; + + void *udr_context; + + TAILQ_ENTRY(usd_dest_req) udr_link; +}; + +void usd_fill_udp_dest(struct usd_dest *dest, struct usd_device_attrs *dap, + uint32_t daddr_be, uint16_t dport_be); + +#endif /* _USD_DEST_H_ */ diff --git a/prov/usnic/src/usnic_direct/usd_device.c b/prov/usnic/src/usnic_direct/usd_device.c new file mode 100644 index 00000000000..280467fad30 --- /dev/null +++ b/prov/usnic/src/usnic_direct/usd_device.c @@ -0,0 +1,694 @@ +/* + * Copyright (c) 2014-2016, Cisco Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "usnic_direct.h" +#include "usd.h" +#include "usd_ib_sysfs.h" +#include "usd_ib_cmd.h" +#include "usd_socket.h" +#include "usd_device.h" + +static pthread_once_t usd_init_once = PTHREAD_ONCE_INIT; + +static struct usd_ib_dev *usd_ib_dev_list; +static int usd_init_error; + +TAILQ_HEAD(,usd_device) usd_device_list = + TAILQ_HEAD_INITIALIZER(usd_device_list); + +/* + * Perform one-time initialization + */ +static void +do_usd_init(void) +{ + usd_init_error = usd_ib_get_devlist(&usd_ib_dev_list); +} + +/* + * Unmap group vector when releasing usd_dev + */ +static void +usd_unmap_grp_vect(struct usd_device *dev) +{ + if (dev->grp_vect_map.va != NULL) { + munmap(dev->grp_vect_map.va, dev->grp_vect_map.len); + dev->grp_vect_map.va = NULL; + } +} + +/* + * Init routine + */ +static int +usd_init(void) +{ + /* Do initialization one time */ + pthread_once(&usd_init_once, do_usd_init); + return usd_init_error; +} + +/* + * Return list of currently available devices + */ +int +usd_get_device_list( + struct usd_device_entry *entries, + int *num_entries) +{ + int n; + struct usd_ib_dev *idp; + int ret; + + n = 0; + + ret = usd_init(); + if (ret != 0) { + goto out; + } + +#pragma GCC diagnostic push +#if defined(__GNUC__) && (__GNUC__ >= 8) +#pragma GCC diagnostic ignored "-Wstringop-truncation" +#endif + idp = usd_ib_dev_list; + while (idp != NULL && n < *num_entries) { + strncpy(entries[n].ude_devname, idp->id_usnic_name, + sizeof(entries[n].ude_devname) - 1); + ++n; + idp = idp->id_next; + } +#pragma GCC diagnostic pop + +out: + *num_entries = n; + return ret; +} + +/* + * Allocate a context from the driver + */ +static int +usd_open_ibctx(struct usd_context *uctx) +{ + int ret; + + ret = usd_ib_cmd_get_context(uctx); + return ret; +} + +const char * +usd_devid_to_pid(uint32_t vendor_id, uint32_t device_id) +{ + const char *pid; + + if (vendor_id != 0x1137) + return "Unknown"; + + switch (device_id) { + case 0x4f: + // Vasona + pid = "UCSC-VIC-M82-8P"; + break; + case 0x84: + // Cotati + pid = "UCSB-MLOM-40G-01"; + break; + case 0x85: + // Lexington + pid = "UCSC-PCIE-CSC-02"; + break; + case 0xcd: + // Icehouse + pid = "UCSC-PCIE-C40Q-02"; + break; + case 0xce: + // Kirkwood Lake + pid = "UCSC-PCIE-C10T-02"; + break; + case 0x12e: + // Susanville MLOM + pid = "UCSC-MLOM-CSC-02"; + break; + case 0x139: + // Torrance MLOM + pid = "UCSC-MLOM-C10T-02"; + break; + + case 0x12c: + // Calistoga MLOM + pid = "UCSB-MLOM-40G-03"; + break; + case 0x137: + // Mountain View (Cruz mezz) + pid = "UCSB-VIC-M83-8P"; + break; + case 0x138: + // Walnut Creek + pid = "UCSB-B3116S-LOM"; + break; + case 0x14b: + // Mount Tian + pid = "UCSC-C3260-SIOC"; + break; + case 0x14d: + // Clearlake + pid = "UCSC-PCIE-C40Q-03"; + break; + case 0x157: + // Mount Tian2 + pid = "UCSC-C3260-SIOC"; + break; + case 0x15d: + // Claremont MLOM + pid = "UCSC-MLOM-C40Q-03"; + break; + + case 0x0218: + // Bradbury + pid = "UCSC-MLOM-C25Q-04"; + break; + case 0x0217: + // Brentwood + pid = "UCSC-PCIE-C25Q-04"; + break; + case 0x021a: + // Burlingame + pid = "UCSC-MLOM-C40Q-04"; + break; + case 0x0219: + // Bayside + pid = "UCSC-PCIE-C40Q-04"; + break; + case 0x0215: + // Bakersfield + pid = "UCSB-MLOM-40G-04"; + break; + case 0x0216: + // Boonville + pid = "UCSB-VIC-M84-4P"; + break; + case 0x024a: + // Benicia + pid = "UCSC-PCIE-C100-04"; + break; + case 0x024b: + // Beaumont + pid = "UCSC-MLOM-C100-04"; + break; + + default: + pid = "Unknown Cisco Device"; + break; + } + + return pid; +} + +const char * +usd_devid_to_nicname(uint32_t vendor_id, uint32_t device_id) +{ + const char *nicname; + + if (vendor_id != 0x1137) + return "Unknown"; + + switch (device_id) { + case 0x4f: + // Vasona + nicname = "VIC 1280"; + break; + case 0x84: + // Cotati + nicname = "VIC 1240"; + break; + case 0x85: + // Lexington + nicname = "VIC 1225"; + break; + case 0xcd: + // Icehouse + nicname = "VIC 1285"; + break; + case 0xce: + // Kirkwood Lake + nicname = "VIC 1225T"; + break; + case 0x12e: + // Susanville MLOM + nicname = "VIC 1227"; + break; + case 0x139: + // Torrance MLOM + nicname = "VIC 1227T"; + break; + + case 0x12c: + // Calistoga MLOM + nicname = "VIC 1340"; + break; + case 0x137: + // Mountain View (Cruz mezz) + nicname = "VIC 1380"; + break; + case 0x138: + // Walnut Creek + nicname = "UCSB-B3116S"; + break; + case 0x14b: + // Mount Tian + nicname = ""; + break; + case 0x14d: + // Clearlake + nicname = "VIC 1385"; + break; + case 0x157: + // Mount Tian2 + nicname = ""; + break; + case 0x15d: + // Claremont MLOM + nicname = "VIC 1387"; + break; + + case 0x0218: + // Bradbury + nicname = "VIC 1457"; + break; + case 0x0217: + // Brentwood + nicname = "VIC 1455"; + break; + case 0x021a: + // Burlingame + nicname = "VIC 1487"; + break; + case 0x0219: + // Bayside + nicname = "VIC 1485"; + break; + case 0x0215: + // Bakersfield + nicname = "VIC 1440"; + break; + case 0x0216: + // Boonville + nicname = "VIC 1480"; + break; + case 0x024a: + // Benicia + nicname = "VIC 1495"; + break; + case 0x024b: + // Beaumont + nicname = "VIC 1497"; + break; + + default: + nicname = "Unknown Cisco Device"; + break; + } + + return nicname; +} + +/* + * Rummage around and collect all the info about this device we can find + */ +static int +usd_discover_device_attrs( + struct usd_device *dev, + const char *dev_name) +{ + struct usd_device_attrs *dap; + int ret; + + /* find interface name */ + ret = usd_get_iface(dev); + if (ret != 0) + return ret; + + ret = usd_get_mac(dev, dev->ud_attrs.uda_mac_addr); + if (ret != 0) + return ret; + + ret = usd_get_usnic_config(dev); + if (ret != 0) + return ret; + + ret = usd_get_firmware(dev); + if (ret != 0) + return ret; + + /* ipaddr, netmask, mtu */ + ret = usd_get_dev_if_info(dev); + if (ret != 0) + return ret; + + /* get what attributes we can from querying IB */ + ret = usd_ib_query_dev(dev); + if (ret != 0) + return ret; + + /* constants that should come from driver */ + dap = &dev->ud_attrs; + dap->uda_max_cqe = (1 << 16) - 1;; + dap->uda_max_send_credits = (1 << 12) - 1; + dap->uda_max_recv_credits = (1 << 12) - 1; + strncpy(dap->uda_devname, dev_name, sizeof(dap->uda_devname) - 1); + + return 0; +} + +static void +usd_dev_free(struct usd_device *dev) +{ + if (dev->ud_arp_sockfd != -1) + close(dev->ud_arp_sockfd); + + if (dev->ud_ctx != NULL && + (dev->ud_flags & USD_DEVF_CLOSE_CTX)) { + usd_close_context(dev->ud_ctx); + } + free(dev); +} + +/* + * Allocate a usd_device without allocating a PD + */ +static int +usd_dev_alloc_init(const char *dev_name, struct usd_open_params *uop_param, + struct usd_device **dev_o) +{ + struct usd_device *dev = NULL; + int ret; + + dev = calloc(sizeof(*dev), 1); + if (dev == NULL) { + ret = -errno; + goto out; + } + + dev->ud_flags = 0; + if (uop_param->context == NULL) { + ret = usd_open_context(dev_name, uop_param->cmd_fd, + &dev->ud_ctx); + if (ret != 0) { + goto out; + } + dev->ud_flags |= USD_DEVF_CLOSE_CTX; + } else { + dev->ud_ctx = uop_param->context; + } + + dev->ud_arp_sockfd = -1; + + TAILQ_INIT(&dev->ud_pending_reqs); + TAILQ_INIT(&dev->ud_completed_reqs); + + if (uop_param->context == NULL) + ret = usd_discover_device_attrs(dev, dev_name); + else + ret = usd_discover_device_attrs(dev, + uop_param->context->ucx_ib_dev->id_usnic_name); + if (ret != 0) + goto out; + + dev->ud_attrs.uda_event_fd = dev->ud_ctx->event_fd; + dev->ud_attrs.uda_num_comp_vectors = dev->ud_ctx->num_comp_vectors; + + if (!(uop_param->flags & UOPF_SKIP_LINK_CHECK)) { + ret = usd_device_ready(dev); + if (ret != 0) { + goto out; + } + } + + *dev_o = dev; + return 0; + +out: + if (dev != NULL) + usd_dev_free(dev); + return ret; +} + +int +usd_close_context(struct usd_context *ctx) +{ + pthread_mutex_destroy(&ctx->ucx_mutex); + + /* XXX - verify all other resources closed out */ + if (ctx->ucx_flags & USD_CTXF_CLOSE_CMD_FD) + close(ctx->ucx_ib_dev_fd); + if (ctx->ucmd_ib_dev_fd != -1) + close(ctx->ucmd_ib_dev_fd); + + free(ctx); + + return 0; +} + +int +usd_open_context(const char *dev_name, int cmd_fd, + struct usd_context **ctx_o) +{ + struct usd_context *ctx = NULL; + struct usd_ib_dev *idp; + int ret; + + if (dev_name == NULL) + return -EINVAL; + + ret = usd_init(); + if (ret != 0) { + return ret; + } + + /* Look for matching device */ + idp = usd_ib_dev_list; + while (idp != NULL) { + if (dev_name == NULL || strcmp(idp->id_usnic_name, dev_name) == 0) { + break; + } + idp = idp->id_next; + } + + /* not found, leave now */ + if (idp == NULL) { + ret = -ENXIO; + goto out; + } + + /* + * Found matching device, open an instance + */ + ctx = calloc(sizeof(*ctx), 1); + if (ctx == NULL) { + ret = -errno; + goto out; + } + ctx->ucx_ib_dev_fd = -1; + ctx->ucmd_ib_dev_fd = -1; + ctx->ucx_flags = 0; + + /* Save pointer to IB device */ + ctx->ucx_ib_dev = idp; + + /* Open the fd we will be using for IB commands */ + if (cmd_fd == -1) { + ctx->ucx_ib_dev_fd = open(idp->id_dev_path, O_RDWR); + if (ctx->ucx_ib_dev_fd == -1) { + ret = -ENODEV; + goto out; + } + ctx->ucx_flags |= USD_CTXF_CLOSE_CMD_FD; + } else { + ctx->ucx_ib_dev_fd = cmd_fd; + } + + /* + * Open another fd to send encapsulated user commands through + * CMD_GET_CONTEXT call. The reason to open an additional fd is + * that ib core does not allow multiple get_context call on one + * file descriptor. + */ + ctx->ucmd_ib_dev_fd = open(idp->id_dev_path, O_RDWR | O_CLOEXEC); + if (ctx->ucmd_ib_dev_fd == -1) { + ret = -ENODEV; + goto out; + } + + /* allocate a context from driver */ + ret = usd_open_ibctx(ctx); + if (ret != 0) { + goto out; + } + + LIST_INIT(&ctx->ucx_intr_list); + if (pthread_mutex_init(&ctx->ucx_mutex, NULL) != 0) + goto out; + + *ctx_o = ctx; + return 0; + +out: + if (ctx != NULL) + usd_close_context(ctx); + return ret; +} + +/* + * Close a raw USNIC device + */ +int +usd_close( + struct usd_device *dev) +{ + usd_unmap_grp_vect(dev); + + TAILQ_REMOVE(&usd_device_list, dev, ud_link); + usd_dev_free(dev); + + return 0; +} + +/* + * Open a raw USNIC device + */ +int +usd_open( + const char *dev_name, + struct usd_device **dev_o) +{ + struct usd_open_params params; + + memset(¶ms, 0, sizeof(params)); + params.cmd_fd = -1; + params.context = NULL; + return usd_open_with_params(dev_name, ¶ms, dev_o); +} + +/* + * Most generic usd device open function + */ +int usd_open_with_params(const char *dev_name, + struct usd_open_params* uop_param, + struct usd_device **dev_o) +{ + struct usd_device *dev = NULL; + int ret; + + ret = usd_dev_alloc_init(dev_name, uop_param, &dev); + if (ret != 0) { + goto out; + } + + if (!(uop_param->flags & UOPF_SKIP_PD_ALLOC)) { + ret = usd_ib_cmd_alloc_pd(dev, &dev->ud_pd_handle); + if (ret != 0) { + goto out; + } + } + + TAILQ_INSERT_TAIL(&usd_device_list, dev, ud_link); + *dev_o = dev; + return 0; + +out: + if (dev != NULL) + usd_dev_free(dev); + return ret; +} + +/* + * Return attributes of a device + */ +int +usd_get_device_attrs( + struct usd_device *dev, + struct usd_device_attrs *dattrs) +{ + int ret; + + /* ipaddr, netmask, mtu */ + ret = usd_get_dev_if_info(dev); + if (ret != 0) + return ret; + + /* get what attributes we can from querying IB */ + ret = usd_ib_query_dev(dev); + if (ret != 0) + return ret; + + *dattrs = dev->ud_attrs; + return 0; +} + +/* + * Check that device is ready to have queues created + */ +int +usd_device_ready( + struct usd_device *dev) +{ + if (dev->ud_attrs.uda_ipaddr_be == 0) { + return -EADDRNOTAVAIL; + } + if (dev->ud_attrs.uda_link_state != USD_LINK_UP) { + return -ENETDOWN; + } + + return 0; +} diff --git a/prov/usnic/src/usnic_direct/usd_device.h b/prov/usnic/src/usnic_direct/usd_device.h new file mode 100644 index 00000000000..93bbde17030 --- /dev/null +++ b/prov/usnic/src/usnic_direct/usd_device.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + * definitions for device management + */ + +#ifndef _USD_DEVICE_H_ +#define _USD_DEVICE_H_ + +int usd_device_ready(struct usd_device *dev); + +#endif /* _USD_DEVICE_H_ */ diff --git a/prov/usnic/src/usnic_direct/usd_enum.c b/prov/usnic/src/usnic_direct/usd_enum.c new file mode 100644 index 00000000000..29cc91a7f99 --- /dev/null +++ b/prov/usnic/src/usnic_direct/usd_enum.c @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#include "usnic_direct.h" + +const char *usd_link_state_str(enum usd_link_state state) +{ + switch (state) { + case USD_LINK_DOWN: return "USD_LINK_DOWN"; + case USD_LINK_UP: return "USD_LINK_UP"; + default: return "UNKNOWN"; + } +} + +const char *usd_completion_status_str(enum usd_completion_status cstatus) +{ + switch (cstatus) { + case USD_COMPSTAT_SUCCESS: return "USD_COMPSTAT_SUCCESS"; + case USD_COMPSTAT_ERROR_CRC: return "USD_COMPSTAT_ERROR_CRC"; + case USD_COMPSTAT_ERROR_TRUNC: return "USD_COMPSTAT_ERROR_TRUNC"; + case USD_COMPSTAT_ERROR_TIMEOUT: return "USD_COMPSTAT_ERROR_TIMEOUT"; + case USD_COMPSTAT_ERROR_INTERNAL: return "USD_COMPSTAT_ERROR_INTERNAL"; + default: return "UNKNOWN"; + } +} + +const char *usd_completion_type_str(enum usd_completion_type ctype) +{ + switch (ctype) { + case USD_COMPTYPE_SEND: return "USD_COMPTYPE_SEND"; + case USD_COMPTYPE_RECV: return "USD_COMPTYPE_RECV"; + default: return "UNKNOWN"; + } +} + +const char *usd_filter_type_str(enum usd_filter_type ftype) +{ + switch (ftype) { + case USD_FTY_UDP: return "USD_FTY_UDP"; + case USD_FTY_UDP_SOCK: return "USD_FTY_UDP_SOCK"; + case USD_FTY_TCP: return "USD_FTY_TCP"; + case USD_FTY_MCAST: return "USD_FTY_MCAST"; + case USD_FTY_8915: return "USD_FTY_8915"; + default: return "UNKNOWN"; + } +} + +const char *usd_qp_transport_str(enum usd_qp_transport qpt) +{ + switch (qpt) { + case USD_QTR_RAW: return "USD_QTR_RAW"; + case USD_QTR_UDP: return "USD_QTR_UDP"; + default: return "UNKNOWN"; + } +} + +const char *usd_qp_type_str(enum usd_qp_type qpt) +{ + switch (qpt) { + case USD_QTY_UD: return "USD_QTY_UD"; + case USD_QTY_UD_PIO: return "USD_QTY_UD_PIO"; + default: return "UNKNOWN"; + } +} + +const char *usd_qp_event_event_type_str(enum usd_device_event_type det) +{ + switch (det) { + case USD_EVENT_LINK_UP: return "USD_EVENT_LINK_UP"; + case USD_EVENT_LINK_DOWN: return "USD_EVENT_LINK_DOWN"; + default: return "UNKNOWN"; + } +} + +const char *usd_send_flag_sift_str(enum usd_send_flag_shift sfs) +{ + switch (sfs) { + case USD_SFS_SIGNAL: return "USD_SFS_SIGNAL"; + default: return "UNKNOWN"; + } +} + +const char *usd_capability(enum usd_capability cap) +{ + switch (cap) { + case USD_CAP_CQ_SHARING: return "USD_CAP_CQ_SHARING"; + case USD_CAP_MAP_PER_RES: return "USD_CAP_MAP_PER_RES"; + case USD_CAP_PIO: return "USD_CAP_PIO"; + case USD_CAP_CQ_INTR: return "USD_CAP_CQ_INTR"; + case USD_CAP_GRP_INTR: return "USD_CAP_GRP_INTR"; + case USD_CAP_MAX: return "USD_CAP_MAX"; + default: return "UNKNOWN"; + } +} diff --git a/prov/usnic/src/usnic_direct/usd_event.c b/prov/usnic/src/usnic_direct/usd_event.c new file mode 100644 index 00000000000..9eb24326544 --- /dev/null +++ b/prov/usnic/src/usnic_direct/usd_event.c @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2014-2017, Cisco Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#include +#include +#include + +#include + +#include + +#include "usnic_direct.h" +#include "usd.h" + +/* + * Read an event from IB event fd + */ +int +usd_get_device_event(struct usd_device *dev, + struct usd_device_event *devent) +{ + struct ib_uverbs_async_event_desc ib_event; + int n; + + n = read(dev->ud_attrs.uda_event_fd, &ib_event, sizeof(ib_event)); + if (n == 0) + return -EAGAIN; + else if (n < 0) + return -errno; + + switch (ib_event.event_type) { + case IBV_EVENT_PORT_ACTIVE: + devent->ude_type = USD_EVENT_LINK_UP; + break; + case IBV_EVENT_PORT_ERR: + devent->ude_type = USD_EVENT_LINK_DOWN; + break; + default: + printf("Unexpected event type: %d\n", ib_event.event_type); + return -EAGAIN; + break; + } + + return 0; +} diff --git a/prov/usnic/src/usnic_direct/usd_ib_cmd.c b/prov/usnic/src/usnic_direct/usd_ib_cmd.c new file mode 100644 index 00000000000..a0b7903c5a0 --- /dev/null +++ b/prov/usnic/src/usnic_direct/usd_ib_cmd.c @@ -0,0 +1,1032 @@ +/* + * Copyright (c) 2014-2017, Cisco Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "kcompat.h" +#include "usnic_ib_abi.h" + +#include "usnic_direct.h" +#include "usd.h" +#include "usd_ib_cmd.h" + +int +usd_ib_cmd_get_context(struct usd_context *uctx) +{ + struct usnic_get_context cmd; + struct usnic_get_context_resp resp; + struct ib_uverbs_cmd_hdr *ich; + struct ib_uverbs_get_context *icp; + struct ib_uverbs_get_context_resp *irp; + struct usnic_ib_get_context_cmd *ucp; + struct usnic_ib_get_context_resp *urp; + int n; + + /* clear cmd and response */ + memset(&cmd, 0, sizeof(cmd)); + memset(&resp, 0, sizeof(resp)); + + /* fill in the command struct */ + ich = &cmd.ibv_cmd_hdr; + ich->command = IB_USER_VERBS_CMD_GET_CONTEXT; + ich->in_words = sizeof(cmd) / 4; + ich->out_words = sizeof(resp) / 4; + + icp = &cmd.ibv_cmd; + icp->response = (uintptr_t) & resp; + + ucp = &cmd.usnic_cmd; + +/* + * Because usnic_verbs kernel module with USNIC_CTX_RESP_VERSION as 1 + * silently returns success even it receives resp_version larger than 1, + * without filling in capbility information, here we still fill in + * command with resp_version as 1 in order to retrive cababiltiy information. + * Later when we decide to drop support for this version of kernel + * module, we should replace the next two lines of code with commented-out + * code below. + ucp->resp_version = USNIC_CTX_RESP_VERSION; + ucp->v2.encap_subcmd = 0; + ucp->v2.num_caps = USNIC_CAP_CNT; +*/ + ucp->resp_version = 1; + ucp->v1.num_caps = USNIC_CAP_CNT; + + n = write(uctx->ucx_ib_dev_fd, &cmd, sizeof(cmd)); + if (n != sizeof(cmd)) { + return -errno; + } + + irp = &resp.ibv_resp; + uctx->event_fd = irp->async_fd; + uctx->num_comp_vectors = irp->num_comp_vectors; + + urp = &resp.usnic_resp; + +/* + * Replace the code below with the commented-out line if dropping + * support for kernel module with resp_version support as 1 + if (urp->resp_version == USNIC_CTX_RESP_VERSION) { + */ + if (urp->resp_version == 1) { + if (urp->num_caps > USNIC_CAP_CQ_SHARING && + urp->cap_info[USNIC_CAP_CQ_SHARING] > 0) { + uctx->ucx_caps[USD_CAP_CQ_SHARING] = 1; + } + if (urp->num_caps > USNIC_CAP_MAP_PER_RES && + urp->cap_info[USNIC_CAP_MAP_PER_RES] > 0) { + uctx->ucx_caps[USD_CAP_MAP_PER_RES] = 1; + } + if (urp->num_caps > USNIC_CAP_PIO && + urp->cap_info[USNIC_CAP_PIO] > 0) { + uctx->ucx_caps[USD_CAP_PIO] = 1; + } + if (urp->num_caps > USNIC_CAP_CQ_INTR && + urp->cap_info[USNIC_CAP_CQ_INTR] > 0) { + uctx->ucx_caps[USD_CAP_CQ_INTR] = 1; + } + if (urp->num_caps > USNIC_CAP_GRP_INTR && + urp->cap_info[USNIC_CAP_GRP_INTR] > 0) { + uctx->ucx_caps[USD_CAP_GRP_INTR] = 1; + } + } + + return 0; +} + +int +usd_ib_cmd_devcmd( + struct usd_device *dev, + enum vnic_devcmd_cmd devcmd, + u64 *a0, u64 *a1, int wait) +{ + struct usnic_get_context cmd; + struct usnic_get_context_resp resp; + struct ib_uverbs_cmd_hdr *ich; + struct ib_uverbs_get_context *icp; + struct usnic_ib_get_context_cmd *ucp; + struct usnic_ib_get_context_resp *urp; + struct usnic_udevcmd_cmd udevcmd; + struct usnic_udevcmd_resp udevcmd_resp; + int n; + + if (dev->ud_ctx->ucmd_ib_dev_fd < 0) + return -ENOENT; + + /* clear cmd and response */ + memset(&cmd, 0, sizeof(cmd)); + memset(&resp, 0, sizeof(resp)); + memset(&udevcmd, 0, sizeof(udevcmd)); + memset(&udevcmd_resp, 0, sizeof(udevcmd_resp)); + + /* fill in the command struct */ + ich = &cmd.ibv_cmd_hdr; + ich->command = IB_USER_VERBS_CMD_GET_CONTEXT; + ich->in_words = sizeof(cmd) / 4; + ich->out_words = sizeof(resp) / 4; + + icp = &cmd.ibv_cmd; + icp->response = (uintptr_t) & resp; + + /* fill in usnic devcmd struct */ + udevcmd.vnic_idx = dev->ud_vf_list->vf_id; + udevcmd.devcmd = devcmd; + udevcmd.wait = wait; + udevcmd.num_args = 2; + udevcmd.args[0] = *a0; + udevcmd.args[1] = *a1; + + ucp = &cmd.usnic_cmd; + ucp->resp_version = USNIC_CTX_RESP_VERSION; + ucp->v2.encap_subcmd = 1; + ucp->v2.usnic_ucmd.ucmd = USNIC_USER_CMD_DEVCMD; + ucp->v2.usnic_ucmd.inbuf = (uintptr_t) &udevcmd; + ucp->v2.usnic_ucmd.inlen = (u32)sizeof(udevcmd); + ucp->v2.usnic_ucmd.outbuf = (uintptr_t) &udevcmd_resp; + ucp->v2.usnic_ucmd.outlen = (u32)sizeof(udevcmd_resp); + + n = write(dev->ud_ctx->ucmd_ib_dev_fd, &cmd, sizeof(cmd)); + urp = &resp.usnic_resp; + /* + * If returns success, it's an old kernel who does not understand + * version 2 command, then we need to close the command FD to + * release the created ucontext object + */ + if (n == sizeof(cmd)) { + usd_err( + "The running usnic_verbs kernel module does not support " + "encapsulating devcmd through IB GET_CONTEXT command\n"); + close(dev->ud_ctx->ucmd_ib_dev_fd); + dev->ud_ctx->ucmd_ib_dev_fd = -1; + return -ENOTSUP; + } else if (errno != ECHILD) { + return -errno; + } else if (urp->resp_version != USNIC_CTX_RESP_VERSION) { + /* Kernel needs to make sure it returns response with a format + * understandable by the library. */ + usd_err( + "The returned resp version does not match with requested\n"); + return -ENOTSUP; + } + + *a0 = udevcmd_resp.args[0]; + *a1 = udevcmd_resp.args[1]; + + return 0; +} + +/* + * Issue IB DEALLOC_PD command to alloc a PD in kernel + */ +static int +_usd_ib_cmd_dealloc_pd( + struct usd_device *dev, + uint32_t pd_handle) +{ + struct usnic_dealloc_pd cmd; + struct ib_uverbs_cmd_hdr *ich; + struct ib_uverbs_dealloc_pd *icp; + int n; + + memset(&cmd, 0, sizeof(cmd)); + + ich = &cmd.ibv_cmd_hdr; + ich->command = IB_USER_VERBS_CMD_DEALLOC_PD; + ich->in_words = sizeof(cmd) / 4; + ich->out_words = 0; + + icp = &cmd.ibv_cmd; + icp->pd_handle = pd_handle; + + n = write(dev->ud_ctx->ucx_ib_dev_fd, &cmd, sizeof(cmd)); + if (n != sizeof(cmd)) { + return -errno; + } + + return 0; +} + +/* + * Issue IB ALLOC_PD command to alloc a PD in kernel + */ +static int +_usd_ib_cmd_alloc_pd( + struct usd_device *dev, + uint32_t *handle_o, + uint32_t *vfid, + uint32_t *grp_vect_buf_len) +{ + struct usnic_alloc_pd cmd; + struct usnic_alloc_pd_resp resp; + struct ib_uverbs_cmd_hdr *ich; + struct ib_uverbs_alloc_pd *icp; + struct usnic_ib_alloc_pd_cmd *ucp; + struct ib_uverbs_alloc_pd_resp *irp; + struct usnic_ib_alloc_pd_resp *urp; + int n; + + memset(&cmd, 0, sizeof(cmd)); + memset(&resp, 0, sizeof(resp)); + + /* fill in command */ + ich = &cmd.ibv_cmd_hdr; + ich->command = IB_USER_VERBS_CMD_ALLOC_PD; + ich->in_words = sizeof(cmd) / 4; + ich->out_words = sizeof(resp) / 4; + + icp = &cmd.ibv_cmd; + icp->response = (uintptr_t) & resp; + + /* + * Only need to get group vector size and vf information + * if group interrupt is enabled + */ + if (dev->ud_ctx->ucx_caps[USD_CAP_GRP_INTR] > 0) { + ucp = &cmd.usnic_cmd; + ucp->resp_version = USNIC_IB_ALLOC_PD_VERSION; + } + + n = write(dev->ud_ctx->ucx_ib_dev_fd, &cmd, sizeof(cmd)); + if (n != sizeof(cmd)) { + return -errno; + } + + /* process response */ + irp = &resp.ibv_resp; + *handle_o = irp->pd_handle; + urp = &resp.usnic_resp; + if (urp->resp_version >= 1) { + *vfid = urp->cur.vfid; + *grp_vect_buf_len = urp->cur.grp_vect_buf_len; + } + + return 0; +} + +/* + * Create a protection domain + */ +int +usd_ib_cmd_alloc_pd( + struct usd_device *dev, + uint32_t *handle_o) +{ + uint32_t vfid = 0; + uint32_t grp_vect_buf_len = 0; + int err; + + /* Issue IB alloc_pd command, get assigned VF id and group vector size */ + err = _usd_ib_cmd_alloc_pd(dev, handle_o, &vfid, &grp_vect_buf_len); + if (err) { + return err; + } + + /* MAP group vector address to userspace + * Kernel module then maps group vector user address to IOMMU and + * program VIC HW register + */ + if (dev->ud_ctx->ucx_caps[USD_CAP_GRP_INTR] > 0) { + void *va; + off64_t offset; + + offset = USNIC_ENCODE_PGOFF(vfid, USNIC_MMAP_GRPVECT, 0); + va = mmap64(NULL, grp_vect_buf_len, PROT_READ + PROT_WRITE, + MAP_SHARED, dev->ud_ctx->ucx_ib_dev_fd, offset); + + if (va == MAP_FAILED) { + usd_err("Failed to map group vector for vf %u, grp_vect_size %u, " + "error %d\n", + vfid, grp_vect_buf_len, errno); + _usd_ib_cmd_dealloc_pd(dev, *handle_o); + return -errno; + } + + dev->grp_vect_map.va = va; + dev->grp_vect_map.len = grp_vect_buf_len; + dev->grp_vect_map.vfid = vfid; + } + + return 0; +} + +int +usd_ib_cmd_reg_mr( + struct usd_device *dev, + void *vaddr, + size_t length, + struct usd_mr *mr) +{ + struct usnic_reg_mr cmd; + struct usnic_reg_mr_resp resp; + struct ib_uverbs_cmd_hdr *ich; + struct ib_uverbs_reg_mr *icp; + struct ib_uverbs_reg_mr_resp *irp; + int n; + + memset(&cmd, 0, sizeof(cmd)); + memset(&resp, 0, sizeof(resp)); + + ich = &cmd.ibv_cmd_hdr; + ich->command = IB_USER_VERBS_CMD_REG_MR; + ich->in_words = sizeof(cmd) / 4; + ich->out_words = sizeof(resp) / 4; + + icp = &cmd.ibv_cmd; + icp->response = (uintptr_t) & resp; + icp->start = (uintptr_t) vaddr; + icp->length = length; + icp->hca_va = (uintptr_t) vaddr; + icp->pd_handle = dev->ud_pd_handle; + icp->access_flags = IBV_ACCESS_LOCAL_WRITE; + + /* Issue command to IB driver */ + n = write(dev->ud_ctx->ucx_ib_dev_fd, &cmd, sizeof(cmd)); + if (n != sizeof(cmd)) { + return errno; + } + + /* process response */ + irp = &resp.ibv_resp; + mr->umr_handle = irp->mr_handle; + mr->umr_lkey = irp->lkey; + mr->umr_rkey = irp->rkey; + + return 0; +} + +int +usd_ib_cmd_dereg_mr( + struct usd_device *dev, + struct usd_mr *mr) +{ + struct usnic_dereg_mr cmd; + struct ib_uverbs_cmd_hdr *ich; + struct ib_uverbs_dereg_mr *icp; + int n; + + memset(&cmd, 0, sizeof(cmd)); + + ich = &cmd.ibv_cmd_hdr; + ich->command = IB_USER_VERBS_CMD_DEREG_MR; + ich->in_words = sizeof(cmd) / 4; + ich->out_words = 0; + + icp = &cmd.ibv_cmd; + icp->mr_handle = mr->umr_handle; + + /* Issue command to IB driver */ + n = write(dev->ud_ctx->ucx_ib_dev_fd, &cmd, sizeof(cmd)); + if (n != sizeof(cmd)) { + return -errno; + } + + return 0; +} + +/* + * Make the verbs call to create a CQ + */ +int +usd_ib_cmd_create_cq( + struct usd_device *dev, + struct usd_cq_impl *cq, + void *ibv_cq, + int comp_channel, + int comp_vector) +{ + struct usnic_create_cq cmd; + struct usnic_create_cq_resp resp; + struct ib_uverbs_cmd_hdr *ich; + struct ib_uverbs_create_cq *icp; + struct ib_uverbs_create_cq_resp *irp; + cpu_set_t *affinity_mask = NULL; + int flags = 0; + int n; + + memset(&cmd, 0, sizeof(cmd)); + memset(&resp, 0, sizeof(resp)); + + ich = &cmd.ibv_cmd_hdr; + ich->command = IB_USER_VERBS_CMD_CREATE_CQ; + ich->in_words = sizeof(cmd) / 4; + ich->out_words = sizeof(resp) / 4; + + icp = &cmd.ibv_cmd; + icp->response = (uintptr_t) & resp; + + if (ibv_cq == NULL) { + icp->user_handle = (uintptr_t) cq; + } else { + icp->user_handle = (uintptr_t) ibv_cq; /* Pass real verbs cq pointer to kernel + * to make ibv_get_cq_event happy */ + flags |= USNIC_CQ_COMP_SIGNAL_VERBS; + } + icp->cqe = cq->ucq_num_entries; + icp->comp_channel = comp_channel; + icp->comp_vector = comp_vector; + + if (comp_channel != -1) { + if (dev->ud_ctx->ucx_caps[USD_CAP_GRP_INTR] != 1) { + usd_err("usd_create_cq failed. No interrupt support\n"); + return -ENOTSUP; + } + cmd.usnic_cmd.resp_version = USNIC_IB_CREATE_CQ_VERSION; + cmd.usnic_cmd.cur.flags = flags; + cmd.usnic_cmd.cur.comp_event_fd = comp_channel; + if ((affinity_mask = CPU_ALLOC(sysconf(_SC_NPROCESSORS_ONLN))) + != NULL && + sched_getaffinity(getpid(), + CPU_ALLOC_SIZE(sysconf(_SC_NPROCESSORS_ONLN)), + affinity_mask) == 0) { + cmd.usnic_cmd.cur.affinity_mask_ptr = (u64)affinity_mask; + cmd.usnic_cmd.cur.affinity_mask_len = + CPU_ALLOC_SIZE(sysconf(_SC_NPROCESSORS_ONLN)); + } else { + cmd.usnic_cmd.cur.affinity_mask_ptr = (u64)NULL; + cmd.usnic_cmd.cur.affinity_mask_len = 0; + } + } else { + /* + * If appliation does not request cq completion event support, + * send command with version 0 to allow compatibility with + * old kernel library + */ + cmd.usnic_cmd.resp_version = 0; + } + + /* Issue command to IB driver */ + n = write(dev->ud_ctx->ucx_ib_dev_fd, &cmd, sizeof(cmd)); + if (n != sizeof(cmd)) { + return -errno; + } + + /* process response */ + irp = &resp.ibv_resp; + cq->ucq_handle = irp->cq_handle; + + if (affinity_mask != NULL) + CPU_FREE(affinity_mask); + + return 0; +} + +/* + * Make the verbs call to destroy a CQ + */ +int +usd_ib_cmd_destroy_cq( + struct usd_device *dev, + struct usd_cq_impl *cq) +{ + struct usnic_destroy_cq cmd; + struct ib_uverbs_cmd_hdr *ich; + struct ib_uverbs_destroy_cq *icp; + int n; + + memset(&cmd, 0, sizeof(cmd)); + + ich = &cmd.ibv_cmd_hdr; + ich->command = IB_USER_VERBS_CMD_DESTROY_CQ; + ich->in_words = sizeof(cmd) / 4; + ich->out_words = 0; + + icp = &cmd.ibv_cmd; + icp->cq_handle = cq->ucq_handle; + + /* Issue command to IB driver */ + n = write(dev->ud_ctx->ucx_ib_dev_fd, &cmd, sizeof(cmd)); + if (n != sizeof(cmd)) { + return -errno; + } + + return 0; +} + +/* + * Create a verbs QP without attaching any real resources to it yet + */ +int +usd_ib_cmd_create_qp( + struct usd_device *dev, + struct usd_qp_impl *qp, + struct usd_vf_info *vfip) +{ + struct usnic_create_qp cmd; + struct usnic_create_qp_resp *resp; + struct ib_uverbs_cmd_hdr *ich; + struct ib_uverbs_create_qp *icp; + struct ib_uverbs_create_qp_resp *irp = NULL; + struct usnic_ib_create_qp_cmd *ucp; + struct usnic_ib_create_qp_resp *urp; + struct usd_qp_filter *qfilt; + int ret; + int n; + uint32_t i; + struct usnic_vnic_barres_info *resources; + + ucp = NULL; + resources = NULL; + irp = NULL; + memset(&cmd, 0, sizeof(cmd)); + + resp = calloc(1, sizeof(*resp)); + if (resp == NULL) { + usd_err("Failed to allocate memory for create_qp_resp\n"); + return -ENOMEM; + } + + ich = &cmd.ibv_cmd_hdr; + ich->command = IB_USER_VERBS_CMD_CREATE_QP; + ich->in_words = sizeof(cmd) / 4; + ich->out_words = sizeof(*resp) / 4; + + icp = &cmd.ibv_cmd; + icp->response = (uintptr_t) resp; + icp->user_handle = (uintptr_t) qp; + icp->pd_handle = dev->ud_pd_handle; + icp->send_cq_handle = qp->uq_wq.uwq_cq->ucq_handle; + icp->recv_cq_handle = qp->uq_rq.urq_cq->ucq_handle; + icp->srq_handle = 0; + icp->max_send_wr = qp->uq_wq.uwq_num_entries; + icp->max_recv_wr = qp->uq_rq.urq_num_entries; + icp->max_send_sge = 1; + icp->max_recv_sge = 1; + icp->max_inline_data = 1024; + icp->sq_sig_all = 0; + icp->qp_type = IBV_QPT_UD; + icp->is_srq = 0; + icp->reserved = 0; + + ucp = &cmd.usnic_cmd; + + if (dev->ud_ctx->ucx_caps[USD_CAP_GRP_INTR]) { + ucp->cmd_version = 2; + } else { + /* + * Allow compatibility with old kernel module when + * application does not require cq completion notification + */ + ucp->cmd_version = 1; + } + + qfilt = &qp->uq_filter; + if (qfilt->qf_type == USD_FTY_UDP || + qfilt->qf_type == USD_FTY_UDP_SOCK) { + /* + * Command versions 0,1,2 need to fill in the spec_v2 struct. + * Newer versions need to fill in the spec struct. + */ + if (ucp->cmd_version <= 2) { + ucp->spec_v2.trans_type = USNIC_TRANSPORT_IPV4_UDP; + ucp->spec_v2.ip.sock_fd = qfilt->qf_filter.qf_udp.u_sockfd; + } else { + ucp->spec.trans_type = USNIC_TRANSPORT_IPV4_UDP; + ucp->spec.ip.sock_fd = qfilt->qf_filter.qf_udp.u_sockfd; + } + } else { + ret = -EINVAL; + goto out; + } + + ucp->u.cur.resources_len = RES_TYPE_MAX * sizeof(*resources); + resources = calloc(RES_TYPE_MAX, sizeof(*resources)); + if (resources == NULL) { + usd_err("unable to allocate resources array\n"); + ret = -ENOMEM; + goto out; + } + ucp->u.cur.resources = (u64)(uintptr_t)resources; + + /* Issue command to IB driver */ + n = write(dev->ud_ctx->ucx_ib_dev_fd, &cmd, sizeof(cmd)); + if (n != sizeof(cmd)) { + ret = -errno; + goto out; + } + + /* process IB part of response */ + irp = &resp->ibv_resp; + qp->uq_qp_handle = irp->qp_handle; + qp->uq_qp_num = irp->qpn; + + /* process usnic part response */ + urp = &resp->usnic_resp; + + qp->uq_rq.urq_index = urp->rq_idx[0]; + qp->uq_wq.uwq_index = urp->wq_idx[0]; + + qp->uq_rq.urq_cq->ucq_index = urp->cq_idx[0]; + if (qp->uq_rq.urq_cq != qp->uq_wq.uwq_cq) { + qp->uq_wq.uwq_cq->ucq_index = urp->cq_idx[1]; + } + + /* Pull VF info */ + vfip->vi_vfid = urp->vfid; + vfip->vi_bar_bus_addr = urp->bar_bus_addr; + vfip->vi_bar_len = urp->bar_len; + + if (urp->cmd_version == ucp->cmd_version) { + /* got expected version */ + if (dev->ud_ctx->ucx_caps[USD_CAP_MAP_PER_RES] > 0) { + for (i = 0; i < MIN(RES_TYPE_MAX, urp->u.cur.num_barres); i++) { + enum vnic_res_type type = resources[i].type; + if (type < RES_TYPE_MAX) { + vfip->barres[type].type = type; + vfip->barres[type].bus_addr = resources[i].bus_addr; + vfip->barres[type].len = resources[i].len; + } + } + if (vfip->barres[RES_TYPE_WQ].bus_addr == 0) { + usd_err("Failed to retrieve WQ res info\n"); + ret = -ENXIO; + goto out; + } + if (vfip->barres[RES_TYPE_RQ].bus_addr == 0) { + usd_err("Failed to retrieve RQ res info\n"); + ret = -ENXIO; + goto out; + } + if (vfip->barres[RES_TYPE_CQ].bus_addr == 0) { + usd_err("Failed to retrieve CQ res info\n"); + ret = -ENXIO; + goto out; + } + if (vfip->barres[RES_TYPE_INTR_CTRL].bus_addr == 0) { + usd_err("Failed to retrieve INTR res info\n"); + ret = -ENXIO; + goto out; + } + if (vfip->barres[RES_TYPE_DEVCMD].bus_addr == 0) { + usd_err("Failed to retrieve DEVCMD res info\n"); + ret = -ENXIO; + goto out; + } + } + } else if (urp->cmd_version == 0) { + /* special case, old kernel that won't tell us about individual barres + * info but should otherwise work fine */ + + if (dev->ud_ctx->ucx_caps[USD_CAP_MAP_PER_RES] != 0) { + /* should not happen, only the presence of never-released kernel + * code should cause this case */ + usd_err("USD_CAP_MAP_PER_RES claimed but qp_create cmd_version == 0\n"); + ret = -ENXIO; + goto out; + } + } else { + usd_err("unexpected cmd_version (%u)\n", urp->cmd_version); + ret = -ENXIO; + goto out; + } + + /* version 2 and beyond has interrupt support */ + if (urp->cmd_version > 1) { + qp->uq_rq.urq_cq->intr_offset = urp->u.cur.rcq_intr_offset; + if (qp->uq_rq.urq_cq != qp->uq_wq.uwq_cq) { + qp->uq_wq.uwq_cq->intr_offset = urp->u.cur.wcq_intr_offset; + } + vfip->vi_barhead_len = urp->u.cur.barhead_len; + } + + free(resources); + free(resp); + return 0; + + out: + if (irp != NULL) /* indicates successful IB create QP */ + usd_ib_cmd_destroy_qp(dev, qp); + free(resources); + free(resp); + return ret; +} + +int +usd_ib_cmd_modify_qp( + struct usd_device *dev, + struct usd_qp_impl *qp, + int state) +{ + struct usnic_modify_qp cmd; + struct ib_uverbs_cmd_hdr *ich; + struct ib_uverbs_modify_qp *icp; + int n; + + memset(&cmd, 0, sizeof(cmd)); + + ich = &cmd.ibv_cmd_hdr; + ich->command = IB_USER_VERBS_CMD_MODIFY_QP; + ich->in_words = sizeof(cmd) / 4; + ich->out_words = 0; + + icp = &cmd.ibv_cmd; + icp->qp_handle = qp->uq_qp_handle; + icp->attr_mask = IBV_QP_STATE; + icp->qp_state = state; + + /* Issue command to IB driver */ + n = write(dev->ud_ctx->ucx_ib_dev_fd, &cmd, sizeof(cmd)); + if (n != sizeof(cmd)) { + return -errno; + } + + return 0; +} + +int +usd_ib_cmd_destroy_qp( + struct usd_device *dev, + struct usd_qp_impl *qp) +{ + struct usnic_destroy_qp cmd; + struct ib_uverbs_destroy_qp_resp resp; + struct ib_uverbs_cmd_hdr *ich; + struct ib_uverbs_destroy_qp *icp; + int n; + + memset(&cmd, 0, sizeof(cmd)); + + ich = &cmd.ibv_cmd_hdr; + ich->command = IB_USER_VERBS_CMD_DESTROY_QP; + ich->in_words = sizeof(cmd) / 4; + ich->out_words = sizeof(resp) / 4; + + icp = &cmd.ibv_cmd; + icp->response = (uintptr_t) & resp; + icp->qp_handle = qp->uq_qp_handle; + + /* Issue command to IB driver */ + n = write(dev->ud_ctx->ucx_ib_dev_fd, &cmd, sizeof(cmd)); + if (n != sizeof(cmd)) { + return -errno; + } + + return 0; +} + +static int +usd_ib_cmd_query_device( + struct usd_device *dev, + struct ib_uverbs_query_device_resp *irp) +{ + struct usnic_query_device cmd; + struct ib_uverbs_cmd_hdr *ich; + struct ib_uverbs_query_device *icp; + int n; + + memset(&cmd, 0, sizeof(cmd)); + + ich = &cmd.ibv_cmd_hdr; + ich->command = IB_USER_VERBS_CMD_QUERY_DEVICE; + ich->in_words = sizeof(cmd) / 4; + ich->out_words = sizeof(*irp) / 4; + + icp = &cmd.ibv_cmd; + icp->response = (uintptr_t) irp; + + /* keep Valgrind happy */ + memset(irp, 0x00, sizeof(*irp)); + + /* Issue command to IB driver */ + n = write(dev->ud_ctx->ucx_ib_dev_fd, &cmd, sizeof(cmd)); + if (n != sizeof(cmd)) { + return -errno; + } + + return 0; +} + +static int +usd_ib_cmd_query_port( + struct usd_device *dev, + struct ib_uverbs_query_port_resp *irp) +{ + struct usnic_query_port cmd; + struct ib_uverbs_cmd_hdr *ich; + struct ib_uverbs_query_port *icp; + int n; + + memset(&cmd, 0, sizeof(cmd)); + + ich = &cmd.ibv_cmd_hdr; + ich->command = IB_USER_VERBS_CMD_QUERY_PORT; + ich->in_words = sizeof(cmd) / 4; + ich->out_words = sizeof(*irp) / 4; + + icp = &cmd.ibv_cmd; + icp->response = (uintptr_t) irp; + icp->port_num = 1; + + /* keep Valgrind happy */ + memset(irp, 0x00, sizeof(*irp)); + + /* Issue command to IB driver */ + n = write(dev->ud_ctx->ucx_ib_dev_fd, &cmd, sizeof(cmd)); + if (n != sizeof(cmd)) { + return -errno; + } + + return 0; +} + +/* + * For code readability, copy these two enums from kernel + * /usr/include/rdma/ib_verbs.h (otherwise, we'd would have to + * hard-code the integer values below). + */ +enum ib_port_width { + IB_WIDTH_1X = 1, + IB_WIDTH_4X = 2, + IB_WIDTH_8X = 4, + IB_WIDTH_12X = 8 +}; + +enum ib_port_speed { + IB_SPEED_SDR = 1, // 2.5 Gbps + IB_SPEED_DDR = 2, // 5 Gbps + IB_SPEED_QDR = 4, // 10 Gbps + IB_SPEED_FDR10 = 8, // 10.3125 Gbps + IB_SPEED_FDR = 16, // 14.0625 Gbps + IB_SPEED_EDR = 32, // 25.78125 Gbps + IB_SPEED_HDR = 64 // 50 Gbps +}; + + +/* + * Issue query commands for device and port and interpret the resaults + */ +int +usd_ib_query_dev( + struct usd_device *dev) +{ + struct ib_uverbs_query_device_resp dresp; + struct ib_uverbs_query_port_resp presp; + struct usd_device_attrs *dap; + unsigned speed; + int ret; + + ret = usd_ib_cmd_query_device(dev, &dresp); + if (ret != 0) + return ret; + + ret = usd_ib_cmd_query_port(dev, &presp); + if (ret != 0) + return ret; + + /* copy out the attributes we care about */ + dap = &dev->ud_attrs; + + dap->uda_link_state = + (presp.state == 4) ? USD_LINK_UP : USD_LINK_DOWN; + + /* + * If link is up, derive bandwidth from speed and width. + * If link is down, driver reports bad speed, try to deduce from the + * NIC device ID. + */ + if (dap->uda_link_state == USD_LINK_UP) { +#define MKSW(S,W) (((S)<<8)|(W)) + speed = MKSW(presp.active_speed, presp.active_width); + switch (speed) { + case MKSW(IB_SPEED_FDR10, IB_WIDTH_1X): + case MKSW(IB_SPEED_DDR, IB_WIDTH_4X): + dap->uda_bandwidth = 10000; + break; + case MKSW(IB_SPEED_QDR, IB_WIDTH_4X): + dap->uda_bandwidth = 25000; + break; + case MKSW(IB_SPEED_FDR10, IB_WIDTH_4X): + dap->uda_bandwidth = 40000; + break; + case MKSW(IB_SPEED_HDR, IB_WIDTH_1X): + dap->uda_bandwidth = 50000; + break; + case MKSW(IB_SPEED_EDR, IB_WIDTH_4X): + dap->uda_bandwidth = 100000; + break; + case MKSW(IB_SPEED_HDR, IB_WIDTH_4X): + dap->uda_bandwidth = 200000; + break; + case MKSW(IB_SPEED_HDR, IB_WIDTH_8X): + dap->uda_bandwidth = 400000; + break; + default: + printf("Warning: unrecognized speed/width %d/%d, defaulting to 10G\n", + presp.active_speed, presp.active_width); + dap->uda_bandwidth = 10000; + break; + } + } else { + /* from pci_ids.h */ + switch (dap->uda_device_id) { + case 0x4f: /* Vasona */ + case 0x84: /* Cotati */ + case 0x85: /* Lexington */ + case 0x12c: /* Calistoga */ + case 0x137: /* Mountain View */ + case 0x138: /* Walnut Creek */ + dap->uda_bandwidth = 10000; + break; + case 0xcd: /* icehouse */ + case 0x14d: /* clearlake */ + dap->uda_bandwidth = 40000; + break; + default: + dap->uda_bandwidth = 0; + } + } + + dap->uda_vendor_id = dresp.vendor_id; + dap->uda_vendor_part_id = dresp.vendor_part_id; + dap->uda_device_id = dresp.hw_ver; + + dap->uda_max_qp = dresp.max_qp; + dap->uda_max_cq = dresp.max_cq; + + return 0; +} + + +int +usd_ib_cmd_create_comp_channel( + struct usd_device *dev, + int *comp_fd_o) +{ + int n; + struct usnic_create_comp_channel cmd; + struct ib_uverbs_create_comp_channel_resp resp; + struct ib_uverbs_cmd_hdr *ich; + struct ib_uverbs_create_comp_channel *icp; + struct ib_uverbs_create_comp_channel_resp *irp; + + memset(&cmd, 0, sizeof(cmd)); + + ich = &cmd.ibv_cmd_hdr; + ich->command = IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL; + ich->in_words = sizeof(cmd) / 4; + ich->out_words = sizeof(resp) / 4; + + icp = &cmd.ibv_cmd; + icp->response = (uintptr_t) & resp; + + /* Issue command to IB driver */ + n = write(dev->ud_ctx->ucx_ib_dev_fd, &cmd, sizeof(cmd)); + if (n != sizeof(cmd)) { + return -errno; + } + + irp = &resp; + *comp_fd_o = irp->fd; + + return 0; +} diff --git a/prov/usnic/src/usnic_direct/usd_ib_cmd.h b/prov/usnic/src/usnic_direct/usd_ib_cmd.h new file mode 100644 index 00000000000..b7d7dc33269 --- /dev/null +++ b/prov/usnic/src/usnic_direct/usd_ib_cmd.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#ifndef _USD_IB_CMD_ +#define _USD_IB_CMD_ + +#include "usd.h" + +int usd_ib_cmd_get_context(struct usd_context *uctx); +int usd_ib_cmd_alloc_pd(struct usd_device *dev, uint32_t * pd_handle_o); +int usd_ib_cmd_reg_mr(struct usd_device *dev, void *vaddr, size_t length, + struct usd_mr *mr); +int usd_ib_cmd_dereg_mr(struct usd_device *dev, struct usd_mr *mr); +int usd_ib_cmd_create_cq(struct usd_device *dev, struct usd_cq_impl *cq, + void *ibv_cq, int comp_channel, int comp_vector); +int usd_ib_cmd_destroy_cq(struct usd_device *dev, struct usd_cq_impl *cq); +int usd_ib_cmd_create_qp(struct usd_device *dev, struct usd_qp_impl *qp, + struct usd_vf_info *vfip); +int usd_ib_cmd_modify_qp(struct usd_device *dev, struct usd_qp_impl *qp, + int state); +int usd_ib_cmd_destroy_qp(struct usd_device *dev, struct usd_qp_impl *qp); + +int usd_ib_query_dev(struct usd_device *dev); +int usd_ib_cmd_devcmd(struct usd_device *dev, enum vnic_devcmd_cmd devcmd, + u64 *a0, u64 *a1, int wait); + +int usd_ib_cmd_create_comp_channel(struct usd_device *dev, int *comp_fd_o); +int usd_ib_cmd_destroy_comp_channel(struct usd_device *dev, int comp_fd); + +#endif /* _USD_IB_CMD_ */ diff --git a/prov/usnic/src/usnic_direct/usd_ib_sysfs.c b/prov/usnic/src/usnic_direct/usd_ib_sysfs.c new file mode 100644 index 00000000000..0056bef61c4 --- /dev/null +++ b/prov/usnic/src/usnic_direct/usd_ib_sysfs.c @@ -0,0 +1,389 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "usd.h" +#include "usd_ib_sysfs.h" +#include "usd_util.h" + +/* + * usnic_direct routines that depend on Infiniband /sysfs directory structure + */ + +/* + * Perform one-time initialization + */ +int +usd_ib_get_devlist( + struct usd_ib_dev **dev_list) +{ + char *class_path = "/sys/class/infiniband_verbs"; + DIR *class_dir; + struct dirent *dent; + struct stat sbuf; + char *dev_path = NULL; + char *ibdev_path = NULL; + char ibdev_buf[32]; + struct usd_ib_dev *idp; + struct usd_ib_dev *last_idp; + int fd; + int rc; + int n; + + /* + * For now, we are glomming onto Infiniband driver for setup + */ + class_dir = opendir(class_path); + if (class_dir == NULL) { + return -ENODEV; + } + + /* Check dir entries for USNIC devices */ + last_idp = NULL; + fd = -1; + while ((dent = readdir(class_dir)) != NULL) { + /* skip "." and ".." */ + if (dent->d_name[0] == '.') + continue; + + /* build path to entry */ + if (asprintf(&dev_path, "%s/%s", class_path, + dent->d_name) <= 0) { + rc = -errno; + usd_perror("failed to asprintf"); + goto out; + } + + /* see if it's a dir */ + rc = stat(dev_path, &sbuf); + if (rc != 0) { + usd_perror(dev_path); + rc = -errno; + goto out; + } + + /* Must be a directory */ + if (!S_ISDIR(sbuf.st_mode)) + continue; + + /* read the ibdev */ + if (asprintf(&ibdev_path, "%s/ibdev", dev_path) <= 0) { + rc = -errno; + usd_perror(ibdev_path); + goto out; + } + fd = open(ibdev_path, O_RDONLY); + if (fd == -1) { + usd_perror(ibdev_path); + rc = -errno; + goto out; + } + memset(ibdev_buf, 0, sizeof(ibdev_buf)); + n = read(fd, ibdev_buf, sizeof(ibdev_buf) - 1); + if (n == -1) { + usd_perror("reading ibdev"); + rc = -errno; + goto out; + } + close(fd); + fd = -1; + if (n > 0 && ibdev_buf[n - 1] == '\n') { + ibdev_buf[n - 1] = '\0'; /* newline -> EOF */ + } + +#pragma GCC diagnostic push +#if defined(__GNUC__) && (__GNUC__ >= 8) +#pragma GCC diagnostic ignored "-Wstringop-truncation" +#endif + /* If USNIC device, remember this one */ + if (strncmp(ibdev_buf, "usnic", 5) == 0) { + idp = calloc(sizeof(*idp), 1); + if (idp == NULL) { + usd_perror("calloc IB device"); + rc = -errno; + goto out; + } + strncpy(idp->id_name, dent->d_name, sizeof(idp->id_name) - 1); + strncpy(idp->id_usnic_name, ibdev_buf, + sizeof(idp->id_usnic_name) - 1); + snprintf(idp->id_dev_path, sizeof(idp->id_dev_path) - 1, + "/dev/infiniband/%s", idp->id_name); + snprintf(idp->id_class_path, sizeof(idp->id_class_path) - 1, + "%s/device/infiniband/%s", dev_path, ibdev_buf); + + if (last_idp == NULL) { + *dev_list = idp; + } else { + last_idp->id_next = idp; + } + idp->id_next = NULL; + last_idp = idp; + } +#pragma GCC diagnostic pop + + free(dev_path); + dev_path = NULL; + free(ibdev_path); + ibdev_path = NULL; + } + rc = 0; + +out: + /* clean up */ + free(dev_path); + free(ibdev_path); + if (class_dir != NULL) { + closedir(class_dir); + } + if (fd != -1) { + close(fd); + } + + return rc; +} + +/* + * Find MAC for a device + * (we assume port 0) + */ +int +usd_get_mac( + struct usd_device *dev, + uint8_t * mac) +{ + char name[PATH_MAX + 128]; + char gid[80]; + char *p; + uint16_t v; + struct usd_ib_dev *idp; + int fd; + int n; + + idp = dev->ud_ctx->ucx_ib_dev; + snprintf(name, sizeof(name), "%s/ports/1/gids/0", idp->id_class_path); + + fd = open(name, O_RDONLY); + if (fd == -1) { + usd_perror(name); + return -errno; + } + + n = read(fd, gid, sizeof(gid) - 1); + close(fd); + if (n < 0) { + usd_perror("reading GID"); + return -errno; + } + gid[n] = '\0'; + + p = gid + 20; + sscanf(p, "%hx", &v); + *mac++ = (v >> 8) ^ 2; + *mac++ = v & 0xFF; + p += 5; + sscanf(p, "%hx", &v); + *mac++ = v >> 8; + p += 5; + sscanf(p, "%hx", &v); + *mac++ = v & 0xFF; + p += 5; + sscanf(p, "%hx", &v); + *mac++ = v >> 8; + *mac++ = v & 0xFF; + + return 0; +} + +/* + * Find interface for a device + */ +int +usd_get_iface( + struct usd_device *dev) +{ + char name[PATH_MAX + 128]; + struct usd_ib_dev *idp; + int fd; + int n; + + idp = dev->ud_ctx->ucx_ib_dev; + snprintf(name, sizeof(name), "%s/iface", idp->id_class_path); + + fd = open(name, O_RDONLY); + if (fd == -1) { + usd_perror(name); + dev->ud_attrs.uda_ifname[0] = '\0'; + return -errno; + } + + n = read(fd, dev->ud_attrs.uda_ifname, + sizeof(dev->ud_attrs.uda_ifname)); + close(fd); + if (n < 0) { + usd_perror("reading iface"); + return -errno; + } + + dev->ud_attrs.uda_ifname[n - 1] = '\0'; + + return 0; +} + +/* + * Read an integer from a sysfs entry + */ +static int +usd_ib_sysfs_get_int( + struct usd_device *dev, + char *entry, + int *result) +{ + char name[PATH_MAX + 128]; + char buf[32]; + struct usd_ib_dev *idp; + int fd; + int n; + + idp = dev->ud_ctx->ucx_ib_dev; + snprintf(name, sizeof(name), "%s/%s", idp->id_class_path, entry); + + fd = open(name, O_RDONLY); + if (fd == -1) { + usd_perror(name); + return -errno; + } + + n = read(fd, buf, sizeof(buf)); + close(fd); + if (n < 0) { + fprintf(stderr, "Error %d reading %s\n", errno, entry); + return -errno; + } + + *result = atoi(buf); + return 0; +} + +/* + * Get usNIC configuration + */ +int +usd_get_usnic_config( + struct usd_device *dev) +{ + int v; + int ret; + + ret = usd_ib_sysfs_get_int(dev, "max_vf", &v); + if (ret != 0) + return ret; + dev->ud_attrs.uda_num_vf = v; + + ret = usd_ib_sysfs_get_int(dev, "qp_per_vf", &v); + if (ret != 0) + return ret; + dev->ud_attrs.uda_qp_per_vf = v; + + ret = usd_ib_sysfs_get_int(dev, "cq_per_vf", &v); + if (ret != 0) + return ret; + dev->ud_attrs.uda_cq_per_vf = v; + + ret = usd_ib_sysfs_get_int(dev, "intr_per_vf", &v); + if (ret != 0) { + /* older kernels did not export this sysfs node */ + if (ret == -ENOENT) { + dev->ud_attrs.uda_intr_per_vf = 0; + ret = 0; + } + else { + return ret; + } + } else { + dev->ud_attrs.uda_intr_per_vf = v; + } + + return ret; +} + +/* + * Find firmware version + */ +int +usd_get_firmware( + struct usd_device *dev) +{ + char name[PATH_MAX + 128]; + struct usd_ib_dev *idp; + char *fw; + int fd; + int n; + + idp = dev->ud_ctx->ucx_ib_dev; + snprintf(name, sizeof(name), "%s/fw_ver", idp->id_class_path); + + fd = open(name, O_RDONLY); + if (fd == -1) { + usd_perror(name); + return -errno; + } + + fw = &dev->ud_attrs.uda_firmware[0]; + n = read(fd, fw, sizeof(dev->ud_attrs.uda_firmware)); + close(fd); + if (n < 0) { + usd_perror("reading fw_ver"); + return -errno; + } + fw[n - 1] = '\0'; + + return 0; +} diff --git a/prov/usnic/src/usnic_direct/usd_ib_sysfs.h b/prov/usnic/src/usnic_direct/usd_ib_sysfs.h new file mode 100644 index 00000000000..3014d0a2d2f --- /dev/null +++ b/prov/usnic/src/usnic_direct/usd_ib_sysfs.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#ifndef _USD_IB_SYSFS_ +#define _USD_IB_SYSFS_ + +#include + +/* + * Forward structure defs + */ +struct usd_device; + +/* + * Definition of a usnic IB entry + */ +struct usd_ib_dev { + char id_name[80]; + char id_usnic_name[USD_MAX_DEVNAME]; + char id_dev_path[PATH_MAX]; /* path to IB dev */ + char id_class_path[PATH_MAX]; /* path to IB class info */ + + struct usd_ib_dev *id_next; +}; + +int usd_ib_get_devlist(struct usd_ib_dev **dev_list); +int usd_get_mac(struct usd_device *dev, uint8_t * mac); +int usd_get_iface(struct usd_device *dev); +int usd_get_usnic_config(struct usd_device *dev); +int usd_get_firmware(struct usd_device *dev); +int usd_read_cap_ver(struct usd_device *dev, char *cap_name, int *vers_o); +#endif /* _USD_IB_SYSFS_ */ diff --git a/prov/usnic/src/usnic_direct/usd_mem.c b/prov/usnic/src/usnic_direct/usd_mem.c new file mode 100644 index 00000000000..486e8f85622 --- /dev/null +++ b/prov/usnic/src/usnic_direct/usd_mem.c @@ -0,0 +1,220 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "usnic_direct.h" +#include "usd.h" +#include "usd_ib_cmd.h" + +/* + * Issue driver command to register memory region + */ +int +usd_reg_mr( + struct usd_device *dev, + void *vaddr, + size_t length, + struct usd_mr **mr_o) +{ + struct usd_mr *mr; + int ret; + + mr = calloc(sizeof(*mr), 1); + if (mr == NULL) { + return -errno; + } + + ret = usd_ib_cmd_reg_mr(dev, vaddr, length, mr); + + if (ret == 0) { + mr->umr_dev = dev; + mr->umr_vaddr = vaddr; + mr->umr_length = length; + *mr_o = mr; + } else { + free(mr); + } + + return ret; +} + +/* + * Issue driver command to de-register memory region + */ +int +usd_dereg_mr( + struct usd_mr *mr) +{ + int ret; + + ret = usd_ib_cmd_dereg_mr(mr->umr_dev, mr); + if (ret == 0) + free(mr); + + return ret; +} + +/* + * Used to allocate memory and an mr to go with it all in one go. Used + * to provide memory to the vnic_* functions that call pci_alloc_consistant + * We want to return a nicely aligned chunk of memory preceded by struct usd_mr. + * We don't know the alignment of the memory we get back, so allocate a big + * enough chunk to hold the following: + * struct usd_mr + * N pad bytes + * true length and pointer to usd_mr + * page aligned buffer for user + */ +int +usd_alloc_mr( + struct usd_device *dev, + size_t size, + void **vaddr_o) +{ + void *vaddr; + void *base_addr; + struct usd_mr *mr; + size_t true_size; + size_t metadata_size; + size_t madv_size; + int ret; + + metadata_size = sizeof(struct usd_mr) + 3 * sizeof(uintptr_t); + madv_size = ALIGN(size, sysconf(_SC_PAGESIZE)); + true_size = madv_size + metadata_size + sysconf(_SC_PAGESIZE) - 1; + base_addr = mmap(NULL, true_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (base_addr == NULL || base_addr == MAP_FAILED) { + usd_err("Failed to mmap region of size %lu\n", true_size); + return -errno; + } + mr = base_addr; + vaddr = + (void *) ALIGN((uintptr_t) base_addr + metadata_size, + sysconf(_SC_PAGESIZE)); + ((uintptr_t *) vaddr)[-1] = (uintptr_t) mr; + ((uintptr_t *) vaddr)[-2] = true_size; + ((uintptr_t *) vaddr)[-3] = madv_size; + + /* + * Disable copy-on-write for memories internally used by USD. + * For application buffers, disabling copy-on-write should be provided by + * usd wrapper such as libfabric or verbs plugin if fork is supported. + * The memory to be registered starts from page-aligned address, and ends + * at page boundary, so it's impossible for a page to be updated + * with multiple madvise calls when each call reference different VAs on + * the same page. This allows to avoid the need to reference count + * the pages that get updated with mutiple madvise calls. For details, + * see libibverbs ibv_dont_forkrange implementations. + */ + ret = madvise(vaddr, madv_size, MADV_DONTFORK); + if (ret != 0) { + usd_err("Failed to disable child's access to memory %p size %lu\n", + vaddr, size); + ret = errno; + goto err_unmap; + } + + ret = usd_ib_cmd_reg_mr(dev, vaddr, size, mr); + if (ret != 0) { + usd_err("Failed to register memory region %p, size %lu\n", + vaddr, size); + goto err_madvise; + } + mr->umr_dev = dev; + + *vaddr_o = vaddr; + return 0; + +err_madvise: + madvise(vaddr, ALIGN(size, sysconf(_SC_PAGESIZE)), MADV_DOFORK); +err_unmap: + munmap(base_addr, true_size); + return ret; +} + +/* + * See usd_alloc_mr() for explanation of: + * mr = (struct usd_mr *)((uintptr_t *)vaddr)[-1]; + */ +int +usd_free_mr( + void *vaddr) +{ + struct usd_mr *mr; + size_t true_size; + size_t madv_size; + int ret; + + mr = (struct usd_mr *) ((uintptr_t *) vaddr)[-1]; + true_size = ((uintptr_t *) vaddr)[-2]; + madv_size = ((uintptr_t *) vaddr)[-3]; + + ret = usd_ib_cmd_dereg_mr(mr->umr_dev, mr); + if (ret == 0) { + madvise(vaddr, madv_size, MADV_DOFORK); + munmap(mr, true_size); + } + + return ret; +} + +/* + * Utility function for vnic_* routines + */ +char * +pci_name( + struct pci_dev *pdev) +{ + struct usd_device *dev; + + dev = (struct usd_device *) pdev; + + return dev->ud_ctx->ucx_ib_dev->id_usnic_name; +} diff --git a/prov/usnic/src/usnic_direct/usd_poll.c b/prov/usnic/src/usnic_direct/usd_poll.c new file mode 100644 index 00000000000..0ce9008083d --- /dev/null +++ b/prov/usnic/src/usnic_direct/usd_poll.c @@ -0,0 +1,293 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#include + + +#include "usd.h" +#include "usd_util.h" +#include "cq_enet_desc.h" + +static inline void +find_rx_lengths( + struct usd_rq *rq, + uint16_t q_index, + size_t *posted_len_o, + size_t *len_in_pkt_o) +{ + dma_addr_t bus_addr; + u16 len; + u8 type; + size_t rcvbuf_len; + uint16_t i; + + i = q_index; + rcvbuf_len = 0; + do { + rq_enet_desc_dec( (struct rq_enet_desc *) + ((uintptr_t)rq->urq_desc_ring + (i<<4)), + &bus_addr, &type, &len); + rcvbuf_len += len; + i = (i - 1) & rq->urq_post_index_mask; + } while (type == RQ_ENET_TYPE_NOT_SOP); + + *posted_len_o = rcvbuf_len; + *len_in_pkt_o = ntohs(((struct usd_udp_hdr *)bus_addr)->uh_ip.tot_len) + + sizeof(struct ether_header); +} + +static inline int +usd_desc_to_rq_comp( + struct usd_cq_impl *cq, + struct cq_desc *desc, + uint16_t qid, + uint16_t q_index, + struct usd_completion *comp) +{ + struct usd_rq *rq; + struct usd_qp_impl *qp; + struct cq_enet_rq_desc *edesc; + uint16_t bytes_written_flags; + uint32_t bytes_written; + uint32_t ci_flags; + uint32_t ipudpok; + unsigned credits; + size_t len_in_pkt; + size_t rcvbuf_len; + + edesc = (struct cq_enet_rq_desc *)desc; + rq = cq->ucq_rq_map[qid]; + qp = usd_container_of(rq, struct usd_qp_impl, uq_rq); + + bytes_written_flags = le16_to_cpu(edesc->bytes_written_flags); + bytes_written = bytes_written_flags & CQ_ENET_RQ_DESC_BYTES_WRITTEN_MASK; + ci_flags = le16_to_cpu(edesc->completed_index_flags); + + if (ci_flags & CQ_ENET_RQ_DESC_FLAGS_EOP) { + comp->uc_bytes = bytes_written + rq->urq_accum_bytes; + rq->urq_accum_bytes = 0; + } else { + rq->urq_accum_bytes += bytes_written; + return -1; + } + + comp->uc_context = rq->urq_context[q_index]; + comp->uc_qp = &qp->uq_qp; + + ipudpok = CQ_ENET_RQ_DESC_FLAGS_IPV4_CSUM_OK | + CQ_ENET_RQ_DESC_FLAGS_TCP_UDP_CSUM_OK; + + if (bytes_written_flags & CQ_ENET_RQ_DESC_FLAGS_TRUNCATED || + (edesc->flags & ipudpok) != ipudpok) { + if (((edesc->flags & CQ_ENET_RQ_DESC_FLAGS_FCS_OK) == 0) && + bytes_written == 0) { + find_rx_lengths(rq, q_index, &rcvbuf_len, &len_in_pkt); + + /* + * If only the paddings to meet 64-byte minimum eth frame + * requirement are truncated, do not mark packet as + * error due to truncation. + * The usnic hdr should not be split into multiple receive buffer + * + * If we could afford the extra cycles, we would also compute the + * UDP checksum here and compare it to the UDP header. + */ + if (rcvbuf_len >= 60 || len_in_pkt > rcvbuf_len) { + comp->uc_status = USD_COMPSTAT_ERROR_TRUNC; + } + else { + comp->uc_status = USD_COMPSTAT_SUCCESS; + /* TRUNC means bytes_written==0, so fix this too */ + comp->uc_bytes = len_in_pkt; + } + } else { + comp->uc_status = USD_COMPSTAT_ERROR_CRC; + } + } else { + if (comp->uc_bytes <= 60) { + /* + * The sender may have attempted to send a small frame (<64-bytes) + * that was padded out to 64-bytes by the sending VIC. + * If we posted a recv buffer >= 60 bytes then we wouldn't see + * truncation, but the bytes_written by the VIC will be larger than + * the bytes the sender actually requested to send. Fix that up + * here. + */ + find_rx_lengths(rq, q_index, &rcvbuf_len, &len_in_pkt); + comp->uc_bytes = len_in_pkt; + } + comp->uc_status = USD_COMPSTAT_SUCCESS; + } + + /* needs a little work in multi-SGE case, all credits currently not + * reported as released until next RX + */ + credits = (q_index - rq->urq_last_comp) & rq->urq_post_index_mask; + rq->urq_recv_credits += credits; + rq->urq_last_comp = q_index; + + return 0; +} + +static inline void +usd_desc_to_wq_comp( + struct usd_cq_impl *cq, + uint16_t qid, + uint16_t q_index, + struct usd_completion *comp) +{ + struct usd_wq *wq; + struct usd_qp_impl *qp; + struct usd_wq_post_info *info; + unsigned credits; + + wq = cq->ucq_wq_map[qid]; + qp = usd_container_of(wq, struct usd_qp_impl, uq_wq); + comp->uc_qp = &qp->uq_qp; + + info = &wq->uwq_post_info[(q_index+1)&wq->uwq_post_index_mask]; + comp->uc_context = info->wp_context; + comp->uc_bytes = info->wp_len; + comp->uc_status = USD_COMPSTAT_SUCCESS; + + credits = (q_index - wq->uwq_last_comp) & wq->uwq_post_index_mask; + wq->uwq_send_credits += credits; + wq->uwq_last_comp = q_index; +} + +int +usd_poll_cq_multi( + struct usd_cq *ucq, + int max_comps, + struct usd_completion *comps) +{ + int ret; + int n; + + for (n = 0; n < max_comps; ++n) { + ret = usd_poll_cq(ucq, comps + n); + if (ret == -EAGAIN) { + return n; + } + } + return max_comps; +} + +int +usd_poll_cq( + struct usd_cq *ucq, + struct usd_completion *comp) +{ + struct usd_cq_impl *cq; + struct cq_desc *cq_desc; + uint8_t color; + uint8_t last_color; + uint8_t type_color; + uint8_t type; + uint16_t qid; + uint16_t q_index; + + cq = to_cqi(ucq); + +retry: + /* check for a completion */ + cq_desc = (struct cq_desc *)((uint8_t *)cq->ucq_desc_ring + + (cq->ucq_next_desc << 4)); + last_color = cq->ucq_last_color; + + type_color = cq_desc->type_color; + type = type_color & 0x7f; + color = type_color >> CQ_DESC_COLOR_SHIFT; + qid = le16_to_cpu(cq_desc->q_number) & CQ_DESC_Q_NUM_MASK; + q_index = le16_to_cpu(cq_desc->completed_index) & CQ_DESC_COMP_NDX_MASK; + + if (color == last_color) { + return -EAGAIN; + } else { + + /* bookkeeping */ + cq->ucq_next_desc++; + cq->ucq_last_color ^= (cq->ucq_next_desc >> cq->ucq_color_shift); + cq->ucq_next_desc &= cq->ucq_cqe_mask; + + rmb(); + + comp->uc_type = (enum usd_completion_type) type; + + if (type == USD_COMPTYPE_RECV) { + if (usd_desc_to_rq_comp(cq, cq_desc, qid, q_index, comp) == -1) { + goto retry; + } + } else if (type == USD_COMPTYPE_SEND) { + usd_desc_to_wq_comp(cq, qid, q_index, comp); + } else { + comp->uc_status = USD_COMPSTAT_ERROR_INTERNAL; + } + return 0; + } +} + +/* + * Allow application to unmask interrupt explicitly + */ +int usd_poll_req_notify(struct usd_cq *ucq) +{ + struct usd_cq_impl *cq; + + cq = to_cqi(ucq); + + /* + * application uses a signal thread waiting for one completion FD, + * then calling this function to unmask the interrupt source. If multiple + * cqs are associated with the FD/interrupt, this may be unneccesarilly + * called for subsequent cqs at each poll/wait, but it's OK. A lock isn't + * used here to prevent simultaneous unmasking among multiple threads as + * it's not a valid use case. + * Also this call happens at data path, it's assumed that removing a + * interrupt source from cq happens at control path tear down stage, when + * data path is already finished. + */ + if (cq->comp_fd != -1 && cq->ucq_intr != NULL) + vnic_intr_unmask(&cq->ucq_intr->uci_vintr); + + return 0; +} diff --git a/prov/usnic/src/usnic_direct/usd_post.c b/prov/usnic/src/usnic_direct/usd_post.c new file mode 100644 index 00000000000..e1e1b30c0c0 --- /dev/null +++ b/prov/usnic/src/usnic_direct/usd_post.c @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#include "usd.h" +#include "usd_post.h" + +unsigned +usd_get_send_credits( + struct usd_qp *uqp) +{ + struct usd_qp_impl *qp; + + qp = to_qpi(uqp); + + return qp->uq_wq.uwq_send_credits; +} + +unsigned +usd_get_recv_credits( + struct usd_qp *uqp) +{ + struct usd_qp_impl *qp; + + qp = to_qpi(uqp); + + return qp->uq_rq.urq_recv_credits; +} + +int +usd_post_recv( + struct usd_qp *uqp, + struct usd_recv_desc *recv_list) +{ + struct usd_qp_impl *qp; + struct usd_rq *rq; + struct vnic_rq *vrq; + struct rq_enet_desc *desc; + struct iovec *iovp; + uint32_t index; + uint32_t count; + unsigned i; + + qp = to_qpi(uqp); + rq = &qp->uq_rq; + vrq = &rq->urq_vnic_rq; + desc = rq->urq_next_desc; + index = rq->urq_post_index; + + count = 0; + + while (recv_list != NULL) { + iovp = recv_list->urd_iov; + rq->urq_context[index] = recv_list->urd_context; + rq_enet_desc_enc(desc, (dma_addr_t) iovp[0].iov_base, + RQ_ENET_TYPE_ONLY_SOP, iovp[0].iov_len); + count++; + + index = (index+1) & rq->urq_post_index_mask; + desc = (struct rq_enet_desc *) ((uintptr_t)rq->urq_desc_ring + + (index<<4)); + + for (i = 1; i < recv_list->urd_iov_cnt; ++i) { + rq->urq_context[index] = recv_list->urd_context; + rq_enet_desc_enc(desc, (dma_addr_t) iovp[i].iov_base, + RQ_ENET_TYPE_NOT_SOP, iovp[i].iov_len); + count++; + + index = (index+1) & rq->urq_post_index_mask; + desc = (struct rq_enet_desc *) ((uintptr_t)rq->urq_desc_ring + + (index<<4)); + } + recv_list = recv_list->urd_next; + } + + wmb(); + iowrite32(index, &vrq->ctrl->posted_index); + + rq->urq_next_desc = desc; + rq->urq_post_index = index; + rq->urq_recv_credits -= count; + + return 0; +} diff --git a/prov/usnic/src/usnic_direct/usd_post.h b/prov/usnic/src/usnic_direct/usd_post.h new file mode 100644 index 00000000000..a7bc5b5e38c --- /dev/null +++ b/prov/usnic/src/usnic_direct/usd_post.h @@ -0,0 +1,227 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#ifndef _USD_POST_H_ +#define _USD_POST_H_ + +#include + +#include "usd.h" +#include "usd_util.h" + +static inline uint32_t +_usd_post_send_one( + struct usd_wq *wq, + const void *packet, + size_t length, + u_int8_t cq_entry) +{ + struct vnic_wq *vwq; + uint32_t index; + struct wq_enet_desc *desc; + uint64_t wr; + u_int8_t offload_mode = 0, eop = 1; + u_int16_t mss = 7, header_length = 0, vlan_tag = 0; + u_int8_t vlan_tag_insert = 0, loopback = 0, fcoe_encap = 0; + + vwq = &wq->uwq_vnic_wq; + desc = wq->uwq_next_desc; + index = wq->uwq_post_index; + + wq_enet_desc_enc(desc, (uintptr_t)packet, length, + mss, header_length, offload_mode, + eop, cq_entry, fcoe_encap, + vlan_tag_insert, vlan_tag, loopback); + wmb(); + + wr = vnic_cached_posted_index((dma_addr_t)packet, length, index); + iowrite64(wr, &vwq->ctrl->posted_index); + + wq->uwq_next_desc = (struct wq_enet_desc *) + ((uintptr_t)wq->uwq_desc_ring + (index<<4)); + wq->uwq_post_index = (index+1) & wq->uwq_post_index_mask; + wq->uwq_send_credits--; + + return index; +} + +static inline uint32_t +_usd_post_send_two( + struct usd_wq *wq, + const void *hdr, + size_t hdrlen, + const void *pkt, + size_t pktlen, + u_int8_t cq_entry) +{ + struct vnic_wq *vwq; + uint32_t index; + struct wq_enet_desc *desc; + u_int8_t offload_mode = 0, eop; + u_int16_t mss = 7, header_length = 0, vlan_tag = 0; + u_int8_t vlan_tag_insert = 0, loopback = 0, fcoe_encap = 0; + + vwq = &wq->uwq_vnic_wq; + desc = wq->uwq_next_desc; + index = wq->uwq_post_index; + + eop = 0; + wq_enet_desc_enc(desc, (uintptr_t)hdr, hdrlen, + mss, header_length, offload_mode, + eop, 0, fcoe_encap, + vlan_tag_insert, vlan_tag, loopback); + + desc = (struct wq_enet_desc *) ((uintptr_t)wq->uwq_desc_ring + (index<<4)); + index = (index+1) & wq->uwq_post_index_mask; + + eop = 1; + wq_enet_desc_enc(desc, (uintptr_t)pkt, pktlen, + mss, header_length, offload_mode, + eop, cq_entry, fcoe_encap, + vlan_tag_insert, vlan_tag, loopback); + wmb(); + + iowrite32(index, &vwq->ctrl->posted_index); + + wq->uwq_next_desc = (struct wq_enet_desc *) + ((uintptr_t)wq->uwq_desc_ring + (index<<4)); + wq->uwq_post_index = (index+1) & wq->uwq_post_index_mask; + wq->uwq_send_credits -= 2; + + return index; +} + +static inline uint32_t +_usd_post_send_two_vlan( + struct usd_wq *wq, + const void *hdr, + size_t hdrlen, + const void *pkt, + size_t pktlen, + u_int8_t cq_entry, + u_int16_t vlan_tag) +{ + struct vnic_wq *vwq; + uint32_t index; + struct wq_enet_desc *desc; + u_int8_t offload_mode = 0, eop; + u_int16_t mss = 7, header_length = 0; + u_int8_t vlan_tag_insert = 1, loopback = 0, fcoe_encap = 0; + + vwq = &wq->uwq_vnic_wq; + desc = wq->uwq_next_desc; + index = wq->uwq_post_index; + + eop = 0; + wq_enet_desc_enc(desc, (uintptr_t)hdr, hdrlen, + mss, header_length, offload_mode, + eop, 0, fcoe_encap, + vlan_tag_insert, vlan_tag, loopback); + + desc = (struct wq_enet_desc *) ((uintptr_t)wq->uwq_desc_ring + (index<<4)); + index = (index+1) & wq->uwq_post_index_mask; + + eop = 1; + wq_enet_desc_enc(desc, (uintptr_t)pkt, pktlen, + mss, header_length, offload_mode, + eop, cq_entry, fcoe_encap, + vlan_tag_insert, vlan_tag, loopback); + wmb(); + + iowrite32(index, &vwq->ctrl->posted_index); + + wq->uwq_next_desc = (struct wq_enet_desc *) + ((uintptr_t)wq->uwq_desc_ring + (index<<4)); + wq->uwq_post_index = (index+1) & wq->uwq_post_index_mask; + wq->uwq_send_credits -= 2; + + return index; +} + +/* + * Consume iov count credits, assumes that iov[0] includes usnic header + */ +static inline uint32_t +_usd_post_send_iov( + struct usd_wq *wq, + const struct iovec *iov, + size_t count, + u_int8_t cq_entry) +{ + struct vnic_wq *vwq; + uint32_t index; + struct wq_enet_desc *desc; + u_int8_t offload_mode = 0; + u_int16_t mss = 7, header_length = 0, vlan_tag = 0; + u_int8_t vlan_tag_insert = 0, loopback = 0, fcoe_encap = 0; + unsigned i; + + vwq = &wq->uwq_vnic_wq; + desc = wq->uwq_next_desc; + index = wq->uwq_post_index; + + for (i = 0; i < count - 1; i++) { + wq_enet_desc_enc(desc, (uintptr_t)(iov[i].iov_base), + iov[i].iov_len, mss, header_length, offload_mode, + 0, 0, fcoe_encap, vlan_tag_insert, vlan_tag, loopback); + desc = (struct wq_enet_desc *) ((uintptr_t)wq->uwq_desc_ring + + (index<<4)); + index = (index+1) & wq->uwq_post_index_mask; + } + + wq_enet_desc_enc(desc, (uintptr_t)(iov[i].iov_base), + iov[i].iov_len, mss, header_length, offload_mode, + 1, cq_entry, fcoe_encap, vlan_tag_insert, vlan_tag, loopback); + + wmb(); + + iowrite32(index, &vwq->ctrl->posted_index); + + wq->uwq_next_desc = (struct wq_enet_desc *) + ((uintptr_t)wq->uwq_desc_ring + (index<<4)); + wq->uwq_post_index = (index+1) & wq->uwq_post_index_mask; + wq->uwq_send_credits -= count; + + return index; +} + +#endif /* _USD_POST_H_ */ diff --git a/prov/usnic/src/usnic_direct/usd_post_ud_pio_udp.c b/prov/usnic/src/usnic_direct/usd_post_ud_pio_udp.c new file mode 100644 index 00000000000..64f3cf9180a --- /dev/null +++ b/prov/usnic/src/usnic_direct/usd_post_ud_pio_udp.c @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#include + +#include "usd.h" +#include "usd_post.h" + +static int +usd_post_send_one_ud_pio_udp( + struct usd_qp *uqp, + struct usd_dest *dest, + const void *buf, + size_t len, + uint32_t flags, + void *context) +{ + struct usd_qp_impl *qp; + struct usd_udp_hdr *hdr; + struct usd_wq *wq; + struct usd_wq_post_info *info; + struct vnic_wq *vwq; + uint32_t index; + struct wq_enet_desc *desc; + char *v_pkt; + uint64_t p_pkt; + uint64_t *s, *d; + uint32_t copylen; + uint8_t *copybuf; + + u_int8_t offload_mode = 0, eop = 1; + u_int16_t mss = 7, header_length = 0, vlan_tag = 0; + u_int8_t vlan_tag_insert = 0, loopback = 0, fcoe_encap = 0; + + qp = to_qpi(uqp); + wq = &qp->uq_wq; + + hdr = &dest->ds_dest.ds_udp.u_hdr; + + /* adjust lengths and insert source port */ + hdr->uh_ip.tot_len = htons(len + sizeof(struct usd_udp_hdr) - + sizeof(struct ether_header)); + hdr->uh_udp.len = htons((sizeof(struct usd_udp_hdr) - + sizeof(struct ether_header) - + sizeof(struct iphdr)) + len); + hdr->uh_udp.source = + qp->uq_attrs.uqa_local_addr.ul_addr.ul_udp.u_addr.sin_port; + + vwq = &wq->uwq_vnic_wq; + desc = wq->uwq_next_desc; + index = wq->uwq_post_index; + + v_pkt = wq->pio_v_pkt_buf + index * 256; + p_pkt = wq->pio_p_pkt_buf + index * 256; + copylen = (len + sizeof(*hdr) + 7) & ~7; +//printf("len = %lu, p_pkt = 0x%lx, index = %d\n", len, p_pkt, index); + d = (uint64_t *)v_pkt; + d[0] = ((uint64_t *)hdr)[0]; + d[1] = ((uint64_t *)hdr)[1]; + d[2] = ((uint64_t *)hdr)[2]; + d[3] = ((uint64_t *)hdr)[3]; + d[4] = ((uint64_t *)hdr)[4]; + + d += 5; + copybuf = wq->uwq_copybuf; + memcpy(copybuf + 2, buf, len); + s = (uint64_t *)copybuf; + + /* 40 bytes already copied */ + while (copylen > 40) { + *d++ = *s++; + copylen -= 8; + } + + /* encode in shadow ring and write 64 bytes */ + wq_enet_desc_enc(desc, (uintptr_t)p_pkt, len + sizeof(*hdr), + mss, header_length, offload_mode, + eop, USD_SF_ISSET(flags, SIGNAL), fcoe_encap, + vlan_tag_insert, vlan_tag, loopback); + + d = (uint64_t *)((uintptr_t)wq->pio_v_wq_addr + (uintptr_t)desc - + (uintptr_t)wq->uwq_desc_ring); + s = (uint64_t *)desc; + d[0] = s[0]; + d[1] = s[1]; + + wmb(); + +//printf("post %lu[%d] p=0x%lx\n", len + sizeof(*hdr), index, p_pkt); + iowrite32(index, &vwq->ctrl->posted_index); + + wq->uwq_next_desc = (struct wq_enet_desc *) + ((uintptr_t)wq->uwq_desc_ring + (index<<4)); + wq->uwq_post_index = (index+1) & wq->uwq_post_index_mask; + wq->uwq_send_credits--; + + info = &wq->uwq_post_info[index]; + info->wp_context = context; + info->wp_len = len; + + return 0; +} + +/* + * 2 WQEs - our header plus user header in 1st one, user packet in 2nd + */ +static int +usd_post_send_two_ud_pio_udp( + struct usd_qp *uqp, + struct usd_dest *dest, + const void *uhdr, + size_t uhdrlen, + const void *pkt, + size_t pktlen, + uint32_t flags, + void *context) +{ + struct usd_qp_impl *qp; + struct usd_udp_hdr *hdr; + struct usd_wq *wq; + struct usd_wq_post_info *info; + struct vnic_wq *vwq; + uint32_t index; + struct wq_enet_desc *desc; + char *v_pkt; + uint64_t p_pkt; + uint64_t *s, *d; + uint32_t copylen; + uint8_t *copybuf; + size_t len; + + u_int8_t offload_mode = 0, eop = 1; + u_int16_t mss = 7, header_length = 0, vlan_tag = 0; + u_int8_t vlan_tag_insert = 0, loopback = 0, fcoe_encap = 0; + + qp = to_qpi(uqp); + wq = &qp->uq_wq; + + hdr = &dest->ds_dest.ds_udp.u_hdr; + len = uhdrlen + pktlen; + + /* adjust lengths and insert source port */ + hdr->uh_ip.tot_len = htons(len + sizeof(struct usd_udp_hdr) - + sizeof(struct ether_header)); + hdr->uh_udp.len = htons((sizeof(struct usd_udp_hdr) - + sizeof(struct ether_header) - + sizeof(struct iphdr)) + len); + hdr->uh_udp.source = + qp->uq_attrs.uqa_local_addr.ul_addr.ul_udp.u_addr.sin_port; + + vwq = &wq->uwq_vnic_wq; + desc = wq->uwq_next_desc; + index = wq->uwq_post_index; + + v_pkt = wq->pio_v_pkt_buf + index * 256; + p_pkt = wq->pio_p_pkt_buf + index * 256; + copylen = (len + sizeof(*hdr) + 7) & ~7; +//printf("len = %lu, p_pkt = 0x%lx, index = %d\n", len, p_pkt, index); + d = (uint64_t *)v_pkt; + d[0] = ((uint64_t *)hdr)[0]; + d[1] = ((uint64_t *)hdr)[1]; + d[2] = ((uint64_t *)hdr)[2]; + d[3] = ((uint64_t *)hdr)[3]; + d[4] = ((uint64_t *)hdr)[4]; + + d += 5; + copybuf = wq->uwq_copybuf; + memcpy(copybuf + 2, uhdr, uhdrlen); + memcpy(copybuf + 2 + uhdrlen, pkt, pktlen); + s = (uint64_t *)copybuf; + + /* 40 bytes already copied */ + while (copylen > 40) { + *d++ = *s++; + copylen -= 8; + } + + /* encode in shadow ring and write 64 bytes */ + wq_enet_desc_enc(desc, (uintptr_t)p_pkt, len + sizeof(*hdr), + mss, header_length, offload_mode, + eop, USD_SF_ISSET(flags, SIGNAL), fcoe_encap, + vlan_tag_insert, vlan_tag, loopback); + + d = (uint64_t *)((uintptr_t)wq->pio_v_wq_addr + (uintptr_t)desc - + (uintptr_t)wq->uwq_desc_ring); + s = (uint64_t *)desc; + d[0] = s[0]; + d[1] = s[1]; + + wmb(); + +//printf("post %lu[%d] p=0x%lx\n", len + sizeof(*hdr), index, p_pkt); + iowrite32(index, &vwq->ctrl->posted_index); + + wq->uwq_next_desc = (struct wq_enet_desc *) + ((uintptr_t)wq->uwq_desc_ring + (index<<4)); + wq->uwq_post_index = (index+1) & wq->uwq_post_index_mask; + wq->uwq_send_credits--; + + info = &wq->uwq_post_info[index]; + info->wp_context = context; + info->wp_len = len; + + return 0; +} + +struct usd_qp_ops usd_qp_ops_ud_pio_udp = { + .qo_post_send_one = usd_post_send_one_ud_pio_udp, + .qo_post_send_one_prefixed = usd_post_send_one_ud_pio_udp, + .qo_post_send_one_copy = usd_post_send_one_ud_pio_udp, + .qo_post_send_two_copy = usd_post_send_two_ud_pio_udp, +}; diff --git a/prov/usnic/src/usnic_direct/usd_post_ud_raw.c b/prov/usnic/src/usnic_direct/usd_post_ud_raw.c new file mode 100644 index 00000000000..40a6e33ea7a --- /dev/null +++ b/prov/usnic/src/usnic_direct/usd_post_ud_raw.c @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#include "usd.h" +#include "usd_post.h" + +static int +usd_post_send_one_prefixed_ud_raw( + struct usd_qp *uqp, + struct usd_dest __attribute__ ((unused)) * dest, + const void *buf, + size_t len, + uint32_t flags, + void *context) +{ + struct usd_qp_impl *qp; + struct usd_wq *wq; + uint32_t last_post; + struct usd_wq_post_info *info; + + qp = to_qpi(uqp); + wq = &qp->uq_wq; + + last_post = + _usd_post_send_one(wq, buf, len, USD_SF_ISSET(flags, SIGNAL)); + + info = &wq->uwq_post_info[last_post]; + info->wp_context = context; + info->wp_len = len; + + return 0; +} + +struct usd_qp_ops usd_qp_ops_ud_raw = { + .qo_post_send_one_prefixed = usd_post_send_one_prefixed_ud_raw, +}; diff --git a/prov/usnic/src/usnic_direct/usd_post_ud_udp.c b/prov/usnic/src/usnic_direct/usd_post_ud_udp.c new file mode 100644 index 00000000000..c2511230fec --- /dev/null +++ b/prov/usnic/src/usnic_direct/usd_post_ud_udp.c @@ -0,0 +1,325 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#include + +#include "usd.h" +#include "usd_post.h" + +static int +usd_post_send_one_ud_udp( + struct usd_qp *uqp, + struct usd_dest *dest, + const void *buf, + size_t len, + uint32_t flags, + void *context) +{ + struct usd_qp_impl *qp; + struct usd_udp_hdr *hdr; + struct usd_wq *wq; + uint32_t last_post; + uint8_t *copybuf; + struct usd_wq_post_info *info; + + qp = to_qpi(uqp); + wq = &qp->uq_wq; + copybuf = wq->uwq_copybuf + wq->uwq_post_index * USD_SEND_MAX_COPY; + + hdr = (struct usd_udp_hdr *)copybuf; + memcpy(hdr, &dest->ds_dest.ds_udp.u_hdr, sizeof(*hdr)); + + /* adjust lengths and insert source port */ + hdr->uh_ip.tot_len = htons(len + sizeof(struct usd_udp_hdr) - + sizeof(struct ether_header)); + hdr->uh_udp.len = htons((sizeof(struct usd_udp_hdr) - + sizeof(struct ether_header) - + sizeof(struct iphdr)) + len); + hdr->uh_udp.source = + qp->uq_attrs.uqa_local_addr.ul_addr.ul_udp.u_addr.sin_port; + + last_post = _usd_post_send_two(wq, hdr, sizeof(*hdr), buf, len, + USD_SF_ISSET(flags, SIGNAL)); + + info = &wq->uwq_post_info[last_post]; + info->wp_context = context; + info->wp_len = len; + + return 0; +} + +static int +usd_post_send_one_vlan_ud_udp( + struct usd_qp *uqp, + struct usd_dest *dest, + const void *buf, + size_t len, + uint16_t vlan, + uint32_t flags, + void *context) +{ + struct usd_qp_impl *qp; + struct usd_udp_hdr *hdr; + struct usd_wq *wq; + uint32_t last_post; + uint8_t *copybuf; + struct usd_wq_post_info *info; + + qp = to_qpi(uqp); + wq = &qp->uq_wq; + copybuf = wq->uwq_copybuf + wq->uwq_post_index * USD_SEND_MAX_COPY; + + hdr = (struct usd_udp_hdr *)copybuf; + memcpy(hdr, &dest->ds_dest.ds_udp.u_hdr, sizeof(*hdr)); + + /* adjust lengths and insert source port */ + hdr->uh_ip.tot_len = htons(len + sizeof(struct usd_udp_hdr) - + sizeof(struct ether_header)); + hdr->uh_udp.len = htons((sizeof(struct usd_udp_hdr) - + sizeof(struct ether_header) - + sizeof(struct iphdr)) + len); + hdr->uh_udp.source = + qp->uq_attrs.uqa_local_addr.ul_addr.ul_udp.u_addr.sin_port; + + last_post = _usd_post_send_two_vlan(wq, hdr, sizeof(*hdr), buf, len, + USD_SF_ISSET(flags, SIGNAL), vlan); + + info = &wq->uwq_post_info[last_post]; + info->wp_context = context; + info->wp_len = len; + + return 0; +} + +static int +usd_post_send_one_copy_ud_udp( + struct usd_qp *uqp, + struct usd_dest *dest, + const void *buf, + size_t len, + uint32_t flags, + void *context) +{ + struct usd_qp_impl *qp; + struct usd_udp_hdr *hdr; + struct usd_wq *wq; + uint8_t *copybuf; + uint32_t last_post; + struct usd_wq_post_info *info; + + qp = to_qpi(uqp); + wq = &qp->uq_wq; + copybuf = wq->uwq_copybuf + wq->uwq_post_index * USD_SEND_MAX_COPY; + + hdr = (struct usd_udp_hdr *) copybuf; + memcpy(hdr, &dest->ds_dest.ds_udp.u_hdr, sizeof(*hdr)); + memcpy(hdr + 1, buf, len); + + /* adjust lengths and insert source port */ + hdr->uh_ip.tot_len = htons(len + sizeof(struct usd_udp_hdr) - + sizeof(struct ether_header)); + hdr->uh_udp.len = htons((sizeof(struct usd_udp_hdr) - + sizeof(struct ether_header) - + sizeof(struct iphdr)) + len); + hdr->uh_udp.source = + qp->uq_attrs.uqa_local_addr.ul_addr.ul_udp.u_addr.sin_port; + + last_post = + _usd_post_send_one(wq, hdr, len + sizeof(struct usd_udp_hdr), + USD_SF_ISSET(flags, SIGNAL)); + + info = &wq->uwq_post_info[last_post]; + info->wp_context = context; + info->wp_len = len; + + return 0; +} + +static int +usd_post_send_one_prefixed_ud_udp( + struct usd_qp *uqp, + struct usd_dest *dest, + const void *buf, + size_t len, + uint32_t flags, + void *context) +{ + struct usd_qp_impl *qp; + struct usd_udp_hdr *hdr; + struct usd_wq *wq; + uint32_t last_post; + struct usd_wq_post_info *info; + + qp = to_qpi(uqp); + wq = &qp->uq_wq; + + hdr = (struct usd_udp_hdr *) buf - 1; + memcpy(hdr, &dest->ds_dest.ds_udp.u_hdr, sizeof(*hdr)); + + /* adjust lengths and insert source port */ + hdr->uh_ip.tot_len = htons(len + sizeof(struct usd_udp_hdr) - + sizeof(struct ether_header)); + hdr->uh_udp.len = htons((sizeof(struct usd_udp_hdr) - + sizeof(struct ether_header) - + sizeof(struct iphdr)) + len); + hdr->uh_udp.source = + qp->uq_attrs.uqa_local_addr.ul_addr.ul_udp.u_addr.sin_port; + + last_post = + _usd_post_send_one(wq, hdr, len + sizeof(struct usd_udp_hdr), + USD_SF_ISSET(flags, SIGNAL)); + + info = &wq->uwq_post_info[last_post]; + info->wp_context = context; + info->wp_len = len; + + return 0; +} + +/* + * 2 WQEs - our header plus user header in 1st one, user packet in 2nd + */ +static int +usd_post_send_two_copy_ud_udp( + struct usd_qp *uqp, + struct usd_dest *dest, + const void *uhdr, + size_t uhdrlen, + const void *pkt, + size_t pktlen, + uint32_t flags, + void *context) +{ + struct usd_qp_impl *qp; + struct usd_udp_hdr *hdr; + struct usd_wq *wq; + uint8_t *copybuf; + size_t tot_ulen; + uint32_t last_post; + struct usd_wq_post_info *info; + + qp = to_qpi(uqp); + wq = &qp->uq_wq; + copybuf = wq->uwq_copybuf + wq->uwq_post_index * USD_SEND_MAX_COPY; + + hdr = (struct usd_udp_hdr *) copybuf; + memcpy(hdr, &dest->ds_dest.ds_udp.u_hdr, sizeof(*hdr)); + memcpy(hdr + 1, uhdr, uhdrlen); + memcpy((char *) (hdr + 1) + uhdrlen, pkt, pktlen); + + /* adjust lengths and insert source port */ + tot_ulen = uhdrlen + pktlen; + hdr->uh_ip.tot_len = htons(tot_ulen + sizeof(struct usd_udp_hdr) - + sizeof(struct ether_header)); + hdr->uh_udp.len = htons((sizeof(struct usd_udp_hdr) - + sizeof(struct ether_header) - + sizeof(struct iphdr)) + tot_ulen); + hdr->uh_udp.source = + qp->uq_attrs.uqa_local_addr.ul_addr.ul_udp.u_addr.sin_port; + + last_post = + _usd_post_send_one(wq, hdr, uhdrlen + sizeof(*hdr) + pktlen, + USD_SF_ISSET(flags, SIGNAL)); + + info = &wq->uwq_post_info[last_post]; + info->wp_context = context; + info->wp_len = uhdrlen + pktlen; + + return 0; +} + +static int +usd_post_send_iov_ud_udp(struct usd_qp *uqp, + struct usd_dest *dest, const struct iovec* iov, + size_t iov_count, uint32_t flags, void *context) +{ + struct usd_qp_impl *qp; + struct usd_udp_hdr *hdr; + struct usd_wq *wq; + uint32_t last_post; + uint8_t *copybuf; + struct usd_wq_post_info *info; + struct iovec send_iov[USD_SEND_MAX_SGE + 1]; + size_t len; + unsigned i; + + qp = to_qpi(uqp); + wq = &qp->uq_wq; + copybuf = wq->uwq_copybuf + wq->uwq_post_index * USD_SEND_MAX_COPY; + + for (i = 0, len = 0; i < iov_count; i++) { + len += iov[i].iov_len; + } + + hdr = (struct usd_udp_hdr *)copybuf; + memcpy(hdr, &dest->ds_dest.ds_udp.u_hdr, sizeof(*hdr)); + + /* adjust lengths and insert source port */ + hdr->uh_ip.tot_len = htons(len + sizeof(struct usd_udp_hdr) - + sizeof(struct ether_header)); + hdr->uh_udp.len = htons((sizeof(struct usd_udp_hdr) - + sizeof(struct ether_header) - + sizeof(struct iphdr)) + len); + hdr->uh_udp.source = + qp->uq_attrs.uqa_local_addr.ul_addr.ul_udp.u_addr.sin_port; + + send_iov[0].iov_base = hdr; + send_iov[0].iov_len = sizeof(*hdr); + memcpy(&send_iov[1], iov, sizeof(struct iovec) * iov_count); + + last_post = _usd_post_send_iov(wq, send_iov, iov_count + 1, + USD_SF_ISSET(flags, SIGNAL)); + info = &wq->uwq_post_info[last_post]; + info->wp_context = context; + info->wp_len = len; + + return 0; +} + +struct usd_qp_ops usd_qp_ops_ud_udp = { + .qo_post_send_one = usd_post_send_one_ud_udp, + .qo_post_send_one_prefixed = usd_post_send_one_prefixed_ud_udp, + .qo_post_send_one_copy = usd_post_send_one_copy_ud_udp, + .qo_post_send_two_copy = usd_post_send_two_copy_ud_udp, + .qo_post_send_iov = usd_post_send_iov_ud_udp, + .qo_post_send_one_vlan = usd_post_send_one_vlan_ud_udp, +}; diff --git a/prov/usnic/src/usnic_direct/usd_queue.h b/prov/usnic/src/usnic_direct/usd_queue.h new file mode 100644 index 00000000000..cb912da2638 --- /dev/null +++ b/prov/usnic/src/usnic_direct/usd_queue.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#ifndef _USD_QUEUE_H_ +#define _USD_QUEUE_H_ + +#include + +#define TAILQ_FOREACH_SAFE(var, tmpvar, head, field) \ + for ((var) = ((head)->tqh_first), \ + (tmpvar) = (var)?((var)->field.tqe_next):NULL; \ + (var); \ + (var) = (tmpvar), \ + (tmpvar) = (var)?((var)->field.tqe_next):NULL) + +#endif /* _USD_QUEUE_H_ */ diff --git a/prov/usnic/src/usnic_direct/usd_queues.c b/prov/usnic/src/usnic_direct/usd_queues.c new file mode 100644 index 00000000000..9ceabef7838 --- /dev/null +++ b/prov/usnic/src/usnic_direct/usd_queues.c @@ -0,0 +1,1370 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "kcompat.h" +#include "cq_enet_desc.h" +#include "wq_enet_desc.h" +#include "rq_enet_desc.h" + +#include "usnic_abi.h" +#include "usnic_direct.h" +#include "usd.h" +#include "usd_ib_cmd.h" +#include "usd_util.h" +#include "usd_vnic.h" +#include "usd_device.h" + +static int usd_create_qp_ud(struct usd_qp_impl *qp); + +/* + * Remove a usecount on a VF, free it if it goes to zero + */ +static void +usd_unmap_vf( + struct usd_device *dev, + struct usd_vf *vf) +{ + uint32_t i; + --vf->vf_refcnt; + + if (vf->vf_refcnt == 0) { + + /* unlink from list (logic works for uninit struct also) */ + if (vf->vf_next != NULL) + vf->vf_next->vf_prev = vf->vf_prev; + if (vf->vf_prev != NULL) + vf->vf_prev->vf_next = vf->vf_next; + if (dev->ud_vf_list == vf) + dev->ud_vf_list = vf->vf_next; + + if (vf->vf_vdev != NULL) + vnic_dev_unregister(vf->vf_vdev); + if (vf->vf_bar0.vaddr != MAP_FAILED) { + munmap(vf->vf_bar0.vaddr, vf->vf_bar_map_len); + } + for (i = 0; i < sizeof(vf->iomaps)/sizeof(vf->iomaps[0]); i++) { + if (vf->iomaps[i].bus_addr != 0 && + vf->iomaps[i].vaddr != MAP_FAILED) { + munmap(vf->iomaps[i].vaddr, vf->iomaps[i].len); + } + } + + free(vf); + } +} + +static int +usd_map_one_res(struct usd_device *dev, struct usd_vf *vf, + struct usnic_vnic_barres_info *barres) +{ + struct vnic_dev_iomap_info* iomap; + off64_t offset; + uint64_t page_size = sysconf(_SC_PAGE_SIZE); + + iomap = &vf->iomaps[barres->type]; + iomap->bus_addr = barres->bus_addr; + iomap->len = (barres->len + (page_size - 1)) & (~(page_size - 1)); + + offset = USNIC_ENCODE_PGOFF(vf->vf_id, USNIC_MMAP_RES, barres->type); + iomap->vaddr = mmap64(NULL, iomap->len, PROT_READ + PROT_WRITE, + MAP_SHARED, dev->ud_ctx->ucx_ib_dev_fd, offset); + if (iomap->vaddr == MAP_FAILED) { + usd_err("Failed to map res type %d, bus_addr 0x%lx, len 0x%lx\n", + barres->type, iomap->bus_addr, iomap->len); + return -errno; + } + vnic_dev_upd_res_vaddr(vf->vf_vdev, iomap); + + return 0; +} + +static int +usd_map_vnic_res(struct usd_device *dev, struct usd_vf *vf, + struct usd_vf_info *vfip) +{ + int i, err; + + /* unmap bar0 */ + if (vf->vf_bar0.vaddr != MAP_FAILED) { + munmap(vf->vf_bar0.vaddr, vf->vf_bar_map_len); + vf->vf_bar0.vaddr = MAP_FAILED; + } + + for (i = RES_TYPE_EOL + 1; i < RES_TYPE_MAX; i++) { + if (vfip->barres[i].bus_addr != 0) { + err = usd_map_one_res(dev, vf, &vfip->barres[i]); + if (err) + return err; + } else { + /* Disable any other res not reported by kernel module */ + struct vnic_dev_iomap_info iomap; + iomap.vaddr = 0; + iomap.bus_addr = vnic_dev_get_res_bus_addr( + vf->vf_vdev, i, 0); + iomap.len = vnic_dev_get_res_type_len( + vf->vf_vdev, i); + vnic_dev_upd_res_vaddr(vf->vf_vdev, &iomap); + } + } + + return 0; +} + +/* + * Create a VF structure if we don't already have one in use, + * update refcnt + */ +static int +usd_map_vf( + struct usd_device *dev, + struct usd_vf_info *vfip, + struct usd_vf **vf_o) +{ + struct usd_vf *vf; + off64_t offset; + int ret; + + /* find matching VF */ + vf = dev->ud_vf_list; + while (vf != NULL) { + if (vf->vf_id == vfip->vi_vfid) break; + vf = vf->vf_next; + } + + /* Was VF actually found? If not, create and add */ + if (vf == NULL) { + vf = calloc(sizeof(*vf), 1); + if (vf == NULL) { + ret = -errno; + goto out; + } + + /* Fill in function */ + vf->vf_id = vfip->vi_vfid; + vf->vf_refcnt = 1; + vf->vf_bar0.bus_addr = vfip->vi_bar_bus_addr; + vf->vf_bar0.len = vfip->vi_bar_len; + + /* map BAR0 HEAD first to get res info */ + if (vfip->vi_barhead_len > 0) { + offset = USNIC_ENCODE_PGOFF(vf->vf_id, USNIC_MMAP_BARHEAD, 0); + vf->vf_bar_map_len = vfip->vi_barhead_len; + } else { + offset = USNIC_ENCODE_PGOFF(vf->vf_id, USNIC_MMAP_BAR, 0); + vf->vf_bar_map_len = vfip->vi_bar_len; + } + vf->vf_bar0.vaddr = mmap64(NULL, vf->vf_bar_map_len, + PROT_READ + PROT_WRITE, MAP_SHARED, + dev->ud_ctx->ucx_ib_dev_fd, + offset); + if (vf->vf_bar0.vaddr == MAP_FAILED) { + usd_err("Failed to map bar0\n"); + ret = -errno; + goto out; + } + + /* Register it */ + vf->vf_vdev = vnic_dev_alloc_discover(NULL, NULL, (void *)dev, + &vf->vf_bar0, 1); + if (vf->vf_vdev == NULL) { + ret = -ENOENT; + goto out; + } + + /* map individual vnic resource seperately */ + if (dev->ud_ctx->ucx_caps[USNIC_CAP_MAP_PER_RES] > 0) { + ret = usd_map_vnic_res(dev, vf, vfip); + if (ret) + goto out; + } + + /* link it in */ + vf->vf_next = dev->ud_vf_list; + dev->ud_vf_list = vf; + + if (vf->vf_next != NULL) + vf->vf_next->vf_prev = vf; + vf->vf_prev = NULL; + + /* Found existing VF, bump reference count */ + } else { + ++vf->vf_refcnt; + } + + *vf_o = vf; + + return 0; + + out: + if (vf != NULL) + usd_unmap_vf(dev, vf); + return ret; +} + +static void +usd_get_vf( + struct usd_vf *vf) +{ + ++vf->vf_refcnt; +} + +/* + * Get a cq interrupt source + */ +static struct usd_cq_comp_intr * +usd_get_cq_intr( + struct usd_cq_impl *cq, + struct usd_vf *vf) +{ + struct usd_context *uctx; + struct usd_cq_comp_intr *intr; + int ret; + + uctx = cq->ucq_dev->ud_ctx; + + pthread_mutex_lock(&uctx->ucx_mutex); + LIST_FOREACH(intr, &uctx->ucx_intr_list, uci_ctx_link) { + if (intr->uci_offset == cq->intr_offset) { + intr->uci_refcnt ++; + goto out; + } + } + + intr = calloc(sizeof(*intr), 1); + if (intr != NULL) { + ret = vnic_grpmbrintr_alloc(vf->vf_vdev, &intr->uci_vintr, + cq->intr_offset); + if (ret) { + usd_err("Failed to alloc cq completion intr\n"); + free(intr); + pthread_mutex_unlock(&uctx->ucx_mutex); + return NULL; + } + + /* init host interrupt registers */ + iowrite32(0, &intr->uci_vintr.ctrl->coalescing_timer); + iowrite32(0, &intr->uci_vintr.ctrl->coalescing_type); + iowrite32(1, &intr->uci_vintr.ctrl->mask_on_assertion); + iowrite32(0, &intr->uci_vintr.ctrl->int_credits); + iowrite32(0, &intr->uci_vintr.ctrl->mask); /* unmask */ + + intr->uci_offset = cq->intr_offset; + intr->uci_refcnt = 1; + LIST_INSERT_HEAD(&uctx->ucx_intr_list, intr, uci_ctx_link); + } + +out: + pthread_mutex_unlock(&uctx->ucx_mutex); + return intr; +} + +/* + * put a cq interrupt source + */ +static void +usd_put_cq_intr( + struct usd_cq_impl *cq) +{ + struct usd_context *uctx; + struct usd_cq_comp_intr *intr; + + uctx = cq->ucq_dev->ud_ctx; + + pthread_mutex_lock(&uctx->ucx_mutex); + LIST_FOREACH(intr, &uctx->ucx_intr_list, uci_ctx_link) { + if (intr->uci_offset == cq->intr_offset) { + intr->uci_refcnt--; + if (intr->uci_refcnt == 0) + vnic_grpmbrintr_free(&intr->uci_vintr); + break; + } + } + + if (intr != NULL) { + LIST_REMOVE(intr, uci_ctx_link); + free(intr); + } + pthread_mutex_unlock(&uctx->ucx_mutex); +} + + + +/* + * Function that does whatever is needed to make a CQ go away + */ +int +usd_destroy_cq( + struct usd_cq *ucq) +{ + struct usd_cq_impl *cq; + + cq = to_cqi(ucq); + + if (cq->ucq_intr != NULL) { + usd_put_cq_intr(cq); + cq->ucq_intr = NULL; + } + if (cq->ucq_state & USD_QS_VERBS_CREATED) + usd_ib_cmd_destroy_cq(cq->ucq_dev, cq); + + if (cq->ucq_state & USD_QS_VF_MAPPED) + usd_unmap_vf(cq->ucq_dev, cq->ucq_vf); + + if (cq->ucq_desc_ring != NULL) + usd_free_mr(cq->ucq_desc_ring); + if (cq->ucq_rq_map != NULL) + free(cq->ucq_rq_map); + if (cq->ucq_wq_map != NULL) + free(cq->ucq_wq_map); + free(cq); + + return 0; +} + +static int +usd_vnic_wq_init( + struct usd_wq *wq, + struct usd_vf *vf, + uint64_t desc_ring) +{ + struct vnic_wq *vwq; + int ret; + + vwq = &wq->uwq_vnic_wq; + + /* get address of control register */ + vwq->ctrl = vnic_dev_get_res(vf->vf_vdev, RES_TYPE_WQ, wq->uwq_index); + if (vwq->ctrl == NULL) + return -EINVAL; + + ret = vnic_wq_disable(vwq); + if (ret != 0) + return ret; + + writeq(desc_ring, &vwq->ctrl->ring_base); + iowrite32(wq->uwq_num_entries, &vwq->ctrl->ring_size); + iowrite32(0, &vwq->ctrl->fetch_index); + iowrite32(0, &vwq->ctrl->posted_index); + iowrite32(wq->uwq_cq->ucq_index, &vwq->ctrl->cq_index); + iowrite32(0, &vwq->ctrl->error_interrupt_enable); + iowrite32(0, &vwq->ctrl->error_interrupt_offset); + iowrite32(0, &vwq->ctrl->error_status); + + wq->uwq_state |= USD_QS_VNIC_INITIALIZED; + wq->uwq_next_desc = wq->uwq_desc_ring; + wq->uwq_send_credits = wq->uwq_num_entries - 1; + + return 0; +} + +/* + * Allocate the resources for a previously created WQ for UD QP + */ +static int +usd_create_wq_ud( + struct usd_qp_impl *qp) +{ + struct usd_wq *wq; + uint32_t ring_size; + int ret; + + wq = &qp->uq_wq; + + /* Allocate resources for WQ */ + ring_size = sizeof(struct wq_enet_desc) * wq->uwq_num_entries; + ret = usd_alloc_mr(qp->uq_dev, ring_size, (void **)&wq->uwq_desc_ring); + if (ret != 0) + return ret; + + ret = usd_vnic_wq_init(wq, qp->uq_vf, (uint64_t)wq->uwq_desc_ring); + if (ret != 0) + goto out; + + return 0; + +out: + if (wq->uwq_desc_ring != NULL) { + usd_free_mr(wq->uwq_desc_ring); + wq->uwq_desc_ring = NULL; + } + return ret; +} + +/* + * Allocate the resources for a previously created WQ + */ +static int +usd_create_wq_pio( + struct usd_qp_impl *qp) +{ + uint32_t pio_memsize; + uint32_t used_size; + uint32_t ring_size; + void *pio_vaddr; + uint64_t pio_paddr; + uint64_t ivaddr; + struct usd_wq *wq; + struct usd_device *dev; + int ret; + + dev = qp->uq_dev; + if (dev->ud_ctx->ucx_caps[USNIC_CAP_PIO] == 0 || + vnic_dev_get_res_bus_addr(qp->uq_vf->vf_vdev, RES_TYPE_MEM, 0) == 0) { + usd_err("dev does not support PIO\n"); + return -ENODEV; + } + + pio_memsize = vnic_dev_get_res_count(qp->uq_vf->vf_vdev, RES_TYPE_MEM); + pio_vaddr = vnic_dev_get_res(qp->uq_vf->vf_vdev, RES_TYPE_MEM, 0); + + ret = usd_get_piopa(qp); + if (ret != 0) + return ret; + pio_paddr = qp->uq_attrs.uqa_pio_paddr; + + /* 512-byte alignment must match */ + if ((((uint64_t)pio_vaddr ^ pio_paddr) & 511) != 0) { + fprintf(stderr, "Alignment mismatch, %p vs 0x%lx, cannot do PIO\n", + pio_vaddr, pio_paddr); + return -ENXIO; + } + + /* skip past size */ + ivaddr = (uintptr_t)pio_vaddr; + ivaddr += sizeof(uint64_t); + + /* round up to 512 bytes */ + ivaddr = (ivaddr + 511) & ~511; + + /* WQ ring goes first. Allow space for 64-byte write of last desc */ + wq = &qp->uq_wq; + ring_size = wq->uwq_num_entries * sizeof(struct wq_enet_desc); + ring_size += 64 - sizeof(struct wq_enet_desc); + wq->pio_v_wq_addr = (void *)ivaddr; + wq->pio_p_wq_addr = pio_paddr + ivaddr - (uint64_t)pio_vaddr; + ivaddr += ring_size; + + /* round up to 64 bytes */ + ivaddr = (ivaddr + 63) & ~63; + + /* we keep a copy of the ring, also */ + ret = usd_alloc_mr(qp->uq_dev, ring_size, (void **)&wq->uwq_desc_ring); + if (ret != 0) + return ret; + + /* packet buffer */ + wq->pio_v_pkt_buf = (void *)ivaddr; + wq->pio_p_pkt_buf = pio_paddr + ivaddr - (uint64_t)pio_vaddr; + ivaddr += ((uint64_t) wq->uwq_num_entries) * 256; + + used_size = ivaddr - (uintptr_t)pio_vaddr; + if (used_size > pio_memsize) { + ret = -ENOMEM; + goto out; + } + + ret = usd_vnic_wq_init(wq, qp->uq_vf, wq->pio_p_wq_addr); + if (ret != 0) + goto out; + + return 0; + +out: + if (wq->uwq_desc_ring != NULL) { + usd_free_mr(wq->uwq_desc_ring); + wq->uwq_desc_ring = NULL; + } + return ret; +} + +/* + * Allocate the resources for a previously created WQ + */ +static int +usd_create_wq( + struct usd_qp_impl *qp) +{ + struct usd_wq *wq; + int ret; + + switch (qp->uq_attrs.uqa_qtype) { + case USD_QTY_UD_PIO: + ret = usd_create_wq_pio(qp); + break; + case USD_QTY_UD: + ret = usd_create_wq_ud(qp); + break; + default: + ret = -1; + break; + } + + if (ret == 0) { + wq = &qp->uq_wq; + wq->uwq_post_index_mask = (wq->uwq_num_entries-1); + wq->uwq_post_index = 1; + wq->uwq_last_comp = (wq->uwq_num_entries-1); + } + + return ret; +} + +static int +usd_vnic_rq_init( + struct usd_rq *rq, + struct usd_vf *vf, + uint64_t desc_ring) +{ + struct vnic_rq *vrq; + int ret; + + vrq = &rq->urq_vnic_rq; + + /* get address of control register */ + vrq->ctrl = vnic_dev_get_res(vf->vf_vdev, RES_TYPE_RQ, rq->urq_index); + if (vrq->ctrl == NULL) + return -EINVAL; + + ret = vnic_rq_disable(vrq); + if (ret != 0) + return ret; + + writeq(desc_ring, &vrq->ctrl->ring_base); + iowrite32(rq->urq_num_entries, &vrq->ctrl->ring_size); + iowrite32(0, &vrq->ctrl->fetch_index); + iowrite32(0, &vrq->ctrl->posted_index); + iowrite32(rq->urq_cq->ucq_index, &vrq->ctrl->cq_index); + iowrite32(0, &vrq->ctrl->error_interrupt_enable); + iowrite32(0, &vrq->ctrl->error_interrupt_offset); + iowrite32(0, &vrq->ctrl->error_status); + + rq->urq_state |= USD_QS_VNIC_INITIALIZED; + rq->urq_next_desc = rq->urq_desc_ring; + rq->urq_recv_credits = rq->urq_num_entries - 1; + + return 0; +} + +/* + * Allocate the resources for a previously created RQ + */ +static int +usd_create_rq(struct usd_qp_impl *qp) +{ + struct usd_rq *rq; + uint32_t ring_size; + int ret; + + rq = &qp->uq_rq; + + /* Allocate resources for RQ */ + ring_size = sizeof(struct rq_enet_desc) * rq->urq_num_entries; + ret = usd_alloc_mr(qp->uq_dev, ring_size, (void **)&rq->urq_desc_ring); + if (ret != 0) + return ret; + + ret = usd_vnic_rq_init(rq, qp->uq_vf, (uint64_t)rq->urq_desc_ring); + if (ret != 0) + goto out; + + rq->urq_post_index_mask = (rq->urq_num_entries-1); + rq->urq_post_index = 0; + rq->urq_last_comp = (rq->urq_num_entries-1); + + return 0; +out: + if (rq->urq_desc_ring != NULL) { + usd_free_mr(rq->urq_desc_ring); + rq->urq_desc_ring = NULL; + } + return ret; +} + +static int +usd_vnic_disable_qp( + struct usd_qp_impl *qp) +{ + struct usd_rq *rq; + struct usd_wq *wq; + int ret; + + wq = &qp->uq_wq; + rq = &qp->uq_rq; + + /* disable both queues */ + ret = vnic_wq_disable(&wq->uwq_vnic_wq); + if (ret != 0) + return ret; + ret = vnic_rq_disable(&rq->urq_vnic_rq); + + return ret; +} + +static void +usd_vnic_enable_qp( + struct usd_qp_impl *qp) +{ + struct usd_rq *rq; + struct usd_wq *wq; + + wq = &qp->uq_wq; + rq = &qp->uq_rq; + + vnic_rq_enable(&rq->urq_vnic_rq); + vnic_wq_enable(&wq->uwq_vnic_wq); +} + +/* + * QP has been created and resources allocated. Issue the IB commands to + * change the state to INIT/RTR/RTS to trigger filter creation and enable the + * QP to send and receive traffic. + */ +static int +usd_enable_verbs_qp( + struct usd_qp_impl *qp) +{ + struct usd_rq *rq; + struct usd_wq *wq; + struct usd_device *dev; + int ret; + + dev = qp->uq_dev; + wq = &qp->uq_wq; + rq = &qp->uq_rq; + + /* XXX is this really necessary? */ + ret = usd_vnic_disable_qp(qp); + if (ret != 0) { + goto out; + } + + /* state to INIT */ + ret = usd_ib_cmd_modify_qp(dev, qp, IBV_QPS_INIT); + if (ret != 0) { + goto out; + } + + /* state to "ready to receive," enable rq */ + ret = usd_ib_cmd_modify_qp(dev, qp, IBV_QPS_RTR); + if (ret != 0) { + goto out; + } + + /* state to "ready to send," enable wq */ + ret = usd_ib_cmd_modify_qp(dev, qp, IBV_QPS_RTS); + if (ret != 0) { + goto out; + } + + usd_vnic_enable_qp(qp); + rq->urq_state |= USD_QS_READY; + wq->uwq_state |= USD_QS_READY; + + out: + return ret; +} + +/* + * Public interface to disable a QP + */ +int +usd_disable_qp( + struct usd_qp *uqp) +{ + struct usd_qp_impl *qp; + + qp = to_qpi(uqp); + usd_vnic_disable_qp(qp); + return 0; +} + +/* + * Public interface to enable a QP + */ +int +usd_enable_qp( + struct usd_qp *uqp) +{ + struct usd_qp_impl *qp; + + qp = to_qpi(uqp); + usd_vnic_enable_qp(qp); + return 0; +} + +/* + * Public interface to create a CQ + * First, issue the verbs command to create a CW instance in the driver. + * Second, allocate the data structures so that poll_cq can succeed, though + * we will not actually have VIC resources allocated until the first create_qp + * that uses this CQ. We will finish configuring the CQ at that time. + */ +int +usd_create_cq( + struct usd_device *dev, + struct usd_cq_init_attr *init_attr, + struct usd_cq **cq_o) +{ + unsigned num_entries; + int comp_vec; + unsigned qp_per_vf; + struct usd_cq *ucq; + struct usd_cq_impl *cq; + unsigned ring_size; + int ret; + + if (init_attr == NULL) + return -EINVAL; + + num_entries = init_attr->num_entries; + comp_vec = init_attr->comp_vec; + + /* Make sure device ready */ + ret = usd_device_ready(dev); + if (ret != 0) { + return ret; + } + + if (num_entries > dev->ud_attrs.uda_max_cqe) { + return -EINVAL; + } + + if (init_attr->comp_fd != -1) { + if (dev->ud_ctx->ucx_caps[USD_CAP_GRP_INTR] == 0) { + usd_err("CQ completion event is not supported\n"); + return -EINVAL; + } + if (comp_vec >= (int)dev->ud_attrs.uda_num_comp_vectors) { + usd_err("too large comp_vec (%d) requested, num_comp_vectors=%d\n", + comp_vec, (int)dev->ud_attrs.uda_num_comp_vectors); + return -EINVAL; + } + } + + cq = (struct usd_cq_impl *)calloc(sizeof(*cq), 1); + if (cq == NULL) { + ret = -errno; + goto out; + } + + qp_per_vf = dev->ud_attrs.uda_qp_per_vf; + + cq->ucq_wq_map = calloc(qp_per_vf, sizeof(struct usd_wq *)); + cq->ucq_rq_map = calloc(qp_per_vf, sizeof(struct usd_rq *)); + if (cq->ucq_wq_map == NULL || cq->ucq_rq_map == NULL) { + ret = -ENOMEM; + goto out; + } + + cq->ucq_dev = dev; + + /* add 1 and round up to next POW2 and min() with 64 */ + num_entries = 1 << msbit(num_entries); + if (num_entries < 64) { + num_entries = 64; + } + + cq->ucq_num_entries = num_entries; + + ring_size = sizeof(struct cq_desc) * num_entries; + ret = usd_alloc_mr(dev, ring_size, &cq->ucq_desc_ring); + if (ret != 0) + goto out; + memset(cq->ucq_desc_ring, 0, ring_size); + + /* + * kernel currently has no support for handling negative comp_vec values, + * just use 0 which is guaranteed to be available + */ + if (comp_vec < 0) + comp_vec = 0; + + ret = usd_ib_cmd_create_cq(dev, cq, init_attr->ibv_cq, init_attr->comp_fd, + comp_vec); + if (ret != 0) + goto out; + + cq->ucq_state |= USD_QS_VERBS_CREATED; + + /* initialize polling variables */ + cq->ucq_cqe_mask = num_entries - 1; + cq->ucq_color_shift = msbit(num_entries) - 1; + cq->comp_fd = init_attr->comp_fd; + cq->comp_vec = comp_vec; + cq->comp_req_notify = init_attr->comp_req_notify; + + ucq = to_usdcq(cq); + ucq->ucq_num_entries = num_entries - 1; + *cq_o = to_usdcq(cq); + return 0; + +out: + if (cq != NULL) { + usd_destroy_cq(to_usdcq(cq)); + } + return ret; +} + +/* + * Finish CQ creation after first QP has been created. Associate a vf + * and configure the CQ on the VIC. It's OK if CQ is already configured, but + * VFs must match. + */ +static int +usd_finish_create_cq( + struct usd_cq_impl *cq, + struct usd_vf *vf) +{ + struct vnic_cq *vcq; + + if (cq->ucq_state & USD_QS_VNIC_INITIALIZED) { + if (cq->ucq_vf == vf) { + return 0; + } else { + usd_err("Cannot share CQ across VFs\n"); + return -EINVAL; + } + } + + vcq = &cq->ucq_vnic_cq; + vcq->index = cq->ucq_index; + vcq->vdev = vf->vf_vdev; + + vcq->ctrl = vnic_dev_get_res(vcq->vdev, RES_TYPE_CQ, vcq->index); + if (vcq->ctrl == NULL) + return -EINVAL; + + cq->ucq_vf = vf; + usd_get_vf(vf); /* bump the reference count */ + cq->ucq_state |= USD_QS_VF_MAPPED; + + /* + * Tell the VIC about this CQ + */ + { + unsigned int cq_flow_control_enable = 0; + unsigned int cq_color_enable = 1; + unsigned int cq_head = 0; + unsigned int cq_tail = 0; + unsigned int cq_tail_color = 1; + unsigned int cq_entry_enable = 1; + unsigned int cq_msg_enable = 0; + unsigned int cq_intr_enable = 0; + unsigned int cq_intr_offset = 0; + uint64_t cq_msg_addr = 0; + + if (cq->comp_fd != -1) { + cq->ucq_intr = usd_get_cq_intr(cq, vf); + if (cq->ucq_intr == NULL) { + usd_err("Failed to alloc cq completion intr\n"); + return -ENOMEM; + } else { + cq_intr_enable = 1; + cq_intr_offset = cq->intr_offset; + } + } + + cq->ucq_vnic_cq.ring.base_addr = (uintptr_t)cq->ucq_desc_ring; + cq->ucq_vnic_cq.ring.desc_count = cq->ucq_num_entries; + + vnic_cq_init(&cq->ucq_vnic_cq, cq_flow_control_enable, + cq_color_enable, cq_head, cq_tail, cq_tail_color, + cq_intr_enable, cq_entry_enable, cq_msg_enable, + cq_intr_offset, cq_msg_addr); + } + cq->ucq_state |= USD_QS_VNIC_INITIALIZED; + + return 0; +} + +/* + * Fill in ops field for QP + */ +static int +usd_qp_get_ops( + struct usd_qp_impl *qp) +{ + int tt; + +#define USD_TT(TY,TR) ((TY)<<16|(TR)) + tt = USD_TT(qp->uq_attrs.uqa_transport, qp->uq_attrs.uqa_qtype); + + switch (tt) { + case USD_TT(USD_QTR_UDP, USD_QTY_UD): + qp->uq_qp.uq_ops = usd_qp_ops_ud_udp; + break; + case USD_TT(USD_QTR_UDP, USD_QTY_UD_PIO): + qp->uq_qp.uq_ops = usd_qp_ops_ud_pio_udp; + break; + case USD_TT(USD_QTR_RAW, USD_QTY_UD): + qp->uq_qp.uq_ops = usd_qp_ops_ud_raw; + break; + default: + return -EINVAL; + } + + return 0; +} + +/* + * Convert user's filter into internal representation + */ +static int +usd_filter_alloc( + struct usd_device *dev, + struct usd_filter *filt, + struct usd_qp_filter *qfilt) +{ + struct sockaddr_in sin; + int ret; + int s; + + switch (filt->uf_type) { + case USD_FTY_UDP_SOCK: + qfilt->qf_type = USD_FTY_UDP_SOCK; + qfilt->qf_filter.qf_udp.u_sockfd = filt->uf_filter.uf_udp_sock.u_sock; + break; + + case USD_FTY_UDP: + qfilt->qf_type = USD_FTY_UDP; + qfilt->qf_filter.qf_udp.u_sockfd = -1; + + s = socket(AF_INET, SOCK_DGRAM, 0); + if (s == -1) + return -errno; + memset(&sin, 0, sizeof(sin)); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = dev->ud_attrs.uda_ipaddr_be; + sin.sin_port = htons(filt->uf_filter.uf_udp.u_port); + ret = bind(s, (struct sockaddr *)&sin, sizeof(sin)); + if (ret == -1) { + ret = -errno; + close(s); + return ret; + } + + /* save the socket */ + qfilt->qf_filter.qf_udp.u_sockfd = s; + break; + + default: + return -EINVAL; + } + + return 0; +} + +/* + * Fill in local address given filter and return from verbs QP create + */ +static int +usd_get_qp_local_addr( + struct usd_qp_impl *qp) +{ + socklen_t addrlen; + int ret; + + switch (qp->uq_attrs.uqa_transport) { + + case USD_QTR_UDP: + /* find out what address we got */ + addrlen = sizeof(qp->uq_attrs.uqa_local_addr.ul_addr.ul_udp.u_addr); + ret = getsockname(qp->uq_filter.qf_filter.qf_udp.u_sockfd, + (struct sockaddr *) &qp->uq_attrs.uqa_local_addr.ul_addr.ul_udp.u_addr, + &addrlen); + if (ret == -1) + return -errno; + break; + + default: + break; + } + return 0; +} + +static void +usd_filter_free( + struct usd_qp_filter *qfilt) +{ + switch (qfilt->qf_type) { + case USD_FTY_UDP: + close(qfilt->qf_filter.qf_udp.u_sockfd); + break; + default: + break; + } +} + +/* + * Destroy a QP + */ +int +usd_destroy_qp( + struct usd_qp *uqp) +{ + struct usd_wq *wq; + struct usd_rq *rq; + struct usd_qp_impl *qp; + + qp = to_qpi(uqp); + + wq = &qp->uq_wq; + rq = &qp->uq_rq; + + if (wq->uwq_state & USD_QS_READY) + usd_disable_qp(uqp); + + if (rq->urq_state & USD_QS_VNIC_ALLOCATED) + vnic_rq_free(&rq->urq_vnic_rq); + + if (wq->uwq_state & USD_QS_VF_MAPPED) + usd_unmap_vf(qp->uq_dev, qp->uq_vf); + + if (wq->uwq_state & USD_QS_VERBS_CREATED) + usd_ib_cmd_destroy_qp(qp->uq_dev, qp); + + if (rq->urq_state & USD_QS_FILTER_ALLOC) + usd_filter_free(&qp->uq_filter); + + if (rq->urq_context != NULL) + free(rq->urq_context); + if (wq->uwq_post_info != NULL) + free(wq->uwq_post_info); + if (wq->uwq_copybuf != NULL) + usd_free_mr(wq->uwq_copybuf); + if (wq->uwq_desc_ring != NULL) + usd_free_mr(wq->uwq_desc_ring); + if (rq->urq_desc_ring != NULL) + usd_free_mr(rq->urq_desc_ring); + + free(qp); + + return 0; +} + +/* + * Create a normal or PIO UD QP + */ +static int +usd_create_qp_ud( + struct usd_qp_impl *qp) +{ + struct usd_device *dev; + unsigned num_wq_entries; + unsigned num_rq_entries; + struct usd_vf_info vf_info; + struct usd_vf *vf; + struct usd_rq *rq; + struct usd_wq *wq; + struct usd_cq_impl *wcq; + struct usd_cq_impl *rcq; + size_t copybuf_size; + int ret; + + dev = qp->uq_dev; + vf = NULL; + + wq = &qp->uq_wq; + rq = &qp->uq_rq; + wcq = wq->uwq_cq; + rcq = rq->urq_cq; + + ret = usd_qp_get_ops(qp); + if (ret != 0) { + goto fail; + } + + num_wq_entries = wq->uwq_num_entries; + num_rq_entries = rq->urq_num_entries; + + rq->urq_context = calloc(sizeof(void *), num_rq_entries); + wq->uwq_post_info = calloc(sizeof(struct usd_wq_post_info), num_wq_entries); + if (rq->urq_context == NULL || wq->uwq_post_info == NULL) { + ret = -ENOMEM; + goto fail; + } + + /* + * Issue verbs command to create the QP. This does not actually + * instanstiate the filter in the VIC yet, need to bring the + * verbs QP up to RTR state for that + */ + memset(&vf_info, 0, sizeof(vf_info)); + ret = usd_ib_cmd_create_qp(dev, qp, &vf_info); + if (ret != 0) { + goto fail; + } + + /* verbs create_qp command has been completed */ + rq->urq_state |= USD_QS_VERBS_CREATED; + wq->uwq_state |= USD_QS_VERBS_CREATED; + + /* + * Create/regmr for wq copybuf after verbs QP is created + * because QP number information may be needed to register + * mr under shared PD + */ + copybuf_size = USD_SEND_MAX_COPY * num_wq_entries; + ret = usd_alloc_mr(dev, copybuf_size, (void **)&wq->uwq_copybuf); + if (ret != 0) + goto fail; + + ret = usd_map_vf(dev, &vf_info, &vf); + if (ret != 0) { + goto fail; + } + + qp->uq_vf = vf; + rq->urq_state |= USD_QS_VF_MAPPED; + wq->uwq_state |= USD_QS_VF_MAPPED; + + /* + * Now that we have a VF, we can finish creating the CQs. + * It's OK if rcq==wcq, finish_create_cq allows for CQ sharing + */ + ret = usd_finish_create_cq(wcq, vf); + if (ret != 0) { + goto fail; + } + ret = usd_finish_create_cq(rcq, vf); + if (ret != 0) { + goto fail; + } + + /* define the WQ and RQ to the VIC */ + ret = usd_create_wq(qp); + if (ret != 0) { + goto fail; + } + ret = usd_create_rq(qp); + if (ret != 0) { + goto fail; + } + + /* Issue commands to driver to enable the QP */ + ret = usd_enable_verbs_qp(qp); + if (ret != 0) { + goto fail; + } + + /* Attach WQ and RQ to CW */ + rcq->ucq_rq_map[rq->urq_index] = rq; + wcq->ucq_wq_map[wq->uwq_index] = wq; + + qp->uq_attrs.uqa_max_send_credits = wq->uwq_num_entries - 1; + qp->uq_attrs.uqa_max_recv_credits = rq->urq_num_entries - 1; + qp->uq_attrs.uqa_max_inline = USD_SEND_MAX_COPY - + qp->uq_attrs.uqa_hdr_len; + + /* build local address */ + ret = usd_get_qp_local_addr(qp); + if (ret != 0) { + goto fail; + } + + return 0; + + fail: + return ret; +} + +/* + * Public interface to create QP + */ +int +usd_create_qp( + struct usd_device *dev, + enum usd_qp_transport transport, + enum usd_qp_type qtype, + struct usd_cq *wucq, + struct usd_cq *rucq, + unsigned num_send_credits, + unsigned num_recv_credits, + struct usd_filter *filt, + struct usd_qp **uqp_o) +{ + struct usd_qp_impl *qp; + unsigned num_rq_entries; + unsigned num_wq_entries; + struct usd_cq_impl *wcq; + struct usd_cq_impl *rcq; + struct usd_rq *rq; + struct usd_wq *wq; + int ret; + + qp = NULL; + + /* Make sure device ready */ + ret = usd_device_ready(dev); + if (ret != 0) { + goto fail; + } + + qp = calloc(sizeof(*qp), 1); + if (qp == NULL) { + ret = -ENOMEM; + goto fail; + } + + qp->uq_dev = dev; + qp->uq_attrs.uqa_transport = transport; + qp->uq_attrs.uqa_qtype = qtype; + + ret = usd_qp_get_ops(qp); + if (ret != 0) { + goto fail; + } + + if (num_recv_credits > dev->ud_attrs.uda_max_recv_credits) { + ret = -EINVAL; + goto fail; + } + /* Add 1 and round num_entries up to POW2 and min to 32 */ + num_rq_entries = 1 << msbit(num_recv_credits); + if (num_rq_entries < 32) num_rq_entries = 32; + + if (num_send_credits > dev->ud_attrs.uda_max_send_credits) { + ret = -EINVAL; + goto fail; + } + num_wq_entries = 1 << msbit(num_send_credits); + if (num_wq_entries < 32) num_wq_entries = 32; + + rcq = to_cqi(rucq); + wcq = to_cqi(wucq); + + rq = &qp->uq_rq; + rq->urq_num_entries = num_rq_entries; + rq->urq_cq = rcq; + + wq = &qp->uq_wq; + wq->uwq_num_entries = num_wq_entries; + wq->uwq_cq = wcq; + + /* do filter setup */ + ret = usd_filter_alloc(dev, filt, &qp->uq_filter); + if (ret != 0) { + goto fail; + } + rq->urq_state |= USD_QS_FILTER_ALLOC; + + /* Fill in some attrs */ + switch (transport) { + case USD_QTR_UDP: + qp->uq_attrs.uqa_hdr_len = sizeof(struct usd_udp_hdr); + break; + case USD_QTR_RAW: + qp->uq_attrs.uqa_hdr_len = 0; + break; + } + + /* + * Now, do the type-specific configuration + */ + switch (qtype) { + case USD_QTY_UD: + case USD_QTY_UD_PIO: + ret = usd_create_qp_ud(qp); + if (ret != 0) { + goto fail; + } + break; + default: + ret = -EINVAL; + goto fail; + break; + } + + *uqp_o = to_usdqp(qp); + return 0; + +fail: + if (qp != NULL) { + usd_destroy_qp(to_usdqp(qp)); + } + return ret; +} + +/* + * Return attributes of a QP + */ +int +usd_get_qp_attrs( + struct usd_qp *uqp, + struct usd_qp_attrs *qattrs) +{ + struct usd_qp_impl *qp; + + qp = to_qpi(uqp); + *qattrs = qp->uq_attrs; + return 0; +} + +int usd_get_completion_fd(struct usd_device *dev, int *comp_fd_o) +{ + if (dev == NULL || comp_fd_o == NULL) + return -EINVAL; + + return usd_ib_cmd_create_comp_channel(dev, comp_fd_o); +} + +int usd_put_completion_fd(struct usd_device *dev, int comp_fd) +{ + if (dev == NULL || comp_fd < 0) + return -EINVAL; + + if (close(comp_fd) == -1) + return -errno; + + return 0; +} diff --git a/prov/usnic/src/usnic_direct/usd_socket.c b/prov/usnic/src/usnic_direct/usd_socket.c new file mode 100644 index 00000000000..4066d0eb6e1 --- /dev/null +++ b/prov/usnic/src/usnic_direct/usd_socket.c @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "usd.h" +#include "usd_util.h" +#include "usd_socket.h" + +/* + * Get the IP address and other information associated with this + * device's interface. + */ +int +usd_get_dev_if_info( + struct usd_device *dev) +{ + struct sockaddr_in sin; + struct ifreq ifr; + struct usd_device_attrs *dp; + uint32_t netmask; + int s; + int ret; + + s = socket(AF_INET, SOCK_DGRAM, 0); + if (s == -1) + return -errno; + + dp = &dev->ud_attrs; + + dp->uda_ifindex = if_nametoindex(dp->uda_ifname); + if (dp->uda_ifindex == 0) + goto out; + +#pragma GCC diagnostic push +#if defined(__GNUC__) && (__GNUC__ >= 8) +#pragma GCC diagnostic ignored "-Wstringop-truncation" +#endif + ifr.ifr_addr.sa_family = AF_INET; + strncpy(ifr.ifr_name, dp->uda_ifname, IFNAMSIZ - 1); +#pragma GCC diagnostic pop + + ret = ioctl(s, SIOCGIFADDR, &ifr); + if (ret == 0) { + dp->uda_ipaddr_be = + ((struct sockaddr_in *) &ifr.ifr_addr)->sin_addr.s_addr; + } + + ret = ioctl(s, SIOCGIFNETMASK, &ifr); + if (ret == 0) { + dp->uda_netmask_be = + ((struct sockaddr_in *) &ifr.ifr_netmask)->sin_addr.s_addr; + netmask = ntohl(dp->uda_netmask_be); + dp->uda_prefixlen = 32 - msbit(~netmask); + } + + ret = ioctl(s, SIOCGIFMTU, &ifr); + if (ret == 0) { + dp->uda_mtu = ifr.ifr_mtu; + } + + if (dp->uda_ipaddr_be != 0) { + memset(&sin, 0, sizeof(sin)); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = dp->uda_ipaddr_be; + sin.sin_port = 0; + ret = bind(s, (struct sockaddr *) &sin, sizeof(sin)); + if (ret == -1) + goto out; + dev->ud_arp_sockfd = s; + } else { + close(s); + } + + return 0; + out: + close(s); + return -errno; +} diff --git a/prov/usnic/src/usnic_direct/usd_socket.h b/prov/usnic/src/usnic_direct/usd_socket.h new file mode 100644 index 00000000000..a8c015f8c0a --- /dev/null +++ b/prov/usnic/src/usnic_direct/usd_socket.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#ifndef _USD_SOCKET_ +#define _USD_SOCKET_ + +/* + * Forward structure defs + */ +struct usd_device; + +int usd_get_dev_if_info(struct usd_device *dev); +int usd_ip_to_mac(struct usd_device *dev, uint32_t ipaddr, + uint8_t * mac_o); +#endif /* _USD_SOCKET_ */ diff --git a/prov/usnic/src/usnic_direct/usd_time.h b/prov/usnic/src/usnic_direct/usd_time.h new file mode 100644 index 00000000000..c331025dd24 --- /dev/null +++ b/prov/usnic/src/usnic_direct/usd_time.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + * definitions about time + */ + +#ifndef _USD_TIME_H_ +#define _USD_TIME_H_ + +#include + +typedef uint64_t usd_time_t; + +static inline void usd_get_time(usd_time_t * timep) +{ + struct timespec now; + + clock_gettime(CLOCK_MONOTONIC, &now); + *timep = now.tv_sec * 1000 + now.tv_nsec / 1000000; +} + +/* + * Returns time delta in ms + */ +static inline int usd_time_diff(usd_time_t time1, usd_time_t time2) +{ + return time2 - time1; +} +#endif /* _USD_TIME_H_ */ diff --git a/prov/usnic/src/usnic_direct/usd_util.h b/prov/usnic/src/usnic_direct/usd_util.h new file mode 100644 index 00000000000..a7736309b34 --- /dev/null +++ b/prov/usnic/src/usnic_direct/usd_util.h @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#ifndef _USD_UTIL_H_ +#define _USD_UTIL_H_ + +#include +#include + +static uint8_t bittab[] = { + 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, +}; + +static inline int +msbit( + uint32_t val) +{ + int bit; + + bit = 0; + + if (val & (0xffff << 16)) { + val >>= 16; + bit += 16; + } + if (val & (0xff << 8)) { + val >>= 8; + bit += 8; + } + return bittab[val] + bit; +} + +#define usd_offset_of(type, memb) \ + ((unsigned long)(&((type *)0)->memb)) +#define usd_container_of(obj, type, memb) \ + ((type *)(((char *)obj) - usd_offset_of(type, memb))) + +static inline void hex(void *vcp, int len) +{ + uint8_t *cp = vcp; + int i; + for (i = 0; i < len; ++i) { + printf("%02x%c", *cp++, ((i & 15) == 15) ? 10 : 32); + } + if (i & 15) + puts(""); +} + +/* + * 48-bit type. Byte aligned. + */ +typedef struct { + unsigned char net_data[6]; +} net48_t; + +/** + * net48_get(net) - fetch from a network-order 48-bit field. + * + * @param net pointer to type net48_t, network-order 48-bit data. + * @return the host-order value. + */ +static inline u_int64_t net48_get(const net48_t * net) +{ + return ((u_int64_t) net->net_data[0] << 40) | + ((u_int64_t) net->net_data[1] << 32) | + ((u_int64_t) net->net_data[2] << 24) | + ((u_int64_t) net->net_data[3] << 16) | + ((u_int64_t) net->net_data[4] << 8) | + (u_int64_t) net->net_data[5]; +} + +/** + * net48_put(net, val) - store to a network-order 48-bit field. + * + * @param net pointer to a net48_t, network-order 48-bit data. + * @param val host-order value to be stored at net. + */ +static inline void net48_put(net48_t * net, u_int64_t val) +{ + net->net_data[0] = (u_int8_t)((val >> 40) & 0xFF); + net->net_data[1] = (u_int8_t)((val >> 32) & 0xFF); + net->net_data[2] = (u_int8_t)((val >> 24) & 0xFF); + net->net_data[3] = (u_int8_t)((val >> 16) & 0xFF); + net->net_data[4] = (u_int8_t)((val >> 8) & 0xFF); + net->net_data[5] = (u_int8_t)(val & 0xFF); +} + +static inline void usd_perror(const char *s) +{ + if (USD_DEBUG) { + perror(s); + } +} +#endif /* _USD_UTIL_H_ */ diff --git a/prov/usnic/src/usnic_direct/usd_vnic.c b/prov/usnic/src/usnic_direct/usd_vnic.c new file mode 100644 index 00000000000..60e25c63291 --- /dev/null +++ b/prov/usnic/src/usnic_direct/usd_vnic.c @@ -0,0 +1,213 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#include +#include + +#include "kcompat.h" +#include "vnic_dev.h" +#include "vnic_enet.h" + +#include "usd.h" +#include "usd_util.h" +#include "usd_vnic.h" +#include "usd_ib_cmd.h" + +#define GET_CONFIG(m) \ + do { \ + ret = usd_dev_spec(qp->uq_dev, \ + usd_offset_of(struct vnic_enet_config, m), \ + sizeof(c->m), &c->m); \ + if (ret) { \ + printf("Error %d getting " #m "\n", ret); \ + } else { \ + printf(#m " = 0x%lx\n", (u64)c->m); \ + } \ + } while (0) + + +int usd_vnic_dev_cmd(struct usd_device *dev, enum vnic_devcmd_cmd cmd, + u64 *a0, u64 *a1, int wait) +{ + return usd_ib_cmd_devcmd(dev, cmd, a0, a1, wait); +} + +#if 0 +/* + * Dump the devspec (for debugging) + */ +int +usd_dump_devspec( + struct usd_qp_impl *qp) +{ + struct vnic_enet_config config; + struct vnic_enet_config *c; + int ret; + + c = &config; + memset(&config, 0, sizeof(config)); + + GET_CONFIG(flags); + GET_CONFIG(wq_desc_count); + GET_CONFIG(rq_desc_count); + GET_CONFIG(mtu); + GET_CONFIG(intr_timer_deprecated); + GET_CONFIG(intr_timer_type); + GET_CONFIG(intr_mode); + GET_CONFIG(intr_timer_usec); + GET_CONFIG(loop_tag); + GET_CONFIG(vf_rq_count); + GET_CONFIG(num_arfs); + GET_CONFIG(mem_paddr); + + ret = vnic_dev_spec(qp->uq_vf->vf_vdev, + usd_offset_of(struct vnic_enet_config, devname), + 8, &c->devname[0]); + ret |= vnic_dev_spec(qp->uq_vf->vf_vdev, + usd_offset_of(struct vnic_enet_config, devname) + 8, + 8, &c->devname[8]); + printf("devname = \"%s\", ret = %d\n", c->devname, ret); + + return 0; +} +#endif + +/* + * Get some QP settings from devspec + */ +/* +int +usd_get_devspec( + struct usd_qp_impl *qp) +{ + struct vnic_enet_config config; + unsigned int offset; + int ret; + + offset = usd_offset_of(struct vnic_enet_config, mem_paddr); + ret = vnic_dev_spec(qp->uq_vf->vf_vdev, offset, + sizeof(config.mem_paddr), &config.mem_paddr); + if (ret != 0) { + return ret; + } + + qp->uq_attrs.uqa_pio_paddr = config.mem_paddr; + + return 0; +} +*/ + +/* + * general dev_spec function to replace vnic_dev_spec + */ +int usd_dev_spec(struct usd_device *dev, unsigned int offset, + size_t size, void *value) +{ + u64 a0, a1; + int wait = 1000; + int err; + + a0 = offset; + a1 = size; + + err = usd_vnic_dev_cmd(dev, CMD_DEV_SPEC, &a0, &a1, wait); + + switch (size) { + case 1: + *(u8 *)value = (u8)a0; + break; + case 2: + *(u16 *)value = (u16)a0; + break; + case 4: + *(u32 *)value = (u32)a0; + break; + case 8: + *(u64 *)value = a0; + break; + default: + return -EINVAL; + break; + } + + return err; +} + +int usd_get_piopa(struct usd_qp_impl *qp) +{ + struct vnic_enet_config config; + unsigned int offset; + int ret; + + offset = usd_offset_of(struct vnic_enet_config, mem_paddr); + ret = usd_dev_spec(qp->uq_dev, offset, + sizeof(config.mem_paddr), &config.mem_paddr); + if (ret != 0) { + return ret; + } + + qp->uq_attrs.uqa_pio_paddr = config.mem_paddr; + + return 0; +} + +/* + * Issue HANG_NOTIFY to the VNIC + */ +int +usd_vnic_hang_notify( + struct usd_qp *uqp) +{ + struct usd_qp_impl *qp; + u64 a0 = 0; + int ret; + + qp = to_qpi(uqp); + ret = usd_vnic_dev_cmd(qp->uq_dev, CMD_HANG_NOTIFY, + &a0, &a0, 1000); + if (ret != 0) { + fprintf(stderr, "hang_notify ret = %d\n", ret); + return ret; + } + + return 0; +} diff --git a/prov/usnic/src/usnic_direct/usd_vnic.h b/prov/usnic/src/usnic_direct/usd_vnic.h new file mode 100644 index 00000000000..80051a8a41f --- /dev/null +++ b/prov/usnic/src/usnic_direct/usd_vnic.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#ifndef _USD_VNIC_H_ +#define _USD_VNIC_H_ + +int usd_vnic_dev_cmd(struct usd_device *dev, enum vnic_devcmd_cmd cmd, + u64 *a0, u64 *a1, int wait); +int usd_dev_spec(struct usd_device *dev, unsigned int offset, + size_t size, void *value); +int usd_get_piopa(struct usd_qp_impl *qp); +int usd_vnic_hang_notify(struct usd_qp *uqp); + +#endif /* _USD_VNIC_H_ */ diff --git a/prov/usnic/src/usnic_direct/usnic_abi.h b/prov/usnic/src/usnic_direct/usnic_abi.h new file mode 100644 index 00000000000..1554a0d446a --- /dev/null +++ b/prov/usnic/src/usnic_direct/usnic_abi.h @@ -0,0 +1,361 @@ +/* + * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + + +#ifndef USNIC_ABI_H +#define USNIC_ABI_H + +/* ABI between userspace and kernel */ +#define USNIC_UVERBS_ABI_VERSION 4 + +#define USNIC_QP_GRP_MAX_WQS 8 +#define USNIC_QP_GRP_MAX_RQS 8 +#define USNIC_QP_GRP_MAX_CQS 16 + +#define USNIC_DECODE_PGOFF_VFID(pgoff) ((pgoff) & ((1ULL << 32) - 1)) +#define USNIC_DECODE_PGOFF_TYPE(pgoff) ((pgoff) >> 48) +#define USNIC_DECODE_PGOFF_RES(pgoff) (((pgoff) >> 32) & ((1ULL << 16) - 1)) +#define USNIC_DECODE_PGOFF_BAR(pgoff) (((pgoff) >> 32) & ((1ULL << 16) - 1)) + +#define USNIC_ENCODE_PGOFF(vfid, map_type, res_type_bar_id) \ + (((((uint64_t)map_type & 0xffff) << 48) | \ + (((uint64_t)res_type_bar_id & 0xffff) << 32) | \ + ((uint64_t)vfid & ((1ULL << 32) - 1))) * sysconf(_SC_PAGE_SIZE)) + +/* + * The kernel module eventually issues proxy the devcmd through enic and the + * maximum number of devcmd arguments supported for a vnic is VNIC_DEVCMD_NARGS + * (= 15). Among them, 2 arguments are consumed by proxy command for + * proxy_index and proxy devcmd. Hence, only a maximum of 13 arguments are + * supported on input in practice, even though the ABI between user space and + * the kernel has space for 15. + * + * Keep _NARGS as 15 for backwards compatibility (newer user space with older + * kernel), otherwise usnic_ucmd_devcmd() on the older kernel will fail with + * -EINVAL. + */ +#define USNIC_UDEVCMD_NARGS 15 +#define USNIC_UDEVCMD_MAX_IN_ARGS (USNIC_UDEVCMD_NARGS - 2) + +enum usnic_mmap_type { + USNIC_MMAP_BAR = 0, + USNIC_MMAP_RES = 1, + USNIC_MMAP_BARHEAD = 2, + USNIC_MMAP_GRPVECT = 3, +}; + +enum usnic_transport_type { + USNIC_TRANSPORT_UNKNOWN = 0, + USNIC_TRANSPORT_ROCE_CUSTOM = 1, + USNIC_TRANSPORT_IPV4_UDP = 2, + USNIC_TRANSPORT_IPV4_TCP_3T = 3, + USNIC_TRANSPORT_ROCEV2 = 4, + USNIC_TRANSPORT_MAX = 5, +}; + +#define ROCEV2_PORT 4791 + +enum usnic_ucmd_type { + USNIC_USER_CMD_DEVCMD, + USNIC_USER_CMD_MAX, +}; + +struct usnic_user_cmd { + u32 ucmd; + u32 pad_to_8byte; + u64 inbuf; + u64 outbuf; + u32 inlen; + u32 outlen; +}; + +struct usnic_udevcmd_cmd { + u32 vnic_idx; + u32 devcmd; + u32 wait; + u32 num_args; + u64 args[USNIC_UDEVCMD_NARGS]; +}; + +struct usnic_udevcmd_resp { + u32 num_args; + u64 args[USNIC_UDEVCMD_NARGS]; +}; + +/* + * This is the version of the transport_spec structure that is used + * in CREATE_QP versions 0..2 + */ +struct usnic_transport_spec_v2 { + enum usnic_transport_type trans_type; + union { + struct { + uint16_t port_num; + } usnic_roce; + struct { + uint32_t sock_fd; + } ip; + }; +}; + +/* + * This is the version of the transport_spec structure that is used + * in CREATE_QP versions 3.. + */ +struct usnic_transport_spec { + enum usnic_transport_type trans_type; + union { + struct { + uint16_t port_num; + } usnic_roce; + struct { + uint32_t sock_fd; + } ip; + struct { + uint32_t qpn; + uint32_t ipaddr_be; + } rocev2; + u_int8_t pad[256]; + }; +}; + +#define USNIC_IB_ALLOC_PD_VERSION 1 + +struct usnic_ib_alloc_pd_cmd { + u32 resp_version; /* response version requested */ + u32 pad_to_8byte; +}; + +struct usnic_ib_alloc_pd_resp { + u32 resp_version; + u32 pad_to_8byte; + union { + struct { + u32 vfid; + u32 grp_vect_buf_len; + } cur; /* v1 */ + }; +}; + +#define USNIC_IB_CREATE_QP_VERSION 3 + +struct usnic_ib_create_qp_cmd_v0 { + struct usnic_transport_spec_v2 spec_v2; +}; + +struct usnic_ib_create_qp_cmd_v2 { + struct usnic_transport_spec_v2 spec_v2; + u32 cmd_version; + union { + struct { + /* length in bytes of resources array */ + u32 resources_len; + + /* ptr to array of struct usnic_vnic_barres_info */ + u64 resources; + } cur; /* v1 and v2 cmd */ + } u; +}; + +struct usnic_ib_create_qp_cmd { + /* + * This is the old transport spec struct that must stay as the + * first member of this struct for backwards compatibility/ABI + * reasons.. It is "v2" because it is used with CREATE_QP + * versions 0, 1, and 2. + */ + struct usnic_transport_spec_v2 spec_v2; + u32 cmd_version; + union { + struct { + /* length in bytes of resources array */ + u32 resources_len; + + /* ptr to array of struct usnic_vnic_barres_info */ + u64 resources; + } cur; /* v1 and v2 cmd */ + } u; + /* + * This is the current version of the transport spec struct. + */ + struct usnic_transport_spec spec; +}; + + +/* + * infomation of vnic bar resource + */ +struct usnic_vnic_barres_info { + int32_t type; + uint32_t padding; + uint64_t bus_addr; + uint64_t len; +}; + +/* + * All create_qp responses must start with this for backwards compatability + */ +#define USNIC_IB_CREATE_QP_RESP_V0_FIELDS \ + u32 vfid; \ + u32 qp_grp_id; \ + u64 bar_bus_addr; \ + u32 bar_len; \ + u32 wq_cnt; \ + u32 rq_cnt; \ + u32 cq_cnt; \ + u32 wq_idx[USNIC_QP_GRP_MAX_WQS]; \ + u32 rq_idx[USNIC_QP_GRP_MAX_RQS]; \ + u32 cq_idx[USNIC_QP_GRP_MAX_CQS]; \ + u32 transport; + +struct usnic_ib_create_qp_resp_v0 { + USNIC_IB_CREATE_QP_RESP_V0_FIELDS + u32 reserved[9]; +}; + +struct usnic_ib_create_qp_resp { + USNIC_IB_CREATE_QP_RESP_V0_FIELDS + /* the above fields end on 4-byte alignment boundary */ + u32 cmd_version; + union { + struct { + u32 num_barres; + u32 pad_to_8byte; + } v1; + struct { + u32 num_barres; + u32 wq_err_intr_offset; + u32 rq_err_intr_offset; + u32 wcq_intr_offset; + u32 rcq_intr_offset; + u32 barhead_len; + } cur; /* v2 */ + } u; + + /* v0 had a "reserved[9]" field, must not shrink the response or we can + * corrupt newer clients running on older kernels */ + u32 reserved[2]; +}; + +#define USNIC_CTX_RESP_VERSION 2 + +/* + * Make this structure packed in order to make sure v1.num_caps not aligned + * at 8 byte boundary, hence still being able to support user libary + * requesting version 1 response. + */ +struct __attribute__((__packed__)) usnic_ib_get_context_cmd { + u32 resp_version; /* response version requested */ + union { + struct { + u32 num_caps; /* number of capabilities requested */ + } v1; + struct { + u32 encap_subcmd; /* whether encapsulate subcmd */ + union { + u32 num_caps; + struct usnic_user_cmd usnic_ucmd; + }; + } v2; + }; +}; + +/* + * Note that this enum must never have members removed or re-ordered in order + * to retain backwards compatability + */ +enum usnic_capability { + USNIC_CAP_CQ_SHARING, /* CQ sharing version */ + USNIC_CAP_MAP_PER_RES, /* Map individual RES */ + USNIC_CAP_PIO, /* PIO send */ + USNIC_CAP_CQ_INTR, /* CQ interrupts (via comp channels) */ + USNIC_CAP_GRP_INTR, /* Group interrupt */ + USNIC_CAP_DPKT, /* Direct Packet Interface */ + USNIC_CAP_CNT +}; + +/* + * If and when there become multiple versions of this struct, it will + * become a union for cross-version compatability to make sure there is always + * space for older and larger versions of the contents. + */ +struct usnic_ib_get_context_resp { + u32 resp_version; /* response version returned */ + u32 num_caps; /* number of capabilities returned */ + u32 cap_info[USNIC_CAP_CNT]; +}; + +#define USNIC_IB_CREATE_CQ_VERSION 2 + +struct usnic_ib_create_cq_v0 { + u64 reserved; +}; + +#define USNIC_CQ_COMP_SIGNAL_VERBS 0x1 /* whether to signal cq + * completion event via verbs + */ + +struct usnic_ib_create_cq { + u32 resp_version; /* response version requested */ + union { + struct { + u32 intr_arm_mode; + } v1; + struct { + u32 flags; + __s64 comp_event_fd; /* wait fd for cq completion */ + u64 affinity_mask_ptr; /* process affinity mask ptr*/ + u64 affinity_mask_len; + } cur; /* v2 */ + }; +}; + +struct usnic_ib_create_cq_resp_v0 { + u64 reserved; +}; + +struct usnic_ib_create_cq_resp { + u32 resp_version; /* response version returned */ + u32 pad_to_8byte; +}; + +#endif /* USNIC_ABI_H */ diff --git a/prov/usnic/src/usnic_direct/usnic_direct.h b/prov/usnic/src/usnic_direct/usnic_direct.h new file mode 100644 index 00000000000..a224e1cc0f0 --- /dev/null +++ b/prov/usnic/src/usnic_direct/usnic_direct.h @@ -0,0 +1,719 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#ifndef _USNIC_DIRECT_H_ +#define _USNIC_DIRECT_H_ + +#include +#include +#include +#include +#include +#include + +#define USD_MAX_DEVICES 8 +#define USD_MAX_DEVNAME 16 +#define USD_RECV_MAX_SGE 8 +#define USD_SEND_MAX_SGE 8 + +enum usd_link_state { + USD_LINK_DOWN, + USD_LINK_UP +}; + +/* forward structure defs */ +struct usd_context; +struct usd_qp; +struct usd_device; +struct usd_dest; +struct usd_connection; +struct usd_mr; + +struct usd_device_attrs { + char uda_devname[USD_MAX_DEVNAME]; + char uda_ifname[IFNAMSIZ]; + int uda_ifindex; + uint8_t uda_mac_addr[ETH_ALEN]; + + /* IP config */ + uint32_t uda_ipaddr_be; + uint32_t uda_netmask_be; + uint32_t uda_prefixlen; /* netmask length */ + uint32_t uda_mtu; + enum usd_link_state uda_link_state; + + /* HW info */ + uint32_t uda_vendor_id; + uint32_t uda_vendor_part_id; + uint32_t uda_device_id; + char uda_firmware[64]; + + /* usnic config */ + unsigned uda_num_vf; + unsigned uda_cq_per_vf; + unsigned uda_qp_per_vf; + unsigned uda_intr_per_vf; + unsigned uda_num_comp_vectors; + unsigned uda_max_cq; + unsigned uda_max_qp; + + /* VIC constants */ + uint32_t uda_bandwidth; + unsigned uda_max_cqe; + unsigned uda_max_send_credits; + unsigned uda_max_recv_credits; + + /* fd that can be used to poll for device events */ + int uda_event_fd; +}; + +enum usd_completion_status { + USD_COMPSTAT_SUCCESS, + USD_COMPSTAT_ERROR_CRC, + USD_COMPSTAT_ERROR_TRUNC, + USD_COMPSTAT_ERROR_TIMEOUT, + USD_COMPSTAT_ERROR_INTERNAL +}; +enum usd_completion_type { + USD_COMPTYPE_SEND=0, + USD_COMPTYPE_RECV=7, +}; + +struct usd_completion { + enum usd_completion_status uc_status; + enum usd_completion_type uc_type; + uint32_t uc_bytes; + uint16_t uc_rkey; + struct usd_qp *uc_qp; + void *uc_context; + u_int16_t uc_retrans; +}; + +struct usd_recv_desc { + void *urd_context; + struct iovec urd_iov[USD_RECV_MAX_SGE]; + size_t urd_iov_cnt; + struct usd_recv_desc *urd_next; +}; + +/* + * Operations that may vary based on transport/QP type + */ +struct usd_qp_ops { + int (*qo_post_send_one)(struct usd_qp *qp, + struct usd_dest *dest, const void *buf, size_t len, + uint32_t flags, void *context); + int (*qo_post_send_one_prefixed)(struct usd_qp *qp, + struct usd_dest *dest, const void *buf, size_t len, + uint32_t flags, void *context); + int (*qo_post_send_one_copy)(struct usd_qp *qp, + struct usd_dest *dest, const void *buf, size_t len, + uint32_t flags, void *context); + int (*qo_post_send_two_copy)(struct usd_qp *qp, + struct usd_dest *dest, const void *hdr, size_t hdrlen, + const void *pkt, size_t pktlen, uint32_t flags, void *context); + int (*qo_post_send_iov)(struct usd_qp *qp, + struct usd_dest *dest, const struct iovec* iov, + size_t iov_count, uint32_t flags, void *context); + int (*qo_post_send_one_vlan)(struct usd_qp *qp, + struct usd_dest *dest, const void *buf, size_t len, + u_int16_t vlan, uint32_t flags, void *context); +}; + +/* + * user's view of a CQ + */ +struct usd_cq { + unsigned ucq_num_entries; +}; + +/* + * User's view of a QP + */ +struct usd_qp { + struct usd_qp_ops uq_ops; + void *uq_context; /* place for user to scribble */ +}; + +/* + * Filters for QPs + */ +enum usd_filter_type { + USD_FTY_UDP, + USD_FTY_UDP_SOCK, + USD_FTY_TCP, + USD_FTY_MCAST, + USD_FTY_8915 +}; +struct usd_filter { + enum usd_filter_type uf_type; + union { + struct { + uint16_t u_port; + } uf_udp; + struct { + int u_sock; + } uf_udp_sock; + struct { + int t_sock; + struct sockaddr_in t_remote; + } uf_tcp; + struct { + struct sockaddr_in m_addr; + } uf_mcast; + } uf_filter; +}; + +/* + * Local address - much like a filter + * Type is defined by transport specified in create_qp + */ +struct usd_local_addr { + union { + struct { + struct sockaddr_in u_addr; + } ul_udp; + struct { + uint32_t qp_num; + } ul_8915; + } ul_addr; +}; + +enum usd_qp_transport { + USD_QTR_RAW, /* no header added */ + USD_QTR_UDP /* create UDP header based on dest */ +}; + +enum usd_qp_type { + USD_QTY_UD, + USD_QTY_UD_PIO, +}; + +/* + * Attributes of a queue pair + */ +struct usd_qp_attrs { + enum usd_qp_transport uqa_transport; + enum usd_qp_type uqa_qtype; + struct usd_local_addr uqa_local_addr; + + unsigned uqa_max_send_credits; + unsigned uqa_max_recv_credits; + uint64_t uqa_pio_paddr; + + unsigned uqa_max_inline; + unsigned uqa_hdr_len; /* length of header for this QP */ +}; + +/* + * Description of a device event which has occurred + */ +enum usd_device_event_type { + USD_EVENT_LINK_UP, + USD_EVENT_LINK_DOWN +}; +struct usd_device_event { + union { + void *ude_context; + } ude_context; + enum usd_device_event_type ude_type; +}; + +/* + * Returned form usd_get_available_devices() - array of currently + * available usd device names + */ +struct usd_device_entry { + char ude_devname[USD_MAX_DEVNAME]; +}; + +/* + * Send flags + */ +enum usd_send_flag_shift { + USD_SFS_SIGNAL, +}; +#define USD_SF_SIGNAL (1 << USD_SFS_SIGNAL) + + /* + * cq creation parameters + */ +struct usd_cq_init_attr { + unsigned num_entries; /* number of requested cq elements */ + int comp_fd; /* completion fd */ + int comp_vec; /* requested completion vector */ + int comp_req_notify; /* whether need to request notify for each completion */ + void *ibv_cq; /* verbs userspace cq object if signaling through uverbs */ +}; + +/* + * Headers for defined transport types + */ +struct usd_udp_hdr { + struct ether_header uh_eth; + struct iphdr uh_ip; + struct udphdr uh_udp; +} __attribute__ ((__packed__)); + +/* + * Struct and defines for usd open parameters + */ +#define UOPF_SKIP_LINK_CHECK 0x1 +#define UOPF_SKIP_PD_ALLOC 0x2 + +struct usd_open_params { + int flags; + int cmd_fd; + struct usd_context *context; +}; + +/* + **************************************************************** + * Device management + **************************************************************** + */ +int usd_get_device_list(struct usd_device_entry *dev_array, + int *num_devs); + +int usd_open(const char *devname, struct usd_device **dev_o); + +int usd_open_with_params(const char *dev_name, + struct usd_open_params *uop_param, + struct usd_device **dev_o); + +int usd_close(struct usd_device *dev); + +int usd_get_device_attrs(struct usd_device *dev, + struct usd_device_attrs *attr); + +int usd_get_device_event(struct usd_device *dev, + struct usd_device_event *event); + +enum usd_capability { + USD_CAP_CQ_SHARING, + USD_CAP_MAP_PER_RES, + USD_CAP_PIO, + USD_CAP_CQ_INTR, + USD_CAP_GRP_INTR, + USD_CAP_MAX +}; +int usd_get_cap(struct usd_device *dev, enum usd_capability cap); + +/* + **************************************************************** + * Queue management + **************************************************************** + */ + +/* + * Get a file descriptor which can be used to poll for completions. The + * returned file descriptor will be different on each call to + * usd_get_completion_fd, so that coordination is not needed when using these + * fds in syscalls like poll(2). + */ +int usd_get_completion_fd(struct usd_device *dev, int *comp_fd_o); + +int usd_put_completion_fd(struct usd_device *dev, int comp_fd); + +/* + * Request a CQ with specified attributes: + * dev - device on which to create this CQ + * init_attr - CQ creation parameters + */ +int usd_create_cq(struct usd_device *dev, struct usd_cq_init_attr *init_attr, + struct usd_cq **cq_o); + +int usd_destroy_cq(struct usd_cq *cq); + +int usd_cq_intr_enable(struct usd_cq *cq); +int usd_cq_intr_disable(struct usd_cq *cq); + +/* + * Get and set interrupt coalescing delay, units are in microseconds + */ +int usd_cq_set_intr_coal(struct usd_cq *cq, unsigned intr_coal_delay); +unsigned usd_cq_get_intr_coal(struct usd_cq *cq); + +/* + * IN: + * dev - device on which QP is to be created + * transport - what transport to use on this queue + * type - type of queue to create + * wcq - CQ handle for send completions + * rcq - CQ handle for receive completions + * send_credits - Number of send credits requested + * recv_credite - Number of receive buffer credits requested + * port - Requested local port for QP (0 lets library choose) + * qp_o - Address to receive QP handle on successful completion + * OUT: + * Returns 0 or code from errno.h + * 0 - successful completion + * EBUSY - port is in use + * XXX + */ +int usd_create_qp(struct usd_device *dev, + enum usd_qp_transport transport, + enum usd_qp_type qtype, + struct usd_cq *wcq, struct usd_cq *rcq, + unsigned send_credits, unsigned recv_credits, + struct usd_filter *filt, struct usd_qp **qp_o); + +int usd_destroy_qp(struct usd_qp *qp); + +int usd_enable_qp(struct usd_qp *qp); +int usd_disable_qp(struct usd_qp *qp); + +int usd_get_qp_attrs(struct usd_qp *qp, + struct usd_qp_attrs *qp_attrs_o); + +/* + * Add a filter to a QP + */ +int usd_qp_add_filter(struct usd_qp *qp, struct usd_filter *filter); + +/* + * Get current send credits + */ +unsigned usd_get_send_credits(struct usd_qp *uqp); + +/* + * Get current recv credits + */ +unsigned usd_get_recv_credits(struct usd_qp *uqp); + +/* + **************************************************************** + * Memory management + **************************************************************** + */ + +int usd_reg_mr(struct usd_device *dev, + void *buffer, size_t size, struct usd_mr **mr_o); +int usd_dereg_mr(struct usd_mr *mr); + +int usd_alloc_mr(struct usd_device *dev, size_t size, void **vaddr_o); +int usd_free_mr(void *vaddr); + +/* + **************************************************************** + * Destination management + **************************************************************** + */ + +/* + * Return the distance metric to a specified IP address + * Metric is: + * 0 - same VLAN + * 1..MAXINT - relative distance metric + * -1 - unreachable + */ +int usd_get_dest_distance(struct usd_device *dev, uint32_t daddr_be, + int *metric_o); + +/* + * Settings for address resolution timeout and retry + */ +struct usd_dest_params { + unsigned dp_arp_timeout; /* per-try timeout in ms */ + unsigned dp_max_arps; +}; + +/* + * Get address resolution settings + */ +int usd_get_dest_params(struct usd_dest_params *params); + +/* + * Set address resolution settings + * Settings may not be changed while any resolution requests are in progress. + */ +int usd_set_dest_params(struct usd_dest_params *params); + +/* + * Used to create a destination with MAC address is already known. + */ +int usd_create_dest_with_mac(struct usd_device *dev, uint32_t daddr_be, + uint16_t port_be, uint8_t *dmac, struct usd_dest **dest_o); + +/* + * Synchronously creates a destination + */ +int usd_create_dest(struct usd_device *dev, uint32_t daddr_be, + uint16_t port_be, struct usd_dest **dest_o); + +/* + * Start the necessary ARP resolution to create a destination + * Resolution progress is performed in usd_create_dest_query() and + * usd_create_dest_poll() + */ +int usd_create_dest_start(struct usd_device *dev, uint32_t daddr_be, + uint16_t dport_be, void *context); + +/* + * Cancel resolution on a not-yet-completed create_dest request + */ +int usd_create_dest_cancel(struct usd_device *dev, void *context); + +/* + * Extract dest port and IP from a destination + */ +int usd_expand_dest(struct usd_dest *dest, uint32_t *dest_ip_be_o, + uint16_t *dest_port_be_o); + +/* + * Query completion status of a given create_dest request + * If complete, newly allocated destination is returned in dest_o + * Returns: + * 0 - request completed, *status is valid + * dest_o valid if *status == 0 + * -EAGAIN - nothing is complete + * other - negative errno code + */ +int usd_create_dest_query(struct usd_device *dev, void *context, int *status, + struct usd_dest **dest_o); + +/* + * Checks for completed destination creation. + * context specified in call to usd_create_dest_start is returned, + * newly allocated destination is returned in dest_o + * Returns: + * 0 - request completed, status and context_o valid + * dest_o valid if *status == 0 + * -EAGAIN - nothing is complete + * other - negative errno code + */ +int usd_create_dest_poll(struct usd_device *dev, void **context_o, int *status, + struct usd_dest **dest_o); + + +int usd_destroy_dest(struct usd_dest *dest); + +/* + **************************************************************** + * Sending, receiving, and completions + **************************************************************** + */ + +/* + * Post a receive. The number of receive credits consumed is equal + * to the number of entries in the SG list of the recv_desc, or + * recv_desc.urd_iov_cnt + */ +int usd_post_recv(struct usd_qp *qp, + struct usd_recv_desc *recv_list); + +int usd_poll_cq_multi(struct usd_cq *cq, int max_comps, + struct usd_completion *comps); +int usd_poll_cq(struct usd_cq *cq, struct usd_completion *comp); +int usd_poll_req_notify(struct usd_cq *ucq); + +unsigned usd_get_send_credits(struct usd_qp *qp); + +unsigned usd_get_recv_credits(struct usd_qp *qp); + +/* + * post a single-buffer send from registered memory + * IN: + * qp + * dest + * buf - + * Requires 2 send credits + */ +static inline int +usd_post_send_one( + struct usd_qp *qp, + struct usd_dest *dest, + const void *buf, + size_t len, + uint32_t flags, + void *context) +{ + return qp->uq_ops.qo_post_send_one( + qp, dest, buf, len, flags, context); +} + +/* + * post a single-buffer send from registered memory to specified VLAN + * IN: + * qp + * dest + * buf - + * Requires 2 send credits + */ +static inline int +usd_post_send_one_vlan( + struct usd_qp *qp, + struct usd_dest *dest, + const void *buf, + size_t len, + u_int16_t vlan, + uint32_t flags, + void *context) +{ + return qp->uq_ops.qo_post_send_one_vlan( + qp, dest, buf, len, vlan, flags, context); +} + +/* + * post a single-buffer send from registered memory + * Caller must allow sufficient space *before* the packet for usd header + * For optimal efficieny, the buffer should be aligned on XXX boundary + * IN: + * qp + * dest + * buf - + * Requires 1 send credit + */ +static inline int +usd_post_send_one_prefixed( + struct usd_qp *qp, + struct usd_dest *dest, + const void *buf, + size_t len, + uint32_t flags, + void *context) +{ + return qp->uq_ops.qo_post_send_one_prefixed( + qp, dest, buf, len, flags, context); +} + +/* + * post a single-buffer send from anywhere + * Data is copied into registered memory by the lib for sending + * IN: + * qp + * dest + * buf - + * len - number of bytes in buffer, must be less than max_inline for the QP + * Requires 1 send credit + */ +static inline int +usd_post_send_one_copy(struct usd_qp *qp, struct usd_dest *dest, + const void *buf, size_t len, uint32_t flags, void *context) +{ + return qp->uq_ops.qo_post_send_one_copy( + qp, dest, buf, len, flags, context); +} + +/* + * post a two-buffer send, the first buffer is a usually a header and must + * allow space *before* it for our header. + * For optimal efficieny, the first buffer should be aligned XXX + * Requires 2 send credits + */ +int usd_post_send_two_prefixed(struct usd_qp *qp, struct usd_dest *dest, + const void *hdr, size_t hdr_len, const void *pkt, size_t pkt_len, + uint32_t flags, void *context); + +/* + * post a two-buffer send, the first buffer is a usually a header. + * The header and the packet will be both be copied into registered + * memory by usnic_direct and sent. + * Requires 2 send credits + */ +static inline int +usd_post_send_two_copy(struct usd_qp *qp, struct usd_dest *dest, + const void *hdr, size_t hdrlen, const void *pkt, size_t pktlen, + uint32_t flags, void *context) +{ + return qp->uq_ops.qo_post_send_two_copy( + qp, dest, hdr, hdrlen, pkt, pktlen, flags, context); +} + +/* + * Post an N-buffer send + * All buffers must be in registered memory. + * Requires iov_count + 1 send credits + */ +static inline int +usd_post_send_iov(struct usd_qp *qp, struct usd_dest *dest, + const struct iovec *iov, size_t iov_count, uint32_t flags, + void *context) +{ + return qp->uq_ops.qo_post_send_iov( + qp, dest, iov, iov_count, flags, context); +} + +/**************************************************************** + * enum-to-string utility functions (for prettyprinting) + ****************************************************************/ + +const char *usd_link_state_str(enum usd_link_state state); + +const char *usd_completion_status_str(enum usd_completion_status cstatus); + +const char *usd_completion_type_str(enum usd_completion_type ctype); + +const char *usd_filter_type_str(enum usd_filter_type ftype); + +const char *usd_qp_transport_str(enum usd_qp_transport qpt); + +const char *usd_qp_type_str(enum usd_qp_type); + +const char *usd_qp_event_event_type_str(enum usd_device_event_type det); + +const char *usd_send_flag_sift_str(enum usd_send_flag_shift sfs); + +const char *usd_capability(enum usd_capability cap); + +const char *usd_devid_to_nicname(uint32_t vendor_id, uint32_t device_id); + +const char *usd_devid_to_pid(uint32_t vendor_id, uint32_t device_id); + +/**************************************************************** + * special API holes punched for implementing verbs + ****************************************************************/ +/* open a context, mapped to a verbs open_device call */ +int usd_open_context(const char *dev_name, int cmd_fd, + struct usd_context **ctx_o); + +int usd_close_context(struct usd_context *ctx); + +/* modify the destination UDP port in a usd_dest */ +void usd_dest_set_udp_ports(struct usd_dest *dest, struct usd_qp *src_qp, + uint16_t dest_port_be); + +/* create a dest with only IP addresses set */ +int usd_create_ip_dest(struct usd_device *dev, uint32_t dest_ip_be, + struct usd_dest **dest_o); + +#endif /* _USNIC_DIRECT_H_ */ diff --git a/prov/usnic/src/usnic_direct/usnic_ib_abi.h b/prov/usnic/src/usnic_direct/usnic_ib_abi.h new file mode 100644 index 00000000000..dbf7cbc9914 --- /dev/null +++ b/prov/usnic/src/usnic_direct/usnic_ib_abi.h @@ -0,0 +1,153 @@ +/* + * Copyright (c) 2013-2017, Cisco Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + + +#ifndef USNIC_IB_ABI_H +#define USNIC_IB_ABI_H + +#include "kcompat.h" +#include + +/* + * Pick up common file with driver + */ +#include "usnic_abi.h" + +struct usnic_query_device { + struct ib_uverbs_cmd_hdr ibv_cmd_hdr; + struct ib_uverbs_query_device ibv_cmd; +}; + +struct usnic_query_port { + struct ib_uverbs_cmd_hdr ibv_cmd_hdr; + struct ib_uverbs_query_port ibv_cmd; +}; + +struct usnic_get_context { + struct ib_uverbs_cmd_hdr ibv_cmd_hdr; + struct ib_uverbs_get_context ibv_cmd; + struct usnic_ib_get_context_cmd usnic_cmd; + __u64 reserved; +}; + +struct usnic_get_context_resp { + struct ib_uverbs_get_context_resp ibv_resp; + struct usnic_ib_get_context_resp usnic_resp; + __u64 reserved; +}; + +struct usnic_alloc_pd { + struct ib_uverbs_cmd_hdr ibv_cmd_hdr; + struct ib_uverbs_alloc_pd ibv_cmd; + struct usnic_ib_alloc_pd_cmd usnic_cmd; +}; + +struct usnic_alloc_pd_resp { + struct ib_uverbs_alloc_pd_resp ibv_resp; + struct usnic_ib_alloc_pd_resp usnic_resp; +}; + +struct usnic_dealloc_pd { + struct ib_uverbs_cmd_hdr ibv_cmd_hdr; + struct ib_uverbs_dealloc_pd ibv_cmd; +}; + +struct usnic_create_comp_channel { + struct ib_uverbs_cmd_hdr ibv_cmd_hdr; + struct ib_uverbs_create_comp_channel ibv_cmd; +}; + +struct usnic_reg_mr { + struct ib_uverbs_cmd_hdr ibv_cmd_hdr; + struct ib_uverbs_reg_mr ibv_cmd; + __u64 reserved; +}; + +struct usnic_reg_mr_resp { + struct ib_uverbs_reg_mr_resp ibv_resp; + __u64 reserved; +}; + +struct usnic_dereg_mr { + struct ib_uverbs_cmd_hdr ibv_cmd_hdr; + struct ib_uverbs_dereg_mr ibv_cmd; +}; + +struct usnic_create_qp { + struct ib_uverbs_cmd_hdr ibv_cmd_hdr; + struct ib_uverbs_create_qp ibv_cmd; + struct usnic_ib_create_qp_cmd usnic_cmd; + __u64 reserved[8]; +}; + +struct usnic_create_qp_resp { + struct ib_uverbs_create_qp_resp ibv_resp; + struct usnic_ib_create_qp_resp usnic_resp; +}; + +struct usnic_modify_qp { + struct ib_uverbs_cmd_hdr ibv_cmd_hdr; + struct ib_uverbs_modify_qp ibv_cmd; +}; + +struct usnic_destroy_qp { + struct ib_uverbs_cmd_hdr ibv_cmd_hdr; + struct ib_uverbs_destroy_qp ibv_cmd; +}; + +struct usnic_create_cq { + struct ib_uverbs_cmd_hdr ibv_cmd_hdr; + struct ib_uverbs_create_cq ibv_cmd; + struct usnic_ib_create_cq usnic_cmd; +}; + +struct usnic_create_cq_resp { + struct ib_uverbs_create_cq_resp ibv_resp; + struct usnic_ib_create_cq_resp usnic_resp; +}; + +struct usnic_destroy_cq { + struct ib_uverbs_cmd_hdr ibv_cmd_hdr; + struct ib_uverbs_destroy_cq ibv_cmd; +}; + +#endif /* USNIC_IB_ABI_H */ diff --git a/prov/usnic/src/usnic_direct/usnic_ip_utils.c b/prov/usnic/src/usnic_direct/usnic_ip_utils.c new file mode 100644 index 00000000000..aa8a66eae7a --- /dev/null +++ b/prov/usnic/src/usnic_direct/usnic_ip_utils.c @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libnl_utils.h" +#include "usnic_user_utils.h" +#include "usnic_ip_utils.h" + +int usnic_arp_lookup(char *ifname, uint32_t ipaddr, int sockfd, uint8_t *macaddr) +{ + struct arpreq req; + struct sockaddr_in sinp; + int err; + int status; + + memset(&req, 0, sizeof req); + strncpy(req.arp_dev, ifname, sizeof(req.arp_dev) - 1); + memset(&sinp, 0, sizeof(sinp)); + sinp.sin_family = AF_INET; + sinp.sin_addr.s_addr = ipaddr; + memcpy(&req.arp_pa, &sinp, sizeof(sinp)); + + err = 0; + status = ioctl(sockfd, SIOCGARP, (char *)&req, sizeof(req)); + if (status != -1 && (req.arp_flags & ATF_COM)) + memcpy(macaddr, req.arp_ha.sa_data, 6); + else if (status != -1) /* req.arp_flags & ATF_COM == 0 */ + err = EAGAIN; + else if (errno == ENXIO) /* ENXIO means no ARP entry was found */ + err = EAGAIN; + else /* status == -1 */ + err = errno; + + return err; +} + +static int usnic_arp_lookup_index(int if_index, uint32_t ipaddr, int sockfd, uint8_t *macaddr) +{ + char ifname[IF_NAMESIZE]; + + if (if_indextoname((unsigned int)if_index, ifname) == NULL) { + usnic_perr("if_indextoname failed. ifindex: %d", if_index); + return errno; + } + + return usnic_arp_lookup(ifname, ipaddr, sockfd, macaddr); +} + +int usnic_arp_request(uint32_t ipaddr, int sockfd) +{ + struct sockaddr_in sin; + int err = 0; + + memset(&sin, 0, sizeof(sin)); + sin.sin_addr.s_addr = ipaddr; + sin.sin_port = htons(9); /* Send to Discard Protocol */ + err = sendto(sockfd, NULL, 0, 0, (struct sockaddr *)&sin, sizeof(sin)); + if (err == -1) { + char buf[INET_ADDRSTRLEN]; + inet_ntop(AF_INET, &ipaddr, buf, sizeof(buf)); + usnic_perr("Arp triggering socket sendto() failed. ip: %s", + buf); + } + else + err = 0; + + return err; +} + +static +int usnic_resolve_arp(int if_index, uint32_t ipaddr, uint8_t *macaddr) +{ + int sockfd; + int err; + char buf[INET_ADDRSTRLEN]; + + inet_ntop(AF_INET, &ipaddr, buf, sizeof(buf)); + sockfd = socket(AF_INET, SOCK_DGRAM, 0); + if (sockfd == -1) { + usnic_perr( + "socket() failed when creating socket for arp resolution, ip: %s", + buf); + return ENXIO; + } + + err = usnic_arp_lookup_index(if_index, ipaddr, sockfd, macaddr); + if (err == EAGAIN || err == ENXIO) { + /* entry is FAILED or INCOMPLETE or does not exist, send a dummy packet */ + err = usnic_arp_request(ipaddr, sockfd); + if (err) /* sendto failure, abort */ + err = ENXIO; + else + err = EAGAIN; + } + + close(sockfd); + return err; +} + +int usnic_resolve_dst(int if_index, uint32_t src_ip_addr, + uint32_t dst_ip_addr, uint8_t *macaddr) +{ + uint32_t nh_ip_addr = 0; + int err; + + err = usnic_nl_rt_lookup(src_ip_addr, dst_ip_addr, if_index, + &nh_ip_addr); + if (err) { + char ifname[IFNAMSIZ]; + char src_buf[INET_ADDRSTRLEN]; + char dst_buf[INET_ADDRSTRLEN]; + + if_indextoname((unsigned int)if_index, ifname); + inet_ntop(AF_INET, &src_ip_addr, src_buf, sizeof(src_buf)); + inet_ntop(AF_INET, &dst_ip_addr, dst_buf, sizeof(dst_buf)); + + usnic_err( + "ip route lookup for dst: %s on if: %d device: %s src ip: %s failed\n", + dst_buf, if_index, ifname, src_buf); + return EHOSTUNREACH; + } + + if (nh_ip_addr) { + char nh_buf[INET_ADDRSTRLEN]; + char src_buf[INET_ADDRSTRLEN]; + char dst_buf[INET_ADDRSTRLEN]; + + inet_ntop(AF_INET, &nh_ip_addr, nh_buf, sizeof(nh_buf)); + inet_ntop(AF_INET, &src_ip_addr, src_buf, sizeof(src_buf)); + inet_ntop(AF_INET, &dst_ip_addr, dst_buf, sizeof(dst_buf)); + + usnic_info("ip route for dest %s src %s is via %s\n", + dst_buf, src_buf, nh_buf); + } else { + char src_buf[INET_ADDRSTRLEN]; + char dst_buf[INET_ADDRSTRLEN]; + + inet_ntop(AF_INET, &src_ip_addr, src_buf, sizeof(src_buf)); + inet_ntop(AF_INET, &dst_ip_addr, dst_buf, sizeof(dst_buf)); + usnic_info("ip route for dest %s src %s is directly connected\n", + dst_buf, src_buf); + } + + if (nh_ip_addr) + return usnic_resolve_arp(if_index, nh_ip_addr, macaddr); + else + return usnic_resolve_arp(if_index, dst_ip_addr, macaddr); +} diff --git a/prov/usnic/src/usnic_direct/usnic_ip_utils.h b/prov/usnic/src/usnic_direct/usnic_ip_utils.h new file mode 100644 index 00000000000..e9f1f600da9 --- /dev/null +++ b/prov/usnic/src/usnic_direct/usnic_ip_utils.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#ifndef USNIC_IP_UTILS_H +#define USNIC_IP_UTILS_H + +int usnic_arp_lookup(char *ifname, uint32_t ipaddr, int sockfd, + uint8_t *macaddr); +int usnic_arp_request(uint32_t ipaddr, int sockfd); +int usnic_resolve_dst(int if_index, uint32_t src_ip_addr, + uint32_t dst_ip_addr, uint8_t *macaddr); + +#endif /* USNIC_IP_UTILS_H */ diff --git a/prov/usnic/src/usnic_direct/usnic_user_utils.h b/prov/usnic/src/usnic_direct/usnic_user_utils.h new file mode 100644 index 00000000000..2a92100d335 --- /dev/null +++ b/prov/usnic/src/usnic_direct/usnic_user_utils.h @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#ifndef USNIC_USER_UTILS_H +#define USNIC_USER_UTILS_H + +#include +#include +#include + +#ifndef __CHAR_BIT__ +#define __CHAR_BIT__ 8 +#endif +#define CHAR_BIT __CHAR_BIT__ + +#define USNIC_LOG_LVL_NONE 0 +#define USNIC_LOG_LVL_ERR 1 +#define USNIC_LOG_LVL_INFO 2 +#define USNIC_LOG_LVL_VERBOSE 3 + +#if WANT_DEBUG_MSGS +#define USNIC_LOG_LVL USNIC_LOG_LVL_INFO +#else +#define USNIC_LOG_LVL USNIC_LOG_LVL_NONE +#endif + +#define usnic_printf(fd, args...) \ + do { \ + fprintf(fd, "usnic:%-22s:%5d: ", __func__, __LINE__); \ + fprintf(fd, args); \ + } while (0) + +#if USNIC_LOG_LVL >= USNIC_LOG_LVL_ERR +#define usnic_err(args...) usnic_printf(stderr, args) +#else +#define usnic_err(args...) {} +#endif + +#if USNIC_LOG_LVL >= USNIC_LOG_LVL_ERR +#define usnic_strerror(err, args, ...) \ + do { \ + char err_buf[50]; \ + char *estr = strerror_r(err, err_buf, sizeof(err_buf)); \ + fprintf(stderr, "usnic:%-22s:%5d: ", __func__, __LINE__); \ + fprintf(stderr, args " error: %s\n", ## __VA_ARGS__, \ + estr); \ + } while (0) +#else +#define usnic_strerror(err, args, ...) +#endif + +#if USNIC_LOG_LVL >= USNIC_LOG_LVL_ERR +#define usnic_perr(args, ...) \ + do { \ + char err_buf[50]; \ + char *estr = strerror_r(errno, err_buf, sizeof(err_buf)); \ + fprintf(stderr, "usnic:%-22s:%5d: ", __func__, __LINE__); \ + fprintf(stderr, args " error: %s\n", ## __VA_ARGS__, \ + estr); \ + } while (0) +#else +#define usnic_perr(args, ...) {} +#endif + +#if USNIC_LOG_LVL >= USNIC_LOG_LVL_INFO +#define usnic_info(args...) usnic_printf(stdout, args) +#else +#define usnic_info(args...) {} +#endif + +#if USNIC_LOG_LVL >= USNIC_LOG_LVL_VERBOSE +#define usnic_verbose(args...) usnic_printf(stdout, args) +#else +#define usnic_verbose(args...) {} +#endif + +#endif /* USNIC_USER_UTILS_H */ diff --git a/prov/usnic/src/usnic_direct/vnic_cq.c b/prov/usnic/src/usnic_direct/vnic_cq.c new file mode 100644 index 00000000000..12fc5cdb53c --- /dev/null +++ b/prov/usnic/src/usnic_direct/vnic_cq.c @@ -0,0 +1,128 @@ +/* + * Copyright 2008-2018 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#include +#include +#include +#include + +#include "kcompat.h" +#include "vnic_dev.h" +#include "vnic_cq.h" + +#ifndef NOT_FOR_OPEN_ENIC +int vnic_cq_mem_size(struct vnic_cq *cq, unsigned int desc_count, + unsigned int desc_size) +{ + int mem_size; + + mem_size = vnic_dev_desc_ring_size(&cq->ring, desc_count, desc_size); + + return mem_size; +} + +#endif +void vnic_cq_free(struct vnic_cq *cq) +{ + vnic_dev_free_desc_ring(cq->vdev, &cq->ring); + + cq->ctrl = NULL; +} + +int vnic_cq_alloc(struct vnic_dev *vdev, struct vnic_cq *cq, unsigned int index, + unsigned int desc_count, unsigned int desc_size) +{ + int err; + + cq->index = index; + cq->vdev = vdev; + + cq->ctrl = vnic_dev_get_res(vdev, RES_TYPE_CQ, index); + if (!cq->ctrl) { + pr_err("Failed to hook CQ[%d] resource\n", index); + return -EINVAL; + } + + err = vnic_dev_alloc_desc_ring(vdev, &cq->ring, desc_count, desc_size); + if (err) + return err; + + return 0; +} + +void vnic_cq_init(struct vnic_cq *cq, unsigned int flow_control_enable, + unsigned int color_enable, unsigned int cq_head, unsigned int cq_tail, + unsigned int cq_tail_color, unsigned int interrupt_enable, + unsigned int cq_entry_enable, unsigned int cq_message_enable, + unsigned int interrupt_offset, u64 cq_message_addr) +{ + u64 paddr; + + paddr = (u64)cq->ring.base_addr | VNIC_PADDR_TARGET; + writeq(paddr, &cq->ctrl->ring_base); + iowrite32(cq->ring.desc_count, &cq->ctrl->ring_size); + iowrite32(flow_control_enable, &cq->ctrl->flow_control_enable); + iowrite32(color_enable, &cq->ctrl->color_enable); + iowrite32(cq_head, &cq->ctrl->cq_head); + iowrite32(cq_tail, &cq->ctrl->cq_tail); + iowrite32(cq_tail_color, &cq->ctrl->cq_tail_color); + iowrite32(interrupt_enable, &cq->ctrl->interrupt_enable); + iowrite32(cq_entry_enable, &cq->ctrl->cq_entry_enable); + iowrite32(cq_message_enable, &cq->ctrl->cq_message_enable); + iowrite32(interrupt_offset, &cq->ctrl->interrupt_offset); + writeq(cq_message_addr, &cq->ctrl->cq_message_addr); + + cq->interrupt_offset = interrupt_offset; +} + +void vnic_cq_clean(struct vnic_cq *cq) +{ + cq->to_clean = 0; + cq->last_color = 0; + + iowrite32(0, &cq->ctrl->cq_head); + iowrite32(0, &cq->ctrl->cq_tail); + iowrite32(1, &cq->ctrl->cq_tail_color); + + vnic_dev_clear_desc_ring(&cq->ring); +} diff --git a/prov/usnic/src/usnic_direct/vnic_cq.h b/prov/usnic/src/usnic_direct/vnic_cq.h new file mode 100644 index 00000000000..af1c19f769d --- /dev/null +++ b/prov/usnic/src/usnic_direct/vnic_cq.h @@ -0,0 +1,154 @@ +/* + * Copyright 2008-2018 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#ifndef _VNIC_CQ_H_ +#define _VNIC_CQ_H_ + +#include "cq_desc.h" +#include "vnic_dev.h" + +/* Completion queue control */ +struct vnic_cq_ctrl { + u64 ring_base; /* 0x00 */ + u32 ring_size; /* 0x08 */ + u32 pad0; + u32 flow_control_enable; /* 0x10 */ + u32 pad1; + u32 color_enable; /* 0x18 */ + u32 pad2; + u32 cq_head; /* 0x20 */ + u32 pad3; + u32 cq_tail; /* 0x28 */ + u32 pad4; + u32 cq_tail_color; /* 0x30 */ + u32 pad5; + u32 interrupt_enable; /* 0x38 */ + u32 pad6; + u32 cq_entry_enable; /* 0x40 */ + u32 pad7; + u32 cq_message_enable; /* 0x48 */ + u32 pad8; + u32 interrupt_offset; /* 0x50 */ + u32 pad9; + u64 cq_message_addr; /* 0x58 */ + u32 pad10; +}; + +#ifdef ENIC_AIC +struct vnic_rx_bytes_counter { + unsigned int small_pkt_bytes_cnt; + unsigned int large_pkt_bytes_cnt; +}; +#endif + +struct vnic_cq { + unsigned int index; + struct vnic_dev *vdev; + struct vnic_cq_ctrl __iomem *ctrl; /* memory-mapped */ + struct vnic_dev_ring ring; + unsigned int to_clean; + unsigned int last_color; + unsigned int interrupt_offset; +#ifdef ENIC_AIC + struct vnic_rx_bytes_counter pkt_size_counter; + unsigned int cur_rx_coal_timeval; + unsigned int tobe_rx_coal_timeval; + ktime_t prev_ts; +#endif +}; + +static inline unsigned int vnic_cq_service(struct vnic_cq *cq, + unsigned int work_to_do, + int (*q_service)(struct vnic_dev *vdev, struct cq_desc *cq_desc, + u8 type, u16 q_number, u16 completed_index, void *opaque), + void *opaque) +{ + struct cq_desc *cq_desc; + unsigned int work_done = 0; + u16 q_number, completed_index; + u8 type, color; + + cq_desc = (struct cq_desc *)((u8 *)cq->ring.descs + + cq->ring.desc_size * cq->to_clean); + cq_desc_dec(cq_desc, &type, &color, + &q_number, &completed_index); + + while (color != cq->last_color) { + if ((*q_service)(cq->vdev, cq_desc, type, + q_number, completed_index, opaque)) + break; + + cq->to_clean++; + if (cq->to_clean == cq->ring.desc_count) { + cq->to_clean = 0; + cq->last_color = cq->last_color ? 0 : 1; + } + + cq_desc = (struct cq_desc *)((u8 *)cq->ring.descs + + cq->ring.desc_size * cq->to_clean); + cq_desc_dec(cq_desc, &type, &color, + &q_number, &completed_index); + + work_done++; + if (work_done >= work_to_do) + break; + } + + return work_done; +} + +void vnic_cq_free(struct vnic_cq *cq); +int vnic_cq_alloc(struct vnic_dev *vdev, struct vnic_cq *cq, unsigned int index, + unsigned int desc_count, unsigned int desc_size); +void vnic_cq_init(struct vnic_cq *cq, unsigned int flow_control_enable, + unsigned int color_enable, unsigned int cq_head, unsigned int cq_tail, + unsigned int cq_tail_color, unsigned int interrupt_enable, + unsigned int cq_entry_enable, unsigned int message_enable, + unsigned int interrupt_offset, u64 message_addr); +void vnic_cq_clean(struct vnic_cq *cq); +#ifndef NOT_FOR_OPEN_ENIC +int vnic_cq_mem_size(struct vnic_cq *cq, unsigned int desc_count, + unsigned int desc_size); +#endif + +#endif /* _VNIC_CQ_H_ */ diff --git a/prov/usnic/src/usnic_direct/vnic_dev.c b/prov/usnic/src/usnic_direct/vnic_dev.c new file mode 100644 index 00000000000..ca393aec83e --- /dev/null +++ b/prov/usnic/src/usnic_direct/vnic_dev.c @@ -0,0 +1,1787 @@ +/* + * Copyright 2008-2018 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#include +#include +#include +#include +#include +#include + +#include "kcompat.h" +#include "vnic_resource.h" +#include "vnic_devcmd.h" +#include "vnic_dev.h" +#include "vnic_stats.h" +#include "vnic_wq.h" + +struct devcmd2_controller { + struct vnic_wq_ctrl *wq_ctrl; + struct vnic_dev_ring results_ring; + struct vnic_wq wq; + struct vnic_devcmd2 *cmd_ring; + struct devcmd2_result *result; + u16 next_result; + u16 result_size; + int color; + u32 posted; +}; + +enum vnic_proxy_type { + PROXY_NONE, + PROXY_BY_BDF, + PROXY_BY_INDEX, +}; + +struct vnic_res { + void __iomem *vaddr; + dma_addr_t bus_addr; + unsigned int count; + u8 bar_num; + u32 bar_offset; + unsigned long len; +}; + +struct vnic_intr_coal_timer_info { + u32 mul; + u32 div; + u32 max_usec; +}; + +struct vnic_dev { + void *priv; + struct pci_dev *pdev; + struct vnic_res res[RES_TYPE_MAX]; + enum vnic_dev_intr_mode intr_mode; + struct vnic_devcmd __iomem *devcmd; + struct vnic_devcmd_notify *notify; + struct vnic_devcmd_notify notify_copy; + dma_addr_t notify_pa; + u32 notify_sz; + dma_addr_t linkstatus_pa; + struct vnic_stats *stats; + dma_addr_t stats_pa; + struct vnic_devcmd_fw_info *fw_info; + dma_addr_t fw_info_pa; + enum vnic_proxy_type proxy; + u32 proxy_index; + u64 args[VNIC_DEVCMD_NARGS]; + struct vnic_intr_coal_timer_info intr_coal_timer_info; + struct devcmd2_controller *devcmd2; + int (*devcmd_rtn)(struct vnic_dev *vdev, enum vnic_devcmd_cmd cmd, int wait); +}; + +#define VNIC_MAX_RES_HDR_SIZE \ + (sizeof(struct vnic_resource_header) + \ + sizeof(struct vnic_resource) * RES_TYPE_MAX) +#define VNIC_RES_STRIDE 128 + +void *vnic_dev_priv(struct vnic_dev *vdev) +{ + return vdev->priv; +} + +int vnic_dev_get_size(void) +{ + return sizeof(struct vnic_dev); +} + +static int vnic_dev_discover_res(struct vnic_dev *vdev, + struct vnic_dev_bar *bar, unsigned int num_bars) +{ + struct vnic_resource_header __iomem *rh; + struct mgmt_barmap_hdr __iomem *mrh; + struct vnic_resource __iomem *r; + u8 type; + + if (num_bars == 0) + return -EINVAL; + + if (bar->len < VNIC_MAX_RES_HDR_SIZE) { + pr_err("vNIC BAR0 res hdr length error\n"); + return -EINVAL; + } + + rh = bar->vaddr; + mrh = bar->vaddr; + if (!rh) { + pr_err("vNIC BAR0 res hdr not mem-mapped\n"); + return -EINVAL; + } + + /* Check for mgmt vnic in addition to normal vnic */ + if ((ioread32(&rh->magic) != VNIC_RES_MAGIC) || + (ioread32(&rh->version) != VNIC_RES_VERSION)) { + if ((ioread32(&mrh->magic) != MGMTVNIC_MAGIC) || + (ioread32(&mrh->version) != MGMTVNIC_VERSION)) { + pr_err("vNIC BAR0 res magic/version error " + "exp (%lx/%lx) or (%lx/%lx), curr (%x/%x)\n", + VNIC_RES_MAGIC, VNIC_RES_VERSION, + MGMTVNIC_MAGIC, MGMTVNIC_VERSION, + ioread32(&rh->magic), ioread32(&rh->version)); + return -EINVAL; + } + } + + if (ioread32(&mrh->magic) == MGMTVNIC_MAGIC) + r = (struct vnic_resource __iomem *)(mrh + 1); + else + r = (struct vnic_resource __iomem *)(rh + 1); + + while ((type = ioread8(&r->type)) != RES_TYPE_EOL) { + + u8 bar_num = ioread8(&r->bar); + u32 bar_offset = ioread32(&r->bar_offset); + u32 count = ioread32(&r->count); + u32 len; + + r++; + + if (bar_num >= num_bars) + continue; + + if (!bar[bar_num].len || !bar[bar_num].vaddr) + continue; + + switch (type) { + case RES_TYPE_WQ: + case RES_TYPE_RQ: + case RES_TYPE_CQ: + case RES_TYPE_INTR_CTRL: + case RES_TYPE_GRPMBR_INTR: + /* each count is stride bytes long */ + len = count * VNIC_RES_STRIDE; + if (len + bar_offset > bar[bar_num].len) { + pr_err("vNIC BAR0 resource %d " + "out-of-bounds, offset 0x%x + " + "size 0x%x > bar len 0x%lx\n", + type, bar_offset, + len, + bar[bar_num].len); + return -EINVAL; + } + break; + case RES_TYPE_DPKT: + case RES_TYPE_MEM: + case RES_TYPE_INTR_PBA_LEGACY: +#ifdef CONFIG_MIPS + case RES_TYPE_DEV: +#endif + case RES_TYPE_DEVCMD2: + case RES_TYPE_DEVCMD: + len = count; + break; + default: + continue; + } + + vdev->res[type].count = count; + vdev->res[type].vaddr = (char __iomem *)bar[bar_num].vaddr + + bar_offset; + vdev->res[type].bus_addr = bar[bar_num].bus_addr + bar_offset; + vdev->res[type].bar_num = bar_num; + vdev->res[type].bar_offset = bar_offset; + vdev->res[type].len = len; + } + + return 0; +} + +/* + * Assign virtual addresses to all resources whose bus address falls + * within the specified map. + * vnic_dev_discover_res assigns res vaddrs based on the assumption that + * the entire bar is mapped once. When memory regions on the bar + * are mapped seperately, the vnic res for those regions need to be updated + * with new virutal addresses. + * Notice that the mapping and virtual address update need to be done before + * other VNIC APIs that might use the old virtual address, + * such as vdev->devcmd + */ +void vnic_dev_upd_res_vaddr(struct vnic_dev *vdev, + struct vnic_dev_iomap_info *map) +{ + int i; + + for (i = RES_TYPE_EOL; i < RES_TYPE_MAX; i++) { + if (i == RES_TYPE_EOL) + continue; + if (vdev->res[i].bus_addr >= map->bus_addr && + vdev->res[i].bus_addr < map->bus_addr + map->len) + vdev->res[i].vaddr = ((uint8_t *)map->vaddr) + + (vdev->res[i].bus_addr - map->bus_addr); + } +} +EXPORT_SYMBOL(vnic_dev_upd_res_vaddr); + +unsigned int vnic_dev_get_res_count(struct vnic_dev *vdev, + enum vnic_res_type type) +{ + return vdev->res[type].count; +} +EXPORT_SYMBOL(vnic_dev_get_res_count); + +void __iomem *vnic_dev_get_res(struct vnic_dev *vdev, enum vnic_res_type type, + unsigned int index) +{ + if (!vdev->res[type].vaddr) + return NULL; + + switch (type) { + case RES_TYPE_WQ: + case RES_TYPE_RQ: + case RES_TYPE_CQ: + case RES_TYPE_INTR_CTRL: + case RES_TYPE_GRPMBR_INTR: + return (char __iomem *)vdev->res[type].vaddr + + index * VNIC_RES_STRIDE; + default: + return (char __iomem *)vdev->res[type].vaddr; + } +} +EXPORT_SYMBOL(vnic_dev_get_res); + +dma_addr_t vnic_dev_get_res_bus_addr(struct vnic_dev *vdev, + enum vnic_res_type type, unsigned int index) +{ + switch (type) { + case RES_TYPE_WQ: + case RES_TYPE_RQ: + case RES_TYPE_CQ: + case RES_TYPE_INTR_CTRL: + case RES_TYPE_GRPMBR_INTR: + return vdev->res[type].bus_addr + + index * VNIC_RES_STRIDE; + default: + return vdev->res[type].bus_addr; + } +} +EXPORT_SYMBOL(vnic_dev_get_res_bus_addr); + +uint8_t vnic_dev_get_res_bar(struct vnic_dev *vdev, + enum vnic_res_type type) +{ + return vdev->res[type].bar_num; +} +EXPORT_SYMBOL(vnic_dev_get_res_bar); + +uint32_t vnic_dev_get_res_offset(struct vnic_dev *vdev, + enum vnic_res_type type, unsigned int index) +{ + switch (type) { + case RES_TYPE_WQ: + case RES_TYPE_RQ: + case RES_TYPE_CQ: + case RES_TYPE_INTR_CTRL: + case RES_TYPE_GRPMBR_INTR: + return vdev->res[type].bar_offset + + index * VNIC_RES_STRIDE; + default: + return vdev->res[type].bar_offset; + } +} +EXPORT_SYMBOL(vnic_dev_get_res_offset); + +/* + * Get the length of the res type + */ +unsigned long vnic_dev_get_res_type_len(struct vnic_dev *vdev, + enum vnic_res_type type) +{ + return vdev->res[type].len; +} +EXPORT_SYMBOL(vnic_dev_get_res_type_len); + +unsigned int vnic_dev_desc_ring_size(struct vnic_dev_ring *ring, + unsigned int desc_count, unsigned int desc_size) +{ + /* The base address of the desc rings must be 512 byte aligned. + * Descriptor count is aligned to groups of 32 descriptors. A + * count of 0 means the maximum 4096 descriptors. Descriptor + * size is aligned to 16 bytes. + */ + + unsigned int count_align = 32; + unsigned int desc_align = 16; + + ring->base_align = 512; + + if (desc_count == 0) + desc_count = 4096; + + ring->desc_count = ALIGN(desc_count, count_align); + + ring->desc_size = ALIGN(desc_size, desc_align); + + ring->size = ring->desc_count * ring->desc_size; + ring->size_unaligned = ring->size + ring->base_align; + + return ring->size_unaligned; +} + +void vnic_dev_clear_desc_ring(struct vnic_dev_ring *ring) +{ + memset(ring->descs, 0, ring->size); +} + +int vnic_dev_alloc_desc_ring(struct vnic_dev *vdev, struct vnic_dev_ring *ring, + unsigned int desc_count, unsigned int desc_size) +{ + vnic_dev_desc_ring_size(ring, desc_count, desc_size); + + ring->descs_unaligned = pci_alloc_consistent(vdev->pdev, + ring->size_unaligned, + &ring->base_addr_unaligned); + + if (!ring->descs_unaligned) { + pr_err("Failed to allocate ring (size=%d), aborting\n", + (int)ring->size); + return -ENOMEM; + } + + ring->base_addr = ALIGN(ring->base_addr_unaligned, + ring->base_align); + ring->descs = (u8 *)ring->descs_unaligned + + (ring->base_addr - ring->base_addr_unaligned); + + vnic_dev_clear_desc_ring(ring); + + ring->desc_avail = ring->desc_count - 1; + + return 0; +} + +void vnic_dev_free_desc_ring(struct vnic_dev *vdev, struct vnic_dev_ring *ring) +{ + if (ring->descs) { + pci_free_consistent(vdev->pdev, + ring->size_unaligned, + ring->descs_unaligned, + ring->base_addr_unaligned); + ring->descs = NULL; + } +} + +static int _vnic_dev_cmd(struct vnic_dev *vdev, enum vnic_devcmd_cmd cmd, + int wait) +{ +#if defined(CONFIG_MIPS) || defined(MGMT_VNIC) + return 0; +#else + struct vnic_devcmd __iomem *devcmd = vdev->devcmd; + unsigned int i; + int delay; + u32 status; + int err; + + status = ioread32(&devcmd->status); + if (status == 0xFFFFFFFF) { + /* PCI-e target device is gone */ + return -ENODEV; + } + if (status & STAT_BUSY) { + pr_err("%s: Busy devcmd %d\n", + pci_name(vdev->pdev), _CMD_N(cmd)); + return -EBUSY; + } + + if (_CMD_DIR(cmd) & _CMD_DIR_WRITE) { + for (i = 0; i < VNIC_DEVCMD_NARGS; i++) + writeq(vdev->args[i], &devcmd->args[i]); + wmb(); + } + + iowrite32(cmd, &devcmd->cmd); + + if ((_CMD_FLAGS(cmd) & _CMD_FLAGS_NOWAIT)) + return 0; + + for (delay = 0; delay < wait; delay++) { + + udelay(100); + + status = ioread32(&devcmd->status); + if (status == 0xFFFFFFFF) { + /* PCI-e target device is gone */ + return -ENODEV; + } + + if (!(status & STAT_BUSY)) { + if (status & STAT_ERROR) { + err = -(int)readq(&devcmd->args[0]); + if (cmd != CMD_CAPABILITY) + pr_err("%s: Devcmd %d failed " + "with error code %d\n", + pci_name(vdev->pdev), + _CMD_N(cmd), err); + return err; + } + + if (_CMD_DIR(cmd) & _CMD_DIR_READ) { + rmb(); + for (i = 0; i < VNIC_DEVCMD_NARGS; i++) + vdev->args[i] = readq(&devcmd->args[i]); + } + + return 0; + } + } + + pr_err("%s: Timedout devcmd %d\n", + pci_name(vdev->pdev), _CMD_N(cmd)); + return -ETIMEDOUT; +#endif +} + +static int _vnic_dev_cmd2(struct vnic_dev *vdev, enum vnic_devcmd_cmd cmd, + int wait) +{ +#if defined(CONFIG_MIPS) || defined(MGMT_VNIC) + return 0; +#else + struct devcmd2_controller *dc2c = vdev->devcmd2; + struct devcmd2_result *result; + u8 color; + unsigned int i; + int delay; + int err; + u32 fetch_index; + u32 posted = dc2c->posted; + u32 new_posted; + + fetch_index = ioread32(&dc2c->wq_ctrl->fetch_index); + + if (fetch_index == 0xFFFFFFFF) { /* check for hardware gone */ + /* Hardware surprise removal: return error */ + return -ENODEV; + + } + new_posted = (posted + 1) % DEVCMD2_RING_SIZE; + + if (new_posted == fetch_index) { + pr_err("%s: wq is full while issuing devcmd2 command %d, " + "fetch index: %u, posted index: %u\n", + pci_name(vdev->pdev), + _CMD_N(cmd), + fetch_index, posted); + return -EBUSY; + + } + dc2c->cmd_ring[posted].cmd = cmd; + dc2c->cmd_ring[posted].flags = 0; + + if ((_CMD_FLAGS(cmd) & _CMD_FLAGS_NOWAIT)) + dc2c->cmd_ring[posted].flags |= DEVCMD2_FNORESULT; + if (_CMD_DIR(cmd) & _CMD_DIR_WRITE) { + for (i = 0; i < VNIC_DEVCMD_NARGS; i++) + dc2c->cmd_ring[posted].args[i] = vdev->args[i]; + + } + + /* Adding write memory barrier prevents compiler and/or CPU + * reordering, thus avoiding descriptor posting before + * descriptor is initialized. Otherwise, hardware can read + * stale descriptor fields. + */ + wmb(); + iowrite32(new_posted, &dc2c->wq_ctrl->posted_index); + dc2c->posted = new_posted; + + if (dc2c->cmd_ring[posted].flags & DEVCMD2_FNORESULT) + return 0; + + result = dc2c->result + dc2c->next_result; + color = dc2c->color; + + dc2c->next_result++; + if (dc2c->next_result == dc2c->result_size) { + dc2c->next_result = 0; + dc2c->color = dc2c->color ? 0 : 1; + } + + for (delay = 0; delay < wait; delay++) { + udelay(100); + if (result->color == color) { + if (result->error) { + err = -(int) result->error; + if (err != ERR_ECMDUNKNOWN || cmd != CMD_CAPABILITY) + pr_err("%s:Error %d devcmd %d\n", + pci_name(vdev->pdev), + err, _CMD_N(cmd)); + return err; + } + if (_CMD_DIR(cmd) & _CMD_DIR_READ) { + for (i = 0; i < VNIC_DEVCMD_NARGS; i++) + vdev->args[i] = result->results[i]; + } + return 0; + } + } + + pr_err("%s:Timed out devcmd %d\n", pci_name(vdev->pdev), + _CMD_N(cmd)); + + return -ETIMEDOUT; +#endif +} + +int vnic_dev_init_devcmd1(struct vnic_dev *vdev) +{ +#if !defined(CONFIG_MIPS) && !defined(MGMT_VNIC) + vdev->devcmd = vnic_dev_get_res(vdev, RES_TYPE_DEVCMD, 0); + if (!vdev->devcmd) + return -ENODEV; + + vdev->devcmd_rtn = &_vnic_dev_cmd; + return 0; +#else + return 0; +#endif +} + +static int vnic_dev_init_devcmd2(struct vnic_dev *vdev) +{ +#if !defined(CONFIG_MIPS) && !defined(MGMT_VNIC) + int err; + unsigned int fetch_index; + + if (vdev->devcmd2) + return 0; + + vdev->devcmd2 = kzalloc(sizeof(*vdev->devcmd2), GFP_ATOMIC); + if (!vdev->devcmd2) + return -ENOMEM; + + vdev->devcmd2->color = 1; + vdev->devcmd2->result_size = DEVCMD2_RING_SIZE; + err = vnic_wq_devcmd2_alloc(vdev, &vdev->devcmd2->wq, + DEVCMD2_RING_SIZE, DEVCMD2_DESC_SIZE); + if (err) + goto err_free_devcmd2; + + fetch_index = ioread32(&vdev->devcmd2->wq.ctrl->fetch_index); + if (fetch_index == 0xFFFFFFFF) { /* check for hardware gone */ + pr_err("Fatal error in devcmd2 init - hardware surprise removal"); + return -ENODEV; + } + + /* + * Don't change fetch_index ever and + * set posted_index same as fetch_index + * when setting up the WQ for devmcd2. + */ + vnic_wq_init_start(&vdev->devcmd2->wq, 0, fetch_index, fetch_index, 0, 0); + vdev->devcmd2->posted = fetch_index; + vnic_wq_enable(&vdev->devcmd2->wq); + + err = vnic_dev_alloc_desc_ring(vdev, &vdev->devcmd2->results_ring, + DEVCMD2_RING_SIZE, DEVCMD2_DESC_SIZE); + if (err) + goto err_free_wq; + + vdev->devcmd2->result = + (struct devcmd2_result *) vdev->devcmd2->results_ring.descs; + vdev->devcmd2->cmd_ring = + (struct vnic_devcmd2 *) vdev->devcmd2->wq.ring.descs; + vdev->devcmd2->wq_ctrl = vdev->devcmd2->wq.ctrl; + vdev->args[0] = (u64) vdev->devcmd2->results_ring.base_addr | + VNIC_PADDR_TARGET; + vdev->args[1] = DEVCMD2_RING_SIZE; + + err = _vnic_dev_cmd2(vdev, CMD_INITIALIZE_DEVCMD2, 1000); + if (err) + goto err_free_desc_ring; + + vdev->devcmd_rtn = &_vnic_dev_cmd2; + + return 0; + +err_free_desc_ring: + vnic_dev_free_desc_ring(vdev, &vdev->devcmd2->results_ring); +err_free_wq: + vnic_wq_disable(&vdev->devcmd2->wq); + vnic_wq_free(&vdev->devcmd2->wq); +err_free_devcmd2: + kfree(vdev->devcmd2); + vdev->devcmd2 = NULL; + + return err; +#else + return 0; +#endif +} + +static void vnic_dev_deinit_devcmd2(struct vnic_dev *vdev) +{ +#if !defined(CONFIG_MIPS) && !defined(MGMT_VNIC) + vnic_dev_free_desc_ring(vdev, &vdev->devcmd2->results_ring); + vnic_wq_disable(&vdev->devcmd2->wq); + vnic_wq_free(&vdev->devcmd2->wq); + kfree(vdev->devcmd2); +#endif +} + +static int vnic_dev_cmd_proxy(struct vnic_dev *vdev, + enum vnic_devcmd_cmd proxy_cmd, enum vnic_devcmd_cmd cmd, + u64 *args, int nargs, int wait) +{ + u32 status; + int err; + + /* + * Proxy command consumes 2 arguments. One for proxy index, + * the other is for command to be proxied + */ + if (nargs > VNIC_DEVCMD_NARGS - 2) { + pr_err("number of args %d exceeds the maximum\n", nargs); + return -EINVAL; + } + memset(vdev->args, 0, sizeof(vdev->args)); + + vdev->args[0] = vdev->proxy_index; + vdev->args[1] = cmd; + memcpy(&vdev->args[2], args, nargs * sizeof(args[0])); + + err = (*vdev->devcmd_rtn)(vdev, proxy_cmd, wait); + if (err) + return err; + + status = (u32)vdev->args[0]; + if (status & STAT_ERROR) { + err = (int)vdev->args[1]; + if (err != ERR_ECMDUNKNOWN || + cmd != CMD_CAPABILITY) + pr_err("Error %d proxy devcmd %d\n", err, _CMD_N(cmd)); + return err; + } + + memcpy(args, &vdev->args[1], nargs * sizeof(args[0])); + + return 0; +} + +static int vnic_dev_cmd_no_proxy(struct vnic_dev *vdev, + enum vnic_devcmd_cmd cmd, u64 *args, int nargs, int wait) +{ + int err; + + if (nargs > VNIC_DEVCMD_NARGS) { + pr_err("number of args %d exceeds the maximum\n", nargs); + return -EINVAL; + } + memset(vdev->args, 0, sizeof(vdev->args)); + memcpy(vdev->args, args, nargs * sizeof(args[0])); + + err = (*vdev->devcmd_rtn)(vdev, cmd, wait); + + memcpy(args, vdev->args, nargs * sizeof(args[0])); + + return err; +} + +void vnic_dev_cmd_proxy_by_index_start(struct vnic_dev *vdev, u16 index) +{ + vdev->proxy = PROXY_BY_INDEX; + vdev->proxy_index = index; +} + +void vnic_dev_cmd_proxy_by_bdf_start(struct vnic_dev *vdev, u16 bdf) +{ + vdev->proxy = PROXY_BY_BDF; + vdev->proxy_index = bdf; +} + +void vnic_dev_cmd_proxy_end(struct vnic_dev *vdev) +{ + vdev->proxy = PROXY_NONE; + vdev->proxy_index = 0; +} + +int vnic_dev_cmd(struct vnic_dev *vdev, enum vnic_devcmd_cmd cmd, + u64 *a0, u64 *a1, int wait) +{ + u64 args[2]; + int err; + + args[0] = *a0; + args[1] = *a1; + memset(vdev->args, 0, sizeof(vdev->args)); + + switch (vdev->proxy) { + case PROXY_BY_INDEX: + err = vnic_dev_cmd_proxy(vdev, CMD_PROXY_BY_INDEX, cmd, + args, ARRAY_SIZE(args), wait); + break; + case PROXY_BY_BDF: + err = vnic_dev_cmd_proxy(vdev, CMD_PROXY_BY_BDF, cmd, + args, ARRAY_SIZE(args), wait); + break; + case PROXY_NONE: + default: + err = vnic_dev_cmd_no_proxy(vdev, cmd, args, 2, wait); + break; + } + + if (err == 0) { + *a0 = args[0]; + *a1 = args[1]; + } + + return err; +} + +int vnic_dev_cmd_args(struct vnic_dev *vdev, enum vnic_devcmd_cmd cmd, + u64 *args, int nargs, int wait) +{ + switch (vdev->proxy) { + case PROXY_BY_INDEX: + return vnic_dev_cmd_proxy(vdev, CMD_PROXY_BY_INDEX, cmd, + args, nargs, wait); + case PROXY_BY_BDF: + return vnic_dev_cmd_proxy(vdev, CMD_PROXY_BY_BDF, cmd, + args, nargs, wait); + case PROXY_NONE: + default: + return vnic_dev_cmd_no_proxy(vdev, cmd, args, nargs, wait); + } +} + +static int vnic_dev_capable(struct vnic_dev *vdev, enum vnic_devcmd_cmd cmd) +{ + u64 a0 = (u32)cmd, a1 = 0; + int wait = 1000; + int err; + + err = vnic_dev_cmd(vdev, CMD_CAPABILITY, &a0, &a1, wait); + + return !(err || a0); +} + +int vnic_dev_fw_info(struct vnic_dev *vdev, + struct vnic_devcmd_fw_info **fw_info) +{ + u64 a0, a1 = 0; + int wait = 1000; + int err = 0; + + if (!vdev->fw_info) { + vdev->fw_info = pci_alloc_consistent(vdev->pdev, + sizeof(struct vnic_devcmd_fw_info), + &vdev->fw_info_pa); + if (!vdev->fw_info) + return -ENOMEM; + + memset(vdev->fw_info, 0, sizeof(struct vnic_devcmd_fw_info)); + + a0 = vdev->fw_info_pa; + a1 = sizeof(struct vnic_devcmd_fw_info); + + /* only get fw_info once and cache it */ + if (vnic_dev_capable(vdev, CMD_MCPU_FW_INFO)) + err = vnic_dev_cmd(vdev, CMD_MCPU_FW_INFO, + &a0, &a1, wait); + else + err = vnic_dev_cmd(vdev, CMD_MCPU_FW_INFO_OLD, + &a0, &a1, wait); + } + + *fw_info = vdev->fw_info; + + return err; +} + +int vnic_dev_asic_info(struct vnic_dev *vdev, u16 *asic_type, u16 *asic_rev) +{ + struct vnic_devcmd_fw_info *fw_info; + int err; + + err = vnic_dev_fw_info(vdev, &fw_info); + if (err) + return err; + + *asic_type = fw_info->asic_type; + *asic_rev = fw_info->asic_rev; + + return 0; +} + +int vnic_dev_spec(struct vnic_dev *vdev, unsigned int offset, unsigned int size, + void *value) +{ +#ifdef CONFIG_MIPS + u8 *v = vnic_dev_get_res(vdev, RES_TYPE_DEV, 0); + if (!v) { + pr_err("vNIC device-specific region not found.\n"); + return -EINVAL; + } + + switch (size) { + case 1: + *(u8 *)value = ioread8(v + offset); + break; + case 2: + *(u16 *)value = ioread16(v + offset); + break; + case 4: + *(u32 *)value = ioread32(v + offset); + break; + case 8: + *(u64 *)value = readq(v + offset); + break; + default: + BUG(); + break; + } + + return 0; +#else + u64 a0, a1; + int wait = 1000; + int err; + + a0 = offset; + a1 = size; + + err = vnic_dev_cmd(vdev, CMD_DEV_SPEC, &a0, &a1, wait); + + switch (size) { + case 1: + *(u8 *)value = (u8)a0; + break; + case 2: + *(u16 *)value = (u16)a0; + break; + case 4: + *(u32 *)value = (u32)a0; + break; + case 8: + *(u64 *)value = a0; + break; + default: + BUG(); + break; + } + + return err; +#endif +} + +int vnic_dev_stats_clear(struct vnic_dev *vdev) +{ + u64 a0 = 0, a1 = 0; + int wait = 1000; + return vnic_dev_cmd(vdev, CMD_STATS_CLEAR, &a0, &a1, wait); +} + +int vnic_dev_stats_dump(struct vnic_dev *vdev, struct vnic_stats **stats) +{ + u64 a0, a1; + int wait = 1000; + + if (!vdev->stats) { + vdev->stats = pci_alloc_consistent(vdev->pdev, + sizeof(struct vnic_stats), &vdev->stats_pa); + if (!vdev->stats) + return -ENOMEM; + } + + *stats = vdev->stats; + a0 = vdev->stats_pa; + a1 = sizeof(struct vnic_stats); + + return vnic_dev_cmd(vdev, CMD_STATS_DUMP, &a0, &a1, wait); +} + +int vnic_dev_close(struct vnic_dev *vdev) +{ + u64 a0 = 0, a1 = 0; + int wait = 1000; + return vnic_dev_cmd(vdev, CMD_CLOSE, &a0, &a1, wait); +} + +/** Deprecated. @see vnic_dev_enable_wait */ +int vnic_dev_enable(struct vnic_dev *vdev) +{ + u64 a0 = 0, a1 = 0; + int wait = 1000; + return vnic_dev_cmd(vdev, CMD_ENABLE, &a0, &a1, wait); +} + +int vnic_dev_enable_wait(struct vnic_dev *vdev) +{ + u64 a0 = 0, a1 = 0; + int wait = 1000; + + if (vnic_dev_capable(vdev, CMD_ENABLE_WAIT)) + return vnic_dev_cmd(vdev, CMD_ENABLE_WAIT, &a0, &a1, wait); + else + return vnic_dev_cmd(vdev, CMD_ENABLE, &a0, &a1, wait); +} + +int vnic_dev_disable(struct vnic_dev *vdev) +{ + u64 a0 = 0, a1 = 0; + int wait = 1000; + return vnic_dev_cmd(vdev, CMD_DISABLE, &a0, &a1, wait); +} + +int vnic_dev_open(struct vnic_dev *vdev, int arg) +{ + u64 a0 = (u32)arg, a1 = 0; + int wait = 1000; + return vnic_dev_cmd(vdev, CMD_OPEN, &a0, &a1, wait); +} + +int vnic_dev_open_done(struct vnic_dev *vdev, int *done) +{ + u64 a0 = 0, a1 = 0; + int wait = 1000; + int err; + + *done = 0; + + err = vnic_dev_cmd(vdev, CMD_OPEN_STATUS, &a0, &a1, wait); + if (err) + return err; + + *done = (a0 == 0); + + return 0; +} + +int vnic_dev_soft_reset(struct vnic_dev *vdev, int arg) +{ + u64 a0 = (u32)arg, a1 = 0; + int wait = 1000; + + return vnic_dev_cmd(vdev, CMD_SOFT_RESET, &a0, &a1, wait); +} + +int vnic_dev_soft_reset_done(struct vnic_dev *vdev, int *done) +{ + u64 a0 = 0, a1 = 0; + int wait = 1000; + int err; + + *done = 0; + + err = vnic_dev_cmd(vdev, CMD_SOFT_RESET_STATUS, &a0, &a1, wait); + if (err) + return err; + + *done = (a0 == 0); + + return 0; +} + +int vnic_dev_hang_reset(struct vnic_dev *vdev, int arg) +{ + u64 a0 = (u32)arg, a1 = 0; + int wait = 1000; + int err; + + if (vnic_dev_capable(vdev, CMD_HANG_RESET)) { + return vnic_dev_cmd(vdev, CMD_HANG_RESET, + &a0, &a1, wait); + } else { + err = vnic_dev_soft_reset(vdev, arg); + if (err) + return err; + return vnic_dev_init(vdev, 0); + } +} + +int vnic_dev_hang_reset_done(struct vnic_dev *vdev, int *done) +{ + u64 a0 = 0, a1 = 0; + int wait = 1000; + int err; + + *done = 0; + + if (vnic_dev_capable(vdev, CMD_HANG_RESET_STATUS)) { + err = vnic_dev_cmd(vdev, CMD_HANG_RESET_STATUS, + &a0, &a1, wait); + if (err) + return err; + } else { + return vnic_dev_soft_reset_done(vdev, done); + } + + *done = (a0 == 0); + + return 0; +} + +int vnic_dev_hang_notify(struct vnic_dev *vdev) +{ + u64 a0 = 0, a1 = 0; + int wait = 1000; + return vnic_dev_cmd(vdev, CMD_HANG_NOTIFY, &a0, &a1, wait); +} + +int vnic_dev_get_mac_addr(struct vnic_dev *vdev, u8 *mac_addr) +{ +#if defined(CONFIG_MIPS) || defined(MGMT_VNIC) + u64 laa = 0x02; + memcpy(mac_addr, &laa, ETH_ALEN); + return 0; +#else + u64 a0 = 0, a1 = 0; + int wait = 1000; + int err, i; + + for (i = 0; i < ETH_ALEN; i++) + mac_addr[i] = 0; + + err = vnic_dev_cmd(vdev, CMD_GET_MAC_ADDR, &a0, &a1, wait); + if (err) + return err; + + for (i = 0; i < ETH_ALEN; i++) + mac_addr[i] = ((u8 *)&a0)[i]; + + return 0; +#endif +} + +int vnic_dev_packet_filter(struct vnic_dev *vdev, int directed, int multicast, + int broadcast, int promisc, int allmulti) +{ + u64 a0, a1 = 0; + int wait = 1000; + int err; + + a0 = (directed ? CMD_PFILTER_DIRECTED : 0) | + (multicast ? CMD_PFILTER_MULTICAST : 0) | + (broadcast ? CMD_PFILTER_BROADCAST : 0) | + (promisc ? CMD_PFILTER_PROMISCUOUS : 0) | + (allmulti ? CMD_PFILTER_ALL_MULTICAST : 0); + + err = vnic_dev_cmd(vdev, CMD_PACKET_FILTER, &a0, &a1, wait); + if (err) + pr_err("Can't set packet filter\n"); + + return err; +} + +int vnic_dev_packet_filter_all(struct vnic_dev *vdev, int directed, + int multicast, int broadcast, int promisc, int allmulti) +{ + u64 a0, a1 = 0; + int wait = 1000; + int err; + + a0 = (directed ? CMD_PFILTER_DIRECTED : 0) | + (multicast ? CMD_PFILTER_MULTICAST : 0) | + (broadcast ? CMD_PFILTER_BROADCAST : 0) | + (promisc ? CMD_PFILTER_PROMISCUOUS : 0) | + (allmulti ? CMD_PFILTER_ALL_MULTICAST : 0); + + err = vnic_dev_cmd(vdev, CMD_PACKET_FILTER_ALL, &a0, &a1, wait); + if (err) + pr_err("Can't set packet filter\n"); + + return err; +} + +int vnic_dev_add_addr(struct vnic_dev *vdev, u8 *addr) +{ + u64 a0 = 0, a1 = 0; + int wait = 1000; + int err; + int i; + + for (i = 0; i < ETH_ALEN; i++) + ((u8 *)&a0)[i] = addr[i]; + + err = vnic_dev_cmd(vdev, CMD_ADDR_ADD, &a0, &a1, wait); + if (err) + pr_err("Can't add addr [%02x:%02x:%02x:%02x:%02x:%02x], %d\n", + addr[0], addr[1], addr[2], addr[3], addr[4], addr[5], + err); + + return err; +} + +int vnic_dev_del_addr(struct vnic_dev *vdev, u8 *addr) +{ + u64 a0 = 0, a1 = 0; + int wait = 1000; + int err; + int i; + + for (i = 0; i < ETH_ALEN; i++) + ((u8 *)&a0)[i] = addr[i]; + + err = vnic_dev_cmd(vdev, CMD_ADDR_DEL, &a0, &a1, wait); + if (err) + pr_err("Can't del addr [%02x:%02x:%02x:%02x:%02x:%02x], %d\n", + addr[0], addr[1], addr[2], addr[3], addr[4], addr[5], + err); + + return err; +} + +int vnic_dev_set_ig_vlan_rewrite_mode(struct vnic_dev *vdev, + u8 ig_vlan_rewrite_mode) +{ + u64 a0 = ig_vlan_rewrite_mode, a1 = 0; + int wait = 1000; + + if (vnic_dev_capable(vdev, CMD_IG_VLAN_REWRITE_MODE)) + return vnic_dev_cmd(vdev, CMD_IG_VLAN_REWRITE_MODE, + &a0, &a1, wait); + else + return 0; +} + +int vnic_dev_raise_intr(struct vnic_dev *vdev, u16 intr) +{ + u64 a0 = intr, a1 = 0; + int wait = 1000; + int err; + + err = vnic_dev_cmd(vdev, CMD_IAR, &a0, &a1, wait); + if (err) + pr_err("Failed to raise INTR[%d], err %d\n", intr, err); + + return err; +} + +static int vnic_dev_notify_setcmd(struct vnic_dev *vdev, + void *notify_addr, dma_addr_t notify_pa, u16 intr) +{ + u64 a0, a1; + int wait = 1000; + int r; + + memset(notify_addr, 0, sizeof(struct vnic_devcmd_notify)); + vdev->notify = notify_addr; + vdev->notify_pa = notify_pa; + + a0 = (u64)notify_pa; + a1 = ((u64)intr << 32) & 0x0000ffff00000000ULL; + a1 += sizeof(struct vnic_devcmd_notify); + + r = vnic_dev_cmd(vdev, CMD_NOTIFY, &a0, &a1, wait); + vdev->notify_sz = (r == 0) ? (u32)a1 : 0; + return r; +} + +int vnic_dev_notify_set(struct vnic_dev *vdev, u16 intr) +{ + void *notify_addr; + dma_addr_t notify_pa; + + if (vdev->notify || vdev->notify_pa) { + pr_err("notify block %p still allocated", vdev->notify); + return -EINVAL; + } + + notify_addr = pci_alloc_consistent(vdev->pdev, + sizeof(struct vnic_devcmd_notify), + ¬ify_pa); + if (!notify_addr) + return -ENOMEM; + + return vnic_dev_notify_setcmd(vdev, notify_addr, notify_pa, intr); +} + +static int vnic_dev_notify_unsetcmd(struct vnic_dev *vdev) +{ + u64 a0, a1; + int wait = 1000; + int err; + + a0 = 0; /* paddr = 0 to unset notify buffer */ + a1 = 0x0000ffff00000000ULL; /* intr num = -1 to unreg for intr */ + a1 += sizeof(struct vnic_devcmd_notify); + + err = vnic_dev_cmd(vdev, CMD_NOTIFY, &a0, &a1, wait); + vdev->notify = NULL; + vdev->notify_pa = 0; + vdev->notify_sz = 0; + + return err; +} + +int vnic_dev_notify_unset(struct vnic_dev *vdev) +{ + if (vdev->notify) { + pci_free_consistent(vdev->pdev, + sizeof(struct vnic_devcmd_notify), + vdev->notify, + vdev->notify_pa); + } + + return vnic_dev_notify_unsetcmd(vdev); +} + +static int vnic_dev_notify_ready(struct vnic_dev *vdev) +{ + u32 *words; + unsigned int nwords = vdev->notify_sz / 4; + unsigned int i; + u32 csum; + + if (!vdev->notify || !vdev->notify_sz) + return 0; + + do { + csum = 0; + memcpy(&vdev->notify_copy, vdev->notify, vdev->notify_sz); + words = (u32 *)&vdev->notify_copy; + for (i = 1; i < nwords; i++) + csum += words[i]; + } while (csum != words[0]); + + return 1; +} + +int vnic_dev_init(struct vnic_dev *vdev, int arg) +{ + u64 a0 = (u32)arg, a1 = 0; + int wait = 1000; + int r = 0; + + if (vnic_dev_capable(vdev, CMD_INIT)) + r = vnic_dev_cmd(vdev, CMD_INIT, &a0, &a1, wait); + else { + vnic_dev_cmd(vdev, CMD_INIT_v1, &a0, &a1, wait); + if (a0 & CMD_INITF_DEFAULT_MAC) { + /* Emulate these for old CMD_INIT_v1 which + * didn't pass a0 so no CMD_INITF_*. + */ + vnic_dev_cmd(vdev, CMD_GET_MAC_ADDR, &a0, &a1, wait); + vnic_dev_cmd(vdev, CMD_ADDR_ADD, &a0, &a1, wait); + } + } + return r; +} + +int vnic_dev_init_done(struct vnic_dev *vdev, int *done, int *err) +{ + u64 a0 = 0, a1 = 0; + int wait = 1000; + int ret; + + *done = 0; + + ret = vnic_dev_cmd(vdev, CMD_INIT_STATUS, &a0, &a1, wait); + if (ret) + return ret; + + *done = (a0 == 0); + + *err = (a0 == 0) ? (int)a1 : 0; + + return 0; +} + +int vnic_dev_init_prov(struct vnic_dev *vdev, u8 *buf, u32 len) +{ + u64 a0, a1 = len; + int wait = 1000; + dma_addr_t prov_pa; + void *prov_buf; + int ret; + + prov_buf = pci_alloc_consistent(vdev->pdev, len, &prov_pa); + if (!prov_buf) + return -ENOMEM; + + memcpy(prov_buf, buf, len); + + a0 = prov_pa; + + ret = vnic_dev_cmd(vdev, CMD_INIT_PROV_INFO, &a0, &a1, wait); + + pci_free_consistent(vdev->pdev, len, prov_buf, prov_pa); + + return ret; +} + +int vnic_dev_deinit(struct vnic_dev *vdev) +{ + u64 a0 = 0, a1 = 0; + int wait = 1000; + + return vnic_dev_cmd(vdev, CMD_DEINIT, &a0, &a1, wait); +} + +EXPORT_SYMBOL(vnic_dev_intr_coal_timer_info_default); +void vnic_dev_intr_coal_timer_info_default(struct vnic_dev *vdev) +{ + /* Default: hardware intr coal timer is in units of 1.5 usecs */ + vdev->intr_coal_timer_info.mul = 2; + vdev->intr_coal_timer_info.div = 3; + vdev->intr_coal_timer_info.max_usec = + vnic_dev_intr_coal_timer_hw_to_usec(vdev, 0xffff); +} + +int vnic_dev_intr_coal_timer_info(struct vnic_dev *vdev) +{ + int wait = 1000; + int err; + + memset(vdev->args, 0, sizeof(vdev->args)); + + if (vnic_dev_capable(vdev, CMD_INTR_COAL_CONVERT)) + err = (*vdev->devcmd_rtn)(vdev, CMD_INTR_COAL_CONVERT, wait); + else + err = ERR_ECMDUNKNOWN; + + /* Use defaults when firmware doesn't support the devcmd at all or + * supports it for only specific hardware + */ + if ((err == ERR_ECMDUNKNOWN) || + (!err && !(vdev->args[0] && vdev->args[1] && vdev->args[2]))) { + pr_warning("Using default conversion factor for " + "interrupt coalesce timer\n"); + vnic_dev_intr_coal_timer_info_default(vdev); + return 0; + } + + if (!err) { + vdev->intr_coal_timer_info.mul = (u32) vdev->args[0]; + vdev->intr_coal_timer_info.div = (u32) vdev->args[1]; + vdev->intr_coal_timer_info.max_usec = (u32) vdev->args[2]; + } + + return err; +} + +int vnic_dev_link_status(struct vnic_dev *vdev) +{ +#ifdef CONFIG_MIPS + return 1; +#else + if (!vnic_dev_notify_ready(vdev)) + return 0; + + return vdev->notify_copy.link_state; +#endif +} + +u32 vnic_dev_port_speed(struct vnic_dev *vdev) +{ + if (!vnic_dev_notify_ready(vdev)) + return 0; + + return vdev->notify_copy.port_speed; +} + +u32 vnic_dev_msg_lvl(struct vnic_dev *vdev) +{ + if (!vnic_dev_notify_ready(vdev)) + return 0; + + return vdev->notify_copy.msglvl; +} + +u32 vnic_dev_mtu(struct vnic_dev *vdev) +{ +#if defined(CONFIG_MIPS) || defined(MGMT_VNIC) + return 1500; +#else + if (!vnic_dev_notify_ready(vdev)) + return 0; + + return vdev->notify_copy.mtu; +#endif +} + +u32 vnic_dev_link_down_cnt(struct vnic_dev *vdev) +{ + if (!vnic_dev_notify_ready(vdev)) + return 0; + + return vdev->notify_copy.link_down_cnt; +} + +u32 vnic_dev_notify_status(struct vnic_dev *vdev) +{ + if (!vnic_dev_notify_ready(vdev)) + return 0; + + return vdev->notify_copy.status; +} + +u32 vnic_dev_uif(struct vnic_dev *vdev) +{ + if (!vnic_dev_notify_ready(vdev)) + return 0; + + return vdev->notify_copy.uif; +} + +u32 vnic_dev_perbi_rebuild_cnt(struct vnic_dev *vdev) +{ + if (!vnic_dev_notify_ready(vdev)) + return 0; + + return vdev->notify_copy.perbi_rebuild_cnt; +} + +EXPORT_SYMBOL(vnic_dev_set_intr_mode); +void vnic_dev_set_intr_mode(struct vnic_dev *vdev, + enum vnic_dev_intr_mode intr_mode) +{ + vdev->intr_mode = intr_mode; +} + +EXPORT_SYMBOL(vnic_dev_get_intr_mode); +enum vnic_dev_intr_mode vnic_dev_get_intr_mode( + struct vnic_dev *vdev) +{ + return vdev->intr_mode; +} + +u32 vnic_dev_intr_coal_timer_usec_to_hw(struct vnic_dev *vdev, u32 usec) +{ + return (usec * vdev->intr_coal_timer_info.mul) / + vdev->intr_coal_timer_info.div; +} + +u32 vnic_dev_intr_coal_timer_hw_to_usec(struct vnic_dev *vdev, u32 hw_cycles) +{ + return (hw_cycles * vdev->intr_coal_timer_info.div) / + vdev->intr_coal_timer_info.mul; +} + +u32 vnic_dev_get_intr_coal_timer_max(struct vnic_dev *vdev) +{ + return vdev->intr_coal_timer_info.max_usec; +} + +void vnic_dev_unregister(struct vnic_dev *vdev) +{ + if (vdev) { + if (vdev->notify) + pci_free_consistent(vdev->pdev, + sizeof(struct vnic_devcmd_notify), + vdev->notify, + vdev->notify_pa); + if (vdev->stats) + pci_free_consistent(vdev->pdev, + sizeof(struct vnic_stats), + vdev->stats, vdev->stats_pa); + if (vdev->fw_info) + pci_free_consistent(vdev->pdev, + sizeof(struct vnic_devcmd_fw_info), + vdev->fw_info, vdev->fw_info_pa); + if (vdev->devcmd2) + vnic_dev_deinit_devcmd2(vdev); + + kfree(vdev); + } +} +EXPORT_SYMBOL(vnic_dev_unregister); + +struct vnic_dev *vnic_dev_alloc_discover(struct vnic_dev *vdev, + void *priv, struct pci_dev *pdev, struct vnic_dev_bar *bar, + unsigned int num_bars) +{ + if (!vdev) { + vdev = kzalloc(sizeof(struct vnic_dev), GFP_ATOMIC); + if (!vdev) + return NULL; + } + + vdev->priv = priv; + vdev->pdev = pdev; + + if (vnic_dev_discover_res(vdev, bar, num_bars)) + goto err_out; + + return vdev; + +err_out: + vnic_dev_unregister(vdev); + return NULL; +} +EXPORT_SYMBOL(vnic_dev_alloc_discover); + +struct vnic_dev *vnic_dev_register(struct vnic_dev *vdev, + void *priv, struct pci_dev *pdev, struct vnic_dev_bar *bar, + unsigned int num_bars) +{ + vdev = vnic_dev_alloc_discover(vdev, priv, pdev, bar, num_bars); + if (!vdev) + goto err_out; + + if (vnic_dev_init_devcmd1(vdev)) + goto err_free; + + return vdev; + +err_free: + vnic_dev_unregister(vdev); +err_out: + return NULL; +} +EXPORT_SYMBOL(vnic_dev_register); + +struct pci_dev *vnic_dev_get_pdev(struct vnic_dev *vdev) +{ + return vdev->pdev; +} +EXPORT_SYMBOL(vnic_dev_get_pdev); + +int vnic_devcmd_init(struct vnic_dev *vdev, int fallback) +{ +#if !defined(CONFIG_MIPS) && !defined(MGMT_VNIC) + int err; + void *p; + + p = vnic_dev_get_res(vdev, RES_TYPE_DEVCMD2, 0); + if (p) + err = vnic_dev_init_devcmd2(vdev); + else if (fallback) { + pr_warning("DEVCMD2 resource not found, fall back to devcmd\n"); + err = vnic_dev_init_devcmd1(vdev); + } else { + pr_err("DEVCMD2 resource not found, no fall back to devcmd allowed\n"); + err = -ENODEV; + } + + return err; +#else + return 0; +#endif +} + +int vnic_dev_int13(struct vnic_dev *vdev, u64 arg, u32 op) +{ + u64 a0 = arg, a1 = op; + int wait = 1000; + int r = 0; + + r = vnic_dev_cmd(vdev, CMD_INT13, &a0, &a1, wait); + return r; +} + +int vnic_dev_perbi(struct vnic_dev *vdev, u64 arg, u32 op) +{ + u64 a0 = arg, a1 = op; + int wait = 5000; + int r = 0; + + r = vnic_dev_cmd(vdev, CMD_PERBI, &a0, &a1, wait); + + return r; +} + +int vnic_dev_init_prov2(struct vnic_dev *vdev, u8 *buf, u32 len) +{ + u64 a0, a1 = len; + int wait = 1000; + dma_addr_t prov_pa; + void *prov_buf; + int ret; + + prov_buf = pci_alloc_consistent(vdev->pdev, len, &prov_pa); + if (!prov_buf) + return -ENOMEM; + + memcpy(prov_buf, buf, len); + + a0 = prov_pa; + + ret = vnic_dev_cmd(vdev, CMD_INIT_PROV_INFO2, &a0, &a1, wait); + + pci_free_consistent(vdev->pdev, len, prov_buf, prov_pa); + + return ret; +} + +int vnic_dev_enable2(struct vnic_dev *vdev, int active) +{ + u64 a0, a1 = 0; + int wait = 1000; + + a0 = (active ? CMD_ENABLE2_ACTIVE : 0); + + return vnic_dev_cmd(vdev, CMD_ENABLE2, &a0, &a1, wait); +} + +static int vnic_dev_cmd_status(struct vnic_dev *vdev, enum vnic_devcmd_cmd cmd, + int *status) +{ + u64 a0 = cmd, a1 = 0; + int wait = 1000; + int ret; + + ret = vnic_dev_cmd(vdev, CMD_STATUS, &a0, &a1, wait); + if (!ret) + *status = (int)a0; + + return ret; +} + +int vnic_dev_enable2_done(struct vnic_dev *vdev, int *status) +{ + return vnic_dev_cmd_status(vdev, CMD_ENABLE2, status); +} + +int vnic_dev_deinit_done(struct vnic_dev *vdev, int *status) +{ + return vnic_dev_cmd_status(vdev, CMD_DEINIT, status); +} + +int vnic_dev_set_mac_addr(struct vnic_dev *vdev, u8 *mac_addr) +{ + u64 a0 = 0, a1 = 0; + int wait = 1000; + int i; + + for (i = 0; i < ETH_ALEN; i++) + ((u8 *)&a0)[i] = mac_addr[i]; + + return vnic_dev_cmd(vdev, CMD_SET_MAC_ADDR, &a0, &a1, wait); +} + +/* + * vnic_dev_classifier: Add/Delete classifier entries + * @vdev: vdev of the device + * @cmd: CLSF_ADD for Add filter + * CLSF_DEL for Delete filter + * @entry: In case of ADD filter, the caller passes the RQ number in this variable. + * This function stores the filter_id returned by the + * firmware in the same variable before return; + * + * In case of DEL filter, the caller passes the RQ number. Return + * value is irrelevant. + * @data: filter data + */ +int vnic_dev_classifier(struct vnic_dev *vdev, u8 cmd, u16 *entry, struct filter *data) +{ + u64 a0, a1; + int wait = 1000; + dma_addr_t tlv_pa; + int ret = -EINVAL; + struct filter_tlv *tlv, *tlv_va; + struct filter_action *action; + u64 tlv_size; + + if (cmd == CLSF_ADD) { + tlv_size = sizeof(struct filter) + + sizeof(struct filter_action) + + 2*sizeof(struct filter_tlv); + tlv_va = pci_alloc_consistent(vdev->pdev, tlv_size, &tlv_pa); + if (!tlv_va) + return -ENOMEM; + tlv = tlv_va; + a0 = tlv_pa; + a1 = tlv_size; + memset(tlv, 0, tlv_size); + tlv->type = CLSF_TLV_FILTER; + tlv->length = sizeof(struct filter); + *(struct filter *)&tlv->val = *data; + + tlv = (struct filter_tlv *)((char *)tlv + + sizeof(struct filter_tlv) + + sizeof(struct filter)); + + tlv->type = CLSF_TLV_ACTION; + tlv->length = sizeof (struct filter_action); + action = (struct filter_action *)&tlv->val; + action->type = FILTER_ACTION_RQ_STEERING; + action->u.rq_idx = *entry; + + ret = vnic_dev_cmd(vdev, CMD_ADD_FILTER, &a0, &a1, wait); + *entry = (u16)a0; + pci_free_consistent(vdev->pdev, tlv_size, tlv_va, tlv_pa); + } else if (cmd == CLSF_DEL) { + a0 = *entry; + a1 = 0; + ret = vnic_dev_cmd(vdev, CMD_DEL_FILTER, &a0, &a1, wait); + } + + return ret; +} + +int vnic_dev_overlay_offload_ctrl(struct vnic_dev *vdev, u8 overlay, + u8 config) +{ + u64 a0, a1; + int wait = 1000; + int ret = -EINVAL; + + a0 = overlay; + a1 = config; + + ret = vnic_dev_cmd(vdev, CMD_OVERLAY_OFFLOAD_CTRL, &a0, &a1, wait); + + return ret; +} + +int vnic_dev_overlay_offload_cfg(struct vnic_dev *vdev, u8 overlay, + u16 vxlan_udp_port_number) +{ + u64 a0, a1; + int wait = 1000; + int ret = -EINVAL; + + a0 = overlay; + a1 = vxlan_udp_port_number; + + ret = vnic_dev_cmd(vdev, CMD_OVERLAY_OFFLOAD_CFG, &a0, &a1, wait); + + return ret; +} + +int vnic_dev_get_supported_feature_ver(struct vnic_dev *vdev, u8 feature, + u64 *supported_versions) +{ + u64 a0 = feature, a1 = 0; + int wait = 1000; + int ret = -EINVAL; + + ret = vnic_dev_cmd(vdev, CMD_GET_SUPP_FEATURE_VER, &a0, &a1, wait); + if (!ret) + *supported_versions = a0; + + return ret; +} diff --git a/prov/usnic/src/usnic_direct/vnic_dev.h b/prov/usnic/src/usnic_direct/vnic_dev.h new file mode 100644 index 00000000000..d21f6372cad --- /dev/null +++ b/prov/usnic/src/usnic_direct/vnic_dev.h @@ -0,0 +1,214 @@ +/* + * Copyright 2008-2018 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#ifndef _VNIC_DEV_H_ +#define _VNIC_DEV_H_ + +#ifdef __KERNEL__ +#include +#endif /* __KERNEL__ */ +#include "vnic_resource.h" +#include "vnic_devcmd.h" + +#ifndef VNIC_PADDR_TARGET +#define VNIC_PADDR_TARGET 0x0000000000000000ULL +#endif + +#ifndef readq +static inline u64 readq(void __iomem *reg) +{ + return ((u64)readl((char *)reg + 0x4UL) << 32) | + (u64)readl(reg); +} + +static inline void writeq(u64 val, void __iomem *reg) +{ + writel(val & 0xffffffff, reg); + writel(val >> 32, (char *)reg + 0x4UL); +} +#endif + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +enum vnic_dev_intr_mode { + VNIC_DEV_INTR_MODE_UNKNOWN, + VNIC_DEV_INTR_MODE_INTX, + VNIC_DEV_INTR_MODE_MSI, + VNIC_DEV_INTR_MODE_MSIX, +}; + +struct vnic_dev_bar { + void __iomem *vaddr; + dma_addr_t bus_addr; + unsigned long len; +}; + +struct vnic_dev_ring { + void *descs; + size_t size; + dma_addr_t base_addr; + size_t base_align; + void *descs_unaligned; + size_t size_unaligned; + dma_addr_t base_addr_unaligned; + unsigned int desc_size; + unsigned int desc_count; + unsigned int desc_avail; +}; + +struct vnic_dev_iomap_info { + dma_addr_t bus_addr; + unsigned long len; + void __iomem *vaddr; +}; + +struct vnic_dev; +struct vnic_stats; + +void *vnic_dev_priv(struct vnic_dev *vdev); +unsigned int vnic_dev_get_res_count(struct vnic_dev *vdev, + enum vnic_res_type type); +void __iomem *vnic_dev_get_res(struct vnic_dev *vdev, enum vnic_res_type type, + unsigned int index); +dma_addr_t vnic_dev_get_res_bus_addr(struct vnic_dev *vdev, + enum vnic_res_type type, unsigned int index); +uint8_t vnic_dev_get_res_bar(struct vnic_dev *vdev, + enum vnic_res_type type); +uint32_t vnic_dev_get_res_offset(struct vnic_dev *vdev, + enum vnic_res_type type, unsigned int index); +unsigned long vnic_dev_get_res_type_len(struct vnic_dev *vdev, + enum vnic_res_type type); +unsigned int vnic_dev_desc_ring_size(struct vnic_dev_ring *ring, + unsigned int desc_count, unsigned int desc_size); +void vnic_dev_clear_desc_ring(struct vnic_dev_ring *ring); +int vnic_dev_alloc_desc_ring(struct vnic_dev *vdev, struct vnic_dev_ring *ring, + unsigned int desc_count, unsigned int desc_size); +void vnic_dev_free_desc_ring(struct vnic_dev *vdev, + struct vnic_dev_ring *ring); +int vnic_dev_cmd(struct vnic_dev *vdev, enum vnic_devcmd_cmd cmd, + u64 *a0, u64 *a1, int wait); +int vnic_dev_cmd_args(struct vnic_dev *vdev, enum vnic_devcmd_cmd cmd, + u64 *args, int nargs, int wait); +void vnic_dev_cmd_proxy_by_index_start(struct vnic_dev *vdev, u16 index); +void vnic_dev_cmd_proxy_by_bdf_start(struct vnic_dev *vdev, u16 bdf); +void vnic_dev_cmd_proxy_end(struct vnic_dev *vdev); +int vnic_dev_fw_info(struct vnic_dev *vdev, + struct vnic_devcmd_fw_info **fw_info); +int vnic_dev_asic_info(struct vnic_dev *vdev, u16 *asic_type, u16 *asic_rev); +int vnic_dev_spec(struct vnic_dev *vdev, unsigned int offset, unsigned int size, + void *value); +int vnic_dev_stats_clear(struct vnic_dev *vdev); +int vnic_dev_stats_dump(struct vnic_dev *vdev, struct vnic_stats **stats); +int vnic_dev_hang_notify(struct vnic_dev *vdev); +int vnic_dev_packet_filter(struct vnic_dev *vdev, int directed, int multicast, + int broadcast, int promisc, int allmulti); +int vnic_dev_packet_filter_all(struct vnic_dev *vdev, int directed, + int multicast, int broadcast, int promisc, int allmulti); +int vnic_dev_add_addr(struct vnic_dev *vdev, u8 *addr); +int vnic_dev_del_addr(struct vnic_dev *vdev, u8 *addr); +int vnic_dev_get_mac_addr(struct vnic_dev *vdev, u8 *mac_addr); +int vnic_dev_raise_intr(struct vnic_dev *vdev, u16 intr); +int vnic_dev_notify_set(struct vnic_dev *vdev, u16 intr); +void vnic_dev_set_reset_flag(struct vnic_dev *vdev, int state); +int vnic_dev_notify_unset(struct vnic_dev *vdev); +int vnic_dev_link_status(struct vnic_dev *vdev); +u32 vnic_dev_port_speed(struct vnic_dev *vdev); +u32 vnic_dev_msg_lvl(struct vnic_dev *vdev); +u32 vnic_dev_mtu(struct vnic_dev *vdev); +u32 vnic_dev_link_down_cnt(struct vnic_dev *vdev); +u32 vnic_dev_notify_status(struct vnic_dev *vdev); +u32 vnic_dev_uif(struct vnic_dev *vdev); +int vnic_dev_close(struct vnic_dev *vdev); +int vnic_dev_enable(struct vnic_dev *vdev); +int vnic_dev_enable_wait(struct vnic_dev *vdev); +int vnic_dev_disable(struct vnic_dev *vdev); +int vnic_dev_open(struct vnic_dev *vdev, int arg); +int vnic_dev_open_done(struct vnic_dev *vdev, int *done); +int vnic_dev_init(struct vnic_dev *vdev, int arg); +int vnic_dev_init_done(struct vnic_dev *vdev, int *done, int *err); +int vnic_dev_init_prov(struct vnic_dev *vdev, u8 *buf, u32 len); +int vnic_dev_deinit(struct vnic_dev *vdev); +void vnic_dev_intr_coal_timer_info_default(struct vnic_dev *vdev); +int vnic_dev_intr_coal_timer_info(struct vnic_dev *vdev); +int vnic_dev_soft_reset(struct vnic_dev *vdev, int arg); +int vnic_dev_soft_reset_done(struct vnic_dev *vdev, int *done); +int vnic_dev_hang_reset(struct vnic_dev *vdev, int arg); +int vnic_dev_hang_reset_done(struct vnic_dev *vdev, int *done); +void vnic_dev_set_intr_mode(struct vnic_dev *vdev, + enum vnic_dev_intr_mode intr_mode); +enum vnic_dev_intr_mode vnic_dev_get_intr_mode(struct vnic_dev *vdev); +u32 vnic_dev_intr_coal_timer_usec_to_hw(struct vnic_dev *vdev, u32 usec); +u32 vnic_dev_intr_coal_timer_hw_to_usec(struct vnic_dev *vdev, u32 hw_cycles); +u32 vnic_dev_get_intr_coal_timer_max(struct vnic_dev *vdev); +void vnic_dev_unregister(struct vnic_dev *vdev); +int vnic_dev_set_ig_vlan_rewrite_mode(struct vnic_dev *vdev, + u8 ig_vlan_rewrite_mode); +struct vnic_dev *vnic_dev_alloc_discover(struct vnic_dev *vdev, + void *priv, struct pci_dev *pdev, struct vnic_dev_bar *bar, + unsigned int num_bars); +struct vnic_dev *vnic_dev_register(struct vnic_dev *vdev, + void *priv, struct pci_dev *pdev, struct vnic_dev_bar *bar, + unsigned int num_bars); +void vnic_dev_upd_res_vaddr(struct vnic_dev *vdev, + struct vnic_dev_iomap_info *maps); +struct pci_dev *vnic_dev_get_pdev(struct vnic_dev *vdev); +int vnic_devcmd_init(struct vnic_dev *vdev, int fallback); +int vnic_dev_get_size(void); +int vnic_dev_int13(struct vnic_dev *vdev, u64 arg, u32 op); +int vnic_dev_perbi(struct vnic_dev *vdev, u64 arg, u32 op); +u32 vnic_dev_perbi_rebuild_cnt(struct vnic_dev *vdev); +int vnic_dev_init_prov2(struct vnic_dev *vdev, u8 *buf, u32 len); +int vnic_dev_enable2(struct vnic_dev *vdev, int active); +int vnic_dev_enable2_done(struct vnic_dev *vdev, int *status); +int vnic_dev_deinit_done(struct vnic_dev *vdev, int *status); +int vnic_dev_set_mac_addr(struct vnic_dev *vdev, u8 *mac_addr); +int vnic_dev_classifier(struct vnic_dev *vdev, u8 cmd, u16 *entry, + struct filter *data); +int vnic_dev_overlay_offload_ctrl(struct vnic_dev *vdev, u8 overlay, u8 config); +int vnic_dev_overlay_offload_cfg(struct vnic_dev *vdev, u8 overlay, + u16 vxlan_udp_port_number); +int vnic_dev_get_supported_feature_ver(struct vnic_dev *vdev, u8 feature, + u64 *supported_versions); +int vnic_dev_init_devcmd1(struct vnic_dev *vdev); +#endif /* _VNIC_DEV_H_ */ diff --git a/prov/usnic/src/usnic_direct/vnic_devcmd.h b/prov/usnic/src/usnic_direct/vnic_devcmd.h new file mode 100644 index 00000000000..90872381c1c --- /dev/null +++ b/prov/usnic/src/usnic_direct/vnic_devcmd.h @@ -0,0 +1,1413 @@ +/* + * Copyright 2008-2016 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#ifndef _VNIC_DEVCMD_H_ +#define _VNIC_DEVCMD_H_ + +#define _CMD_NBITS 14 +#define _CMD_VTYPEBITS 10 +#define _CMD_FLAGSBITS 6 +#define _CMD_DIRBITS 2 + +#define _CMD_NMASK ((1 << _CMD_NBITS)-1) +#define _CMD_VTYPEMASK ((1 << _CMD_VTYPEBITS)-1) +#define _CMD_FLAGSMASK ((1 << _CMD_FLAGSBITS)-1) +#define _CMD_DIRMASK ((1 << _CMD_DIRBITS)-1) + +#define _CMD_NSHIFT 0 +#define _CMD_VTYPESHIFT (_CMD_NSHIFT+_CMD_NBITS) +#define _CMD_FLAGSSHIFT (_CMD_VTYPESHIFT+_CMD_VTYPEBITS) +#define _CMD_DIRSHIFT (_CMD_FLAGSSHIFT+_CMD_FLAGSBITS) + +/* + * Direction bits (from host perspective). + */ +#define _CMD_DIR_NONE 0U +#define _CMD_DIR_WRITE 1U +#define _CMD_DIR_READ 2U +#define _CMD_DIR_RW (_CMD_DIR_WRITE | _CMD_DIR_READ) + +/* + * Flag bits. + */ +#define _CMD_FLAGS_NONE 0U +#define _CMD_FLAGS_NOWAIT 1U + +/* + * vNIC type bits. + */ +#define _CMD_VTYPE_NONE 0U +#define _CMD_VTYPE_ENET 1U +#define _CMD_VTYPE_FC 2U +#define _CMD_VTYPE_SCSI 4U +#define _CMD_VTYPE_ALL (_CMD_VTYPE_ENET | _CMD_VTYPE_FC | _CMD_VTYPE_SCSI) + +/* + * Used to create cmds.. + */ +#define _CMDCF(dir, flags, vtype, nr) \ + (((dir) << _CMD_DIRSHIFT) | \ + ((flags) << _CMD_FLAGSSHIFT) | \ + ((vtype) << _CMD_VTYPESHIFT) | \ + ((nr) << _CMD_NSHIFT)) +#define _CMDC(dir, vtype, nr) _CMDCF(dir, 0, vtype, nr) +#define _CMDCNW(dir, vtype, nr) _CMDCF(dir, _CMD_FLAGS_NOWAIT, vtype, nr) + +/* + * Used to decode cmds.. + */ +#define _CMD_DIR(cmd) (((cmd) >> _CMD_DIRSHIFT) & _CMD_DIRMASK) +#define _CMD_FLAGS(cmd) (((cmd) >> _CMD_FLAGSSHIFT) & _CMD_FLAGSMASK) +#define _CMD_VTYPE(cmd) (((cmd) >> _CMD_VTYPESHIFT) & _CMD_VTYPEMASK) +#define _CMD_N(cmd) (((cmd) >> _CMD_NSHIFT) & _CMD_NMASK) + +enum vnic_devcmd_cmd { + CMD_NONE = _CMDC(_CMD_DIR_NONE, _CMD_VTYPE_NONE, 0), + + /* + * mcpu fw info in mem: + * in: + * (u64)a0=paddr to struct vnic_devcmd_fw_info + * action: + * Fills in struct vnic_devcmd_fw_info (128 bytes) + * note: + * An old definition of CMD_MCPU_FW_INFO + */ + CMD_MCPU_FW_INFO_OLD = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ALL, 1), + + /* + * mcpu fw info in mem: + * in: + * (u64)a0=paddr to struct vnic_devcmd_fw_info + * (u16)a1=size of the structure + * out: + * (u16)a1=0 for in:a1 = 0, + * data size actually written for other values. + * action: + * Fills in first 128 bytes of vnic_devcmd_fw_info for in:a1 = 0, + * first in:a1 bytes for 0 < in:a1 <= 132, + * 132 bytes for other values of in:a1. + * note: + * CMD_MCPU_FW_INFO and CMD_MCPU_FW_INFO_OLD have the same enum 1 + * for source compatibility. + */ + CMD_MCPU_FW_INFO = _CMDC(_CMD_DIR_RW, _CMD_VTYPE_ALL, 1), + + /* dev-specific block member: + * in: (u16)a0=offset,(u8)a1=size + * out: a0=value + */ + CMD_DEV_SPEC = _CMDC(_CMD_DIR_RW, _CMD_VTYPE_ALL, 2), + + /* stats clear */ + CMD_STATS_CLEAR = _CMDCNW(_CMD_DIR_NONE, _CMD_VTYPE_ALL, 3), + + /* stats dump in mem: (u64)a0=paddr to stats area, + * (u16)a1=sizeof stats area */ + CMD_STATS_DUMP = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ALL, 4), + + /* set Rx packet filter: (u32)a0=filters (see CMD_PFILTER_*) */ + CMD_PACKET_FILTER = _CMDCNW(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 7), + + /* set Rx packet filter for all: (u32)a0=filters (see CMD_PFILTER_*) */ + CMD_PACKET_FILTER_ALL = _CMDCNW(_CMD_DIR_WRITE, _CMD_VTYPE_ALL, 7), + + /* hang detection notification */ + CMD_HANG_NOTIFY = _CMDC(_CMD_DIR_NONE, _CMD_VTYPE_ALL, 8), + + /* MAC address in (u48)a0 */ + CMD_MAC_ADDR = _CMDC(_CMD_DIR_READ, + _CMD_VTYPE_ENET | _CMD_VTYPE_FC, 9), +#define CMD_GET_MAC_ADDR CMD_MAC_ADDR /* some uses are aliased */ + + /* add addr from (u48)a0 */ + CMD_ADDR_ADD = _CMDCNW(_CMD_DIR_WRITE, + _CMD_VTYPE_ENET | _CMD_VTYPE_FC, 12), + + /* del addr from (u48)a0 */ + CMD_ADDR_DEL = _CMDCNW(_CMD_DIR_WRITE, + _CMD_VTYPE_ENET | _CMD_VTYPE_FC, 13), + + /* add VLAN id in (u16)a0 */ + CMD_VLAN_ADD = _CMDCNW(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 14), + + /* del VLAN id in (u16)a0 */ + CMD_VLAN_DEL = _CMDCNW(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 15), + + /* nic_cfg in (u32)a0 */ + CMD_NIC_CFG = _CMDCNW(_CMD_DIR_WRITE, _CMD_VTYPE_ALL, 16), + + /* union vnic_rss_key in mem: (u64)a0=paddr, (u16)a1=len */ + CMD_RSS_KEY = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 17), + + /* union vnic_rss_cpu in mem: (u64)a0=paddr, (u16)a1=len */ + CMD_RSS_CPU = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 18), + + /* initiate softreset */ + CMD_SOFT_RESET = _CMDCNW(_CMD_DIR_NONE, _CMD_VTYPE_ALL, 19), + + /* softreset status: + * out: a0=0 reset complete, a0=1 reset in progress */ + CMD_SOFT_RESET_STATUS = _CMDC(_CMD_DIR_READ, _CMD_VTYPE_ALL, 20), + + /* set struct vnic_devcmd_notify buffer in mem: + * in: + * (u64)a0=paddr to notify (set paddr=0 to unset) + * (u32)a1 & 0x00000000ffffffff=sizeof(struct vnic_devcmd_notify) + * (u16)a1 & 0x0000ffff00000000=intr num (-1 for no intr) + * out: + * (u32)a1 = effective size + */ + CMD_NOTIFY = _CMDC(_CMD_DIR_RW, _CMD_VTYPE_ALL, 21), + + /* UNDI API: (u64)a0=paddr to s_PXENV_UNDI_ struct, + * (u8)a1=PXENV_UNDI_xxx */ + CMD_UNDI = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 22), + + /* initiate open sequence (u32)a0=flags (see CMD_OPENF_*) */ + CMD_OPEN = _CMDCNW(_CMD_DIR_WRITE, _CMD_VTYPE_ALL, 23), + + /* open status: + * out: a0=0 open complete, a0=1 open in progress */ + CMD_OPEN_STATUS = _CMDC(_CMD_DIR_READ, _CMD_VTYPE_ALL, 24), + + /* close vnic */ + CMD_CLOSE = _CMDC(_CMD_DIR_NONE, _CMD_VTYPE_ALL, 25), + + /* initialize virtual link: (u32)a0=flags (see CMD_INITF_*) */ +/***** Replaced by CMD_INIT *****/ + CMD_INIT_v1 = _CMDCNW(_CMD_DIR_READ, _CMD_VTYPE_ALL, 26), + + /* variant of CMD_INIT, with provisioning info + * (u64)a0=paddr of vnic_devcmd_provinfo + * (u32)a1=sizeof provision info */ + CMD_INIT_PROV_INFO = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 27), + + /* enable virtual link */ + CMD_ENABLE = _CMDCNW(_CMD_DIR_WRITE, _CMD_VTYPE_ALL, 28), + + /* enable virtual link, waiting variant. */ + CMD_ENABLE_WAIT = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ALL, 28), + + /* disable virtual link */ + CMD_DISABLE = _CMDC(_CMD_DIR_NONE, _CMD_VTYPE_ALL, 29), + + /* stats dump sum of all vnic stats on same uplink in mem: + * (u64)a0=paddr + * (u16)a1=sizeof stats area */ + CMD_STATS_DUMP_ALL = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ALL, 30), + + /* init status: + * out: a0=0 init complete, a0=1 init in progress + * if a0=0, a1=errno */ + CMD_INIT_STATUS = _CMDC(_CMD_DIR_READ, _CMD_VTYPE_ALL, 31), + + /* INT13 API: (u64)a0=paddr to vnic_int13_params struct + * (u32)a1=INT13_CMD_xxx */ + CMD_INT13 = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_FC, 32), + + /* logical uplink enable/disable: (u64)a0: 0/1=disable/enable */ + CMD_LOGICAL_UPLINK = _CMDCNW(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 33), + + /* undo initialize of virtual link */ + CMD_DEINIT = _CMDCNW(_CMD_DIR_NONE, _CMD_VTYPE_ALL, 34), + + /* initialize virtual link: (u32)a0=flags (see CMD_INITF_*) */ + CMD_INIT = _CMDCNW(_CMD_DIR_WRITE, _CMD_VTYPE_ALL, 35), + + /* check fw capability of a cmd: + * in: (u32)a0=cmd + * out: (u32)a0=errno, 0:valid cmd, a1=supported VNIC_STF_* bits */ + CMD_CAPABILITY = _CMDC(_CMD_DIR_RW, _CMD_VTYPE_ALL, 36), + + /* persistent binding info + * in: (u64)a0=paddr of arg + * (u32)a1=CMD_PERBI_XXX */ + CMD_PERBI = _CMDC(_CMD_DIR_RW, _CMD_VTYPE_FC, 37), + + /* Interrupt Assert Register functionality + * in: (u16)a0=interrupt number to assert + */ + CMD_IAR = _CMDCNW(_CMD_DIR_WRITE, _CMD_VTYPE_ALL, 38), + + /* initiate hangreset, like softreset after hang detected */ + CMD_HANG_RESET = _CMDC(_CMD_DIR_NONE, _CMD_VTYPE_ALL, 39), + + /* hangreset status: + * out: a0=0 reset complete, a0=1 reset in progress */ + CMD_HANG_RESET_STATUS = _CMDC(_CMD_DIR_READ, _CMD_VTYPE_ALL, 40), + + /* + * Set hw ingress packet vlan rewrite mode: + * in: (u32)a0=new vlan rewrite mode + * out: (u32)a0=old vlan rewrite mode */ + CMD_IG_VLAN_REWRITE_MODE = _CMDC(_CMD_DIR_RW, _CMD_VTYPE_ENET, 41), + + /* + * in: (u16)a0=bdf of target vnic + * (u32)a1=cmd to proxy + * a2-a15=args to cmd in a1 + * out: (u32)a0=status of proxied cmd + * a1-a15=out args of proxied cmd */ + CMD_PROXY_BY_BDF = _CMDC(_CMD_DIR_RW, _CMD_VTYPE_ALL, 42), + + /* + * As for BY_BDF except a0 is index of hvnlink subordinate vnic + * or SR-IOV virtual vnic + */ + CMD_PROXY_BY_INDEX = _CMDC(_CMD_DIR_RW, _CMD_VTYPE_ALL, 43), + + /* + * For HPP toggle: + * adapter-info-get + * in: (u64)a0=phsical address of buffer passed in from caller. + * (u16)a1=size of buffer specified in a0. + * out: (u64)a0=phsical address of buffer passed in from caller. + * (u16)a1=actual bytes from VIF-CONFIG-INFO TLV, or + * 0 if no VIF-CONFIG-INFO TLV was ever received. */ + CMD_CONFIG_INFO_GET = _CMDC(_CMD_DIR_RW, _CMD_VTYPE_ALL, 44), + + /* + * INT13 API: (u64)a0=paddr to vnic_int13_params struct + * (u32)a1=INT13_CMD_xxx + */ + CMD_INT13_ALL = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ALL, 45), + + /* + * Set default vlan: + * in: (u16)a0=new default vlan + * (u16)a1=zero for overriding vlan with param a0, + * non-zero for resetting vlan to the default + * out: (u16)a0=old default vlan + */ + CMD_SET_DEFAULT_VLAN = _CMDC(_CMD_DIR_RW, _CMD_VTYPE_ALL, 46), + + /* init_prov_info2: + * Variant of CMD_INIT_PROV_INFO, where it will not try to enable + * the vnic until CMD_ENABLE2 is issued. + * (u64)a0=paddr of vnic_devcmd_provinfo + * (u32)a1=sizeof provision info */ + CMD_INIT_PROV_INFO2 = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 47), + + /* enable2: + * (u32)a0=0 ==> standby + * =CMD_ENABLE2_ACTIVE ==> active + */ + CMD_ENABLE2 = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 48), + + /* + * cmd_status: + * Returns the status of the specified command + * Input: + * a0 = command for which status is being queried. + * Possible values are: + * CMD_SOFT_RESET + * CMD_HANG_RESET + * CMD_OPEN + * CMD_INIT + * CMD_INIT_PROV_INFO + * CMD_DEINIT + * CMD_INIT_PROV_INFO2 + * CMD_ENABLE2 + * Output: + * if status == STAT_ERROR + * a0 = ERR_ENOTSUPPORTED - status for command in a0 is + * not supported + * if status == STAT_NONE + * a0 = status of the devcmd specified in a0 as follows. + * ERR_SUCCESS - command in a0 completed successfully + * ERR_EINPROGRESS - command in a0 is still in progress + */ + CMD_STATUS = _CMDC(_CMD_DIR_RW, _CMD_VTYPE_ALL, 49), + + /* + * Returns interrupt coalescing timer conversion factors. + * After calling this devcmd, ENIC driver can convert + * interrupt coalescing timer in usec into CPU cycles as follows: + * + * intr_timer_cycles = intr_timer_usec * multiplier / divisor + * + * Interrupt coalescing timer in usecs can be be converted/obtained + * from CPU cycles as follows: + * + * intr_timer_usec = intr_timer_cycles * divisor / multiplier + * + * in: none + * out: (u32)a0 = multiplier + * (u32)a1 = divisor + * (u32)a2 = maximum timer value in usec + */ + CMD_INTR_COAL_CONVERT = _CMDC(_CMD_DIR_READ, _CMD_VTYPE_ALL, 50), + + /* + * ISCSI DUMP API: + * in: (u64)a0=paddr of the param or param itself + * (u32)a1=ISCSI_CMD_xxx + */ + CMD_ISCSI_DUMP_REQ = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ALL, 51), + + /* + * ISCSI DUMP STATUS API: + * in: (u32)a0=cmd tag + * in: (u32)a1=ISCSI_CMD_xxx + * out: (u32)a0=cmd status + */ + CMD_ISCSI_DUMP_STATUS = _CMDC(_CMD_DIR_RW, _CMD_VTYPE_ALL, 52), + + /* + * Subvnic migration from MQ <--> VF. + * Enable the LIF migration from MQ to VF and vice versa. MQ and VF + * indexes are statically bound at the time of initialization. + * Based on the direction of migration, the resources of either MQ or + * the VF shall be attached to the LIF. + * in: (u32)a0=Direction of Migration + * 0=> Migrate to VF + * 1=> Migrate to MQ + * (u32)a1=VF index (MQ index) + */ + CMD_MIGRATE_SUBVNIC = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 53), + + /* + * Register / Deregister the notification block for MQ subvnics + * in: + * (u64)a0=paddr to notify (set paddr=0 to unset) + * (u32)a1 & 0x00000000ffffffff=sizeof(struct vnic_devcmd_notify) + * (u16)a1 & 0x0000ffff00000000=intr num (-1 for no intr) + * out: + * (u32)a1 = effective size + */ + CMD_SUBVNIC_NOTIFY = _CMDC(_CMD_DIR_RW, _CMD_VTYPE_ALL, 54), + + /* + * Set the predefined mac address as default + * in: + * (u48)a0=mac addr + */ + CMD_SET_MAC_ADDR = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 55), + + /* Update the provisioning info of the given VIF + * (u64)a0=paddr of vnic_devcmd_provinfo + * (u32)a1=sizeof provision info */ + CMD_PROV_INFO_UPDATE = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 56), + + /* + * Initialization for the devcmd2 interface. + * in: (u64) a0=host result buffer physical address + * in: (u16) a1=number of entries in result buffer + */ + CMD_INITIALIZE_DEVCMD2 = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ALL, 57), + + /* + * Add a filter. + * in: (u64) a0= filter address + * (u32) a1= size of filter + * out: (u32) a0=filter identifier + * + * Capability query: + * out: (u64) a0= 1 if capability query supported + * (u64) a1= MAX filter type supported + */ + CMD_ADD_FILTER = _CMDC(_CMD_DIR_RW, _CMD_VTYPE_ENET, 58), + + /* + * Delete a filter. + * in: (u32) a0=filter identifier + */ + CMD_DEL_FILTER = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 59), + + /* + * Enable a Queue Pair in User space NIC + * in: (u32) a0=Queue Pair number + * (u32) a1= command + */ + CMD_QP_ENABLE = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 60), + + /* + * Disable a Queue Pair in User space NIC + * in: (u32) a0=Queue Pair number + * (u32) a1= command + */ + CMD_QP_DISABLE = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 61), + + /* + * Stats dump Queue Pair in User space NIC + * in: (u32) a0=Queue Pair number + * (u64) a1=host buffer addr for status dump + * (u32) a2=length of the buffer + */ + CMD_QP_STATS_DUMP = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 62), + + /* + * Clear stats for Queue Pair in User space NIC + * in: (u32) a0=Queue Pair number + */ + CMD_QP_STATS_CLEAR = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 63), + + /* + * UEFI BOOT API: (u64)a0= UEFI FLS_CMD_xxx + * (ui64)a1= paddr for the info buffer + */ + CMD_FC_REQ = _CMDC(_CMD_DIR_RW, _CMD_VTYPE_FC, 64), + + /* + * Return the iSCSI config details required by the EFI Option ROM + * in: (u32) a0=0 Get Boot Info for PXE eNIC as per pxe_boot_config_t + * a0=1 Get Boot info for iSCSI enic as per + * iscsi_boot_efi_cfg_t + * in: (u64) a1=Host address where iSCSI config info is returned + */ + CMD_VNIC_BOOT_CONFIG_INFO = _CMDC(_CMD_DIR_RW, _CMD_VTYPE_ALL, 65), + + /* + * Create a Queue Pair (RoCE) + * in: (u32) a0 = Queue Pair number + * (u32) a1 = Remote QP + * (u32) a2 = RDMA-RQ + * (u16) a3 = RQ Res Group + * (u16) a4 = SQ Res Group + * (u32) a5 = Protection Domain + * (u64) a6 = Remote MAC + * (u32) a7 = start PSN + * (u16) a8 = MSS + * (u32) a9 = protocol version + */ + CMD_RDMA_QP_CREATE = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 66), + + /* + * Delete a Queue Pair (RoCE) + * in: (u32) a0 = Queue Pair number + */ + CMD_RDMA_QP_DELETE = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 67), + + /* + * Retrieve a Queue Pair's status information (RoCE) + * in: (u32) a0 = Queue Pair number + * (u64) a1 = host buffer addr for QP status struct + * (u32) a2 = length of the buffer + */ + CMD_RDMA_QP_STATUS = _CMDC(_CMD_DIR_RW, _CMD_VTYPE_ENET, 68), + + /* + * Use this devcmd for agreeing on the highest common version supported + * by both driver and fw for by features who need such a facility. + * in: (u64) a0 = feature (driver requests for the supported versions on + * this feature) + * out: (u64) a0 = bitmap of all supported versions for that feature + */ + CMD_GET_SUPP_FEATURE_VER = _CMDC(_CMD_DIR_RW, _CMD_VTYPE_ENET, 69), + + /* + * Initialize the RDMA notification work queue + * in: (u64) a0 = host buffer address + * in: (u16) a1 = number of entries in buffer + * in: (u16) a2 = resource group number + * in: (u16) a3 = CQ number to post completion + */ + CMD_RDMA_INIT_INFO_BUF = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 70), + + /* + * De-init the RDMA notification work queue + * in: (u64) a0=resource group number + */ + CMD_RDMA_DEINIT_INFO_BUF = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 71), + + /* + * Control (Enable/Disable) overlay offloads on the given vnic + * in: (u8) a0 = OVERLAY_FEATURE_NVGRE : NVGRE + * a0 = OVERLAY_FEATURE_VXLAN : VxLAN + * in: (u8) a1 = OVERLAY_OFFLOAD_ENABLE : Enable or + * a1 = OVERLAY_OFFLOAD_DISABLE : Disable or + * a1 = OVERLAY_OFFLOAD_ENABLE_V2 : Enable with version 2 + */ + CMD_OVERLAY_OFFLOAD_CTRL = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 72), + + /* + * Configuration of overlay offloads feature on a given vNIC + * in: (u8) a0 = OVERLAY_CFG_VXLAN_PORT_UPDATE : VxLAN + * in: (u16) a1 = unsigned short int port information + */ + CMD_OVERLAY_OFFLOAD_CFG = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 73), + + /* + * Return the configured name for the device + * in: (u64) a0=Host address where the name is copied + * (u32) a1=Size of the buffer + */ + CMD_GET_CONFIG_NAME = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ALL, 74), + + /* + * Enable group interrupt for the VF + * in: (u32) a0 = GRPINTR_ENABLE : enable + * a0 = GRPINTR_DISABLE : disable + * a0 = GRPINTR_UPD_VECT: update group vector addr + * in: (u32) a1 = interrupt group count + * in: (u64) a2 = Start of host buffer address for DMAing group + * vector bitmap + * in: (u64) a3 = Stride between group vectors + */ + CMD_CONFIG_GRPINTR = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 75), + + /* + * Set cq arrary base and size in a list of consective wqs and + * rqs for a device + * in: (u16) a0 = the wq relative index in the device. + * -1 indicates skipping wq configuration + * in: (u16) a1 = the wcq relative index in the device + * in: (u16) a2 = the rq relative index in the device + * -1 indicates skipping rq configuration + * in: (u16) a3 = the rcq relative index in the device + */ + CMD_CONFIG_CQ_ARRAY = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 76), + + /* + * Add an advanced filter. + * in: (u64) a0= filter address + * (u32) a1= size of filter + * out: (u32) a0=filter identifier + * + * Capability query: + * in: (u64) a1= supported filter capability exchange modes + * out: (u64) a0= 1 if capability query supported + * if (u64) a1 = 0: a1 = MAX filter type supported + * if (u64) a1 & FILTER_CAP_MODE_V1_FLAG: + * a1 = bitmask of supported filters + * a2 = FILTER_CAP_MODE_V1 + * a3 = bitmask of supported actions + */ + CMD_ADD_ADV_FILTER = _CMDC(_CMD_DIR_RW, _CMD_VTYPE_ENET, 77), + + /* + * Add a MAC address and VLAN pair to a LIF. This is like CMD_ADDR_ADD + * but with the ability to specify a VLAN as well. + * in: (u64) a0 = MAC address + * (u16) a1 = VLAN (0 means default VLAN) + * (u32) a2 = flags (see AVF_xxx below) + */ + CMD_ADDR_VLAN_ADD = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 78), + + /* + * Delete a MAC address and VLAN pair from a LIF. This is like CMD_ADDR_DEL + * but with the ability to specify a VLAN as well. + * in: (u64) a0 = MAC address + * (u16) a1 = VLAN (0 means default VLAN) + * (u32) a2 = flags (see AVF_xxx below) + */ + CMD_ADDR_VLAN_DEL = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 79), + + /* + * Bind resources to an MQ sub-vnic. To detach a sub-vnic from all + * resources, call with all 0s. A sub-vnic may not be attached to + * different resources until it is detached from current resources. + * This may only be issued as proxy-by-index on a MQ sub-vnic + * in: (u32) a0 = WQ base (relative) + * (u32) a1 = WQ count + * (u32) a2 = RQ base + * (u32) a3 = RQ count + * (u32) a4 = CQ base + * (u32) a5 = CQ count + */ + CMD_SUBVNIC_RES_BIND = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 80), + + /* + * Configure RDMA Resource + * in: (u32) a0 = sub-command + * (u32) a1 = resource domain, 0xffffffff for domain-less commands + * (u32) a2 = (command-specific) + * ... + * + * All arguments that have not been assigned a meaning should be + * initialized to 0 to allow for better driver forward compatibility. + */ + CMD_RDMA_CTRL = _CMDC(_CMD_DIR_RW, _CMD_VTYPE_ENET, 81), + + /* + * Set a rate limit on a vnic + * in: (u32) a0 = rate limit in units of Mb/s + * (u32) a1 = traffic class + */ + CMD_RATE_LIMIT_SET = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 82), + + /* + * Query rate limit on a vnic + * in: (u32) a0 = traffic class + * out:(u32) a0 = latest devcmd specified rate limit (Mb/s) + * a1 = aurrent actual rate limit (Mb/s) + */ + CMD_RATE_LIMIT_GET = _CMDC(_CMD_DIR_READ, _CMD_VTYPE_ENET, 82), + + /* + * Write QoS settings to a vnic + * in: (u32) a0 = flags + * (u32) a1 = PFC map + * (u32) a2 = PGS grouping + * ((a2 >> (PRI * 4)) & 0xf) = pri group (15 = strict priority) + * (u32) a3 = PGS BW allocation + * ((a3 >> (PG * 8)) & 0xff) = BW % for priority group + * (must sum to 100) + */ + CMD_QOS_SET = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 83), + + /* + * Read QoS settings from a vnic + * out:(u32) a0 = flags + * (u32) a1 = PFC map + * (u32) a2 = PGS grouping + * ((a2 >> (PRI * 4)) & 0xf) = pri group (15 = strict priority) + * (u32) a3 = PGS BW allocation + * ((a3 >> (PG * 8)) & 0xff) = BW % for priority group + */ + CMD_QOS_GET = _CMDC(_CMD_DIR_READ, _CMD_VTYPE_ENET, 83), + + /* + * Command for tests on bodega-dev + * in: (u32) a0=requested operation + * a1..aN=operation specific + * out: a0..aN=operation specific + */ + CMD_TEST_OP = _CMDC(_CMD_DIR_RW, _CMD_VTYPE_ENET, 96), +}; + +/* Modes for exchanging advanced filter capabilities. The modes supported by + * the driver are passed in the CMD_ADD_ADV_FILTER capability command and the + * mode selected is returned. + * V0: the maximum filter type supported is returned + * V1: bitmasks of supported filters and actions are returned + */ +enum filter_cap_mode { + FILTER_CAP_MODE_V0 = 0, /* Must always be 0 for legacy drivers */ + FILTER_CAP_MODE_V1 = 1, +}; +#define FILTER_CAP_MODE_V1_FLAG (1 << FILTER_CAP_MODE_V1) + +/* CMD_ENABLE2 flags */ +#define CMD_ENABLE2_STANDBY 0x0 +#define CMD_ENABLE2_ACTIVE 0x1 + +/* flags for CMD_OPEN */ +#define CMD_OPENF_OPROM 0x1 /* open coming from option rom */ +#define CMD_OPENF_RQ_ENABLE_THEN_POST 0x2 /* Enable IG DESC cache on open */ + +/* flags for CMD_INIT */ +#define CMD_INITF_DEFAULT_MAC 0x1 /* init with default mac addr */ + +/* flags for CMD_PACKET_FILTER */ +#define CMD_PFILTER_DIRECTED 0x01 +#define CMD_PFILTER_MULTICAST 0x02 +#define CMD_PFILTER_BROADCAST 0x04 +#define CMD_PFILTER_PROMISCUOUS 0x08 +#define CMD_PFILTER_ALL_MULTICAST 0x10 + +/* Commands for CMD_QP_ENABLE/CM_QP_DISABLE */ +#define CMD_QP_RQWQ 0x0 + +/* rewrite modes for CMD_IG_VLAN_REWRITE_MODE */ +#define IG_VLAN_REWRITE_MODE_DEFAULT_TRUNK 0 +#define IG_VLAN_REWRITE_MODE_UNTAG_DEFAULT_VLAN 1 +#define IG_VLAN_REWRITE_MODE_PRIORITY_TAG_DEFAULT_VLAN 2 +#define IG_VLAN_REWRITE_MODE_PASS_THRU 3 + +enum vnic_devcmd_status { + STAT_NONE = 0, + STAT_BUSY = 1 << 0, /* cmd in progress */ + STAT_ERROR = 1 << 1, /* last cmd caused error (code in a0) */ + STAT_FAILOVER = 1 << 2, /* always set on vnics in pci standby state + if seen a failover to the standby happened */ +}; + +enum vnic_devcmd_error { + ERR_SUCCESS = 0, + ERR_EINVAL = 1, + ERR_EFAULT = 2, + ERR_EPERM = 3, + ERR_EBUSY = 4, + ERR_ECMDUNKNOWN = 5, + ERR_EBADSTATE = 6, + ERR_ENOMEM = 7, + ERR_ETIMEDOUT = 8, + ERR_ELINKDOWN = 9, + ERR_EMAXRES = 10, + ERR_ENOTSUPPORTED = 11, + ERR_EINPROGRESS = 12, + ERR_MAX +}; + +/* + * note: hw_version and asic_rev refer to the same thing, + * but have different formats. hw_version is + * a 32-byte string (e.g. "A2") and asic_rev is + * a 16-bit integer (e.g. 0xA2). + */ +struct vnic_devcmd_fw_info { + char fw_version[32]; + char fw_build[32]; + char hw_version[32]; + char hw_serial_number[32]; + u16 asic_type; + u16 asic_rev; +}; + +#ifndef FOR_UPSTREAM_KERNEL +enum fwinfo_asic_type { + FWINFO_ASIC_TYPE_UNKNOWN, + FWINFO_ASIC_TYPE_PALO, + FWINFO_ASIC_TYPE_SERENO, + FWINFO_ASIC_TYPE_CRUZ, +}; +#endif + +struct vnic_devcmd_notify { + u32 csum; /* checksum over following words */ + + u32 link_state; /* link up == 1 */ + u32 port_speed; /* effective port speed (rate limit) */ + u32 mtu; /* MTU */ + u32 msglvl; /* requested driver msg lvl */ + u32 uif; /* uplink interface */ + u32 status; /* status bits (see VNIC_STF_*) */ + u32 error; /* error code (see ERR_*) for first ERR */ + u32 link_down_cnt; /* running count of link down transitions */ + u32 perbi_rebuild_cnt; /* running count of perbi rebuilds */ +}; +#define VNIC_STF_FATAL_ERR 0x0001 /* fatal fw error */ +#define VNIC_STF_STD_PAUSE 0x0002 /* standard link-level pause on */ +#define VNIC_STF_PFC_PAUSE 0x0004 /* priority flow control pause on */ +/* all supported status flags */ +#define VNIC_STF_ALL (VNIC_STF_FATAL_ERR |\ + VNIC_STF_STD_PAUSE |\ + VNIC_STF_PFC_PAUSE |\ + 0) + +struct vnic_devcmd_provinfo { + u8 oui[3]; + u8 type; + u8 data[]; +}; + +/* + * These are used in flags field of different filters to denote + * valid fields used. + */ +#define FILTER_FIELD_VALID(fld) (1 << (fld - 1)) + +#define FILTER_FIELD_USNIC_VLAN FILTER_FIELD_VALID(1) +#define FILTER_FIELD_USNIC_ETHTYPE FILTER_FIELD_VALID(2) +#define FILTER_FIELD_USNIC_PROTO FILTER_FIELD_VALID(3) +#define FILTER_FIELD_USNIC_ID FILTER_FIELD_VALID(4) + +#define FILTER_FIELDS_USNIC (FILTER_FIELD_USNIC_VLAN | \ + FILTER_FIELD_USNIC_ETHTYPE | \ + FILTER_FIELD_USNIC_PROTO | \ + FILTER_FIELD_USNIC_ID) + +struct filter_usnic_id { + u32 flags; + u16 vlan; + u16 ethtype; + u8 proto_version; + u32 usnic_id; +} __attribute__((packed)); + +#define FILTER_FIELD_5TUP_PROTO FILTER_FIELD_VALID(1) +#define FILTER_FIELD_5TUP_SRC_AD FILTER_FIELD_VALID(2) +#define FILTER_FIELD_5TUP_DST_AD FILTER_FIELD_VALID(3) +#define FILTER_FIELD_5TUP_SRC_PT FILTER_FIELD_VALID(4) +#define FILTER_FIELD_5TUP_DST_PT FILTER_FIELD_VALID(5) + +#define FILTER_FIELDS_IPV4_5TUPLE (FILTER_FIELD_5TUP_PROTO | \ + FILTER_FIELD_5TUP_SRC_AD | \ + FILTER_FIELD_5TUP_DST_AD | \ + FILTER_FIELD_5TUP_SRC_PT | \ + FILTER_FIELD_5TUP_DST_PT) + +/* Enums for the protocol field. */ +enum protocol_e { + PROTO_UDP = 0, + PROTO_TCP = 1, + PROTO_IPV4 = 2, + PROTO_IPV6 = 3 +}; + +struct filter_ipv4_5tuple { + u32 flags; + u32 protocol; + u32 src_addr; + u32 dst_addr; + u16 src_port; + u16 dst_port; +} __attribute__((packed)); + +#define FILTER_FIELD_VMQ_VLAN FILTER_FIELD_VALID(1) +#define FILTER_FIELD_VMQ_MAC FILTER_FIELD_VALID(2) + +#define FILTER_FIELDS_MAC_VLAN (FILTER_FIELD_VMQ_VLAN | \ + FILTER_FIELD_VMQ_MAC) + +#define FILTER_FIELDS_NVGRE FILTER_FIELD_VMQ_MAC + +struct filter_mac_vlan { + u32 flags; + u16 vlan; + u8 mac_addr[6]; +} __attribute__((packed)); + +#define FILTER_FIELD_VLAN_IP_3TUP_VLAN FILTER_FIELD_VALID(1) +#define FILTER_FIELD_VLAN_IP_3TUP_L3_PROTO FILTER_FIELD_VALID(2) +#define FILTER_FIELD_VLAN_IP_3TUP_DST_AD FILTER_FIELD_VALID(3) +#define FILTER_FIELD_VLAN_IP_3TUP_L4_PROTO FILTER_FIELD_VALID(4) +#define FILTER_FIELD_VLAN_IP_3TUP_DST_PT FILTER_FIELD_VALID(5) + +#define FILTER_FIELDS_VLAN_IP_3TUP (FILTER_FIELD_VLAN_IP_3TUP_VLAN | \ + FILTER_FIELD_VLAN_IP_3TUP_L3_PROTO | \ + FILTER_FIELD_VLAN_IP_3TUP_DST_AD | \ + FILTER_FIELD_VLAN_IP_3TUP_L4_PROTO | \ + FILTER_FIELD_VLAN_IP_3TUP_DST_PT) + +struct filter_vlan_ip_3tuple { + u32 flags; + u16 vlan; + u16 l3_protocol; + union { + u32 dst_addr_v4; + u8 dst_addr_v6[16]; + } u; + u32 l4_protocol; + u16 dst_port; +} __attribute__((packed)); + +#define FILTER_GENERIC_1_BYTES 64 + +enum filter_generic_1_layer { + FILTER_GENERIC_1_L2, + FILTER_GENERIC_1_L3, + FILTER_GENERIC_1_L4, + FILTER_GENERIC_1_L5, + FILTER_GENERIC_1_NUM_LAYERS +}; + +#define FILTER_GENERIC_1_IPV4 (1 << 0) +#define FILTER_GENERIC_1_IPV6 (1 << 1) +#define FILTER_GENERIC_1_UDP (1 << 2) +#define FILTER_GENERIC_1_TCP (1 << 3) +#define FILTER_GENERIC_1_TCP_OR_UDP (1 << 4) +#define FILTER_GENERIC_1_IP4SUM_OK (1 << 5) +#define FILTER_GENERIC_1_L4SUM_OK (1 << 6) +#define FILTER_GENERIC_1_IPFRAG (1 << 7) + +#define FILTER_GENERIC_1_KEY_LEN 64 + +/* + * Version 1 of generic filter specification + * position is only 16 bits, reserving positions > 64k to be used by firmware + */ +struct filter_generic_1 { + u16 position; // lower position comes first + u32 mask_flags; + u32 val_flags; + u16 mask_vlan; + u16 val_vlan; + struct { + u8 mask[FILTER_GENERIC_1_KEY_LEN]; // 0 bit means "don't care" + u8 val[FILTER_GENERIC_1_KEY_LEN]; + } __attribute__((packed)) layer[FILTER_GENERIC_1_NUM_LAYERS]; +} __attribute__((packed)); + +/* Specifies the filter_action type. */ +enum { + FILTER_ACTION_RQ_STEERING = 0, + FILTER_ACTION_V2 = 1, + FILTER_ACTION_MAX +}; + +struct filter_action { + u32 type; + union { + u32 rq_idx; + } u; +} __attribute__((packed)); + +#define FILTER_ACTION_RQ_STEERING_FLAG (1 << 0) +#define FILTER_ACTION_FILTER_ID_FLAG (1 << 1) +#define FILTER_ACTION_DROP_FLAG (1 << 2) + +/* Version 2 of filter action must be a strict extension of struct filter_action + * where the first fields exactly match in size and meaning. + */ +struct filter_action_v2 { + u32 type; + u32 rq_idx; + u32 flags; // use FILTER_ACTION_XXX_FLAG defines + u16 filter_id; + u_int8_t reserved[32]; // for future expansion +} __attribute__((packed)); + +/* Specifies the filter type. */ +enum filter_type { + FILTER_USNIC_ID = 0, + FILTER_IPV4_5TUPLE = 1, + FILTER_MAC_VLAN = 2, + FILTER_VLAN_IP_3TUPLE = 3, + FILTER_NVGRE_VMQ = 4, + FILTER_USNIC_IP = 5, + FILTER_DPDK_1 = 6, + FILTER_MAX +}; + +#define FILTER_USNIC_ID_FLAG (1 << FILTER_USNIC_ID) +#define FILTER_IPV4_5TUPLE_FLAG (1 << FILTER_IPV4_5TUPLE) +#define FILTER_MAC_VLAN_FLAG (1 << FILTER_MAC_VLAN) +#define FILTER_VLAN_IP_3TUPLE_FLAG (1 << FILTER_VLAN_IP_3TUPLE) +#define FILTER_NVGRE_VMQ_FLAG (1 << FILTER_NVGRE_VMQ) +#define FILTER_USNIC_IP_FLAG (1 << FILTER_USNIC_IP) +#define FILTER_DPDK_1_FLAG (1 << FILTER_DPDK_1) + +struct filter { + u32 type; + union { + struct filter_usnic_id usnic; + struct filter_ipv4_5tuple ipv4; + struct filter_mac_vlan mac_vlan; + struct filter_vlan_ip_3tuple vlan_3tuple; + } u; +} __attribute__((packed)); + +/* + * This is a strict superset of "struct filter" and exists only + * because many drivers use "sizeof (struct filter)" in deciding TLV size. + * This new, larger struct filter would cause any code that uses that method + * to not work with older firmware, so we add filter_v2 to hold the + * new filter types. Drivers should use vnic_filter_size() to determine + * the TLV size instead of sizeof (struct fiter_v2) to guard against future + * growth. + */ +struct filter_v2 { + u32 type; + union { + struct filter_usnic_id usnic; + struct filter_ipv4_5tuple ipv4; + struct filter_mac_vlan mac_vlan; + struct filter_vlan_ip_3tuple vlan_3tuple; + struct filter_generic_1 generic_1; + } u; +} __attribute__((packed)); + +enum { + CLSF_TLV_FILTER = 0, + CLSF_TLV_ACTION = 1, +}; + +struct filter_tlv { + u_int32_t type; + u_int32_t length; + u_int32_t val[]; +}; + +/* Data for CMD_ADD_FILTER is 2 TLV and filter + action structs */ +#define FILTER_MAX_BUF_SIZE 100 +#define FILTER_V2_MAX_BUF_SIZE (sizeof (struct filter_v2) + \ + sizeof (struct filter_action_v2) + \ + (2 * sizeof (struct filter_tlv))) + +/* + * Compute actual structure size given filter type. To be "future-proof," + * drivers should use this instead of "sizeof (struct filter_v2)" when + * computing length for TLV. + */ +static inline u_int32_t +vnic_filter_size( + struct filter_v2 *fp) +{ + u_int32_t size; + + switch (fp->type) { + case FILTER_USNIC_ID: + size = sizeof (fp->u.usnic); + break; + case FILTER_IPV4_5TUPLE: + size = sizeof (fp->u.ipv4); + break; + case FILTER_MAC_VLAN: + case FILTER_NVGRE_VMQ: + size = sizeof (fp->u.mac_vlan); + break; + case FILTER_VLAN_IP_3TUPLE: + size = sizeof (fp->u.vlan_3tuple); + break; + case FILTER_USNIC_IP: + case FILTER_DPDK_1: + size = sizeof (fp->u.generic_1); + break; + default: + size = sizeof (fp->u); + break; + } + size += sizeof (fp->type); + return (size); +} + + +enum { + CLSF_ADD = 0, + CLSF_DEL = 1, +}; + +/* + * Get the action structure size given action type. To be "future-proof," + * drivers should use this instead of "sizeof (struct filter_action_v2)" + * when computing length for TLV. + */ +static inline u_int32_t +vnic_action_size(struct filter_action_v2 *fap) +{ + u_int32_t size; + + switch (fap->type) { + case FILTER_ACTION_RQ_STEERING: + size = sizeof (struct filter_action); + break; + case FILTER_ACTION_V2: + size = sizeof (struct filter_action_v2); + break; + default: + /* this should never happen and will cause a devcmd error */ + size = sizeof (struct filter_action); + break; + } + return (size); +} + +/* + * Writing cmd register causes STAT_BUSY to get set in status register. + * When cmd completes, STAT_BUSY will be cleared. + * + * If cmd completed successfully STAT_ERROR will be clear + * and args registers contain cmd-specific results. + * + * If cmd error, STAT_ERROR will be set and args[0] contains error code. + * + * status register is read-only. While STAT_BUSY is set, + * all other register contents are read-only. + */ + +/* Make sizeof(vnic_devcmd) a power-of-2 for I/O BAR. */ +#define VNIC_DEVCMD_NARGS 15 +struct vnic_devcmd { + u32 status; /* RO */ + u32 cmd; /* RW */ + u64 args[VNIC_DEVCMD_NARGS]; /* RW cmd args (little-endian) */ +}; + +/* + * Version 2 of the interface. + * + * Some things are carried over, notably the vnic_devcmd_cmd enum. + */ + +/* + * Flags for vnic_devcmd2.flags + */ + +#define DEVCMD2_FNORESULT 0x1 /* Don't copy result to host */ + +#define VNIC_DEVCMD2_NARGS VNIC_DEVCMD_NARGS +struct vnic_devcmd2 { + u16 pad; + u16 flags; + u32 cmd; /* same command #defines as original */ + u64 args[VNIC_DEVCMD2_NARGS]; +}; + +#define VNIC_DEVCMD2_NRESULTS VNIC_DEVCMD_NARGS +struct devcmd2_result { + u64 results[VNIC_DEVCMD2_NRESULTS]; + u32 pad; + u16 completed_index; /* into copy WQ */ + u8 error; /* same error codes as original */ + u8 color; /* 0 or 1 as with completion queues */ +}; + +#define DEVCMD2_RING_SIZE 32 +#define DEVCMD2_DESC_SIZE 128 + +#define DEVCMD2_RESULTS_SIZE_MAX ((1 << 16) - 1) + +/* Overlay related definitions */ + +/* + * This enum lists the flag associated with each of the overlay features + */ +typedef enum { + OVERLAY_FEATURE_NVGRE = 1, + OVERLAY_FEATURE_VXLAN, + OVERLAY_FEATURE_MAX, +} overlay_feature_t; + +typedef enum { + OVERLAY_OFFLOAD_ENABLE, + OVERLAY_OFFLOAD_DISABLE, + OVERLAY_OFFLOAD_ENABLE_V2, + OVERLAY_OFFLOAD_MAX, +} overlay_ofld_cmd; + +#define OVERLAY_CFG_VXLAN_PORT_UPDATE 0 + +/* + * Use this enum to get the supported versions for each of these features + * If you need to use the devcmd_get_supported_feature_version(), add + * the new feature into this enum and install function handler in devcmd.c + */ +typedef enum { + VIC_FEATURE_VXLAN, + VIC_FEATURE_RDMA, + VIC_FEATURE_VXLAN_PATCH, + VIC_FEATURE_MAX, +} vic_feature_t; + +/* this previously lived in vnic_rdma.h */ +#define MK_RDMA_FW_VER(ver) (1 << (ver)) +enum vnic_rdma_fw_versions { + RDMA_FW_VER_1, + RDMA_FW_VER_2 +}; + +/* + * CMD_CONFIG_GRPINTR subcommands + */ +typedef enum { + GRPINTR_ENABLE = 1, + GRPINTR_DISABLE, + GRPINTR_UPD_VECT, +} grpintr_subcmd_t; + +/* + * CMD_RDMA_CTRL subcommands + * + * Unless otherwise stated, all arguments are in little endian (as with regular + * devcmds). + * + * MAC address arguments are encoded in u64 arguments. A little endian host + * should encode 11:22:33:44:55:66 as 0x0000112233445566. The high order bytes + * of the u64 value must be 0 or the argument will be considered an invalid MAC + * address. + */ + +#define RDMA_QP_STATE_INVALID 0 +#define RDMA_QP_STATE_RESET (1<<0) +#define RDMA_QP_STATE_INIT (1<<1) +#define RDMA_QP_STATE_RTR (1<<2) +#define RDMA_QP_STATE_RTS (1<<3) +#define RDMA_QP_STATE_SQD (1<<4) +#define RDMA_QP_STATE_SQE (1<<5) +#define RDMA_QP_STATE_ERR (1<<6) +#define RDMA_QP_STATE_RSRV1 (1<<7) + +#define RDMA_QP_STATE_VALID_RQ (RDMA_QP_STATE_INIT | \ + RDMA_QP_STATE_RTR | \ + RDMA_QP_STATE_RTS | \ + RDMA_QP_STATE_SQD | \ + RDMA_QP_STATE_SQE) + +#define RDMA_QP_STATE_VALID_RESP (RDMA_QP_STATE_RTR | \ + RDMA_QP_STATE_RTS | \ + RDMA_QP_STATE_SQD) + +#define RDMA_QP_STATE_SQD_SQE (RDMA_QP_STATE_SQD | \ + RDMA_QP_STATE_SQE) + +#define RDMA_QP_TYPE_INVALID 0 +#define RDMA_QP_TYPE_RC 1 +#define RDMA_QP_TYPE_UD 2 + +#define RDMA_INTR_NULL_IDX 0xffffffff +#define RDMA_ANY_QPN 0xffffffff +#define RDMA_NULL_QP_ID 0xffffffff +#define RDMA_PSN_UNCHANGED 0xffffffff + +#define RDMA_PROTO_ROCEV2 0 + +/* + * Initialize a specific resource domain associated with the current vNIC. The + * number of resource domains for the current vNIC is specified in the vNIC + * devspec. + * + * in: (u32) a0 = RDMA_SUBCMD_CFG_RESOURCE_DOMAIN + * (u32) a1 = resource domain id (0-indexed) + * (u32) a2 = protocol type (only RDMA_PROTO_ROCEV2 for now) + * (u64) a3 = source MAC address (see note above about MAC encoding) + * (u64) a4 = ring base addr of rdma_reg_cmd_result ring + * (u32) a5 = result ring size, should equal command WQ ring size + * (u32) a6 = rcmd soft cq interrupt vector (idx w/in vnic's intr range) + * (pass RDMA_INTR_NULL_IDX for no interrupt) + */ +#define RDMA_SUBCMD_CFG_RESOURCE_DOMAIN 0 + +/* + * Allocate a soft CQ from the resource domain. + * + * in: (u32) a0 = RDMA_SUBCMD_CREATE_CQ + * (u32) a1 = resource domain ID + * (u64) a2 = ring base address + * (u32) a3 = ring size + * (u32) a4 = interrupt vector (idx w/in vnic's intr range) + * (pass RDMA_INTR_NULL_IDX for no interrupt) + * + * out: (u32) a0 = CQ ID + */ +#define RDMA_SUBCMD_CREATE_CQ 1 + +/* + * Deallocate a soft CQ. + * + * in: (u32) a0 = RDMA_SUBCMD_DESTROY_CQ + * (u32) a1 = resource domain ID + * (u32) a2 = CQ ID + */ +#define RDMA_SUBCMD_DESTROY_CQ 2 + +/* + * Allocate a QP (with one SQ and one RQ) from the resource domain. + * + * in: (u32) a0 = RDMA_SUBCMD_CREATE_QP + * (u32) a1 = resource domain ID + * (u32) a2 = QP type (see RDMA_QP_TYPE_xxx) + * (u32) a3 = max SQ WRs + * (u32) a4 = max RQ WRs + * (u32) a5 = SQ CQ ID + * (u32) a6 = RQ CQ ID + * (u32) a7 = desired QPN (or RDMA_ANY_QPN if don't care) + * (u32) a8 = QP flags + * (u64) a9 = SQ ring base ptr + * (u64) a10 = RQ ring base ptr + * + * out: (u32) a0 = QP ID + * (u32) a1 = actual QPN (XXX could just obtain from QUERY_QP) + */ +#define RDMA_SUBCMD_CREATE_QP 3 + +/* + * Modify the state of an existing QP. This is primarily used to transition + * the QP from one state to the next. The "current state" argument must match + * the QP's actual current state or the command will fail. If the driver and + * firmware get out of sync, the actual current state can be queried with + * RDMA_SUBCMD_QUERY_QP. + * + * The next-hop MAC, peer IP, and peer QPN arguments are ignored if the new + * state is not RTR. + * + * in: (u32) a0 = RDMA_SUBCMD_MODIFY_QP + * (u32) a1 = resource domain ID + * (u32) a2 = QP ID + * (u32) a3 = current state + * (u32) a4 = new state + * (u64) a5 = next-hop MAC to destination IP (see MAC encoding note above) + * (u64) a6 = peer IP address + * (u32) a7 = peer QPN + * (u32) a8 = path MTU (one of: 512/1024/2048/4096, 0 means no change) + * (u64) a9 = upper 32-bits: SQ PSN (RDMA_PSN_UNCHANGED means no change) + * lower 32-bits: RQ PSN (RDMA_PSN_UNCHANGED means no change) + * (u32) a10 = Q_Key (UD QPs only) + * (u32) a11 = source IPv4 address in network byte order + */ +#define RDMA_SUBCMD_MODIFY_QP 4 + +/* + * Query current QP status. + * + * in: (u32) a0 = RDMA_SUBCMD_QUERY_QP + * (u32) a1 = resource domain ID + * (u32) a2 = QP ID + * + * out: (u32) a0 = QPN + * (u32) a1 = current QP state + * (u32) a2 = path MTU + * (u32) a3 = current SQ PSN + * (u32) a4 = current RQ PSN + */ +#define RDMA_SUBCMD_QUERY_QP 5 + +/* + * Deallocate a QP. + * + * in: (u32) a0 = RDMA_SUBCMD_DESTROY_QP + * (u32) a1 = resource domain ID + * (u32) a2 = QP ID + */ +#define RDMA_SUBCMD_DESTROY_QP 6 + +/* + * Retrieve a snapshot of current statistics for this vnic's + * rdma engine + * + * in: (u32) a0 = RDMA_SUBCMD_GET_STATS + * + * out: (u64) a0 = IG packet count + * (u64) a1 = IG byte count + * (u64) a2 = EG packet count + * (u64) a3 = EG byte count + */ +#define RDMA_SUBCMD_GET_STATS 7 + +/* + * in: (u32) a0 = RDMA_SUBCMD_RST_RESOURCE_DOMAIN + * (u32) a1 = resource domain ID + */ +#define RDMA_SUBCMD_RST_RESOURCE_DOMAIN 8 + +/* + * Status for deallocate QP dev_cmd. + * + * in: (u32) a0 = RDMA_SUBCMD_DESTROY_QP_STATUS + * (u32) a1 = resource domain ID + * (u32) a2 = QP ID + * + * out: (u32) a0 = ERR_EINPROGRESS/ERR_EBADSTATE/ERR_SUCCESS + */ +#define RDMA_SUBCMD_DESTROY_QP_STATUS 9 + +/* + * Flags for CMD_ADDR_VLAN_ADD and CMD_ADDR_VLAN_DEL + */ +#define AVF_VLAN_VALID 0x0001 // use VLAN from a1 in match + // (else VLAN is wildcard) +#define AVF_INNER_PKT 0x0002 // match on inner packet + +#endif /* _VNIC_DEVCMD_H_ */ diff --git a/prov/usnic/src/usnic_direct/vnic_enet.h b/prov/usnic/src/usnic_direct/vnic_enet.h new file mode 100644 index 00000000000..94ce66702f2 --- /dev/null +++ b/prov/usnic/src/usnic_direct/vnic_enet.h @@ -0,0 +1,86 @@ +/* + * Copyright 2008-2018 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#ifndef _VNIC_ENIC_H_ +#define _VNIC_ENIC_H_ + +/* Device-specific region: enet configuration */ +struct vnic_enet_config { + u32 flags; + u32 wq_desc_count; + u32 rq_desc_count; + u16 mtu; + u16 intr_timer_deprecated; + u8 intr_timer_type; + u8 intr_mode; + char devname[16]; + u32 intr_timer_usec; + u16 loop_tag; + u16 vf_rq_count; + u16 num_arfs; + u64 mem_paddr; +}; + +#define VENETF_TSO 0x1 /* TSO enabled */ +#define VENETF_LRO 0x2 /* LRO enabled */ +#define VENETF_RXCSUM 0x4 /* RX csum enabled */ +#define VENETF_TXCSUM 0x8 /* TX csum enabled */ +#define VENETF_RSS 0x10 /* RSS enabled */ +#define VENETF_RSSHASH_IPV4 0x20 /* Hash on IPv4 fields */ +#define VENETF_RSSHASH_TCPIPV4 0x40 /* Hash on TCP + IPv4 fields */ +#define VENETF_RSSHASH_IPV6 0x80 /* Hash on IPv6 fields */ +#define VENETF_RSSHASH_TCPIPV6 0x100 /* Hash on TCP + IPv6 fields */ +#define VENETF_RSSHASH_IPV6_EX 0x200 /* Hash on IPv6 extended fields */ +#define VENETF_RSSHASH_TCPIPV6_EX 0x400 /* Hash on TCP + IPv6 ext. fields */ +#define VENETF_LOOP 0x800 /* Loopback enabled */ +#define VENETF_VMQ 0x4000 /* using VMQ flag for VMware NETQ */ +#define VENETF_VXLAN 0x10000 /* VxLAN offload */ +#define VENETF_NVGRE 0x20000 /* NVGRE offload */ +#define VENET_INTR_TYPE_MIN 0 /* Timer specs min interrupt spacing */ +#define VENET_INTR_TYPE_IDLE 1 /* Timer specs idle time before irq */ + +#define VENET_INTR_MODE_ANY 0 /* Try MSI-X, then MSI, then INTx */ +#define VENET_INTR_MODE_MSI 1 /* Try MSI then INTx */ +#define VENET_INTR_MODE_INTX 2 /* Try INTx only */ + +#endif /* _VNIC_ENIC_H_ */ diff --git a/prov/usnic/src/usnic_direct/vnic_intr.c b/prov/usnic/src/usnic_direct/vnic_intr.c new file mode 100644 index 00000000000..51f98b327f3 --- /dev/null +++ b/prov/usnic/src/usnic_direct/vnic_intr.c @@ -0,0 +1,123 @@ +/* + * Copyright 2008-2018 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#include +#include +#include +#include +#include + +#include "kcompat.h" +#include "vnic_dev.h" +#include "vnic_intr.h" + +EXPORT_SYMBOL(vnic_intr_free); +void vnic_intr_free(struct vnic_intr *intr) +{ + intr->ctrl = NULL; +} + +EXPORT_SYMBOL(vnic_intr_alloc); +int vnic_intr_alloc(struct vnic_dev *vdev, struct vnic_intr *intr, + unsigned int index) +{ + intr->index = index; + intr->vdev = vdev; + + intr->ctrl = vnic_dev_get_res(vdev, RES_TYPE_INTR_CTRL, index); + if (!intr->ctrl) { + pr_err("Failed to hook INTR[%d].ctrl resource\n", index); + return -EINVAL; + } + + return 0; +} + +EXPORT_SYMBOL(vnic_intr_init); +void vnic_intr_init(struct vnic_intr *intr, u32 coalescing_timer, + unsigned int coalescing_type, unsigned int mask_on_assertion) +{ + vnic_intr_coalescing_timer_set(intr, coalescing_timer); + iowrite32(coalescing_type, &intr->ctrl->coalescing_type); + iowrite32(mask_on_assertion, &intr->ctrl->mask_on_assertion); + iowrite32(0, &intr->ctrl->int_credits); +} + +void vnic_intr_coalescing_timer_set(struct vnic_intr *intr, + u32 coalescing_timer) +{ + iowrite32(vnic_dev_intr_coal_timer_usec_to_hw(intr->vdev, + coalescing_timer), &intr->ctrl->coalescing_timer); +} + +void vnic_intr_clean(struct vnic_intr *intr) +{ + iowrite32(0, &intr->ctrl->int_credits); +} + +void vnic_intr_raise(struct vnic_intr *intr) +{ + vnic_dev_raise_intr(intr->vdev, (u16)intr->index); +} + +EXPORT_SYMBOL(vnic_grpmbrintr_free); +void vnic_grpmbrintr_free(struct vnic_intr *intr) +{ + intr->ctrl = NULL; +} + +EXPORT_SYMBOL(vnic_grpmbrintr_alloc); +int vnic_grpmbrintr_alloc(struct vnic_dev *vdev, struct vnic_intr *intr, + unsigned int index) +{ + intr->index = index; + intr->vdev = vdev; + + intr->ctrl = vnic_dev_get_res(vdev, RES_TYPE_GRPMBR_INTR, index); + if (!intr->ctrl) { + pr_err("Failed to hook INTR[%d].ctrl resource\n", index); + return -EINVAL; + } + + return 0; +} diff --git a/prov/usnic/src/usnic_direct/vnic_intr.h b/prov/usnic/src/usnic_direct/vnic_intr.h new file mode 100644 index 00000000000..cc8152134e4 --- /dev/null +++ b/prov/usnic/src/usnic_direct/vnic_intr.h @@ -0,0 +1,140 @@ +/* + * Copyright 2008-2018 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#ifndef _VNIC_INTR_H_ +#define _VNIC_INTR_H_ + +#include + +#include "vnic_dev.h" + +#define VNIC_INTR_TIMER_TYPE_ABS 0 +#define VNIC_INTR_TIMER_TYPE_QUIET 1 + +/* Interrupt control */ +struct vnic_intr_ctrl { + u32 coalescing_timer; /* 0x00 */ + u32 pad0; + u32 coalescing_value; /* 0x08 */ + u32 pad1; + u32 coalescing_type; /* 0x10 */ + u32 pad2; + u32 mask_on_assertion; /* 0x18 */ + u32 pad3; + u32 mask; /* 0x20 */ + u32 pad4; + u32 int_credits; /* 0x28 */ + u32 pad5; + u32 int_credit_return; /* 0x30 */ + u32 pad6; +}; + +struct vnic_intr { + unsigned int index; + struct vnic_dev *vdev; + struct vnic_intr_ctrl __iomem *ctrl; /* memory-mapped */ +}; + +static inline void vnic_intr_unmask(struct vnic_intr *intr) +{ + iowrite32(0, &intr->ctrl->mask); +} + +static inline void vnic_intr_mask(struct vnic_intr *intr) +{ + iowrite32(1, &intr->ctrl->mask); +} + +static inline int vnic_intr_masked(struct vnic_intr *intr) +{ + return ioread32(&intr->ctrl->mask); +} + +static inline void vnic_intr_return_credits(struct vnic_intr *intr, + unsigned int credits, int unmask, int reset_timer) +{ +#define VNIC_INTR_UNMASK_SHIFT 16 +#define VNIC_INTR_RESET_TIMER_SHIFT 17 + + u32 int_credit_return = (credits & 0xffff) | + (unmask ? (1 << VNIC_INTR_UNMASK_SHIFT) : 0) | + (reset_timer ? (1 << VNIC_INTR_RESET_TIMER_SHIFT) : 0); + + iowrite32(int_credit_return, &intr->ctrl->int_credit_return); +} + +static inline unsigned int vnic_intr_credits(struct vnic_intr *intr) +{ + return ioread32(&intr->ctrl->int_credits); +} + +static inline void vnic_intr_return_all_credits(struct vnic_intr *intr) +{ + unsigned int credits = vnic_intr_credits(intr); + int unmask = 1; + int reset_timer = 1; + + vnic_intr_return_credits(intr, credits, unmask, reset_timer); +} + +static inline u32 vnic_intr_legacy_pba(u32 __iomem *legacy_pba) +{ + /* read PBA without clearing */ + return ioread32(legacy_pba); +} + +void vnic_intr_free(struct vnic_intr *intr); +int vnic_intr_alloc(struct vnic_dev *vdev, struct vnic_intr *intr, + unsigned int index); +void vnic_intr_init(struct vnic_intr *intr, u32 coalescing_timer, + unsigned int coalescing_type, unsigned int mask_on_assertion); +void vnic_intr_coalescing_timer_set(struct vnic_intr *intr, + u32 coalescing_timer); +void vnic_intr_clean(struct vnic_intr *intr); + +void vnic_grpmbrintr_free(struct vnic_intr *intr); +int vnic_grpmbrintr_alloc(struct vnic_dev *vdev, struct vnic_intr *intr, + unsigned int index); + + +#endif /* _VNIC_INTR_H_ */ diff --git a/prov/usnic/src/usnic_direct/vnic_resource.h b/prov/usnic/src/usnic_direct/vnic_resource.h new file mode 100644 index 00000000000..ebfe3cc329d --- /dev/null +++ b/prov/usnic/src/usnic_direct/vnic_resource.h @@ -0,0 +1,119 @@ +/* +* Copyright 2008-2010 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#ifndef _VNIC_RESOURCE_H_ +#define _VNIC_RESOURCE_H_ + +#define VNIC_RES_MAGIC 0x766E6963L /* 'vnic' */ +#define VNIC_RES_VERSION 0x00000000L +#define MGMTVNIC_MAGIC 0x544d474dL /* 'MGMT' */ +#define MGMTVNIC_VERSION 0x00000000L + +/* The MAC address assigned to the CFG vNIC is fixed. */ +#define MGMTVNIC_MAC { 0x02, 0x00, 0x54, 0x4d, 0x47, 0x4d } + +/* vNIC resource types */ +enum vnic_res_type { + RES_TYPE_EOL, /* End-of-list */ + RES_TYPE_WQ, /* Work queues */ + RES_TYPE_RQ, /* Receive queues */ + RES_TYPE_CQ, /* Completion queues */ + RES_TYPE_MEM, /* Window to dev memory */ + RES_TYPE_NIC_CFG, /* Enet NIC config registers */ + RES_TYPE_RSS_KEY, /* Enet RSS secret key */ + RES_TYPE_RSS_CPU, /* Enet RSS indirection table */ + RES_TYPE_TX_STATS, /* Netblock Tx statistic regs */ + RES_TYPE_RX_STATS, /* Netblock Rx statistic regs */ + RES_TYPE_INTR_CTRL, /* Interrupt ctrl table */ + RES_TYPE_INTR_TABLE, /* MSI/MSI-X Interrupt table */ + RES_TYPE_INTR_PBA, /* MSI/MSI-X PBA table */ + RES_TYPE_INTR_PBA_LEGACY, /* Legacy intr status */ + RES_TYPE_DEBUG, /* Debug-only info */ + RES_TYPE_DEV, /* Device-specific region */ + RES_TYPE_DEVCMD, /* Device command region */ + RES_TYPE_PASS_THRU_PAGE, /* Pass-thru page */ + RES_TYPE_SUBVNIC, /* subvnic resource type */ + RES_TYPE_MQ_WQ, /* MQ Work queues */ + RES_TYPE_MQ_RQ, /* MQ Receive queues */ + RES_TYPE_MQ_CQ, /* MQ Completion queues */ + RES_TYPE_DEPRECATED1, /* Old version of devcmd 2 */ + RES_TYPE_DEPRECATED2, /* Old version of devcmd 2 */ + RES_TYPE_DEVCMD2, /* Device control region */ + RES_TYPE_RDMA_WQ, /* RDMA WQ */ + RES_TYPE_RDMA_RQ, /* RDMA RQ */ + RES_TYPE_RDMA_CQ, /* RDMA CQ */ + RES_TYPE_RDMA_RKEY_TABLE, /* RDMA RKEY table */ + RES_TYPE_RDMA_RQ_HEADER_TABLE, /* RDMA RQ Header Table */ + RES_TYPE_RDMA_RQ_TABLE, /* RDMA RQ Table */ + RES_TYPE_RDMA_RD_RESP_HEADER_TABLE, /* RDMA Read Response Header Table */ + RES_TYPE_RDMA_RD_RESP_TABLE, /* RDMA Read Response Table */ + RES_TYPE_RDMA_QP_STATS_TABLE, /* RDMA per QP stats table */ + RES_TYPE_WQ_MREGS, /* XXX snic proto only */ + RES_TYPE_GRPMBR_INTR, /* Group member interrupt control */ + RES_TYPE_DPKT, /* Direct Packet memory region */ + + RES_TYPE_MAX, /* Count of resource types */ +}; + +struct vnic_resource_header { + u32 magic; + u32 version; +}; + +struct mgmt_barmap_hdr { + u32 magic; /* magic number */ + u32 version; /* header format version */ + u16 lif; /* loopback lif for mgmt frames */ + u16 pci_slot; /* installed pci slot */ + char serial[16]; /* card serial number */ +}; + +struct vnic_resource { + u8 type; + u8 bar; + u8 pad[2]; + u32 bar_offset; + u32 count; +}; + +#endif /* _VNIC_RESOURCE_H_ */ diff --git a/prov/usnic/src/usnic_direct/vnic_rq.c b/prov/usnic/src/usnic_direct/vnic_rq.c new file mode 100644 index 00000000000..7e8624d351b --- /dev/null +++ b/prov/usnic/src/usnic_direct/vnic_rq.c @@ -0,0 +1,272 @@ +/* + * Copyright 2008-2018 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#include +#include +#include +#include +#ifdef __KERNEL__ +#include +#include +#endif + +#include "kcompat.h" +#include "vnic_dev.h" +#include "vnic_rq.h" + +static int vnic_rq_alloc_bufs(struct vnic_rq *rq) +{ + struct vnic_rq_buf *buf; + unsigned int i, j, count = rq->ring.desc_count; + unsigned int blks = VNIC_RQ_BUF_BLKS_NEEDED(count); + + for (i = 0; i < blks; i++) { + rq->bufs[i] = kzalloc(VNIC_RQ_BUF_BLK_SZ(count), GFP_ATOMIC); + if (!rq->bufs[i]) + return -ENOMEM; + } + + for (i = 0; i < blks; i++) { + buf = rq->bufs[i]; + for (j = 0; j < VNIC_RQ_BUF_BLK_ENTRIES(count); j++) { + buf->index = i * VNIC_RQ_BUF_BLK_ENTRIES(count) + j; + buf->desc = (u8 *)rq->ring.descs + + rq->ring.desc_size * buf->index; + if (buf->index + 1 == count) { + buf->next = rq->bufs[0]; + break; + } else if (j + 1 == VNIC_RQ_BUF_BLK_ENTRIES(count)) { + buf->next = rq->bufs[i + 1]; + } else { + buf->next = buf + 1; + buf++; + } + } + } + + rq->to_use = rq->to_clean = rq->bufs[0]; + + return 0; +} + +#ifndef NOT_FOR_OPEN_ENIC +int vnic_rq_mem_size(struct vnic_rq *rq, unsigned int desc_count, + unsigned int desc_size) +{ + int mem_size = 0; + + mem_size += vnic_dev_desc_ring_size(&rq->ring, desc_count, desc_size); + + mem_size += VNIC_RQ_BUF_BLKS_NEEDED(rq->ring.desc_count) * + VNIC_RQ_BUF_BLK_SZ(rq->ring.desc_count); + + return mem_size; +} + +#endif +void vnic_rq_free(struct vnic_rq *rq) +{ + struct vnic_dev *vdev; + unsigned int i; + + vdev = rq->vdev; + + vnic_dev_free_desc_ring(vdev, &rq->ring); + + for (i = 0; i < VNIC_RQ_BUF_BLKS_MAX; i++) { + if (rq->bufs[i]) { + kfree(rq->bufs[i]); + rq->bufs[i] = NULL; + } + } + + rq->ctrl = NULL; +} + +int vnic_rq_alloc(struct vnic_dev *vdev, struct vnic_rq *rq, unsigned int index, + unsigned int desc_count, unsigned int desc_size) +{ + int err; + + rq->index = index; + rq->vdev = vdev; + + rq->ctrl = vnic_dev_get_res(vdev, RES_TYPE_RQ, index); + if (!rq->ctrl) { + pr_err("Failed to hook RQ[%d] resource\n", index); + return -EINVAL; + } + + vnic_rq_disable(rq); + + err = vnic_dev_alloc_desc_ring(vdev, &rq->ring, desc_count, desc_size); + if (err) + return err; + + err = vnic_rq_alloc_bufs(rq); + if (err) { + vnic_rq_free(rq); + return err; + } + + return 0; +} + +static void vnic_rq_init_start(struct vnic_rq *rq, unsigned int cq_index, + unsigned int fetch_index, unsigned int posted_index, + unsigned int error_interrupt_enable, + unsigned int error_interrupt_offset) +{ + u64 paddr; + unsigned int count = rq->ring.desc_count; + + paddr = (u64)rq->ring.base_addr | VNIC_PADDR_TARGET; + writeq(paddr, &rq->ctrl->ring_base); + iowrite32(count, &rq->ctrl->ring_size); + iowrite32(cq_index, &rq->ctrl->cq_index); + iowrite32(error_interrupt_enable, &rq->ctrl->error_interrupt_enable); + iowrite32(error_interrupt_offset, &rq->ctrl->error_interrupt_offset); + iowrite32(0, &rq->ctrl->data_ring); + iowrite32(0, &rq->ctrl->header_split); + iowrite32(0, &rq->ctrl->error_status); + iowrite32(fetch_index, &rq->ctrl->fetch_index); + iowrite32(posted_index, &rq->ctrl->posted_index); + + rq->to_use = rq->to_clean = + &rq->bufs[fetch_index / VNIC_RQ_BUF_BLK_ENTRIES(count)] + [fetch_index % VNIC_RQ_BUF_BLK_ENTRIES(count)]; +} + +void vnic_rq_init(struct vnic_rq *rq, unsigned int cq_index, + unsigned int error_interrupt_enable, + unsigned int error_interrupt_offset) +{ + vnic_rq_init_start(rq, cq_index, 0, 0, error_interrupt_enable, + error_interrupt_offset); +} + +void vnic_rq_error_out(struct vnic_rq *rq, unsigned int error) +{ + iowrite32(error, &rq->ctrl->error_status); +} + +unsigned int vnic_rq_error_status(struct vnic_rq *rq) +{ + return vnic_rq_ctrl_error_status(rq->ctrl); +} + +EXPORT_SYMBOL(vnic_rq_ctrl_error_status); +unsigned int vnic_rq_ctrl_error_status(struct vnic_rq_ctrl *ctrl) +{ + return ioread32(&ctrl->error_status); +} + +void vnic_rq_enable(struct vnic_rq *rq) +{ + iowrite32(1, &rq->ctrl->enable); +} + +int vnic_rq_disable(struct vnic_rq *rq) +{ + unsigned int wait; + int i; + + /* + * Due to a race condition with clearing RQ "mini-cache", we need to + * disable the RQ twice to guarantee that stale descriptors are not + * used when this RQ is re-enabled. + */ + for (i = 0; i < 2; ++i) { + iowrite32(0, &rq->ctrl->enable); + + /* Wait for HW to ACK disable request */ + for (wait = 20000; wait > 0; wait--) { + if (ioread32(&rq->ctrl->running) == 0) + break; + } + + if (wait == 0) { + pr_err("Failed to disable RQ[%d]\n", rq->index); + return -ETIMEDOUT; + } + } + return 0; +} + +void vnic_rq_clean(struct vnic_rq *rq, + void (*buf_clean)(struct vnic_rq *rq, struct vnic_rq_buf *buf)) +{ + struct vnic_rq_buf *buf; + u32 fetch_index; + unsigned int count = rq->ring.desc_count; + size_t i; + + buf = rq->to_clean; + + for (i = 0; i < rq->ring.desc_count; i++) { + (*buf_clean)(rq, buf); + buf = buf->next; + } + rq->ring.desc_avail = rq->ring.desc_count - 1; + + /* Use current fetch_index as the ring starting point */ + fetch_index = ioread32(&rq->ctrl->fetch_index); + + if (fetch_index == 0xFFFFFFFF) { /* check for hardware gone */ + /* Hardware surprise removal: reset fetch_index */ + fetch_index = 0; + } + rq->to_use = rq->to_clean = + &rq->bufs[fetch_index / VNIC_RQ_BUF_BLK_ENTRIES(count)] + [fetch_index % VNIC_RQ_BUF_BLK_ENTRIES(count)]; + iowrite32(fetch_index, &rq->ctrl->posted_index); + + /* + * Anytime we write fetch_index, we need to re-write 0 to RQ.enable + * to re-sync internal VIC state on Sereno. + */ + iowrite32(0, &rq->ctrl->enable); + + vnic_dev_clear_desc_ring(&rq->ring); +} + diff --git a/prov/usnic/src/usnic_direct/vnic_rq.h b/prov/usnic/src/usnic_direct/vnic_rq.h new file mode 100644 index 00000000000..0625760ae07 --- /dev/null +++ b/prov/usnic/src/usnic_direct/vnic_rq.h @@ -0,0 +1,296 @@ +/* + * Copyright 2008-2018 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#ifndef _VNIC_RQ_H_ +#define _VNIC_RQ_H_ + +#include "vnic_dev.h" +#include "vnic_cq.h" + +/* Receive queue control */ +struct vnic_rq_ctrl { + u64 ring_base; /* 0x00 */ + u32 ring_size; /* 0x08 */ + u32 pad0; + u32 posted_index; /* 0x10 */ + u32 pad1; + u32 cq_index; /* 0x18 */ + u32 pad2; + u32 enable; /* 0x20 */ + u32 pad3; + u32 running; /* 0x28 */ + u32 pad4; + u32 fetch_index; /* 0x30 */ + u32 pad5; + u32 error_interrupt_enable; /* 0x38 */ + u32 pad6; + u32 error_interrupt_offset; /* 0x40 */ + u32 pad7; + u32 error_status; /* 0x48 */ + u32 pad8; + u32 tcp_sn; /* 0x50 */ + u32 pad9[3]; + u32 dca_select; /* 0x60 */ + u32 pad10[3]; + u32 data_ring; /* 0x70 */ + u32 pad11; + u32 header_split; /* 0x78 */ + u32 pad12; +}; + +/* Break the vnic_rq_buf allocations into blocks of 32/64 entries */ +#define VNIC_RQ_BUF_MIN_BLK_ENTRIES 32 +#define VNIC_RQ_BUF_DFLT_BLK_ENTRIES 64 +#define VNIC_RQ_BUF_BLK_ENTRIES(entries) \ + ((unsigned int)((entries < VNIC_RQ_BUF_DFLT_BLK_ENTRIES) ? \ + VNIC_RQ_BUF_MIN_BLK_ENTRIES : VNIC_RQ_BUF_DFLT_BLK_ENTRIES)) +#define VNIC_RQ_BUF_BLK_SZ(entries) \ + (VNIC_RQ_BUF_BLK_ENTRIES(entries) * sizeof(struct vnic_rq_buf)) +#define VNIC_RQ_BUF_BLKS_NEEDED(entries) \ + DIV_ROUND_UP(entries, VNIC_RQ_BUF_BLK_ENTRIES(entries)) +#define VNIC_RQ_BUF_BLKS_MAX VNIC_RQ_BUF_BLKS_NEEDED(4096) + +struct vnic_rq_buf { + struct vnic_rq_buf *next; + dma_addr_t dma_addr; + void *os_buf; + unsigned int os_buf_index; + unsigned int len; + unsigned int index; + void *desc; + uint64_t wr_id; +}; + +struct vnic_rq { + unsigned int index; + struct vnic_dev *vdev; + struct vnic_rq_ctrl __iomem *ctrl; /* memory-mapped */ + struct vnic_dev_ring ring; + struct vnic_rq_buf *bufs[VNIC_RQ_BUF_BLKS_MAX]; + struct vnic_rq_buf *to_use; + struct vnic_rq_buf *to_clean; + void *os_buf_head; + unsigned int pkts_outstanding; +#if defined(__LIBUSNIC__) + uint32_t qp_num; +#endif + +#ifdef ENIC_BUSY_POLL + atomic_t bpoll_state; +#endif /*ENIC_BUSY_POLL*/ +}; + +static inline unsigned int vnic_rq_desc_avail(struct vnic_rq *rq) +{ + /* how many does SW own? */ + return rq->ring.desc_avail; +} + +static inline unsigned int vnic_rq_desc_used(struct vnic_rq *rq) +{ + /* how many does HW own? */ + return rq->ring.desc_count - rq->ring.desc_avail - 1; +} + +static inline void *vnic_rq_next_desc(struct vnic_rq *rq) +{ + return rq->to_use->desc; +} + +static inline unsigned int vnic_rq_next_index(struct vnic_rq *rq) +{ + return rq->to_use->index; +} + +static inline void vnic_rq_post(struct vnic_rq *rq, + void *os_buf, unsigned int os_buf_index, + dma_addr_t dma_addr, unsigned int len, + uint64_t wrid) +{ + struct vnic_rq_buf *buf = rq->to_use; + + buf->os_buf = os_buf; + buf->os_buf_index = os_buf_index; + buf->dma_addr = dma_addr; + buf->len = len; + buf->wr_id = wrid; + + buf = buf->next; + rq->to_use = buf; + rq->ring.desc_avail--; + + /* Move the posted_index every nth descriptor + */ +#if defined(__LIBUSNIC__) +#define VNIC_RQ_RETURN_RATE 0x0 +#endif + +#ifndef VNIC_RQ_RETURN_RATE +#define VNIC_RQ_RETURN_RATE 0xf /* keep 2^n - 1 */ +#endif + + if ((buf->index & VNIC_RQ_RETURN_RATE) == 0) { + /* Adding write memory barrier prevents compiler and/or CPU + * reordering, thus avoiding descriptor posting before + * descriptor is initialized. Otherwise, hardware can read + * stale descriptor fields. + */ + wmb(); + iowrite32(buf->index, &rq->ctrl->posted_index); + } +} + +static inline void vnic_rq_post_commit(struct vnic_rq *rq, + void *os_buf, unsigned int os_buf_index, + dma_addr_t dma_addr, unsigned int len) +{ + struct vnic_rq_buf *buf = rq->to_use; + + buf->os_buf = os_buf; + buf->os_buf_index = os_buf_index; + buf->dma_addr = dma_addr; + buf->len = len; + + buf = buf->next; + rq->to_use = buf; + rq->ring.desc_avail--; + + /* Move the posted_index every descriptor + */ + + /* Adding write memory barrier prevents compiler and/or CPU + * reordering, thus avoiding descriptor posting before + * descriptor is initialized. Otherwise, hardware can read + * stale descriptor fields. + */ + wmb(); + iowrite32(buf->index, &rq->ctrl->posted_index); +} + +static inline void vnic_rq_return_descs(struct vnic_rq *rq, unsigned int count) +{ + rq->ring.desc_avail += count; +} + +enum desc_return_options { + VNIC_RQ_RETURN_DESC, + VNIC_RQ_DEFER_RETURN_DESC, +}; + +static inline void vnic_rq_service(struct vnic_rq *rq, + struct cq_desc *cq_desc, u16 completed_index, + int desc_return, void (*buf_service)(struct vnic_rq *rq, + struct cq_desc *cq_desc, struct vnic_rq_buf *buf, + int skipped, void *opaque), void *opaque) +{ + struct vnic_rq_buf *buf; + int skipped; + + buf = rq->to_clean; + while (1) { + + skipped = (buf->index != completed_index); + + (*buf_service)(rq, cq_desc, buf, skipped, opaque); + + if (desc_return == VNIC_RQ_RETURN_DESC) + rq->ring.desc_avail++; + + rq->to_clean = buf->next; + + if (!skipped) + break; + + buf = rq->to_clean; + } +} + +static inline int vnic_rq_fill(struct vnic_rq *rq, + int (*buf_fill)(struct vnic_rq *rq)) +{ + int err; + + while (vnic_rq_desc_avail(rq) > 0) { + + err = (*buf_fill)(rq); + if (err) + return err; + } + + return 0; +} + +static inline int vnic_rq_fill_count(struct vnic_rq *rq, + int (*buf_fill)(struct vnic_rq *rq), unsigned int count) +{ + int err; + + while ((vnic_rq_desc_avail(rq) > 0) && (count--)) { + + err = (*buf_fill)(rq); + if (err) + return err; + } + + return 0; +} + +void vnic_rq_free(struct vnic_rq *rq); +int vnic_rq_alloc(struct vnic_dev *vdev, struct vnic_rq *rq, unsigned int index, + unsigned int desc_count, unsigned int desc_size); +void vnic_rq_init(struct vnic_rq *rq, unsigned int cq_index, + unsigned int error_interrupt_enable, + unsigned int error_interrupt_offset); +void vnic_rq_error_out(struct vnic_rq *rq, unsigned int error); +unsigned int vnic_rq_error_status(struct vnic_rq *rq); +unsigned int vnic_rq_ctrl_error_status(struct vnic_rq_ctrl *ctrl); +void vnic_rq_enable(struct vnic_rq *rq); +int vnic_rq_disable(struct vnic_rq *rq); +void vnic_rq_clean(struct vnic_rq *rq, + void (*buf_clean)(struct vnic_rq *rq, struct vnic_rq_buf *buf)); +#ifndef NOT_FOR_OPEN_ENIC +int vnic_rq_mem_size(struct vnic_rq *rq, unsigned int desc_count, + unsigned int desc_size); +#endif + +#endif /* _VNIC_RQ_H_ */ diff --git a/prov/usnic/src/usnic_direct/vnic_stats.h b/prov/usnic/src/usnic_direct/vnic_stats.h new file mode 100644 index 00000000000..c42074b058d --- /dev/null +++ b/prov/usnic/src/usnic_direct/vnic_stats.h @@ -0,0 +1,99 @@ +/* + * Copyright 2008-2018 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#ifndef _VNIC_STATS_H_ +#define _VNIC_STATS_H_ + +/* Tx statistics */ +struct vnic_tx_stats { + u64 tx_frames_ok; + u64 tx_unicast_frames_ok; + u64 tx_multicast_frames_ok; + u64 tx_broadcast_frames_ok; + u64 tx_bytes_ok; + u64 tx_unicast_bytes_ok; + u64 tx_multicast_bytes_ok; + u64 tx_broadcast_bytes_ok; + u64 tx_drops; + u64 tx_errors; + u64 tx_tso; + u64 rsvd[16]; +}; + +/* Rx statistics */ +struct vnic_rx_stats { + u64 rx_frames_ok; + u64 rx_frames_total; + u64 rx_unicast_frames_ok; + u64 rx_multicast_frames_ok; + u64 rx_broadcast_frames_ok; + u64 rx_bytes_ok; + u64 rx_unicast_bytes_ok; + u64 rx_multicast_bytes_ok; + u64 rx_broadcast_bytes_ok; + u64 rx_drop; + u64 rx_no_bufs; + u64 rx_errors; + u64 rx_rss; + u64 rx_crc_errors; + u64 rx_frames_64; + u64 rx_frames_127; + u64 rx_frames_255; + u64 rx_frames_511; + u64 rx_frames_1023; + u64 rx_frames_1518; + u64 rx_frames_to_max; + u64 rsvd[16]; +}; + +/* Generic statistics */ +struct vnic_gen_stats { + u64 dma_map_error; +}; + +struct vnic_stats { + struct vnic_tx_stats tx; + struct vnic_rx_stats rx; +}; + +#endif /* _VNIC_STATS_H_ */ diff --git a/prov/usnic/src/usnic_direct/vnic_wq.c b/prov/usnic/src/usnic_direct/vnic_wq.c new file mode 100644 index 00000000000..5711dff8b07 --- /dev/null +++ b/prov/usnic/src/usnic_direct/vnic_wq.c @@ -0,0 +1,288 @@ +/* + * Copyright 2008-2018 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#include +#include +#include +#include +#include +#include + +#include "kcompat.h" +#include "vnic_dev.h" +#include "vnic_wq.h" + +static inline +int vnic_wq_get_ctrl(struct vnic_dev *vdev, struct vnic_wq *wq, + unsigned int index, enum vnic_res_type res_type) +{ + wq->ctrl = vnic_dev_get_res(vdev, res_type, index); + if (!wq->ctrl) + return -EINVAL; + return 0; +} + +static inline +int vnic_wq_alloc_ring(struct vnic_dev *vdev, struct vnic_wq *wq, + unsigned int desc_count, unsigned int desc_size) +{ + return vnic_dev_alloc_desc_ring(vdev, &wq->ring, desc_count, desc_size); +} + +static int vnic_wq_alloc_bufs(struct vnic_wq *wq) +{ + struct vnic_wq_buf *buf; + unsigned int i, j, count = wq->ring.desc_count; + unsigned int blks = VNIC_WQ_BUF_BLKS_NEEDED(count); + + for (i = 0; i < blks; i++) { + wq->bufs[i] = kzalloc(VNIC_WQ_BUF_BLK_SZ(count), GFP_ATOMIC); + if (!wq->bufs[i]) + return -ENOMEM; + } + + for (i = 0; i < blks; i++) { + buf = wq->bufs[i]; + for (j = 0; j < VNIC_WQ_BUF_BLK_ENTRIES(count); j++) { + buf->index = i * VNIC_WQ_BUF_BLK_ENTRIES(count) + j; + buf->desc = (u8 *)wq->ring.descs + + wq->ring.desc_size * buf->index; + if (buf->index + 1 == count) { + buf->next = wq->bufs[0]; + buf->next->prev = buf; + break; + } else if (j + 1 == VNIC_WQ_BUF_BLK_ENTRIES(count)) { + buf->next = wq->bufs[i + 1]; + buf->next->prev = buf; + } else { + buf->next = buf + 1; + buf->next->prev = buf; + buf++; + } + } + } + + wq->to_use = wq->to_clean = wq->bufs[0]; + + return 0; +} + +void vnic_wq_free(struct vnic_wq *wq) +{ + struct vnic_dev *vdev; + unsigned int i; + + vdev = wq->vdev; + + vnic_dev_free_desc_ring(vdev, &wq->ring); + + for (i = 0; i < VNIC_WQ_BUF_BLKS_MAX; i++) { + if (wq->bufs[i]) { + kfree(wq->bufs[i]); + wq->bufs[i] = NULL; + } + } + + wq->ctrl = NULL; +} + +#ifndef NOT_FOR_OPEN_ENIC +int vnic_wq_mem_size(struct vnic_wq *wq, unsigned int desc_count, + unsigned int desc_size) +{ + int mem_size = 0; + + mem_size += vnic_dev_desc_ring_size(&wq->ring, desc_count, desc_size); + + mem_size += VNIC_WQ_BUF_BLKS_NEEDED(wq->ring.desc_count) * + VNIC_WQ_BUF_BLK_SZ(wq->ring.desc_count); + + return mem_size; +} + +#endif + +int vnic_wq_alloc(struct vnic_dev *vdev, struct vnic_wq *wq, unsigned int index, + unsigned int desc_count, unsigned int desc_size) +{ + int err; + + wq->index = index; + wq->vdev = vdev; + + err = vnic_wq_get_ctrl(vdev, wq, index, RES_TYPE_WQ); + if (err) { + pr_err("Failed to hook WQ[%d] resource, err %d\n", index, err); + return err; + } + + vnic_wq_disable(wq); + + err = vnic_wq_alloc_ring(vdev, wq, desc_count, desc_size); + if (err) + return err; + + err = vnic_wq_alloc_bufs(wq); + if (err) { + vnic_wq_free(wq); + return err; + } + + return 0; +} + +int vnic_wq_devcmd2_alloc(struct vnic_dev *vdev, struct vnic_wq *wq, + unsigned int desc_count, unsigned int desc_size) +{ + int err; + + wq->index = 0; + wq->vdev = vdev; + + err = vnic_wq_get_ctrl(vdev, wq, 0, RES_TYPE_DEVCMD2); + if (err) { + pr_err("Failed to get devcmd2 resource\n"); + return err; + } + vnic_wq_disable(wq); + + err = vnic_wq_alloc_ring(vdev, wq, desc_count, desc_size); + if (err) + return err; + return 0; +} + +void vnic_wq_init_start(struct vnic_wq *wq, unsigned int cq_index, + unsigned int fetch_index, unsigned int posted_index, + unsigned int error_interrupt_enable, + unsigned int error_interrupt_offset) +{ + u64 paddr; + unsigned int count = wq->ring.desc_count; + + paddr = (u64)wq->ring.base_addr | VNIC_PADDR_TARGET; + writeq(paddr, &wq->ctrl->ring_base); + iowrite32(count, &wq->ctrl->ring_size); + iowrite32(fetch_index, &wq->ctrl->fetch_index); + iowrite32(posted_index, &wq->ctrl->posted_index); + iowrite32(cq_index, &wq->ctrl->cq_index); + iowrite32(error_interrupt_enable, &wq->ctrl->error_interrupt_enable); + iowrite32(error_interrupt_offset, &wq->ctrl->error_interrupt_offset); + iowrite32(0, &wq->ctrl->error_status); + + wq->to_use = wq->to_clean = + &wq->bufs[fetch_index / VNIC_WQ_BUF_BLK_ENTRIES(count)] + [fetch_index % VNIC_WQ_BUF_BLK_ENTRIES(count)]; +} + +void vnic_wq_init(struct vnic_wq *wq, unsigned int cq_index, + unsigned int error_interrupt_enable, + unsigned int error_interrupt_offset) +{ + vnic_wq_init_start(wq, cq_index, 0, 0, + error_interrupt_enable, + error_interrupt_offset); +} + +void vnic_wq_error_out(struct vnic_wq *wq, unsigned int error) +{ + iowrite32(error, &wq->ctrl->error_status); +} + +unsigned int vnic_wq_error_status(struct vnic_wq *wq) +{ + return vnic_wq_ctrl_error_status(wq->ctrl); +} + +EXPORT_SYMBOL(vnic_wq_ctrl_error_status); +unsigned int vnic_wq_ctrl_error_status(struct vnic_wq_ctrl *ctrl) +{ + return ioread32(&ctrl->error_status); +} + +void vnic_wq_enable(struct vnic_wq *wq) +{ + iowrite32(1, &wq->ctrl->enable); +} + +int vnic_wq_disable(struct vnic_wq *wq) +{ + unsigned int wait; + + iowrite32(0, &wq->ctrl->enable); + + /* Wait for HW to ACK disable request */ + for (wait = 0; wait < 1000; wait++) { + if (!(ioread32(&wq->ctrl->running))) + return 0; + udelay(10); + } + + pr_err("Failed to disable WQ[%d]\n", wq->index); + + return -ETIMEDOUT; +} + +void vnic_wq_clean(struct vnic_wq *wq, + void (*buf_clean)(struct vnic_wq *wq, struct vnic_wq_buf *buf)) +{ + struct vnic_wq_buf *buf; + + buf = wq->to_clean; + + while (vnic_wq_desc_used(wq) > 0) { + + (*buf_clean)(wq, buf); + + buf = wq->to_clean = buf->next; + wq->ring.desc_avail++; + } + + wq->to_use = wq->to_clean = wq->bufs[0]; + + iowrite32(0, &wq->ctrl->fetch_index); + iowrite32(0, &wq->ctrl->posted_index); + iowrite32(0, &wq->ctrl->error_status); + + vnic_dev_clear_desc_ring(&wq->ring); +} diff --git a/prov/usnic/src/usnic_direct/vnic_wq.h b/prov/usnic/src/usnic_direct/vnic_wq.h new file mode 100644 index 00000000000..c979a7d9404 --- /dev/null +++ b/prov/usnic/src/usnic_direct/vnic_wq.h @@ -0,0 +1,302 @@ +/* + * Copyright 2008-2018 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#ifndef _VNIC_WQ_H_ +#define _VNIC_WQ_H_ + +#include + +#include "vnic_dev.h" +#include "vnic_cq.h" + +/* Work queue control */ +struct vnic_wq_ctrl { + u64 ring_base; /* 0x00 */ + u32 ring_size; /* 0x08 */ + u32 pad0; + u32 posted_index; /* 0x10 */ + u32 pad1; + u32 cq_index; /* 0x18 */ + u32 pad2; + u32 enable; /* 0x20 */ + u32 pad3; + u32 running; /* 0x28 */ + u32 pad4; + u32 fetch_index; /* 0x30 */ + u32 pad5; + u32 dca_value; /* 0x38 */ + u32 pad6; + u32 error_interrupt_enable; /* 0x40 */ + u32 pad7; + u32 error_interrupt_offset; /* 0x48 */ + u32 pad8; + u32 error_status; /* 0x50 */ + u32 pad9; +}; + +struct vnic_wq_buf { + struct vnic_wq_buf *next; + dma_addr_t dma_addr; + void *os_buf; + unsigned int len; + unsigned int index; + int sop; + void *desc; + uint64_t wr_id; /* Cookie */ + uint8_t cq_entry; /* Gets completion event from hw */ + uint8_t desc_skip_cnt; /* Num descs to occupy */ + uint8_t compressed_send; /* Both hdr and payload in one desc */ + struct vnic_wq_buf *prev; +}; + +/* Break the vnic_wq_buf allocations into blocks of 32/64 entries */ +#define VNIC_WQ_BUF_MIN_BLK_ENTRIES 32 +#define VNIC_WQ_BUF_DFLT_BLK_ENTRIES 64 +#define VNIC_WQ_BUF_BLK_ENTRIES(entries) \ + ((unsigned int)((entries < VNIC_WQ_BUF_DFLT_BLK_ENTRIES) ? \ + VNIC_WQ_BUF_MIN_BLK_ENTRIES : VNIC_WQ_BUF_DFLT_BLK_ENTRIES)) +#define VNIC_WQ_BUF_BLK_SZ(entries) \ + (VNIC_WQ_BUF_BLK_ENTRIES(entries) * sizeof(struct vnic_wq_buf)) +#define VNIC_WQ_BUF_BLKS_NEEDED(entries) \ + DIV_ROUND_UP(entries, VNIC_WQ_BUF_BLK_ENTRIES(entries)) +#define VNIC_WQ_BUF_BLKS_MAX VNIC_WQ_BUF_BLKS_NEEDED(4096) + +struct vnic_wq { + unsigned int index; + struct vnic_dev *vdev; + struct vnic_wq_ctrl __iomem *ctrl; /* memory-mapped */ + struct vnic_dev_ring ring; + struct vnic_wq_buf *bufs[VNIC_WQ_BUF_BLKS_MAX]; + struct vnic_wq_buf *to_use; + struct vnic_wq_buf *to_clean; + unsigned int pkts_outstanding; +#if defined(__LIBUSNIC__) + uint32_t qp_num; +#endif +}; + +static inline unsigned int vnic_wq_desc_avail(struct vnic_wq *wq) +{ + /* how many does SW own? */ + return wq->ring.desc_avail; +} + +static inline unsigned int vnic_wq_desc_used(struct vnic_wq *wq) +{ + /* how many does HW own? */ + return wq->ring.desc_count - wq->ring.desc_avail - 1; +} + +static inline void *vnic_wq_next_desc(struct vnic_wq *wq) +{ + return wq->to_use->desc; +} + +#define PI_LOG2_CACHE_LINE_SIZE 5 +#define PI_INDEX_BITS 12 +#define PI_INDEX_MASK ((1U << PI_INDEX_BITS) - 1) +#define PI_PREFETCH_LEN_MASK ((1U << PI_LOG2_CACHE_LINE_SIZE) - 1) +#define PI_PREFETCH_LEN_OFF 16 +#define PI_PREFETCH_ADDR_BITS 43 +#define PI_PREFETCH_ADDR_MASK ((1ULL << PI_PREFETCH_ADDR_BITS) - 1) +#define PI_PREFETCH_ADDR_OFF 21 + +/** How many cache lines are touched by buffer (addr, len). */ +static inline unsigned int num_cache_lines_touched(dma_addr_t addr, + unsigned int len) +{ + const unsigned long mask = PI_PREFETCH_LEN_MASK; + const unsigned long laddr = (unsigned long)addr; + unsigned long lines, equiv_len; + /* A. If addr is aligned, our solution is just to round up len to the + next boundary. + + e.g. addr = 0, len = 48 + +--------------------+ + |XXXXXXXXXXXXXXXXXXXX| 32-byte cacheline a + +--------------------+ + |XXXXXXXXXX | cacheline b + +--------------------+ + + B. If addr is not aligned, however, we may use an extra + cacheline. e.g. addr = 12, len = 22 + + +--------------------+ + | XXXXXXXXXXXXX| + +--------------------+ + |XX | + +--------------------+ + + Our solution is to make the problem equivalent to case A + above by adding the empty space in the first cacheline to the length: + unsigned long len; + + +--------------------+ + |eeeeeeeXXXXXXXXXXXXX| "e" is empty space, which we add to len + +--------------------+ + |XX | + +--------------------+ + + */ + equiv_len = len + (laddr & mask); + + /* Now we can just round up this len to the next 32-byte boundary. */ + lines = (equiv_len + mask) & (~mask); + + /* Scale bytes -> cachelines. */ + return lines >> PI_LOG2_CACHE_LINE_SIZE; +} + +static inline u64 vnic_cached_posted_index(dma_addr_t addr, unsigned int len, + unsigned int index) +{ + unsigned int num_cache_lines = num_cache_lines_touched(addr, len); + /* Wish we could avoid a branch here. We could have separate + * vnic_wq_post() and vinc_wq_post_inline(), the latter + * only supporting < 1k (2^5 * 2^5) sends, I suppose. This would + * eliminate the if (eop) branch as well. + */ + if (num_cache_lines > PI_PREFETCH_LEN_MASK) + num_cache_lines = 0; + return (index & PI_INDEX_MASK) | + ((num_cache_lines & PI_PREFETCH_LEN_MASK) << PI_PREFETCH_LEN_OFF) | + (((addr >> PI_LOG2_CACHE_LINE_SIZE) & + PI_PREFETCH_ADDR_MASK) << PI_PREFETCH_ADDR_OFF); +} + +static inline void vnic_wq_post(struct vnic_wq *wq, + void *os_buf, dma_addr_t dma_addr, + unsigned int len, int sop, int eop, + uint8_t desc_skip_cnt, uint8_t cq_entry, + uint8_t compressed_send, uint64_t wrid) +{ + struct vnic_wq_buf *buf = wq->to_use; + + buf->sop = sop; + buf->cq_entry = cq_entry; + buf->compressed_send = compressed_send; + buf->desc_skip_cnt = desc_skip_cnt; + buf->os_buf = eop ? os_buf : NULL; + buf->dma_addr = dma_addr; + buf->len = len; + buf->wr_id = wrid; + + buf = buf->next; + if (eop) { +#ifdef DO_PREFETCH + uint64_t wr = vnic_cached_posted_index(dma_addr, len, + buf->index); +#endif + /* Adding write memory barrier prevents compiler and/or CPU + * reordering, thus avoiding descriptor posting before + * descriptor is initialized. Otherwise, hardware can read + * stale descriptor fields. + */ + wmb(); +#ifdef DO_PREFETCH + /* Intel chipsets seem to limit the rate of PIOs that we can + * push on the bus. Thus, it is very important to do a single + * 64 bit write here. With two 32-bit writes, my maximum + * pkt/sec rate was cut almost in half. -AJF + */ + iowrite64((uint64_t)wr, &wq->ctrl->posted_index); +#else + iowrite32(buf->index, &wq->ctrl->posted_index); +#endif + } + wq->to_use = buf; + + wq->ring.desc_avail -= desc_skip_cnt; +} + +static inline void vnic_wq_service(struct vnic_wq *wq, + struct cq_desc *cq_desc, u16 completed_index, + void (*buf_service)(struct vnic_wq *wq, + struct cq_desc *cq_desc, struct vnic_wq_buf *buf, void *opaque), + void *opaque) +{ + struct vnic_wq_buf *buf; + + buf = wq->to_clean; + while (1) { + + (*buf_service)(wq, cq_desc, buf, opaque); + + wq->ring.desc_avail++; + + wq->to_clean = buf->next; + + if (buf->index == completed_index) + break; + + buf = wq->to_clean; + } +} + +void vnic_wq_free(struct vnic_wq *wq); +int vnic_wq_alloc(struct vnic_dev *vdev, struct vnic_wq *wq, unsigned int index, + unsigned int desc_count, unsigned int desc_size); +int vnic_wq_devcmd2_alloc(struct vnic_dev *vdev, struct vnic_wq *wq, + unsigned int desc_count, unsigned int desc_size); +#ifndef FOR_UPSTREAM_KERNEL +void vnic_wq_init_start(struct vnic_wq *wq, unsigned int cq_index, + unsigned int fetch_index, unsigned int posted_index, + unsigned int error_interrupt_enable, + unsigned int error_interrupt_offset); +#endif +void vnic_wq_init(struct vnic_wq *wq, unsigned int cq_index, + unsigned int error_interrupt_enable, + unsigned int error_interrupt_offset); +void vnic_wq_error_out(struct vnic_wq *wq, unsigned int error); +unsigned int vnic_wq_error_status(struct vnic_wq *wq); +unsigned int vnic_wq_ctrl_error_status(struct vnic_wq_ctrl *ctrl); +void vnic_wq_enable(struct vnic_wq *wq); +int vnic_wq_disable(struct vnic_wq *wq); +void vnic_wq_clean(struct vnic_wq *wq, + void (*buf_clean)(struct vnic_wq *wq, struct vnic_wq_buf *buf)); +#ifndef NOT_FOR_OPEN_ENIC +int vnic_wq_mem_size(struct vnic_wq *wq, unsigned int desc_count, + unsigned int desc_size); +#endif + +#endif /* _VNIC_WQ_H_ */ diff --git a/prov/usnic/src/usnic_direct/wq_enet_desc.h b/prov/usnic/src/usnic_direct/wq_enet_desc.h new file mode 100644 index 00000000000..0f6077891fa --- /dev/null +++ b/prov/usnic/src/usnic_direct/wq_enet_desc.h @@ -0,0 +1,122 @@ +/* + * Copyright 2008-2010 Cisco Systems, Inc. All rights reserved. + * Copyright 2007 Nuova Systems, Inc. All rights reserved. + * + * LICENSE_BEGIN + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * LICENSE_END + * + * + */ + +#ifndef _WQ_ENET_DESC_H_ +#define _WQ_ENET_DESC_H_ + +/* Ethernet work queue descriptor: 16B */ +struct wq_enet_desc { + __le64 address; + __le16 length; + __le16 mss_loopback; + __le16 header_length_flags; + __le16 vlan_tag; +}; + +#define WQ_ENET_ADDR_BITS 64 +#define WQ_ENET_LEN_BITS 14 +#define WQ_ENET_LEN_MASK ((1 << WQ_ENET_LEN_BITS) - 1) +#define WQ_ENET_MSS_BITS 14 +#define WQ_ENET_MSS_MASK ((1 << WQ_ENET_MSS_BITS) - 1) +#define WQ_ENET_MSS_SHIFT 2 +#define WQ_ENET_LOOPBACK_SHIFT 1 +#define WQ_ENET_HDRLEN_BITS 10 +#define WQ_ENET_HDRLEN_MASK ((1 << WQ_ENET_HDRLEN_BITS) - 1) +#define WQ_ENET_FLAGS_OM_BITS 2 +#define WQ_ENET_FLAGS_OM_MASK ((1 << WQ_ENET_FLAGS_OM_BITS) - 1) +#define WQ_ENET_FLAGS_EOP_SHIFT 12 +#define WQ_ENET_FLAGS_CQ_ENTRY_SHIFT 13 +#define WQ_ENET_FLAGS_FCOE_ENCAP_SHIFT 14 +#define WQ_ENET_FLAGS_VLAN_TAG_INSERT_SHIFT 15 + +#define WQ_ENET_OFFLOAD_MODE_CSUM 0 +#define WQ_ENET_OFFLOAD_MODE_RESERVED 1 +#define WQ_ENET_OFFLOAD_MODE_CSUM_L4 2 +#define WQ_ENET_OFFLOAD_MODE_TSO 3 + +static inline void wq_enet_desc_enc(struct wq_enet_desc *desc, + u64 address, u16 length, u16 mss, u16 header_length, + u8 offload_mode, u8 eop, u8 cq_entry, u8 fcoe_encap, + u8 vlan_tag_insert, u16 vlan_tag, u8 loopback) +{ + desc->address = cpu_to_le64(address); + desc->length = cpu_to_le16(length & WQ_ENET_LEN_MASK); + desc->mss_loopback = cpu_to_le16((mss & WQ_ENET_MSS_MASK) << + WQ_ENET_MSS_SHIFT | (loopback & 1) << WQ_ENET_LOOPBACK_SHIFT); + desc->header_length_flags = cpu_to_le16( + (header_length & WQ_ENET_HDRLEN_MASK) | + (offload_mode & WQ_ENET_FLAGS_OM_MASK) << WQ_ENET_HDRLEN_BITS | + (eop & 1) << WQ_ENET_FLAGS_EOP_SHIFT | + (cq_entry & 1) << WQ_ENET_FLAGS_CQ_ENTRY_SHIFT | + (fcoe_encap & 1) << WQ_ENET_FLAGS_FCOE_ENCAP_SHIFT | + (vlan_tag_insert & 1) << WQ_ENET_FLAGS_VLAN_TAG_INSERT_SHIFT); + desc->vlan_tag = cpu_to_le16(vlan_tag); +} + +static inline void wq_enet_desc_dec(struct wq_enet_desc *desc, + u64 *address, u16 *length, u16 *mss, u16 *header_length, + u8 *offload_mode, u8 *eop, u8 *cq_entry, u8 *fcoe_encap, + u8 *vlan_tag_insert, u16 *vlan_tag, u8 *loopback) +{ + *address = le64_to_cpu(desc->address); + *length = le16_to_cpu(desc->length) & WQ_ENET_LEN_MASK; + *mss = (le16_to_cpu(desc->mss_loopback) >> WQ_ENET_MSS_SHIFT) & + WQ_ENET_MSS_MASK; + *loopback = (u8)((le16_to_cpu(desc->mss_loopback) >> + WQ_ENET_LOOPBACK_SHIFT) & 1); + *header_length = le16_to_cpu(desc->header_length_flags) & + WQ_ENET_HDRLEN_MASK; + *offload_mode = (u8)((le16_to_cpu(desc->header_length_flags) >> + WQ_ENET_HDRLEN_BITS) & WQ_ENET_FLAGS_OM_MASK); + *eop = (u8)((le16_to_cpu(desc->header_length_flags) >> + WQ_ENET_FLAGS_EOP_SHIFT) & 1); + *cq_entry = (u8)((le16_to_cpu(desc->header_length_flags) >> + WQ_ENET_FLAGS_CQ_ENTRY_SHIFT) & 1); + *fcoe_encap = (u8)((le16_to_cpu(desc->header_length_flags) >> + WQ_ENET_FLAGS_FCOE_ENCAP_SHIFT) & 1); + *vlan_tag_insert = (u8)((le16_to_cpu(desc->header_length_flags) >> + WQ_ENET_FLAGS_VLAN_TAG_INSERT_SHIFT) & 1); + *vlan_tag = le16_to_cpu(desc->vlan_tag); +} + +#endif /* _WQ_ENET_DESC_H_ */ diff --git a/prov/util/src/util_attr.c b/prov/util/src/util_attr.c index f310e08a4d0..1665e2e5f1a 100644 --- a/prov/util/src/util_attr.c +++ b/prov/util/src/util_attr.c @@ -247,7 +247,12 @@ static int ofi_info_to_util(uint32_t version, const struct fi_provider *prov, if (ofi_dup_addr(core_info, *util_info)) goto err; - assert(core_info->domain_attr->name); + /* Release 1.4 brought standardized domain names across IP based + * providers. Before this release, the usNIC provider would return a + * NULL domain name from fi_getinfo. For compatibility reasons, allow a + * NULL domain name when apps are requesting version < 1.4. + */ + assert(FI_VERSION_LT(1, 4) || core_info->domain_attr->name); if (core_info->domain_attr->name) { (*util_info)->domain_attr->name = @@ -454,8 +459,10 @@ static int fi_progress_level(enum fi_progress progress_model) return 1; case FI_PROGRESS_MANUAL: return 2; - case FI_PROGRESS_UNSPEC: + case FI_PROGRESS_CONTROL_UNIFIED: return 3; + case FI_PROGRESS_UNSPEC: + return 4; default: return -1; } @@ -572,7 +579,8 @@ int ofi_check_domain_attr(const struct fi_provider *prov, uint32_t api_version, return -FI_ENODATA; } - if (fi_progress_level(user_attr->data_progress) < + if (user_attr->data_progress == FI_PROGRESS_CONTROL_UNIFIED || + fi_progress_level(user_attr->data_progress) < fi_progress_level(prov_attr->data_progress)) { FI_INFO(prov, FI_LOG_CORE, "Invalid data progress model\n"); return -FI_ENODATA; diff --git a/prov/util/src/util_av.c b/prov/util/src/util_av.c index 9d39044783d..46c6216c393 100644 --- a/prov/util/src/util_av.c +++ b/prov/util/src/util_av.c @@ -536,6 +536,7 @@ int ofi_av_init_lightweight(struct util_domain *domain, const struct fi_av_attr struct util_av *av, void *context) { int ret; + enum ofi_lock_type ep_list_lock_type; ret = util_verify_av_attr(domain, attr); if (ret) @@ -553,7 +554,10 @@ int ofi_av_init_lightweight(struct util_domain *domain, const struct fi_av_attr av->context = context; av->domain = domain; - ret = ofi_genlock_init(&av->ep_list_lock, OFI_LOCK_MUTEX); + + ep_list_lock_type = ofi_progress_lock_type(av->domain->threading, + av->domain->control_progress); + ret = ofi_genlock_init(&av->ep_list_lock, ep_list_lock_type); if (ret) return ret; diff --git a/prov/util/src/util_cntr.c b/prov/util/src/util_cntr.c index 8576f2c9182..2958728d1f0 100644 --- a/prov/util/src/util_cntr.c +++ b/prov/util/src/util_cntr.c @@ -336,6 +336,7 @@ int ofi_cntr_init(const struct fi_provider *prov, struct fid_domain *domain, int ret; struct fi_wait_attr wait_attr; struct fid_wait *wait; + enum ofi_lock_type ep_list_lock_type; assert(progress); ret = ofi_check_cntr_attr(prov, attr); @@ -390,7 +391,9 @@ int ofi_cntr_init(const struct fi_provider *prov, struct fid_domain *domain, goto errout_close_wait; } - ret = ofi_genlock_init(&cntr->ep_list_lock, OFI_LOCK_MUTEX); + ep_list_lock_type = ofi_progress_lock_type(cntr->domain->threading, + cntr->domain->control_progress); + ret = ofi_genlock_init(&cntr->ep_list_lock, ep_list_lock_type); if (ret) goto errout_close_wait; diff --git a/prov/util/src/util_cq.c b/prov/util/src/util_cq.c index 553fc1f1d77..29cec9ba08e 100644 --- a/prov/util/src/util_cq.c +++ b/prov/util/src/util_cq.c @@ -708,7 +708,8 @@ int ofi_cq_init(const struct fi_provider *prov, struct fid_domain *domain, { struct fi_wait_attr wait_attr; struct fid_wait *wait; - enum ofi_lock_type lock_type; + enum ofi_lock_type cq_lock_type; + enum ofi_lock_type ep_list_lock_type; int ret; assert(progress); @@ -728,16 +729,17 @@ int ofi_cq_init(const struct fi_provider *prov, struct fid_domain *domain, if (cq->domain->threading == FI_THREAD_COMPLETION || cq->domain->threading == FI_THREAD_DOMAIN) - lock_type = OFI_LOCK_NOOP; + cq_lock_type = OFI_LOCK_NOOP; else - lock_type = cq->domain->lock.lock_type; + cq_lock_type = cq->domain->lock.lock_type; - ret = ofi_genlock_init(&cq->cq_lock, lock_type); + ret = ofi_genlock_init(&cq->cq_lock, cq_lock_type); if (ret) return ret; - /* TODO Figure out how to optimize this lock for rdm and msg endpoints */ - ret = ofi_genlock_init(&cq->ep_list_lock, OFI_LOCK_MUTEX); + ep_list_lock_type = ofi_progress_lock_type(cq->domain->threading, + cq->domain->control_progress); + ret = ofi_genlock_init(&cq->ep_list_lock, ep_list_lock_type); if (ret) goto destroy1; diff --git a/prov/util/src/util_domain.c b/prov/util/src/util_domain.c index 9ca40a0e456..783de5d7253 100644 --- a/prov/util/src/util_domain.c +++ b/prov/util/src/util_domain.c @@ -116,6 +116,7 @@ util_domain_init(struct util_domain *domain, const struct fi_info *info, domain->av_type = info->domain_attr->av_type; domain->threading = info->domain_attr->threading; domain->data_progress = info->domain_attr->data_progress; + domain->control_progress = info->domain_attr->control_progress; domain->name = strdup(info->domain_attr->name); if (!domain->name) { ofi_genlock_destroy(&domain->lock); diff --git a/prov/util/src/util_mem_monitor.c b/prov/util/src/util_mem_monitor.c index a6159dae370..f7385c5d11f 100644 --- a/prov/util/src/util_mem_monitor.c +++ b/prov/util/src/util_mem_monitor.c @@ -4,6 +4,7 @@ * Copyright (c) 2019-2021 Amazon.com, Inc. or its affiliates. * All rights reserved. * (C) Copyright 2020 Hewlett Packard Enterprise Development LP + * Copyright (C) 2024 Cornelis Networks. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -410,7 +411,7 @@ int ofi_monitors_add_cache(struct ofi_mem_monitor **monitors, void ofi_monitors_del_cache(struct ofi_mr_cache *cache) { - struct ofi_mem_monitor *stop_list[OFI_HMEM_MAX]; + struct ofi_mem_monitor *update_list[OFI_HMEM_MAX]; struct ofi_mem_monitor *monitor; enum fi_hmem_iface iface; int ret; @@ -427,7 +428,7 @@ void ofi_monitors_del_cache(struct ofi_mr_cache *cache) } while (ret); for (iface = 0; iface < OFI_HMEM_MAX; iface++) { - stop_list[iface] = NULL; + update_list[iface] = NULL; monitor = cache->monitors[iface]; if (!monitor) continue; @@ -436,12 +437,14 @@ void ofi_monitors_del_cache(struct ofi_mr_cache *cache) if (dlist_empty(&monitor->list)) { pthread_mutex_lock(&mm_state_lock); - stop_list[iface] = monitor; /* See comment above ofi_monitors_update for details */ - if (monitor->state == FI_MM_STATE_RUNNING) + if (monitor->state == FI_MM_STATE_RUNNING) { monitor->state = FI_MM_STATE_STOPPING; - else if (monitor->state == FI_MM_STATE_STARTING) + update_list[iface] = monitor; + } else if (monitor->state == FI_MM_STATE_STARTING) { monitor->state = FI_MM_STATE_RUNNING; + update_list[iface] = monitor; + } pthread_mutex_unlock(&mm_state_lock); } @@ -451,7 +454,7 @@ void ofi_monitors_del_cache(struct ofi_mr_cache *cache) pthread_rwlock_unlock(&mm_list_rwlock); - ofi_monitors_update(stop_list); + ofi_monitors_update(update_list); return; } @@ -853,6 +856,9 @@ static void ofi_import_monitor_notify(struct fid_mem_monitor *monitor, static int ofi_close_import(struct fid *fid) { + pthread_mutex_lock(&mm_state_lock); + impmon.monitor.state = FI_MM_STATE_IDLE; + pthread_mutex_unlock(&mm_state_lock); impmon.impfid = NULL; return 0; } diff --git a/prov/verbs/src/verbs_cm.c b/prov/verbs/src/verbs_cm.c index c9febbee4a3..bf34b9f9500 100644 --- a/prov/verbs/src/verbs_cm.c +++ b/prov/verbs/src/verbs_cm.c @@ -133,7 +133,7 @@ vrb_ep_prepare_rdma_cm_param(struct rdma_conn_param *conn_param, conn_param->rnr_retry_count = 7; } -static void +void vrb_msg_ep_prepare_rdma_cm_hdr(void *priv_data, const struct rdma_cm_id *id) { @@ -159,8 +159,7 @@ vrb_msg_ep_connect(struct fid_ep *ep_fid, const void *addr, struct vrb_ep *ep = container_of(ep_fid, struct vrb_ep, util_ep.ep_fid); size_t priv_data_len; struct vrb_cm_data_hdr *cm_hdr; - off_t rdma_cm_hdr_len = 0; - int ret; + int ret = 0; if (OFI_UNLIKELY(paramlen > VERBS_CM_DATA_SIZE)) return -FI_EINVAL; @@ -173,18 +172,12 @@ vrb_msg_ep_connect(struct fid_ep *ep_fid, const void *addr, } } - if (ep->id->route.addr.src_addr.sa_family == AF_IB) - rdma_cm_hdr_len = sizeof(struct vrb_rdma_cm_hdr); - - priv_data_len = sizeof(*cm_hdr) + paramlen + rdma_cm_hdr_len; + priv_data_len = sizeof(*cm_hdr) + paramlen + sizeof(struct vrb_rdma_cm_hdr); ep->cm_priv_data = malloc(priv_data_len); if (!ep->cm_priv_data) return -FI_ENOMEM; - if (rdma_cm_hdr_len) - vrb_msg_ep_prepare_rdma_cm_hdr(ep->cm_priv_data, ep->id); - - cm_hdr = (void *)((char *)ep->cm_priv_data + rdma_cm_hdr_len); + cm_hdr = (void *)((char *)ep->cm_priv_data + sizeof(struct vrb_rdma_cm_hdr)); vrb_msg_ep_prepare_cm_data(param, paramlen, cm_hdr); vrb_ep_prepare_rdma_cm_param(&ep->conn_param, ep->cm_priv_data, priv_data_len); @@ -193,13 +186,28 @@ vrb_msg_ep_connect(struct fid_ep *ep_fid, const void *addr, if (ep->srx) ep->conn_param.srq = 1; + if (addr) { + free(ep->info_attr.dest_addr); + ep->info_attr.dest_addr = mem_dup(addr, ofi_sizeofaddr(addr)); + if (!ep->info_attr.dest_addr) { + free(ep->cm_priv_data); + ep->cm_priv_data = NULL; + return -FI_ENOMEM; + } + ep->info_attr.dest_addrlen = ofi_sizeofaddr(addr); + } + ofi_genlock_lock(&vrb_ep2_progress(ep)->ep_lock); assert(ep->state == VRB_IDLE); - ep->state = VRB_RESOLVE_ROUTE; - ret = rdma_resolve_route(ep->id, VERBS_RESOLVE_TIMEOUT); - if (ret) { + ep->state = VRB_RESOLVE_ADDR; + if (rdma_resolve_addr(ep->id, ep->info_attr.src_addr, + ep->info_attr.dest_addr, VERBS_RESOLVE_TIMEOUT)) { ret = -errno; - VRB_WARN_ERRNO(FI_LOG_EP_CTRL, "rdma_resolve_route"); + VRB_WARN_ERRNO(FI_LOG_EP_CTRL, "rdma_resolve_addr"); + ofi_straddr_log(&vrb_prov, FI_LOG_WARN, FI_LOG_EP_CTRL, + "src addr", ep->info_attr.src_addr); + ofi_straddr_log(&vrb_prov, FI_LOG_WARN, FI_LOG_EP_CTRL, + "dst addr", ep->info_attr.dest_addr); free(ep->cm_priv_data); ep->cm_priv_data = NULL; ep->state = VRB_IDLE; diff --git a/prov/verbs/src/verbs_cq.c b/prov/verbs/src/verbs_cq.c index b9b6bc9eecc..84a252f7251 100644 --- a/prov/verbs/src/verbs_cq.c +++ b/prov/verbs/src/verbs_cq.c @@ -608,7 +608,7 @@ int vrb_cq_open(struct fid_domain *domain_fid, struct fi_cq_attr *attr, } -int vrb_init_progress(struct vrb_progress *progress, struct ibv_context *verbs) +int vrb_init_progress(struct vrb_progress *progress, struct fi_info *info) { int ret; @@ -624,8 +624,17 @@ int vrb_init_progress(struct vrb_progress *progress, struct ibv_context *verbs) if (ret) goto err1; + ret = ofi_bufpool_create(&progress->recv_wr_pool, + sizeof(struct vrb_recv_wr) + + info->rx_attr->iov_limit * sizeof(struct ibv_sge), + 16, 0, 1024, OFI_BUFPOOL_NO_TRACK); + if (ret) + goto err2; + return 0; +err2: + ofi_bufpool_destroy(progress->ctx_pool); err1: ofi_genlock_destroy(&progress->ep_lock); return ret; @@ -633,6 +642,7 @@ int vrb_init_progress(struct vrb_progress *progress, struct ibv_context *verbs) void vrb_close_progress(struct vrb_progress *progress) { + ofi_bufpool_destroy(progress->recv_wr_pool); ofi_bufpool_destroy(progress->ctx_pool); ofi_genlock_destroy(&progress->ep_lock); } diff --git a/prov/verbs/src/verbs_domain.c b/prov/verbs/src/verbs_domain.c index f184e6cdf4f..e58ad12bfe8 100644 --- a/prov/verbs/src/verbs_domain.c +++ b/prov/verbs/src/verbs_domain.c @@ -416,7 +416,7 @@ vrb_domain(struct fid_fabric *fabric, struct fi_info *info, goto err4; } - ret = vrb_init_progress(&_domain->progress, _domain->verbs); + ret = vrb_init_progress(&_domain->progress, _domain->info); if (ret) goto err4; diff --git a/prov/verbs/src/verbs_ep.c b/prov/verbs/src/verbs_ep.c index be59391424f..4b99e091065 100755 --- a/prov/verbs/src/verbs_ep.c +++ b/prov/verbs/src/verbs_ep.c @@ -57,19 +57,18 @@ void vrb_add_credits(struct fid_ep *ep_fid, uint64_t credits) ofi_genlock_unlock(&vrb_ep2_progress(ep)->ep_lock); } -ssize_t vrb_post_recv(struct vrb_ep *ep, struct ibv_recv_wr *wr) +int vrb_post_recv_internal(struct vrb_ep *ep, struct ibv_recv_wr *wr) { struct vrb_context *ctx; struct ibv_recv_wr *bad_wr; uint64_t credits_to_give; int ret, err; - ofi_genlock_lock(&vrb_ep2_progress(ep)->ep_lock); + assert(ofi_genlock_held(&vrb_ep2_progress(ep)->ep_lock)); + ctx = vrb_alloc_ctx(vrb_ep2_progress(ep)); - if (!ctx) { - ret = -FI_EAGAIN; - goto unlock; - } + if (!ctx) + return -FI_EAGAIN; ctx->ep = ep; ctx->user_ctx = (void *) (uintptr_t) wr->wr_id; @@ -80,8 +79,7 @@ ssize_t vrb_post_recv(struct vrb_ep *ep, struct ibv_recv_wr *wr) wr->wr_id = (uintptr_t) ctx->user_ctx; if (ret) { vrb_free_ctx(vrb_ep2_progress(ep), ctx); - ret = -FI_EAGAIN; - goto unlock; + return -FI_EAGAIN; } slist_insert_tail(&ctx->entry, &ep->rq_list); @@ -109,7 +107,45 @@ ssize_t vrb_post_recv(struct vrb_ep *ep, struct ibv_recv_wr *wr) ep->rq_credits_avail += credits_to_give; } -unlock: + return ret; +} + +static int vrb_prepost_recv(struct vrb_ep *ep, struct ibv_recv_wr *wr) +{ + struct vrb_recv_wr *save_wr; + size_t i; + + assert(ofi_genlock_held(&vrb_ep2_progress(ep)->ep_lock)); + + if (wr->next) + return -FI_EINVAL; + + save_wr = vrb_alloc_recv_wr(vrb_ep2_progress(ep)); + if (!save_wr) + return -FI_ENOMEM; + + save_wr->wr.wr_id = wr->wr_id; + save_wr->wr.next = NULL; + save_wr->wr.num_sge = wr->num_sge; + for (i = 0; i < wr->num_sge; i++) + save_wr->sge[i] = wr->sg_list[i]; + save_wr->wr.sg_list = save_wr->sge; + slist_insert_tail(&save_wr->entry, &ep->prepost_wr_list); + return 0; +} + +ssize_t vrb_post_recv(struct vrb_ep *ep, struct ibv_recv_wr *wr) +{ + int ret; + + if (wr->num_sge > ep->info_attr.rx_iov_limit) + return -FI_EINVAL; + + ofi_genlock_lock(&vrb_ep2_progress(ep)->ep_lock); + if (!ep->ibv_qp) + ret = vrb_prepost_recv(ep, wr); + else + ret = vrb_post_recv_internal(ep, wr); ofi_genlock_unlock(&vrb_ep2_progress(ep)->ep_lock); return ret; } @@ -155,7 +191,7 @@ ssize_t vrb_post_send(struct vrb_ep *ep, struct ibv_send_wr *wr, uint64_t flags) } if (vrb_wr_consumes_recv(wr) && !--ep->peer_rq_credits && - !(flags & FI_PRIORITY)) { + !(flags & OFI_PRIORITY)) { /* Last credit is reserved for credit update */ ep->peer_rq_credits++; goto freectx; @@ -437,6 +473,7 @@ vrb_alloc_init_ep(struct fi_info *info, struct vrb_domain *domain, slist_init(&ep->sq_list); slist_init(&ep->rq_list); + slist_init(&ep->prepost_wr_list); ep->util_ep.ep_fid.msg = calloc(1, sizeof(*ep->util_ep.ep_fid.msg)); if (!ep->util_ep.ep_fid.msg) goto err3; @@ -511,6 +548,34 @@ static void vrb_flush_rq(struct vrb_ep *ep) } } +static void vrb_flush_prepost_wr(struct vrb_ep *ep) +{ + struct vrb_recv_wr *wr; + struct vrb_cq *cq; + struct slist_entry *entry; + struct ibv_wc wc = {0}; + + assert(ofi_genlock_held(vrb_ep2_progress(ep)->active_lock)); + if (!ep->util_ep.rx_cq) + return; + + cq = container_of(ep->util_ep.rx_cq, struct vrb_cq, util_cq); + wc.status = IBV_WC_WR_FLUSH_ERR; + wc.vendor_err = FI_ECANCELED; + + while (!slist_empty(&ep->prepost_wr_list)) { + entry = slist_remove_head(&ep->prepost_wr_list); + wr = container_of(entry, struct vrb_recv_wr, entry); + + wc.wr_id = (uintptr_t) wr->wr.wr_id; + wc.opcode = IBV_WC_RECV; + vrb_free_recv_wr(vrb_ep2_progress(ep), wr); + + if (wc.wr_id != VERBS_NO_COMP_FLAG) + vrb_report_wc(cq, &wc); + } +} + static int vrb_close_free_ep(struct vrb_ep *ep) { int ret; @@ -580,6 +645,7 @@ static int vrb_ep_close(fid_t fid) ofi_genlock_lock(&vrb_ep2_progress(ep)->ep_lock); vrb_cleanup_cq(ep); vrb_flush_sq(ep); + vrb_flush_prepost_wr(ep); vrb_flush_rq(ep); ofi_genlock_unlock(&vrb_ep2_progress(ep)->ep_lock); break; @@ -599,6 +665,7 @@ static int vrb_ep_close(fid_t fid) ofi_genlock_lock(&vrb_ep2_progress(ep)->ep_lock); vrb_cleanup_cq(ep); vrb_flush_sq(ep); + vrb_flush_prepost_wr(ep); vrb_flush_rq(ep); ofi_genlock_unlock(&vrb_ep2_progress(ep)->ep_lock); break; @@ -966,15 +1033,19 @@ static int vrb_ep_enable(struct fid_ep *ep_fid) return -FI_EINVAL; } - ret = rdma_create_qp(ep->id, domain->pd, &attr); - if (ret) { - VRB_WARN_ERRNO(FI_LOG_EP_CTRL, "rdma_create_qp"); - return -errno; - } + /* Server-side QP creation, after RDMA_CM_EVENT_CONNECT_REQUEST + * is recevied */ + if (ep->id->verbs && ep->ibv_qp == NULL) { + ret = rdma_create_qp(ep->id, domain->pd, &attr); + if (ret) { + VRB_WARN_ERRNO(FI_LOG_EP_CTRL, "rdma_create_qp"); + return -errno; + } - /* Allow shared XRC INI QP not controlled by RDMA CM - * to share same post functions as RC QP. */ - ep->ibv_qp = ep->id->qp; + /* Allow shared XRC INI QP not controlled by RDMA CM + * to share same post functions as RC QP. */ + ep->ibv_qp = ep->id->qp; + } break; case FI_EP_DGRAM: assert(domain); diff --git a/prov/verbs/src/verbs_eq.c b/prov/verbs/src/verbs_eq.c old mode 100644 new mode 100755 index 6d4a908faa5..1e3223ccfa7 --- a/prov/verbs/src/verbs_eq.c +++ b/prov/verbs/src/verbs_eq.c @@ -861,6 +861,63 @@ vrb_eq_xrc_disconnect_event(struct vrb_eq *eq, } } +static int +vrb_eq_addr_resolved_event(struct vrb_ep *ep) +{ + struct vrb_recv_wr *wr; + struct slist_entry *entry; + struct ibv_qp_init_attr attr = { 0 }; + int ret; + + assert(ofi_genlock_held(&vrb_ep2_progress(ep)->ep_lock)); + assert(ep->state == VRB_RESOLVE_ADDR); + + if (ep->util_ep.type == FI_EP_MSG) { + vrb_msg_ep_get_qp_attr(ep, &attr); + + /* Client-side QP creation */ + if (rdma_create_qp(ep->id, vrb_ep2_domain(ep)->pd, &attr)) { + ep->state = VRB_DISCONNECTED; + ret = -errno; + VRB_WARN(FI_LOG_EP_CTRL, + "rdma_create_qp failed: %d\n", -ret); + return ret; + } + + /* Allow shared XRC INI QP not controlled by RDMA CM + * to share same post functions as RC QP. */ + ep->ibv_qp = ep->id->qp; + } + + assert(ep->ibv_qp); + while (!slist_empty(&ep->prepost_wr_list)) { + entry = ep->prepost_wr_list.head; + wr = container_of(entry, struct vrb_recv_wr, entry); + + ret = vrb_post_recv_internal(ep, &wr->wr); + if (ret) { + VRB_WARN(FI_LOG_EP_CTRL, + "Failed to post receive buffers: %d\n", -ret); + + return ret; + } + vrb_free_recv_wr(vrb_ep2_progress(ep), wr); + slist_remove_head(&ep->prepost_wr_list); + } + + ep->state = VRB_RESOLVE_ROUTE; + if (rdma_resolve_route(ep->id, VERBS_RESOLVE_TIMEOUT)) { + ep->state = VRB_DISCONNECTED; + ret = -errno; + VRB_WARN(FI_LOG_EP_CTRL, + "rdma_resolve_route failed: %d\n", + -ret); + return ret; + } + + return -FI_EAGAIN; +} + static ssize_t vrb_eq_cm_process_event(struct vrb_eq *eq, struct rdma_cm_event *cma_event, uint32_t *event, @@ -879,11 +936,31 @@ vrb_eq_cm_process_event(struct vrb_eq *eq, assert(ofi_mutex_held(&eq->event_lock)); switch (cma_event->event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + ep = container_of(fid, struct vrb_ep, util_ep.ep_fid); + ofi_genlock_lock(&vrb_ep2_progress(ep)->ep_lock); + ret = vrb_eq_addr_resolved_event(ep); + ofi_genlock_unlock(&vrb_ep2_progress(ep)->ep_lock); + if (ret != -FI_EAGAIN) { + eq->err.err = -ret; + eq->err.prov_errno = ret; + goto err; + } + goto ack; + case RDMA_CM_EVENT_ROUTE_RESOLVED: ep = container_of(fid, struct vrb_ep, util_ep.ep_fid); ofi_genlock_lock(&vrb_ep2_progress(ep)->ep_lock); assert(ep->state == VRB_RESOLVE_ROUTE); ep->state = VRB_CONNECTING; + + if (cma_event->id->route.addr.src_addr.sa_family != AF_IB) { + vrb_eq_skip_rdma_cm_hdr((const void **)&ep->conn_param.private_data, + (size_t *)&ep->conn_param.private_data_len); + } else { + vrb_msg_ep_prepare_rdma_cm_hdr(ep->cm_priv_data, ep->id); + } + if (rdma_connect(ep->id, &ep->conn_param)) { ep->state = VRB_DISCONNECTED; ret = -errno; @@ -899,6 +976,11 @@ vrb_eq_cm_process_event(struct vrb_eq *eq, ret = -FI_EAGAIN; } ofi_genlock_unlock(&vrb_ep2_progress(ep)->ep_lock); + if (ret != -FI_EAGAIN) { + eq->err.err = -ret; + eq->err.prov_errno = ret; + goto err; + } goto ack; case RDMA_CM_EVENT_CONNECT_REQUEST: *event = FI_CONNREQ; diff --git a/prov/verbs/src/verbs_info.c b/prov/verbs/src/verbs_info.c index e4e98d6d0d1..e4def4c6d5d 100644 --- a/prov/verbs/src/verbs_info.c +++ b/prov/verbs/src/verbs_info.c @@ -689,6 +689,7 @@ static int vrb_get_device_attrs(struct ibv_context *ctx, } /* + * USNIC plugs into the verbs framework, but is not a usable device. * Manually check for devices and fail gracefully if none are present. * This avoids the lower libraries (libibverbs and librdmacm) from * reporting error messages to stderr. diff --git a/prov/verbs/src/verbs_init.c b/prov/verbs/src/verbs_init.c index 23b0fbf8d0a..05183d0f9d4 100644 --- a/prov/verbs/src/verbs_init.c +++ b/prov/verbs/src/verbs_init.c @@ -295,17 +295,20 @@ int vrb_get_rai_id(const char *node, const char *service, uint64_t flags, return 0; } - ret = rdma_resolve_addr(*id, (*rai)->ai_src_addr, - (*rai)->ai_dst_addr, VERBS_RESOLVE_TIMEOUT); - if (ret) { - VRB_WARN_ERRNO(FI_LOG_FABRIC, "rdma_resolve_addr"); - ofi_straddr_log(&vrb_prov, FI_LOG_INFO, FI_LOG_FABRIC, - "src addr", (*rai)->ai_src_addr); - ofi_straddr_log(&vrb_prov, FI_LOG_INFO, FI_LOG_FABRIC, - "dst addr", (*rai)->ai_dst_addr); - ret = -errno; - goto err2; + if (node || (hints && hints->dest_addr)) { + ret = rdma_resolve_addr(*id, (*rai)->ai_src_addr, + (*rai)->ai_dst_addr, VERBS_RESOLVE_TIMEOUT); + if (ret) { + VRB_WARN_ERRNO(FI_LOG_FABRIC, "rdma_resolve_addr"); + ofi_straddr_log(&vrb_prov, FI_LOG_INFO, FI_LOG_FABRIC, + "src addr", (*rai)->ai_src_addr); + ofi_straddr_log(&vrb_prov, FI_LOG_INFO, FI_LOG_FABRIC, + "dst addr", (*rai)->ai_dst_addr); + ret = -errno; + goto err2; + } } + return 0; err2: if (rdma_destroy_id(*id)) @@ -335,29 +338,9 @@ int vrb_create_ep(struct vrb_ep *ep, enum rdma_port_space ps, goto err1; } - /* TODO convert this call to non-blocking (use event channel) as well: - * This may likely be needed for better scaling when running large - * MPI jobs. - * Making this non-blocking would mean we can't create QP at EP enable - * time. We need to wait for RDMA_CM_EVENT_ADDR_RESOLVED event before - * creating the QP using rdma_create_qp. It would also require a SW - * receive queue to store recvs posted by app after enabling the EP. - */ - if (rdma_resolve_addr(*id, rai->ai_src_addr, rai->ai_dst_addr, - VERBS_RESOLVE_TIMEOUT)) { - ret = -errno; - VRB_WARN_ERRNO(FI_LOG_EP_CTRL, "rdma_resolve_addr"); - ofi_straddr_log(&vrb_prov, FI_LOG_WARN, FI_LOG_EP_CTRL, - "src addr", rai->ai_src_addr); - ofi_straddr_log(&vrb_prov, FI_LOG_WARN, FI_LOG_EP_CTRL, - "dst addr", rai->ai_dst_addr); - goto err2; - } rdma_freeaddrinfo(rai); return 0; -err2: - rdma_destroy_id(*id); err1: rdma_freeaddrinfo(rai); return ret; diff --git a/prov/verbs/src/verbs_ofi.h b/prov/verbs/src/verbs_ofi.h index 78b8c40369e..82454763efe 100644 --- a/prov/verbs/src/verbs_ofi.h +++ b/prov/verbs/src/verbs_ofi.h @@ -262,9 +262,10 @@ struct vrb_progress { struct ofi_genlock *active_lock; struct ofi_bufpool *ctx_pool; + struct ofi_bufpool *recv_wr_pool; }; -int vrb_init_progress(struct vrb_progress *progress, struct ibv_context *verbs); +int vrb_init_progress(struct vrb_progress *progress, struct fi_info *info); void vrb_close_progress(struct vrb_progress *progress); struct vrb_eq_entry { @@ -574,6 +575,7 @@ struct vrb_xrc_ep_conn_setup { enum vrb_ep_state { VRB_IDLE, + VRB_RESOLVE_ADDR, VRB_RESOLVE_ROUTE, VRB_CONNECTING, VRB_REQ_RCVD, @@ -592,6 +594,7 @@ struct vrb_ep { uint64_t saved_peer_rq_credits; struct slist sq_list; struct slist rq_list; + struct slist prepost_wr_list; /* Protected by recv CQ lock */ int64_t rq_credits_avail; int64_t threshold; @@ -781,6 +784,9 @@ void vrb_eq_remove_sidr_conn(struct vrb_xrc_ep *ep); void vrb_msg_ep_get_qp_attr(struct vrb_ep *ep, struct ibv_qp_init_attr *attr); +void vrb_msg_ep_prepare_rdma_cm_hdr(void *priv_data, + const struct rdma_cm_id *id); + int vrb_process_xrc_connreq(struct vrb_ep *ep, struct vrb_connreq *connreq); @@ -933,9 +939,16 @@ do { \ ( wr->opcode == IBV_WR_SEND || wr->opcode == IBV_WR_SEND_WITH_IMM \ || wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM ) +struct vrb_recv_wr { + struct slist_entry entry; + struct ibv_recv_wr wr; + struct ibv_sge sge[0]; +}; + void vrb_shutdown_ep(struct vrb_ep *ep); ssize_t vrb_post_send(struct vrb_ep *ep, struct ibv_send_wr *wr, uint64_t flags); ssize_t vrb_post_recv(struct vrb_ep *ep, struct ibv_recv_wr *wr); +int vrb_post_recv_internal(struct vrb_ep *ep, struct ibv_recv_wr *wr); static inline ssize_t vrb_send_buf(struct vrb_ep *ep, struct ibv_send_wr *wr, @@ -991,4 +1004,17 @@ vrb_free_ctx(struct vrb_progress *progress, struct vrb_context *ctx) ofi_buf_free(ctx); } +static inline struct vrb_recv_wr *vrb_alloc_recv_wr(struct vrb_progress *progress) +{ + assert(ofi_genlock_held(progress->active_lock)); + return ofi_buf_alloc(progress->recv_wr_pool); +} + +static inline void +vrb_free_recv_wr(struct vrb_progress *progress, struct vrb_recv_wr *wr) +{ + assert(ofi_genlock_held(progress->active_lock)); + ofi_buf_free(wr); +} + #endif /* VERBS_OFI_H */ diff --git a/src/fabric.c b/src/fabric.c index 5a78152f3b4..1d890ed16a3 100644 --- a/src/fabric.c +++ b/src/fabric.c @@ -65,6 +65,7 @@ struct ofi_prov { struct fi_provider *provider; void *dlhandle; bool hidden; + bool preferred; }; enum ofi_prov_order { @@ -79,6 +80,7 @@ struct ofi_info_match { static struct ofi_prov *prov_head, *prov_tail; static enum ofi_prov_order prov_order = OFI_PROV_ORDER_VERSION; +static bool prov_preferred = false; int ofi_init = 0; extern struct ofi_common_locks common_locks; @@ -109,6 +111,7 @@ ofi_init_prov(struct ofi_prov *prov, struct fi_provider *provider, { prov->provider = provider; prov->dlhandle = dlhandle; + prov->preferred = prov_preferred; } static void ofi_cleanup_prov(struct fi_provider *provider, void *dlhandle) @@ -134,6 +137,19 @@ static void ofi_free_prov(struct ofi_prov *prov) free(prov); } +static inline bool ofi_hide_cur_prov(struct ofi_prov *cur, + struct ofi_prov *new) +{ + if (cur->preferred) + return false; + + if (new->preferred) + return true; + + return (prov_order == OFI_PROV_ORDER_VERSION && + FI_VERSION_LT(cur->provider->version, new->provider->version)); +} + static void ofi_insert_prov(struct ofi_prov *prov) { struct ofi_prov *cur, *prev; @@ -141,9 +157,7 @@ static void ofi_insert_prov(struct ofi_prov *prov) for (prev = NULL, cur = prov_head; cur; prev = cur, cur = cur->next) { if ((strlen(prov->prov_name) == strlen(cur->prov_name)) && !strcasecmp(prov->prov_name, cur->prov_name)) { - if ((prov_order == OFI_PROV_ORDER_VERSION) && - FI_VERSION_LT(cur->provider->version, - prov->provider->version)) { + if (ofi_hide_cur_prov(cur, prov)) { cur->hidden = true; prov->next = cur; if (prev) @@ -445,8 +459,8 @@ static struct fi_provider *ofi_get_hook(const char *name) static void ofi_ordered_provs_init(void) { char *ordered_prov_names[] = { - "efa", "psm2", "opx", "verbs", "cxi", - "netdir", "psm3", "ucx", "ofi_rxm", "ofi_rxd", "shm", + "efa", "psm2", "opx", "usnic", "verbs", "cxi", + "psm3", "ucx", "ofi_rxm", "ofi_rxd", "shm", /* Initialize the socket based providers last of the * standard providers. This will result in them being @@ -637,7 +651,7 @@ void ofi_create_filter(struct ofi_filter *filter, const char *raw_filter) } #ifdef HAVE_LIBDL -static void ofi_reg_dl_prov(const char *lib) +static void ofi_reg_dl_prov(const char *lib, bool lib_known_to_exist) { void *dlhandle; struct fi_provider* (*inif)(void); @@ -646,8 +660,13 @@ static void ofi_reg_dl_prov(const char *lib) dlhandle = dlopen(lib, RTLD_NOW); if (dlhandle == NULL) { - FI_DBG(&core_prov, FI_LOG_CORE, - "dlopen(%s): %s\n", lib, dlerror()); + if (lib_known_to_exist) { + FI_WARN(&core_prov, FI_LOG_CORE, + "dlopen(%s): %s\n", lib, dlerror()); + } else { + FI_DBG(&core_prov, FI_LOG_CORE, + "dlopen(%s): %s\n", lib, dlerror()); + } return; } @@ -676,7 +695,7 @@ static void ofi_ini_dir(const char *dir) "asprintf failed to allocate memory\n"); goto libdl_done; } - ofi_reg_dl_prov(lib); + ofi_reg_dl_prov(lib, true); free(liblist[n]); free(lib); @@ -717,11 +736,38 @@ static void ofi_find_prov_libs(void) continue; } - ofi_reg_dl_prov(lib); + ofi_reg_dl_prov(lib, false); free(lib); } } +static void ofi_load_preferred_dl_prov(const char *path) +{ + if (!path || !strlen(path)) + return; + + if (path[0] != '/') { + FI_WARN(&core_prov, FI_LOG_CORE, + "invalid format for preferred provider: \"%s\"\n", + path); + return; + } + + if (access(path, F_OK) != 0) { + FI_WARN(&core_prov, FI_LOG_CORE, + "preferred provider not found: \"%s\"\n", + path); + return; + } + + FI_INFO(&core_prov, FI_LOG_CORE, + "loading preferred provider: \"%s\"\n", path); + + prov_preferred = true; + ofi_reg_dl_prov(path, true); + prov_preferred = false; +} + static void ofi_load_dl_prov(void) { char **dirs; @@ -740,6 +786,11 @@ static void ofi_load_dl_prov(void) "specified similar to dir1:dir2:dir3. If the path " "starts with @, loaded providers are given preference " "based on discovery order, rather than version. " + "Optionally any of the dir can be replaced with + " + "followed by the full path to a provider library, " + "which specifies a preferred provider. If registered " + "successfully, a preferred provider has priority over " + "other providers with the same name. " "(default: " PROVDLDIR ")"); fi_param_get_str(NULL, "provider_path", &provdir); @@ -766,10 +817,32 @@ static void ofi_load_dl_prov(void) } if (dirs) { - for (i = 0; dirs[i]; i++) - ofi_ini_dir(dirs[i]); + int num_dirs = 0; + + for (i = 0; dirs[i]; i++) { + if (dirs[i][0] == '+') { + ofi_load_preferred_dl_prov(dirs[i]+1); + } else { + ofi_ini_dir(dirs[i]); + num_dirs++; + } + } ofi_free_string_array(dirs); + + if (num_dirs) + return; + + /* + * When FI_PROVIDER_PATH contains only preferred providers, go + * back to search under the default path. + */ + dirs = ofi_split_and_alloc(PROVDLDIR, ":", NULL); + if (dirs) { + for (i = 0; dirs[i]; i++) + ofi_ini_dir(dirs[i]); + ofi_free_string_array(dirs); + } } } @@ -900,6 +973,7 @@ void fi_ini(void) ofi_register_provider(PSM3_INIT, NULL); ofi_register_provider(PSM2_INIT, NULL); ofi_register_provider(CXI_INIT, NULL); + ofi_register_provider(USNIC_INIT, NULL); ofi_register_provider(SHM_INIT, NULL); ofi_register_provider(SM2_INIT, NULL); diff --git a/src/fi_tostr.c b/src/fi_tostr.c index 2b95be5b87c..9cc20e4f4d7 100644 --- a/src/fi_tostr.c +++ b/src/fi_tostr.c @@ -126,10 +126,7 @@ static void ofi_tostr_addr_format(char *buf, size_t len, uint32_t addr_format) CASEENUMSTRN(FI_ADDR_OPX, len); CASEENUMSTRN(FI_ADDR_CXI, len); default: - if (addr_format & FI_PROV_SPECIFIC) - ofi_strncatf(buf, len, "Provider specific"); - else - ofi_strncatf(buf, len, "Unknown"); + ofi_strncatf(buf, len, "Unknown"); break; } } @@ -140,6 +137,7 @@ static void ofi_tostr_progress(char *buf, size_t len, enum fi_progress progress) CASEENUMSTRN(FI_PROGRESS_UNSPEC, len); CASEENUMSTRN(FI_PROGRESS_AUTO, len); CASEENUMSTRN(FI_PROGRESS_MANUAL, len); + CASEENUMSTRN(FI_PROGRESS_CONTROL_UNIFIED, len); default: ofi_strncatf(buf, len, "Unknown"); break; @@ -219,7 +217,6 @@ static void ofi_tostr_caps(char *buf, size_t len, uint64_t caps) IFFLAGSTRN(caps, FI_TRIGGER, len); IFFLAGSTRN(caps, FI_FENCE, len); - IFFLAGSTRN(caps, FI_VARIABLE_MSG, len); IFFLAGSTRN(caps, FI_RMA_PMEM, len); IFFLAGSTRN(caps, FI_SOURCE_ERR, len); IFFLAGSTRN(caps, FI_LOCAL_COMM, len); @@ -241,8 +238,6 @@ static void ofi_tostr_ep_type(char *buf, size_t len, enum fi_ep_type ep_type) CASEENUMSTRN(FI_EP_MSG, len); CASEENUMSTRN(FI_EP_DGRAM, len); CASEENUMSTRN(FI_EP_RDM, len); - CASEENUMSTRN(FI_EP_SOCK_STREAM, len); - CASEENUMSTRN(FI_EP_SOCK_DGRAM, len); default: ofi_strncatf(buf, len, "Unknown"); break; @@ -275,11 +270,9 @@ static void ofi_tostr_protocol(char *buf, size_t len, uint32_t protocol) CASEENUMSTRN(FI_PROTO_CXI, len); CASEENUMSTRN(FI_PROTO_XNET, len); CASEENUMSTRN(FI_PROTO_SM2, len); + CASEENUMSTRN(FI_PROTO_CXI_RNR, len); default: - if (protocol & FI_PROV_SPECIFIC) - ofi_strncatf(buf, len, "Provider specific"); - else - ofi_strncatf(buf, len, "Unknown"); + ofi_strncatf(buf, len, "Unknown"); break; } } @@ -291,8 +284,6 @@ static void ofi_tostr_mode(char *buf, size_t len, uint64_t mode) IFFLAGSTRN(mode, FI_ASYNC_IOV, len); IFFLAGSTRN(mode, FI_RX_CQ_DATA, len); IFFLAGSTRN(mode, FI_LOCAL_MR, len); - IFFLAGSTRN(mode, FI_NOTIFY_FLAGS_ONLY, len); - IFFLAGSTRN(mode, FI_RESTRICTED_COMP, len); IFFLAGSTRN(mode, FI_CONTEXT2, len); IFFLAGSTRN(mode, FI_BUFFERED_RECV, len); @@ -905,6 +896,7 @@ ofi_tostr_cntr_events(char *buf, size_t len, enum fi_cntr_events events) { switch (events) { CASEENUMSTRN(FI_CNTR_EVENTS_COMP, len); + CASEENUMSTRN(FI_CNTR_EVENTS_BYTES, len); default: ofi_strncatf(buf, len, "Unknown"); break; diff --git a/src/hmem.c b/src/hmem.c index 3e4b2e270eb..ffc3aca63a4 100644 --- a/src/hmem.c +++ b/src/hmem.c @@ -598,13 +598,14 @@ void ofi_hmem_set_iface_filter(const char* iface_filter_str, bool* filter) "synapseai" }; char *iface_filter_str_copy = strdup(iface_filter_str); + char *saveptr; memset(filter, false, sizeof(bool) * ARRAY_SIZE(hmem_ops)); /* always enable system hmem interface */ filter[FI_HMEM_SYSTEM] = true; - entry = strtok(iface_filter_str_copy, token); + entry = strtok_r(iface_filter_str_copy, token, &saveptr); while (entry != NULL) { for (iface = 0; iface < ARRAY_SIZE(hmem_ops); iface++) { if (!strcasecmp(iface_labels[iface], entry)) { @@ -619,7 +620,7 @@ void ofi_hmem_set_iface_filter(const char* iface_filter_str, bool* filter) entry); } - entry = strtok(NULL, token); + entry = strtok_r(NULL, token, &saveptr); } free(iface_filter_str_copy); diff --git a/src/hmem_cuda.c b/src/hmem_cuda.c index 5d83905384e..25c89ff590c 100644 --- a/src/hmem_cuda.c +++ b/src/hmem_cuda.c @@ -65,6 +65,7 @@ _(cuGetErrorName) \ _(cuGetErrorString) \ _(cuPointerGetAttribute) \ + _(cuPointerGetAttributes) \ _(cuPointerSetAttribute) \ _(cuDeviceCanAccessPeer) \ _(cuMemGetAddressRange) \ @@ -126,6 +127,10 @@ static struct { CUresult (*cuPointerGetAttribute)(void *data, CUpointer_attribute attribute, CUdeviceptr ptr); + CUresult (*cuPointerGetAttributes)(unsigned int num_attributes, + CUpointer_attribute *attributes, + void **data, + CUdeviceptr ptr); CUresult (*cuPointerSetAttribute)(const void *data, CUpointer_attribute attribute, CUdeviceptr ptr); @@ -222,6 +227,14 @@ CUresult ofi_cuPointerGetAttribute(void *data, CUpointer_attribute attribute, return cuda_ops.cuPointerGetAttribute(data, attribute, ptr); } +CUresult ofi_cuPointerGetAttributes(unsigned int num_attributes, + CUpointer_attribute *attributes, + void **data, CUdeviceptr ptr) +{ + return cuda_ops.cuPointerGetAttributes(num_attributes, attributes, + data, ptr); +} + #define CUDA_DRIVER_LOG_ERR(cu_result, cuda_api_name) \ { \ const char *cu_error_name; \ @@ -815,26 +828,33 @@ int cuda_hmem_cleanup(void) bool cuda_is_addr_valid(const void *addr, uint64_t *device, uint64_t *flags) { CUresult cuda_ret; - unsigned int data; + unsigned int mem_type; + unsigned int is_managed; + uint64_t device_ord; + + /* Each pointer in 'data' needs to have the same array index + as the corresponding attribute in 'cuda_attributes' */ + void *data[] = {&mem_type, &is_managed, &device_ord}; + + CUpointer_attribute cuda_attributes[] = { + CU_POINTER_ATTRIBUTE_MEMORY_TYPE, + CU_POINTER_ATTRIBUTE_IS_MANAGED, + CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL + }; + + cuda_ret = ofi_cuPointerGetAttributes(ARRAY_SIZE(cuda_attributes), + cuda_attributes, data, + (CUdeviceptr) addr); - cuda_ret = ofi_cuPointerGetAttribute(&data, - CU_POINTER_ATTRIBUTE_MEMORY_TYPE, - (CUdeviceptr)addr); switch (cuda_ret) { case CUDA_SUCCESS: - if (data == CU_MEMORYTYPE_DEVICE) { - if (flags) + if (mem_type == CU_MEMORYTYPE_DEVICE) { + if (flags && !is_managed) *flags = FI_HMEM_DEVICE_ONLY; - if (device) { - *device = 0; - cuda_ret = ofi_cuPointerGetAttribute( - (int *) device, - CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, - (CUdeviceptr) addr); - if (cuda_ret) - break; - } + if (device) + *device = device_ord; + return true; } break; diff --git a/src/hmem_cuda_gdrcopy.c b/src/hmem_cuda_gdrcopy.c index e013777dfbc..e58a2b0240a 100644 --- a/src/hmem_cuda_gdrcopy.c +++ b/src/hmem_cuda_gdrcopy.c @@ -368,7 +368,7 @@ int cuda_gdrcopy_dev_unregister(uint64_t handle) err = global_gdrcopy_ops.gdr_unpin_buffer(global_gdr, gdrcopy->mh); if (err) { FI_WARN(&core_prov, FI_LOG_MR, - "gdr_unmap failed! error: %s\n", + "gdr_unpin_buffer failed! error: %s\n", strerror(err)); goto exit; } diff --git a/src/hmem_ze.c b/src/hmem_ze.c index 9722385588c..6b4056c43e4 100644 --- a/src/hmem_ze.c +++ b/src/hmem_ze.c @@ -393,6 +393,7 @@ static int ze_hmem_init_fds(void) char *str; uint16_t domain_id; uint8_t pci_id; + char *saveptr; dir = opendir(dev_dir); if (dir == NULL) @@ -411,10 +412,10 @@ static int ze_hmem_init_fds(void) dev_fds[num_pci_devices] = open(dev_name, O_RDWR); if (dev_fds[num_pci_devices] == -1) goto err; - str = strtok(ent->d_name, "-"); - str = strtok(NULL, ":"); + str = strtok_r(ent->d_name, "-", &saveptr); + str = strtok_r(NULL, ":", &saveptr); domain_id = (uint16_t) strtol(str, NULL, 16); - str = strtok(NULL, ":"); + str = strtok_r(NULL, ":", &saveptr); pci_id = (uint8_t) strtol(str, NULL, 16); for (i = 0; i < num_devices; i++) { if (dev_info[i].uuid.id[8] == pci_id && diff --git a/src/xpmem.c b/src/xpmem.c index 884e4c2088b..799dbde4a09 100644 --- a/src/xpmem.c +++ b/src/xpmem.c @@ -164,7 +164,7 @@ int xpmem_copy(struct iovec *local, unsigned long local_cnt, { int ret, i; struct iovec iov; - uint64_t offset, copy_len; + uint64_t offset, copy_len, delta; void *mapped_addr; struct ofi_mr_entry *mr_entry; long page_size = ofi_get_page_size(); @@ -175,19 +175,23 @@ int xpmem_copy(struct iovec *local, unsigned long local_cnt, iov.iov_base = (void *) ofi_get_page_start(remote[i].iov_base, page_size); iov.iov_len = - (uintptr_t) ofi_get_page_end(remote[i].iov_base + - remote[i].iov_len, page_size) - + (uintptr_t) ofi_get_page_end( + (void*)((uintptr_t)remote[i].iov_base + + remote[i].iov_len), page_size) - (uintptr_t)iov.iov_base; - offset = (uintptr_t)((uintptr_t) remote[i].iov_base - - (uintptr_t) iov.iov_base); ret = ofi_xpmem_cache_search(xpmem_cache, &iov, pid, &mr_entry, (struct xpmem_client *)user_data); if (ret) return ret; + delta = (uintptr_t) iov.iov_base - + (uintptr_t) mr_entry->info.iov.iov_base; + offset = (uintptr_t)((uintptr_t) remote[i].iov_base - + (uintptr_t) iov.iov_base); + mapped_addr = (char*) (uintptr_t)mr_entry->info.mapped_addr + - offset; + offset + delta; copy_len = (local[i].iov_len <= iov.iov_len - offset) ? local[i].iov_len : iov.iov_len - offset; diff --git a/util/info.c b/util/info.c index 6dc7789f2d1..698b340c38f 100644 --- a/util/info.c +++ b/util/info.c @@ -162,8 +162,6 @@ static int str2mode(char *inputstr, uint64_t *value) ORCASE(FI_ASYNC_IOV); ORCASE(FI_RX_CQ_DATA); ORCASE(FI_LOCAL_MR); - ORCASE(FI_NOTIFY_FLAGS_ONLY); - ORCASE(FI_RESTRICTED_COMP); ORCASE(FI_CONTEXT2); fprintf(stderr, "error: Unrecognized mode: %s\n", inputstr); @@ -177,8 +175,6 @@ static int str2ep_type(char *inputstr, enum fi_ep_type *value) ORCASE(FI_EP_MSG); ORCASE(FI_EP_DGRAM); ORCASE(FI_EP_RDM); - ORCASE(FI_EP_SOCK_STREAM); - ORCASE(FI_EP_SOCK_DGRAM); fprintf(stderr, "error: Unrecognized endpoint type: %s\n", inputstr); diff --git a/util/pingpong.c b/util/pingpong.c index b38c23d39a3..f8af6943320 100644 --- a/util/pingpong.c +++ b/util/pingpong.c @@ -87,6 +87,7 @@ enum { struct pp_opts { uint16_t src_port; uint16_t dst_port; + char *src_addr; char *dst_addr; int iterations; int transfer_size; @@ -280,6 +281,8 @@ static void pp_banner_options(struct ct_pingpong *ct) } PP_DEBUG(" * PingPong options:\n"); + PP_DEBUG(" - %-20s: [%s]\n", "src_addr", + (opts.src_addr)? opts.src_addr: "None"); PP_DEBUG(" - %-20s: [%" PRIu16 "]\n", "src_port", opts.src_port); PP_DEBUG(" - %-20s: [%s]\n", "dst_addr", opts.dst_addr); PP_DEBUG(" - %-20s: [%" PRIu16 "]\n", "dst_port", opts.dst_port); @@ -1999,6 +2002,8 @@ static void pp_pingpong_usage(struct ct_pingpong *ct, char *name, char *desc) "destination control port number (client: 47592)"); fprintf(stderr, " %-20s %s\n", "-d ", "domain name"); + fprintf(stderr, " %-20s %s\n", "-s ", + "source address associated with domain name"); fprintf(stderr, " %-20s %s\n", "-p ", "specific provider name eg sockets, verbs"); fprintf(stderr, " %-20s %s\n", "-e ", @@ -2071,6 +2076,11 @@ static void pp_parse_opts(struct ct_pingpong *ct, int op, char *optarg) } break; + /* Source address */ + case 's': + ct->opts.src_addr = optarg; + break; + /* Check data */ case 'c': ct->opts.options |= PP_OPT_VERIFY_DATA; @@ -2274,6 +2284,25 @@ static int run_pingpong_msg(struct ct_pingpong *ct) return ret; } +static void pp_set_src_hint(struct ct_pingpong *ct) +{ + struct addrinfo *results = NULL; + + if (getaddrinfo(ct->opts.src_addr, NULL, NULL, &results)) + goto out; + + ct->hints->src_addr = calloc(1, results->ai_addrlen); + if (!ct->hints->src_addr) + goto out; + + ct->hints->src_addrlen = results->ai_addrlen; + memcpy(ct->hints->src_addr, results->ai_addr, results->ai_addrlen); + ct->hints->addr_format = results->ai_family; + +out: + freeaddrinfo(results); +} + int main(int argc, char **argv) { int op, ret = EXIT_SUCCESS; @@ -2298,7 +2327,7 @@ int main(int argc, char **argv) ofi_osd_init(); - while ((op = getopt(argc, argv, "hvd:p:e:I:S:B:P:cm:6")) != -1) { + while ((op = getopt(argc, argv, "hvd:p:e:I:S:s:B:P:cm:6")) != -1) { switch (op) { default: pp_parse_opts(&ct, op, optarg); @@ -2314,6 +2343,9 @@ int main(int argc, char **argv) if (optind < argc) ct.opts.dst_addr = argv[optind]; + if (ct.opts.src_addr) + pp_set_src_hint(&ct); + pp_banner_options(&ct); switch (ct.hints->ep_attr->type) {